source: trunk/SRC/Matrix/cmset_op.pro @ 134

Last change on this file since 134 was 134, checked in by navarro, 18 years ago

change *.pro file properties (del eof-style, del executable, set keywords Id

  • Property svn:keywords set to Id
File size: 16.2 KB
Line 
1;+
2; @hidden
3;
4; @todo seb
5;-
6;
7;; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
8;; "first" value, whatever that may mean.
9function cmset_op_uniq, a
10;
11  compile_opt idl2, strictarrsubs
12;
13  if n_elements(a) LE 1 then return, 0L
14
15  ii = sort(a) & b = a[ii]
16  wh = where(b NE shift(b, +1L), ct)
17  if ct GT 0 then return, ii[wh]
18
19  return, 0L
20;
21end
22;+
23;
24; @file_comments
25; Performs an AND, OR, or XOR operation between two sets
26;
27;   Description: SET_OP performs three common operations between two sets.  The
28;   three supported functions of OP are:
29;
30;        OP          Meaning
31;      'AND' - to find the intersection of A and B;
32;      'OR'  - to find the union of A and B;
33;      'XOR' - to find the those elements who are members of A or B
34;              but not both;
35;
36;   Sets as defined here are one dimensional arrays composed of
37;   numeric or string types.  Comparisons of equality between elements
38;   are done using the IDL EQ operator.
39;
40;   The complements of either set can be taken as well, by using the
41;   NOT1 and NOT2 keywords.  For example, it may be desireable to find
42;   the elements in A but not B, or B but not A (they are different!).
43;   The following IDL expressions achieve each of those effects:
44;
45;      SET = CMSET_OP(A, 'AND', /NOT2, B)   ; A but not B
46;      SET = CMSET_OP(/NOT1, A, 'AND', B)   ; B but not A
47;
48;   Note the distinction between NOT1 and NOT2.  NOT1 refers to the
49;   first set (A) and NOT2 refers to the second (B).  Their ordered
50;   placement in the calling sequence is entirely optional, but the
51;   above ordering makes the logical meaning explicit.
52;
53;   NOT1 and NOT2 can only be set for the 'AND' operator, and never
54;   simultaneously.  This is because the results of an operation with
55;   'OR' or 'XOR' and any combination of NOTs -- or with 'AND' and
56;   both NOTs -- formally cannot produce a defined result.
57;
58;   The implementation depends on the type of operands.  For integer
59;   types, a fast technique using HISTOGRAM is used.  However, this
60;   algorithm becomes inefficient when the dynamic range in the data
61;   is large.  For those cases, and for other data types, a technique
62;   based on SORT() is used.  Thus the compute time should scale
63;   roughly as (A+B)*ALOG(A+B) or better, rather than (A*B) for the
64;   brute force approach.  For large arrays this is a significant
65;   benefit.
66;
67; @categories array
68;
69; @param A {in}{required} The two sets to be operated on.  A one dimensional array of
70;          either numeric or string type.  A and B must be of the same
71;          type.  Empty sets are permitted, and are either represented
72;          as an undefined variable, or by setting EMPTY1 or EMPTY2.
73;
74; @param B {in}{required} See A
75;
76; @param OP {in}{required} a string, the operation to be performed.  Must be one of
77;        'AND', 'OR' or 'XOR' (lower or mixed case is permitted).
78;        Other operations will cause an error message to be produced.
79;
80; @keyword NOT1 If set and OP is 'AND', then the complement of A (for
81;                NOT1) or B (for NOT2) will be used in the operation.
82;                NOT1 and NOT2 cannot be set simultaneously.
83;
84; @keyword NOT2 See NOT1
85;
86; @keyword EMPTY1 If set, then A (for EMPTY1) or B (for EMPTY2) are
87;                 assumed to be the empty set.  The actual values
88;                 passed as A or B are then ignored.
89;
90; @keyword EMPTY2 See EMPTY1
91;
92; @keyword INDEX if set, then return a list of indices instead of the array
93;           values themselves.  The "slower" set operations are always
94;           performed in this case.
95;
96;           The indices refer to the *combined* array [A,B].  To
97;           clarify, in the following call: I = CMSET_OP(..., /INDEX);
98;           returned values from 0 to NA-1 refer to A[I], and values
99;           from NA to NA+NB-1 refer to B[I-NA].
100;
101; @keyword COUNT upon return, the number of elements in the result set.
102;           This is only important when the result set is the empty
103;           set, in which case COUNT is set to zero.
104;
105; @returns The resulting set as a one-dimensional array.  The set may be
106;   represented by either an array of data values (default), or an
107;   array of indices (if INDEX is set).  Duplicate elements, if any,
108;   are removed, and element order may not be preserved.
109;
110;   The empty set is represented as a return value of -1L, and COUNT
111;   is set to zero.  Note that the only way to recognize the empty set
112;   is to examine COUNT.
113;
114; SEE ALSO:
115;
116;   SET_UTILS.PRO by RSI
117;
118; @history Written, CM, 23 Feb 2000
119;   Added empty set capability, CM, 25 Feb 2000
120;   Documentation clarification, CM 02 Mar 2000
121;   Incompatible but more consistent reworking of EMPTY keywords, CM,
122;     04 Mar 2000
123;   Minor documentation clarifications, CM, 26 Mar 2000
124;   Corrected bug in empty_arg special case, CM 06 Apr 2000
125;   Add INDEX keyword, CM 31 Jul 2000
126;   Clarify INDEX keyword documentation, CM 06 Sep 2000
127;   Made INDEX keyword always force SLOW_SET_OP, CM 06 Sep 2000
128;   Added CMSET_OP_UNIQ, and ability to select FIRST_UNIQUE or
129;     LAST_UNIQUE values, CM, 18 Sep 2000
130;   Removed FIRST_UNIQUE and LAST_UNIQUE, and streamlined
131;     CMSET_OP_UNIQ until problems with SORT can be understood, CM, 20
132;     Sep 2000 (thanks to Ben Tupper)
133;   Still trying to get documentation of INDEX and NOT right, CM, 28
134;     Sep 2000 (no code changes)
135;   Correct bug for AND case, when input sets A and B each only have
136;     one unique value, and the values are equal.  CM, 04 Mar 2004
137;     (thanks to James B. jbattat at cfa dot harvard dot edu)
138;   Add support for the cases where the input data types are mixed,
139;      but still compatible; also, attempt to return the same data
140;      type that was passed in; CM, 05 Feb 2005
141;   Fix bug in type checking (thanks to "marit"), CM, 10 Dec 2005
142;   Work around a stupidity in the built-in IDL HISTOGRAM routine,
143;      which tries to "help" you by restricting the MIN/MAX to the
144;      range of the input variable (thanks to Will Maddox), CM, 16 Jan 2006
145;
146;   Author: Craig B. Markwardt, NASA/GSFC Code 662, Greenbelt, MD 20770
147;   craigm@lheamail.gsfc.nasa.gov
148;
149; @version $Id: cmset_op.pro,v 1.6 2006/01/16 19:45:22 craigm Exp $
150;
151; @examples Utility function, similar to UNIQ, but allowing choice of taking
152; first or last unique element, or non-unique elements.
153; Unfortunately this doesn't work because of implementation dependent
154; versions of the SORT() function.
155;
156; function cmset_op_uniq, a, first=first, non=non, count=ct, sort=sortit
157;   if n_elements(a) LE 1 then return, 0L
158;   sh = (2L*keyword_set(first)-1L)*(-2L*keyword_set(non)+1)
159;
160;   if keyword_set(sortit) then begin
161;       ;; Sort it manually
162;       ii = sort(a) & b = a[ii]
163;       if keyword_set(non) then wh = where(b EQ shift(b, sh), ct) $
164;       else                     wh = where(b NE shift(b, sh), ct)
165;       if ct GT 0 then return, ii[wh]
166;   endif else begin
167;       ;; Use the user's values directly
168;       if keyword_set(non) then wh = where(a EQ shift(a, sh), ct) $
169;       else                     wh = where(a NE shift(a, sh), ct)
170;       if ct GT 0 then return, wh
171;   endelse
172;
173;   if keyword_set(first) then return, 0L else return, n_elements(a)-1
174; end
175
176; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
177; "first" value, whatever that may mean.
178;
179;-
180
181
182function cmset_op, a, op0, b, not1=not1, not2=not2, count=count, $
183              empty1=empty1, empty2=empty2, maxarray=ma, index=index
184;
185  compile_opt idl2, strictarrsubs
186;
187
188  on_error, 2 ;; return on error
189  count = 0L
190  index0 = -1L
191  ;; Histogram technique is used for array sizes < 32,000 elements
192  if n_elements(ma) EQ 0 then ma = 32L*1024L
193
194  ;; Check the number of arguments
195  if n_params() LT 3 then begin
196      ARG_ERR:
197      message, 'USAGE: SET = CMSET_OP(A, OP, B [, COUNT=ct])', /info
198      message, '  KEYWORDS: /NOT1, /NOT2, /EMPTY1, /EMPTY2, INDEX', /info
199      return, -1L
200  endif
201  if n_elements(op0) EQ 0 then goto, ARG_ERR
202  kind = keyword_set(index)
203  fst = 1L
204  if keyword_set(last) then fst = 0L
205  if keyword_set(first) then fst = 1L
206
207  ;; Check the operation
208  sz = size(op0)
209  if sz[sz[0]+1] NE 7 then begin
210      OP_ERR:
211      message, "ERROR: OP must be 'AND', 'OR' or 'XOR'"
212      return, -1L
213  endif
214  op = strupcase(op0)
215  if op NE 'AND' AND op NE 'OR' AND op NE 'XOR' then goto, OP_ERR
216
217  ;; Check NOT1 and NOT2
218  if keyword_set(not1) AND keyword_set(not2) then begin
219      message, "ERROR: NOT1 and NOT2 cannot be set simultaneously"
220      return, -1L
221  endif
222  if (keyword_set(not1) OR keyword_set(not2)) AND $
223    (op EQ 'OR' OR op EQ 'XOR') then begin
224      message, "ERROR: NOT1 and NOT2 cannot be set with 'OR' or 'XOR'"
225      return, -1L
226  endif
227
228  ;; Special cases for empty set
229  n1 = n_elements(a) & n2 = n_elements(b)
230  if keyword_set(empty1) then n1 = 0L
231  if keyword_set(empty2) then n2 = 0L
232  if n1 EQ 0 OR n2 EQ 0 then begin
233      ;; Eliminate duplicates
234      if n1 GT 0 then a1 = cmset_op_uniq(a)
235      if n2 GT 0 then b1 = cmset_op_uniq(b)
236      n1 = n_elements(a1) < n1 & n2 = n_elements(b1) < n2
237      case op of
238          'OR': if n1 EQ 0 then goto, RET_A1 else goto, RET_B1
239         'XOR': if n1 EQ 0 then goto, RET_B1 else goto, RET_A1
240         'AND': begin
241             if keyword_set(not1) AND n1 EQ 0 then goto, RET_B1
242             if keyword_set(not2) AND n2 EQ 0 then goto, RET_A1
243             return, -1L
244         end
245     endcase
246     return, -1L
247     RET_A1:
248     count = n1
249     if kind then begin
250         if count GT 0 then return, a1 else return, -1L
251     endif
252     if count GT 0 then return, a[a1] else return, -1L
253     RET_B1:
254     count = n2
255     if kind then begin
256         if count GT 0 then return, b1+n1 else return, -1L
257     endif         
258     if count GT 0 then return, b[b1] else return, -1L
259 endif
260
261  ;; Allow data to have different types, but they must be at least of
262  ;; the same "base" type.  That is, you can't combine a number with a
263  ;; string, etc.
264  ;; basetype 0:undefined 1:real number 6:complex number 7:string
265  ;;     8:structure 10:pointer 11:object
266
267  ;;          0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
268  basetype = [0, 1, 1, 1, 1, 1, 6, 7, 8, 6,10,11, 1, 1, 1, 1]
269
270  ;; Check types of operands
271  sz1 = size(a) & tp1 = sz1[sz1[0]+1]
272  sz2 = size(b) & tp2 = sz2[sz2[0]+1]
273  if tp1 LT 0 OR tp1 GE 16 OR tp2 LT 0 OR tp2 GE 16 then begin
274      message, 'ERROR: unrecognized data types for operands'
275      return, -1
276  endif
277  if basetype[tp1] NE basetype[tp2] then begin
278      TYPE1_ERR:
279      message, 'ERROR: both A and B must be of the same type'
280      return, -1L
281  endif
282  if tp1 EQ 8 OR tp1 EQ 10 OR tp1 EQ 11 then begin
283      TYPE2_ERR:
284      message, 'ERROR: operands must be a numeric or string type'
285      return, -1L
286  endif
287
288  ;; Now use two different kinds of algorithms: a slower but more
289  ;; general algorithm for generic types, and the histogram technique
290  ;; for integer types.  Even for integer types, if there is too much
291  ;; dynamic range, then the slow method is used.
292
293  if tp1 GE 4 AND tp1 LE 9 then begin
294      ;; String and real types, or large int arrays
295      SLOW_SET_OP:
296      case op of
297          'OR': begin
298              uu = [a,b]    ;; OR is simple; just take unique values
299              index0 = cmset_op_uniq(uu)
300              count = n_elements(index0)
301              if kind then return, index0
302              return, uu[index0]
303          end
304
305          'XOR': begin
306              ;; Make ordered list of set union
307              ai = cmset_op_uniq(a) & na = n_elements(ai)
308              bi = cmset_op_uniq(b) & nb = n_elements(bi)
309              ui = [ai, bi+n1]
310              uu = [a,b]    & uu = uu[ui] ;; Raw union...
311              us = sort(uu) & uu = uu[us] ;; ...and sort
312              if kind then ui = ui[temporary(us)] else ui = 0
313
314              ;; Values in one set only will not have duplicates
315              wh1 = where(uu NE shift(uu, -1), count1)
316              if count1 EQ 0 then return, -1L
317              wh = where(wh1[1:*]-wh1 EQ 1, count)
318              if wh1[0] EQ 0 then begin
319                  if count GT 0 then wh = [-1L, wh] else wh = [-1L]
320                  count = n_elements(wh)
321              endif
322              if count EQ 0 then return, -1
323              if kind then return, ui[wh1[wh+1]]
324              return, uu[wh1[wh+1]]
325          end
326
327          'AND': begin
328              ;; Make ordered list of set union
329              ai = cmset_op_uniq(a) & na = n_elements(ai)
330              bi = cmset_op_uniq(b) & nb = n_elements(bi)
331              ui = [ai, bi+n1]
332              uu = [a,b]    & uu = uu[ui]  ;; Raw union...
333              us = sort(uu) & uu = uu[us]  ;; ...and sort
334              if kind then ui = ui[us] else ui = 0
335
336              if NOT keyword_set(not1) AND NOT keyword_set(not2) then begin
337
338                  ;; Special case: if there are one in each set, and
339                  ;; they are equal, then the SHIFT() technique below
340                  ;; fails.  Do this one by hand.
341                  if na EQ 1 AND nb EQ 1 AND uu[0] EQ uu[1] then begin
342                      count = 1L
343                      if kind then return, 0L
344                      return, [uu[0]]
345                  endif
346
347                  ;; If neither "NOT" is set, then find duplicates
348                  us = 0L  ;; Save memory
349                  wh = where(uu EQ shift(uu, -1L), count) ;; Find non-unique
350                  if count EQ 0 then return, -1L
351                  ;; This should always select the element from A
352                  ;; rather than B (the smaller of the two)
353                  if kind then return, (ui[wh] < ui[wh+1])
354                  return, uu[wh]
355              endif
356
357              ;; For "NOT" cases, we need to identify by set
358              ii = make_array(na+nb, value=1b)
359              if keyword_set(not1) then ii[0:na-1] = 0
360              if keyword_set(not2) then ii[na:*]   = 0
361              ii = ii[temporary(us)]
362
363              ;; Remove any duplicates
364              wh1 = where(uu EQ shift(uu, -1L), count1) ;; Find non-unique
365              if count1 GT 0 then ii[wh1, wh1+1] = 0
366              ;; Remainder is the desired set
367              wh = where(ii, count)
368              if count EQ 0 then return, -1L
369              if kind then return, ui[wh]
370              return, uu[wh]
371          end
372
373      endcase
374      return, -1L  ;; DEFAULT CASE
375  endif else begin
376
377      ;; INDEX keyword forces the "slow" operation
378      if kind then goto, SLOW_SET_OP
379
380      ;; Integer types - use histogram technique if the data range
381      ;; is small enough, otherwise use the "slow" technique above
382      min1 = min(a, max=max1) & min2 = min(b, max=max2)
383      minn = min1 < min2 & maxx = max1 > max2
384      nbins = maxx-minn+1
385      if (maxx-minn) GT floor(ma[0]) then goto, SLOW_SET_OP
386
387      ;; Work around a stupidity in the built-in IDL HISTOGRAM routine
388      if (tp1 EQ 2 OR tp2 EQ 2) AND (minn LT -32768 OR maxx GT 32767) then $
389        goto, SLOW_SET_OP
390
391      ;; Following operations create a histogram of the integer values.
392      ha = histogram(a, min=minn, max=maxx) < 1
393      hb = histogram(b, min=minn, max=maxx) < 1
394
395      ;; Compute NOT cases
396      if keyword_set(not1) then ha = 1b - ha 
397      if keyword_set(not2) then hb = 1b - hb
398      case op of
399          ;; Boolean operations
400          'AND': mask = temporary(ha) AND temporary(hb)
401           'OR': mask = temporary(ha)  OR temporary(hb)
402          'XOR': mask = temporary(ha) XOR temporary(hb)
403      endcase
404
405      wh = where(temporary(mask), count)
406      if count EQ 0 then return, -1L
407     
408      result = temporary(wh+minn)
409      if tp1 NE tp2 then return, result
410      szr = size(result) & tpr = szr[szr[0]+1]
411
412      ;; Cast to the original type if necessary
413      if tpr NE tp1 then begin
414          fresult = make_array(n_elements(result), type=tp1)
415          fresult[0] = temporary(result)
416          result = temporary(fresult)
417      endif
418
419      return, result
420
421  endelse
422
423  return, -1L  ;; DEFAULT CASE
424end
425
426;     Here is how I did the INDEX stuff with fast histogramming.  It
427;     works, but is complicated, so I forced it to go to SLOW_SET_OP.
428;     ha = histogram(a, min=minn, max=maxx, reverse=ra) < 1
429;     rr = ra[0:nbins] & mask = rr NE rr[1:*] & ra = ra[rr]*mask-1L+mask
430;     hb = histogram(b, min=minn, max=maxx, reverse=rb) < 1
431;     rr = rb[0:nbins] & mask = rr NE rr[1:*] & rb = rb[rr]*mask-1L+mask
432;     ...  AND/OR/XOR NOT masking here ...
433;     ra = ra[wh] & rb = rb[wh]
434;     return, ra*(ra GE 0) + (rb+n1)*(ra LT 0) ;; is last 'ra' right?
435
Note: See TracBrowser for help on using the repository browser.