source: trunk/SRC/Matrix/cmset_op.pro @ 378

Last change on this file since 378 was 372, checked in by pinsard, 16 years ago

improvements of headers (alignments)

  • Property svn:keywords set to Id
File size: 16.0 KB
Line 
1;+
2;
3; @hidden
4;
5; @file_comments
6; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
7; "first" value, whatever that may mean.
8;
9; @todo
10; seb
11;
12;-
13FUNCTION cmset_op_uniq, a
14;
15  compile_opt idl2, strictarrsubs
16;
17  if n_elements(a) LE 1 then return, 0L
18
19  ii = sort(a) & b = a[ii]
20  wh = where(b NE shift(b, +1L), ct)
21  if ct GT 0 then return, ii[wh]
22
23  return, 0L
24;
25end
26;+
27;
28; @file_comments
29; Performs an AND, OR, or XOR operation between two sets
30;
31; Description: SET_OP performs three common operations between two sets. The
32; three supported functions of OP are:
33;
34;        OP          Meaning
35;      'AND' - to find the intersection of A and B;
36;      'OR'  - to find the union of A and B;
37;      'XOR' - to find the those elements who are members of A or B
38;              but not both;
39;
40;   Sets as defined here is one dimensional array composed of
41;   numeric or string types. Comparisons of equality between elements
42;   are done using the IDL EQ operator.
43;
44;   The complements of either set can be taken as well, by using the
45;   NOT1 and NOT2 keywords. For example, it may be desirable to find
46;   the elements in A but not B, or B but not A (they are different!).
47;   The following IDL expressions achieve each of those effects:
48;
49;      SET = CMSET_OP(A, 'AND', /NOT2, B)   ; A but not B
50;      SET = CMSET_OP(/NOT1, A, 'AND', B)   ; B but not A
51;
52;   Note the distinction between NOT1 and NOT2.  NOT1 refers to the
53;   first set (A) and NOT2 refers to the second (B).  Their ordered
54;   placement in the calling sequence is entirely optional, but the
55;   above ordering makes the logical meaning explicit.
56;
57;   NOT1 and NOT2 can only be set for the 'AND' operator, and never
58;   simultaneously. This is because the results of an operation with
59;   'OR' or 'XOR' and any combination of NOTs -- or with 'AND' and
60;   both NOTs -- formally cannot produce a defined result.
61;
62;   The implementation depends on the type of operands. For integer
63;   types, a fast technique using HISTOGRAM is used. However, this
64;   algorithm becomes inefficient when the dynamic range in the data
65;   is large. For those cases, and for other data types, a technique
66;   based on SORT() is used. Thus the compute time should scale
67;   roughly as (A+B)*ALOG(A+B) or better, rather than (A*B) for the
68;   brute force approach. For large arrays this is a significant
69;   benefit.
70;
71; @categories
72; Array
73;
74; @param A {in}{required}
75; The two sets to be operated on. A one dimensional array of
76; either numeric or string type. A and B must be of the same
77; type. Empty sets are permitted, and are either represented
78; as an undefined variable, or by setting EMPTY1 or EMPTY2.
79;
80; @param B {in}{required}
81; See A
82;
83; @param OP0 {in}{required}{type=string}
84; a string, the operation to be performed. Must be one of
85; 'AND', 'OR' or 'XOR' (lower or mixed case is permitted).
86; Other operations will cause an error message to be produced.
87;
88; @keyword NOT1
89; If set and OP is 'AND', then the complement of A (for
90; NOT1) or B (for NOT2) will be used in the operation.
91; NOT1 and NOT2 cannot be set simultaneously.
92;
93; @keyword NOT2
94; See NOT1
95;
96; @keyword EMPTY1
97; If set, then A (for EMPTY1) or B (for EMPTY2) are
98; assumed to be the empty set. The actual values
99; passed as A or B are then ignored.
100;
101; @keyword EMPTY2
102; See EMPTY1
103;
104; @keyword INDEX
105; if set, then return a list of indexes instead of the array
106; values themselves. The "slower" set operations are always
107; performed in this case.
108;
109; The indexes refer to the *combined* array [A,B]. To
110; clarify, in the following call: I = CMSET_OP(..., /INDEX);
111; returned values from 0 to NA-1 refer to A[I], and values
112; from NA to NA+NB-1 refer to B[I-NA].
113;
114; @keyword COUNT
115; upon return, the number of elements in the result set.
116; This is only important when the result set is the empty
117; set, in which case COUNT is set to zero.
118;
119; @returns
120; The resulting set as a one-dimensional array. The set may be
121; represented by either an array of data values (default), or an
122; array of indexes (if INDEX is set). Duplicate elements, if any,
123; are removed, and element order may not be preserved.
124;
125; The empty set is represented as a return value of -1L, and COUNT
126; is set to zero. Note that the only way to recognize the empty set
127; is to examine COUNT.
128;
129; SEE ALSO:
130;
131;  SET_UTILS.PRO by RSI
132;
133; @history
134; Written, CM, 23 Feb 2000
135;   Added empty set capability, CM, 25 Feb 2000
136;   Documentation clarification, CM 02 Mar 2000
137;   Incompatible but more consistent reworking of EMPTY keywords, CM,
138;     04 Mar 2000
139;   Minor documentation clarifications, CM, 26 Mar 2000
140;   Corrected bug in empty_arg special case, CM 06 Apr 2000
141;   Add INDEX keyword, CM 31 Jul 2000
142;   Clarify INDEX keyword documentation, CM 06 Sep 2000
143;   Made INDEX keyword always force SLOW_SET_OP, CM 06 Sep 2000
144;   Added CMSET_OP_UNIQ, and ability to select FIRST_UNIQUE or
145;     LAST_UNIQUE values, CM, 18 Sep 2000
146;   Removed FIRST_UNIQUE and LAST_UNIQUE, and streamlined
147;     CMSET_OP_UNIQ until problems with SORT can be understood, CM, 20
148;     Sep 2000 (thanks to Ben Tupper)
149;   Still trying to get documentation of INDEX and NOT right, CM, 28
150;     Sep 2000 (no code changes)
151;   Correct bug for AND case, when input sets A and B each only have
152;     one unique value, and the values are equal.  CM, 04 Mar 2004
153;     (thanks to James B. jbattat at cfa dot harvard dot edu)
154;   Add support for the cases where the input data types are mixed,
155;      but still compatible; also, attempt to return the same data
156;      type that was passed in; CM, 05 Feb 2005
157;   Fix bug in type checking (thanks to "marit"), CM, 10 Dec 2005
158;   Work around a stupidity in the built-in IDL HISTOGRAM routine,
159;      which tries to "help" you by restricting the MIN/MAX to the
160;      range of the input variable (thanks to Will Maddox), CM, 16 Jan 2006
161;
162;   Author: Craig B. Markwardt, NASA/GSFC Code 662, Greenbelt, MD 20770
163;   craigm\@lheamail.gsfc.nasa.gov
164;
165; @version
166; $Id$
167;
168; @examples
169; Utility function, similar to UNIQ, but allowing choice of taking
170; first or last unique element, or non-unique elements.
171; Unfortunately this doesn't work because of implementation dependent
172; versions of the SORT() function.
173;
174; function cmset_op_uniq, a, first=first, non=non, count=ct, sort=sortit
175;   if n_elements(a) LE 1 then return, 0L
176;   sh = (2L*keyword_set(first)-1L)*(-2L*keyword_set(non)+1)
177;
178;   if keyword_set(sortit) then begin
179;       ;; Sort it manually
180;       ii = sort(a) & b = a[ii]
181;       if keyword_set(non) then wh = where(b EQ shift(b, sh), ct) $
182;       else                     wh = where(b NE shift(b, sh), ct)
183;       if ct GT 0 then return, ii[wh]
184;   endif else begin
185;       ;; Use the user's values directly
186;       if keyword_set(non) then wh = where(a EQ shift(a, sh), ct) $
187;       else                     wh = where(a NE shift(a, sh), ct)
188;       if ct GT 0 then return, wh
189;   endelse
190;
191;   if keyword_set(first) then return, 0L else return, n_elements(a)-1
192; end
193
194; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
195; "first" value, whatever that may mean.
196;
197;-
198FUNCTION cmset_op, a, op0, b, NOT1=not1, NOT2=not2, COUNT=count, $
199              EMPTY1=empty1, EMPTY2=empty2, MAXARRAY=ma, INDEX=index
200;
201  compile_opt idl2, strictarrsubs
202;
203
204  on_error, 2 ;; return on error
205  count = 0L
206  index0 = -1L
207  ;; Histogram technique is used for array sizes < 32,000 elements
208  if n_elements(ma) EQ 0 then ma = 32L*1024L
209
210  ;; Check the number of arguments
211  if n_params() LT 3 then begin
212      ARG_ERR:
213      message, 'USAGE: SET = CMSET_OP(A, OP, B [, COUNT=ct])', /info
214      message, '  KEYWORDS: /NOT1, /NOT2, /EMPTY1, /EMPTY2, INDEX', /info
215      return, -1L
216  endif
217  if n_elements(op0) EQ 0 then goto, ARG_ERR
218  kind = keyword_set(index)
219  fst = 1L
220  if keyword_set(last) then fst = 0L
221  if keyword_set(first) then fst = 1L
222
223  ;; Check the operation
224  sz = size(op0)
225  if sz[sz[0]+1] NE 7 then begin
226      OP_ERR:
227      message, "ERROR: OP must be 'AND', 'OR' or 'XOR'"
228      return, -1L
229  endif
230  op = strupcase(op0)
231  if op NE 'AND' AND op NE 'OR' AND op NE 'XOR' then goto, OP_ERR
232
233  ;; Check NOT1 and NOT2
234  if keyword_set(not1) AND keyword_set(not2) then begin
235      message, "ERROR: NOT1 and NOT2 cannot be set simultaneously"
236      return, -1L
237  endif
238  if (keyword_set(not1) OR keyword_set(not2)) AND $
239    (op EQ 'OR' OR op EQ 'XOR') then begin
240      message, "ERROR: NOT1 and NOT2 cannot be set with 'OR' or 'XOR'"
241      return, -1L
242  endif
243
244  ;; Special cases for empty set
245  n1 = n_elements(a) & n2 = n_elements(b)
246  if keyword_set(empty1) then n1 = 0L
247  if keyword_set(empty2) then n2 = 0L
248  if n1 EQ 0 OR n2 EQ 0 then begin
249      ;; Eliminate duplicates
250      if n1 GT 0 then a1 = cmset_op_uniq(a)
251      if n2 GT 0 then b1 = cmset_op_uniq(b)
252      n1 = n_elements(a1) < n1 & n2 = n_elements(b1) < n2
253      case op of
254          'OR': if n1 EQ 0 then goto, RET_A1 else goto, RET_B1
255         'XOR': if n1 EQ 0 then goto, RET_B1 else goto, RET_A1
256         'AND': begin
257             if keyword_set(not1) AND n1 EQ 0 then goto, RET_B1
258             if keyword_set(not2) AND n2 EQ 0 then goto, RET_A1
259             return, -1L
260         end
261     endcase
262     return, -1L
263     RET_A1:
264     count = n1
265     if kind then begin
266         if count GT 0 then return, a1 else return, -1L
267     endif
268     if count GT 0 then return, a[a1] else return, -1L
269     RET_B1:
270     count = n2
271     if kind then begin
272         if count GT 0 then return, b1+n1 else return, -1L
273     endif
274     if count GT 0 then return, b[b1] else return, -1L
275 endif
276
277  ;; Allow data to have different types, but they must be at least of
278  ;; the same "base" type.  That is, you can't combine a number with a
279  ;; string, etc.
280  ;; basetype 0:undefined 1:real number 6:complex number 7:string
281  ;;     8:structure 10:pointer 11:object
282
283  ;;          0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
284  basetype = [0, 1, 1, 1, 1, 1, 6, 7, 8, 6,10,11, 1, 1, 1, 1]
285
286  ;; Check types of operands
287  sz1 = size(a) & tp1 = sz1[sz1[0]+1]
288  sz2 = size(b) & tp2 = sz2[sz2[0]+1]
289  if tp1 LT 0 OR tp1 GE 16 OR tp2 LT 0 OR tp2 GE 16 then begin
290      message, 'ERROR: unrecognized data types for operands'
291      return, -1
292  endif
293  if basetype[tp1] NE basetype[tp2] then begin
294      TYPE1_ERR:
295      message, 'ERROR: both A and B must be of the same type'
296      return, -1L
297  endif
298  if tp1 EQ 8 OR tp1 EQ 10 OR tp1 EQ 11 then begin
299      TYPE2_ERR:
300      message, 'ERROR: operands must be a numeric or string type'
301      return, -1L
302  endif
303
304  ;; Now use two different kinds of algorithms: a slower but more
305  ;; general algorithm for generic types, and the histogram technique
306  ;; for integer types.  Even for integer types, if there is too much
307  ;; dynamic range, then the slow method is used.
308
309  if tp1 GE 4 AND tp1 LE 9 then begin
310      ;; String and real types, or large int arrays
311      SLOW_SET_OP:
312      case op of
313          'OR': begin
314              uu = [a,b]    ;; OR is simple; just take unique values
315              index0 = cmset_op_uniq(uu)
316              count = n_elements(index0)
317              if kind then return, index0
318              return, uu[index0]
319          end
320
321          'XOR': begin
322              ;; Make ordered list of set union
323              ai = cmset_op_uniq(a) & na = n_elements(ai)
324              bi = cmset_op_uniq(b) & nb = n_elements(bi)
325              ui = [ai, bi+n1]
326              uu = [a,b]    & uu = uu[ui] ;; Raw union...
327              us = sort(uu) & uu = uu[us] ;; ...and sort
328              if kind then ui = ui[temporary(us)] else ui = 0
329
330              ;; Values in one set only will not have duplicates
331              wh1 = where(uu NE shift(uu, -1), count1)
332              if count1 EQ 0 then return, -1L
333              wh = where(wh1[1:*]-wh1 EQ 1, count)
334              if wh1[0] EQ 0 then begin
335                  if count GT 0 then wh = [-1L, wh] else wh = [-1L]
336                  count = n_elements(wh)
337              endif
338              if count EQ 0 then return, -1
339              if kind then return, ui[wh1[wh+1]]
340              return, uu[wh1[wh+1]]
341          end
342
343          'AND': begin
344              ;; Make ordered list of set union
345              ai = cmset_op_uniq(a) & na = n_elements(ai)
346              bi = cmset_op_uniq(b) & nb = n_elements(bi)
347              ui = [ai, bi+n1]
348              uu = [a,b]    & uu = uu[ui]  ;; Raw union...
349              us = sort(uu) & uu = uu[us]  ;; ...and sort
350              if kind then ui = ui[us] else ui = 0
351
352              if NOT keyword_set(not1) AND NOT keyword_set(not2) then begin
353
354                  ;; Special case: if there is one in each set, and
355                  ;; they are equal, then the SHIFT() technique below
356                  ;; fails.  Do this one by hand.
357                  if na EQ 1 AND nb EQ 1 AND uu[0] EQ uu[1] then begin
358                      count = 1L
359                      if kind then return, 0L
360                      return, [uu[0]]
361                  endif
362
363                  ;; If neither "NOT" is set, then find duplicates
364                  us = 0L  ;; Save memory
365                  wh = where(uu EQ shift(uu, -1L), count) ;; Find non-unique
366                  if count EQ 0 then return, -1L
367                  ;; This should always select the element from A
368                  ;; rather than B (the smaller of the two)
369                  if kind then return, (ui[wh] < ui[wh+1])
370                  return, uu[wh]
371              endif
372
373              ;; For "NOT" cases, we need to identify by set
374              ii = make_array(na+nb, value=1b)
375              if keyword_set(not1) then ii[0:na-1] = 0
376              if keyword_set(not2) then ii[na:*]   = 0
377              ii = ii[temporary(us)]
378
379              ;; Remove any duplicates
380              wh1 = where(uu EQ shift(uu, -1L), count1) ;; Find non-unique
381              if count1 GT 0 then ii[wh1, wh1+1] = 0
382              ;; Remainder is the desired set
383              wh = where(ii, count)
384              if count EQ 0 then return, -1L
385              if kind then return, ui[wh]
386              return, uu[wh]
387          end
388
389      endcase
390      return, -1L  ;; DEFAULT CASE
391  endif else begin
392
393      ;; INDEX keyword forces the "slow" operation
394      if kind then goto, SLOW_SET_OP
395
396      ;; Integer types - use histogram technique if the data range
397      ;; is small enough, otherwise use the "slow" technique above
398      min1 = min(a, max=max1) & min2 = min(b, max=max2)
399      minn = min1 < min2 & maxx = max1 > max2
400      nbins = maxx-minn+1
401      if (maxx-minn) GT floor(ma[0]) then goto, SLOW_SET_OP
402
403      ;; Work around a stupidity in the built-in IDL HISTOGRAM routine
404      if (tp1 EQ 2 OR tp2 EQ 2) AND (minn LT -32768 OR maxx GT 32767) then $
405        goto, SLOW_SET_OP
406
407      ;; Following operations create a histogram of the integer values.
408      ha = histogram(a, min=minn, max=maxx) < 1
409      hb = histogram(b, min=minn, max=maxx) < 1
410
411      ;; Compute NOT cases
412      if keyword_set(not1) then ha = 1b - ha
413      if keyword_set(not2) then hb = 1b - hb
414      case op of
415          ;; Boolean operations
416          'AND': mask = temporary(ha) AND temporary(hb)
417           'OR': mask = temporary(ha)  OR temporary(hb)
418          'XOR': mask = temporary(ha) XOR temporary(hb)
419      endcase
420
421      wh = where(temporary(mask), count)
422      if count EQ 0 then return, -1L
423
424      result = temporary(wh+minn)
425      if tp1 NE tp2 then return, result
426      szr = size(result) & tpr = szr[szr[0]+1]
427
428      ;; Cast to the original type if necessary
429      if tpr NE tp1 then begin
430          fresult = make_array(n_elements(result), type=tp1)
431          fresult[0] = temporary(result)
432          result = temporary(fresult)
433      endif
434
435      return, result
436
437  endelse
438
439  return, -1L  ;; DEFAULT CASE
440end
441
442;     Here is how I did the INDEX stuff with fast histogramming.  It
443;     works, but is complicated, so I forced it to go to SLOW_SET_OP.
444;     ha = histogram(a, min=minn, max=maxx, reverse=ra) < 1
445;     rr = ra[0:nbins] & mask = rr NE rr[1:*] & ra = ra[rr]*mask-1L+mask
446;     hb = histogram(b, min=minn, max=maxx, reverse=rb) < 1
447;     rr = rb[0:nbins] & mask = rr NE rr[1:*] & rb = rb[rr]*mask-1L+mask
448;     ...  AND/OR/XOR NOT masking here ...
449;     ra = ra[wh] & rb = rb[wh]
450;     return, ra*(ra GE 0) + (rb+n1)*(ra LT 0) ;; is last 'ra' right?
451
Note: See TracBrowser for help on using the repository browser.