1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * In this file we have a number of functions that search a column |
11 | * using binary search. The column must be sorted or reverse sorted |
12 | * (for the SORTfnd* functions), or there must be an order index (for |
13 | * the ORDERfnd* functions). |
14 | * |
15 | * All functions return a BUN, i.e. an offset from the start of the |
16 | * column. The SORTfnd and ORDERfnd functions return BUN_NONE if the |
17 | * value does not occur in the column. |
18 | * |
19 | * The ORDERfnd* functions return an offset in the order index, so to |
20 | * get the actual position, (the OID, not the BUN), read the order |
21 | * index at that offset. |
22 | * |
23 | * The *fndfirst functions return the BUN of the *first* value in the |
24 | * column that is greater (less) than or equal to the value being |
25 | * searched (when the column is sorted in ascending (descending) |
26 | * order). |
27 | * |
28 | * The *fndlast functions return the BUN of the *first* value in the |
29 | * column that is greater (less) than the value being searched (when |
30 | * the column is sorted in ascending (descending) order). |
31 | * |
32 | * If the value to be found occurs, the following relationship holds: |
33 | * |
34 | * SORTfndfirst(b, v) <= SORTfnd(b, v) < SORTfndlast(b, v) |
35 | * ORDERfndfirst(b, v) <= ORDERfnd(b, v) < ORDERfndlast(b, v) |
36 | * |
37 | * and the range from *fndfirst (included) up to *fndlast (not |
38 | * included) are all values in the column that are equal to the value. |
39 | * |
40 | * If the value to be found does not occur, SORTfnd and ORDERfnd |
41 | * return BUN_NONE, and the other functions return the location of the |
42 | * next larger value, or BATcount if the value being searched for is |
43 | * larger (smaller if reverse sorted) than any in the column. |
44 | * |
45 | * Note that the NIL value is considered smaller than all other values. |
46 | */ |
47 | |
48 | #include "monetdb_config.h" |
49 | #include "gdk.h" |
50 | #include "gdk_private.h" |
51 | |
52 | #define VALUE(x) (vars ? \ |
53 | vars + VarHeapVal(vals, (x), width) : \ |
54 | (const char *) vals + ((x) * width)) |
55 | |
56 | #define bte_LT(a, b) ((a) < (b)) |
57 | #define bte_LE(a, b) ((a) <= (b)) |
58 | #define bte_GT(a, b) ((a) > (b)) |
59 | #define bte_GE(a, b) ((a) >= (b)) |
60 | #define bte_EQ(a, b) ((a) == (b)) |
61 | #define sht_LT(a, b) ((a) < (b)) |
62 | #define sht_LE(a, b) ((a) <= (b)) |
63 | #define sht_GT(a, b) ((a) > (b)) |
64 | #define sht_GE(a, b) ((a) >= (b)) |
65 | #define sht_EQ(a, b) ((a) == (b)) |
66 | #define int_LT(a, b) ((a) < (b)) |
67 | #define int_LE(a, b) ((a) <= (b)) |
68 | #define int_GT(a, b) ((a) > (b)) |
69 | #define int_GE(a, b) ((a) >= (b)) |
70 | #define int_EQ(a, b) ((a) == (b)) |
71 | #define lng_LT(a, b) ((a) < (b)) |
72 | #define lng_LE(a, b) ((a) <= (b)) |
73 | #define lng_GT(a, b) ((a) > (b)) |
74 | #define lng_GE(a, b) ((a) >= (b)) |
75 | #define lng_EQ(a, b) ((a) == (b)) |
76 | #ifdef HAVE_HGE |
77 | #define hge_LT(a, b) ((a) < (b)) |
78 | #define hge_LE(a, b) ((a) <= (b)) |
79 | #define hge_GT(a, b) ((a) > (b)) |
80 | #define hge_GE(a, b) ((a) >= (b)) |
81 | #define hge_EQ(a, b) ((a) == (b)) |
82 | #endif |
83 | #define flt_LT(a, b) (!is_flt_nil(b) && (is_flt_nil(a) || (a) < (b))) |
84 | #define flt_LE(a, b) (is_flt_nil(a) || (!is_flt_nil(b) && (a) <= (b))) |
85 | #define flt_GT(a, b) (!is_flt_nil(a) && (is_flt_nil(b) || (a) > (b))) |
86 | #define flt_GE(a, b) (is_flt_nil(b) || (!is_flt_nil(a) && (a) >= (b))) |
87 | #define flt_EQ(a, b) (is_flt_nil(a) ? is_flt_nil(b) : !is_flt_nil(b) && (a) == (b)) |
88 | #define dbl_LT(a, b) (!is_dbl_nil(b) && (is_dbl_nil(a) || (a) < (b))) |
89 | #define dbl_LE(a, b) (is_dbl_nil(a) || (!is_dbl_nil(b) && (a) <= (b))) |
90 | #define dbl_GT(a, b) (!is_dbl_nil(a) && (is_dbl_nil(b) || (a) > (b))) |
91 | #define dbl_GE(a, b) (is_dbl_nil(b) || (!is_dbl_nil(a) && (a) >= (b))) |
92 | #define dbl_EQ(a, b) (is_dbl_nil(a) ? is_dbl_nil(b) : !is_dbl_nil(b) && (a) == (b)) |
93 | |
94 | #define BINSEARCHFUNC(TYPE) \ |
95 | BUN \ |
96 | binsearch_##TYPE(const oid *restrict indir, oid offset, \ |
97 | const TYPE *restrict vals, BUN lo, BUN hi, TYPE v, \ |
98 | int ordering, int last) \ |
99 | { \ |
100 | BUN mid; \ |
101 | TYPE x; \ |
102 | \ |
103 | assert(ordering == 1 || ordering == -1); \ |
104 | assert(lo <= hi); \ |
105 | \ |
106 | if (ordering > 0) { \ |
107 | if (indir) { \ |
108 | if (last > 0) { \ |
109 | x = vals[indir[lo] - offset]; \ |
110 | if (TYPE##_GT(x, v)) \ |
111 | return lo; \ |
112 | x = vals[indir[hi] - offset]; \ |
113 | if (TYPE##_LE(x, v)) \ |
114 | return hi + 1; \ |
115 | \ |
116 | /* loop invariant: */ \ |
117 | /* value@lo <= v < value@hi */ \ |
118 | while (hi - lo > 1) { \ |
119 | mid = (hi + lo) / 2; \ |
120 | x = vals[indir[mid] - offset]; \ |
121 | if (TYPE##_GT(x, v)) \ |
122 | hi = mid; \ |
123 | else \ |
124 | lo = mid; \ |
125 | } \ |
126 | } else { \ |
127 | x = vals[indir[lo] - offset]; \ |
128 | if (TYPE##_GE(x, v)) \ |
129 | return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \ |
130 | x = vals[indir[hi] - offset]; \ |
131 | if (TYPE##_LT(x, v)) \ |
132 | return last == 0 ? hi + 1 : BUN_NONE; \ |
133 | \ |
134 | /* loop invariant: */ \ |
135 | /* value@lo < v <= value@hi */ \ |
136 | while (hi - lo > 1) { \ |
137 | mid = (hi + lo) / 2; \ |
138 | x = vals[indir[mid] - offset]; \ |
139 | if (TYPE##_GE(x, v)) \ |
140 | hi = mid; \ |
141 | else \ |
142 | lo = mid; \ |
143 | } \ |
144 | } \ |
145 | } else { \ |
146 | if (last > 0) { \ |
147 | x = vals[lo]; \ |
148 | if (TYPE##_GT(x, v)) \ |
149 | return lo; \ |
150 | x = vals[hi]; \ |
151 | if (TYPE##_LE(x, v)) \ |
152 | return hi + 1; \ |
153 | \ |
154 | /* loop invariant: */ \ |
155 | /* value@lo <= v < value@hi */ \ |
156 | while (hi - lo > 1) { \ |
157 | mid = (hi + lo) / 2; \ |
158 | x = vals[mid]; \ |
159 | if (TYPE##_GT(x, v)) \ |
160 | hi = mid; \ |
161 | else \ |
162 | lo = mid; \ |
163 | } \ |
164 | } else { \ |
165 | x = vals[lo]; \ |
166 | if (TYPE##_GE(x, v)) \ |
167 | return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \ |
168 | x = vals[hi]; \ |
169 | if (TYPE##_LT(x, v)) \ |
170 | return last == 0 ? hi + 1 : BUN_NONE; \ |
171 | \ |
172 | /* loop invariant: */ \ |
173 | /* value@lo < v <= value@hi */ \ |
174 | while (hi - lo > 1) { \ |
175 | mid = (hi + lo) / 2; \ |
176 | x = vals[mid]; \ |
177 | if (TYPE##_GE(x, v)) \ |
178 | hi = mid; \ |
179 | else \ |
180 | lo = mid; \ |
181 | } \ |
182 | } \ |
183 | } \ |
184 | } else { \ |
185 | if (indir) { \ |
186 | if (last > 0) { \ |
187 | x = vals[indir[lo] - offset]; \ |
188 | if (TYPE##_LT(x, v)) \ |
189 | return lo; \ |
190 | x = vals[indir[hi] - offset]; \ |
191 | if (TYPE##_GE(x, v)) \ |
192 | return hi + 1; \ |
193 | \ |
194 | /* loop invariant: */ \ |
195 | /* value@lo >= v > value@hi */ \ |
196 | while (hi - lo > 1) { \ |
197 | mid = (hi + lo) / 2; \ |
198 | x = vals[indir[mid] - offset]; \ |
199 | if (TYPE##_LT(x, v)) \ |
200 | hi = mid; \ |
201 | else \ |
202 | lo = mid; \ |
203 | } \ |
204 | } else { \ |
205 | x = vals[indir[lo] - offset]; \ |
206 | if (TYPE##_LE(x, v)) \ |
207 | return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \ |
208 | x = vals[indir[hi] - offset]; \ |
209 | if (TYPE##_GT(x, v)) \ |
210 | return last == 0 ? hi + 1 : BUN_NONE; \ |
211 | \ |
212 | /* loop invariant: */ \ |
213 | /* value@lo > v >= value@hi */ \ |
214 | while (hi - lo > 1) { \ |
215 | mid = (hi + lo) / 2; \ |
216 | x = vals[indir[mid] - offset]; \ |
217 | if (TYPE##_LE(x, v)) \ |
218 | hi = mid; \ |
219 | else \ |
220 | lo = mid; \ |
221 | } \ |
222 | } \ |
223 | } else { \ |
224 | if (last > 0) { \ |
225 | x = vals[lo]; \ |
226 | if (TYPE##_LT(x, v)) \ |
227 | return lo; \ |
228 | x = vals[hi]; \ |
229 | if (TYPE##_GE(x, v)) \ |
230 | return hi + 1; \ |
231 | \ |
232 | /* loop invariant: */ \ |
233 | /* value@lo >= v > value@hi */ \ |
234 | while (hi - lo > 1) { \ |
235 | mid = (hi + lo) / 2; \ |
236 | x = vals[mid]; \ |
237 | if (TYPE##_LT(x, v)) \ |
238 | hi = mid; \ |
239 | else \ |
240 | lo = mid; \ |
241 | } \ |
242 | } else { \ |
243 | x = vals[lo]; \ |
244 | if (TYPE##_LE(x, v)) \ |
245 | return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \ |
246 | x = vals[hi]; \ |
247 | if (TYPE##_GT(x, v)) \ |
248 | return last == 0 ? hi + 1 : BUN_NONE; \ |
249 | \ |
250 | /* loop invariant: */ \ |
251 | /* value@lo > v >= value@hi */ \ |
252 | while (hi - lo > 1) { \ |
253 | mid = (hi + lo) / 2; \ |
254 | x = vals[mid]; \ |
255 | if (TYPE##_LE(x, v)) \ |
256 | hi = mid; \ |
257 | else \ |
258 | lo = mid; \ |
259 | } \ |
260 | } \ |
261 | } \ |
262 | } \ |
263 | return last >= 0 || (x = (indir ? vals[indir[hi] - offset] : vals[hi]), TYPE##_EQ(x, v)) ? hi : BUN_NONE; \ |
264 | } |
265 | |
266 | BINSEARCHFUNC(bte) |
267 | BINSEARCHFUNC(sht) |
268 | BINSEARCHFUNC(int) |
269 | BINSEARCHFUNC(lng) |
270 | #ifdef HAVE_HGE |
271 | BINSEARCHFUNC(hge) |
272 | #endif |
273 | BINSEARCHFUNC(flt) |
274 | BINSEARCHFUNC(dbl) |
275 | |
276 | /* Do a binary search for the first/last occurrence of v between lo and hi |
277 | * (lo inclusive, hi not inclusive) in vals/vars. |
278 | * If last > 0, return the index of the first value > v; if last == 0, |
279 | * return the index of the first value >= v; if last < 0, return |
280 | * BUN_NONE if v does not occur, any BUN where v occurs otherwise. |
281 | * If ordering is -1, the values are sorted in reverse order and hence |
282 | * all comparisons are reversed. |
283 | */ |
284 | BUN |
285 | binsearch(const oid *restrict indir, oid offset, |
286 | int type, const void *restrict vals, const char * restrict vars, |
287 | int width, BUN lo, BUN hi, const void *restrict v, |
288 | int ordering, int last) |
289 | { |
290 | BUN mid; |
291 | int c; |
292 | int (*cmp)(const void *, const void *); |
293 | |
294 | assert(ordering == 1 || ordering == -1); |
295 | assert(lo < hi); |
296 | |
297 | --hi; /* now hi is inclusive */ |
298 | |
299 | switch (ATOMbasetype(type)) { |
300 | /* TYPE_oid is covered by TYPE_int/TYPE_lng */ |
301 | case TYPE_bte: |
302 | return binsearch_bte(indir, offset, (const bte *) vals, |
303 | lo, hi, *(const bte *) v, ordering, last); |
304 | case TYPE_sht: |
305 | return binsearch_sht(indir, offset, (const sht *) vals, |
306 | lo, hi, *(const sht *) v, ordering, last); |
307 | case TYPE_int: |
308 | return binsearch_int(indir, offset, (const int *) vals, |
309 | lo, hi, *(const int *) v, ordering, last); |
310 | case TYPE_lng: |
311 | return binsearch_lng(indir, offset, (const lng *) vals, |
312 | lo, hi, *(const lng *) v, ordering, last); |
313 | #ifdef HAVE_HGE |
314 | case TYPE_hge: |
315 | return binsearch_hge(indir, offset, (const hge *) vals, |
316 | lo, hi, *(const hge *) v, ordering, last); |
317 | #endif |
318 | case TYPE_flt: |
319 | return binsearch_flt(indir, offset, (const flt *) vals, |
320 | lo, hi, *(const flt *) v, ordering, last); |
321 | case TYPE_dbl: |
322 | return binsearch_dbl(indir, offset, (const dbl *) vals, |
323 | lo, hi, *(const dbl *) v, ordering, last); |
324 | } |
325 | |
326 | cmp = ATOMcompare(type); |
327 | |
328 | if (last > 0) { |
329 | if ((c = ordering * cmp(VALUE(indir ? indir[lo] - offset : lo), v)) > 0) |
330 | return lo; |
331 | if ((c = ordering * cmp(VALUE(indir ? indir[hi] - offset : hi), v)) <= 0) |
332 | return hi + 1; |
333 | } else if (last == 0) { |
334 | if ((c = ordering * cmp(VALUE(indir ? indir[lo] - offset : lo), v)) >= 0) |
335 | return lo; |
336 | if ((c = ordering * cmp(VALUE(indir ? indir[hi] - offset : hi), v)) < 0) |
337 | return hi + 1; |
338 | } else { |
339 | if ((c = ordering * cmp(VALUE(indir ? indir[lo] - offset : lo), v)) > 0) |
340 | return BUN_NONE; |
341 | if (c == 0) |
342 | return lo; |
343 | if ((c = ordering * cmp(VALUE(indir ? indir[hi] - offset : hi), v)) < 0) |
344 | return BUN_NONE; |
345 | if (c == 0) |
346 | return hi; |
347 | if (hi - lo <= 1) { |
348 | /* not the first, not the last, and nothing in |
349 | * between */ |
350 | return BUN_NONE; |
351 | } |
352 | } |
353 | |
354 | /* loop invariant: |
355 | * last ? value@lo <= v < value@hi : value@lo < v <= value@hi |
356 | * |
357 | * This version does some more work in the inner loop than the |
358 | * type-expanded versions (ordering and indir checks) but is |
359 | * slow due to the function call and the needed check for |
360 | * vars (in VALUE()) already, so we're beyond caring. */ |
361 | while (hi - lo > 1) { |
362 | mid = (hi + lo) / 2; |
363 | if ((c = ordering * cmp(VALUE(indir ? indir[mid] - offset : mid), v)) > 0 || |
364 | (last <= 0 && c == 0)) |
365 | hi = mid; |
366 | else |
367 | lo = mid; |
368 | } |
369 | return last >= 0 || cmp(VALUE(indir ? indir[hi] - offset : hi), v) == 0 ? hi : BUN_NONE; |
370 | } |
371 | |
372 | /* Return the BUN of any tail value in b that is equal to v; if no |
373 | * match is found, return BUN_NONE. b must be sorted (reverse or |
374 | * forward). */ |
375 | BUN |
376 | SORTfnd(BAT *b, const void *v) |
377 | { |
378 | if (BATcount(b) == 0) |
379 | return BUN_NONE; |
380 | if (BATtdense(b)) { |
381 | if (is_oid_nil(*(oid*)v) || |
382 | *(oid*)v < b->tseqbase || |
383 | *(oid*)v >= b->tseqbase + BATcount(b)) |
384 | return BUN_NONE; |
385 | return *(oid*)v - b->tseqbase; |
386 | } |
387 | if (b->ttype == TYPE_void) { |
388 | if (b->tvheap) { |
389 | struct canditer ci; |
390 | canditer_init(&ci, NULL, b); |
391 | return canditer_search(&ci, *(const oid*)v, false); |
392 | } |
393 | assert(is_oid_nil(b->tseqbase)); |
394 | if (is_oid_nil(*(const oid *) v)) |
395 | return 0; |
396 | return BUN_NONE; |
397 | } |
398 | return binsearch(NULL, 0, b->ttype, Tloc(b, 0), |
399 | b->tvheap ? b->tvheap->base : NULL, b->twidth, 0, |
400 | BATcount(b), v, b->tsorted ? 1 : -1, -1); |
401 | } |
402 | |
403 | /* use orderidx, returns BUN on order index */ |
404 | BUN |
405 | ORDERfnd(BAT *b, const void *v) |
406 | { |
407 | assert(b->torderidx); |
408 | if (BATcount(b) == 0) |
409 | return BUN_NONE; |
410 | return binsearch((oid *) b->torderidx->base + ORDERIDXOFF, 0, b->ttype, |
411 | Tloc(b, 0), b->tvheap ? b->tvheap->base : NULL, |
412 | b->twidth, 0, BATcount(b), v, 1, -1); |
413 | } |
414 | |
415 | /* Return the BUN of the first (lowest numbered) tail value that is |
416 | * equal to v; if no match is found, return the BUN of the next higher |
417 | * value in b. b must be sorted (reverse or forward). */ |
418 | BUN |
419 | SORTfndfirst(BAT *b, const void *v) |
420 | { |
421 | if (BATcount(b) == 0) |
422 | return 0; |
423 | if (BATtdense(b)) { |
424 | if (is_oid_nil(*(oid*)v) || *(oid*)v <= b->tseqbase) |
425 | return 0; |
426 | if (*(oid*)v >= b->tseqbase + BATcount(b)) |
427 | return BATcount(b); |
428 | return *(oid*)v - b->tseqbase; |
429 | } |
430 | if (b->ttype == TYPE_void) { |
431 | if (b->tvheap) { |
432 | struct canditer ci; |
433 | canditer_init(&ci, NULL, b); |
434 | return canditer_search(&ci, *(const oid*)v, true); |
435 | } |
436 | assert(is_oid_nil(b->tseqbase)); |
437 | return 0; |
438 | } |
439 | return binsearch(NULL, 0, b->ttype, Tloc(b, 0), |
440 | b->tvheap ? b->tvheap->base : NULL, b->twidth, 0, |
441 | BATcount(b), v, b->tsorted ? 1 : -1, 0); |
442 | } |
443 | |
444 | /* use orderidx, returns BUN on order index */ |
445 | BUN |
446 | ORDERfndfirst(BAT *b, const void *v) |
447 | { |
448 | assert(b->torderidx); |
449 | if (BATcount(b) == 0) |
450 | return 0; |
451 | return binsearch((oid *) b->torderidx->base + ORDERIDXOFF, 0, b->ttype, |
452 | Tloc(b, 0), b->tvheap ? b->tvheap->base : NULL, |
453 | b->twidth, 0, BATcount(b), v, 1, 0); |
454 | } |
455 | |
456 | /* Return the BUN of the first (lowest numbered) tail value beyond v. |
457 | * b must be sorted (reverse or forward). */ |
458 | BUN |
459 | SORTfndlast(BAT *b, const void *v) |
460 | { |
461 | if (BATcount(b) == 0) |
462 | return 0; |
463 | if (BATtdense(b)) { |
464 | if (is_oid_nil(*(oid*)v) || *(oid*)v <= b->tseqbase) |
465 | return 0; |
466 | if (*(oid*)v >= b->tseqbase + BATcount(b)) |
467 | return BATcount(b); |
468 | return *(oid*)v - b->tseqbase; |
469 | } |
470 | if (b->ttype == TYPE_void) { |
471 | if (b->tvheap) { |
472 | struct canditer ci; |
473 | canditer_init(&ci, NULL, b); |
474 | return canditer_search(&ci, *(const oid*)v + 1, true); |
475 | } |
476 | assert(is_oid_nil(b->tseqbase)); |
477 | return BATcount(b); |
478 | } |
479 | return binsearch(NULL, 0, b->ttype, Tloc(b, 0), |
480 | b->tvheap ? b->tvheap->base : NULL, b->twidth, 0, |
481 | BATcount(b), v, b->tsorted ? 1 : -1, 1); |
482 | } |
483 | |
484 | /* use orderidx, returns BUN on order index */ |
485 | BUN |
486 | ORDERfndlast(BAT *b, const void *v) |
487 | { |
488 | assert(b->torderidx); |
489 | if (BATcount(b) == 0) |
490 | return 0; |
491 | return binsearch((oid *) b->torderidx->base + ORDERIDXOFF, 0, b->ttype, |
492 | Tloc(b, 0), b->tvheap ? b->tvheap->base : NULL, |
493 | b->twidth, 0, BATcount(b), v, 1, 1); |
494 | } |
495 | |