1/*-------------------------------------------------------------------------
2 *
3 * integerset.c
4 * Data structure to hold a large set of 64-bit integers efficiently
5 *
6 * IntegerSet provides an in-memory data structure to hold a set of
7 * arbitrary 64-bit integers. Internally, the values are stored in a
8 * B-tree, with a special packed representation at the leaf level using
9 * the Simple-8b algorithm, which can pack clusters of nearby values
10 * very tightly.
11 *
12 * Memory consumption depends on the number of values stored, but also
13 * on how far the values are from each other. In the best case, with
14 * long runs of consecutive integers, memory consumption can be as low as
15 * 0.1 bytes per integer. In the worst case, if integers are more than
16 * 2^32 apart, it uses about 8 bytes per integer. In typical use, the
17 * consumption per integer is somewhere between those extremes, depending
18 * on the range of integers stored, and how "clustered" they are.
19 *
20 *
21 * Interface
22 * ---------
23 *
24 * intset_create - Create a new, empty set
25 * intset_add_member - Add an integer to the set
26 * intset_is_member - Test if an integer is in the set
27 * intset_begin_iterate - Begin iterating through all integers in set
28 * intset_iterate_next - Return next set member, if any
29 *
30 * intset_create() creates the set in the current memory context. Subsequent
31 * operations that add to the data structure will continue to allocate from
32 * that same context, even if it's not current anymore.
33 *
34 * Note that there is no function to free an integer set. If you need to do
35 * that, create a dedicated memory context to hold it, and destroy the memory
36 * context instead.
37 *
38 *
39 * Limitations
40 * -----------
41 *
42 * - Values must be added in order. (Random insertions would require
43 * splitting nodes, which hasn't been implemented.)
44 *
45 * - Values cannot be added while iteration is in progress.
46 *
47 * - No support for removing values.
48 *
49 * None of these limitations are fundamental to the data structure, so they
50 * could be lifted if needed, by writing some new code. But the current
51 * users of this facility don't need them.
52 *
53 *
54 * References
55 * ----------
56 *
57 * Simple-8b encoding is based on:
58 *
59 * Vo Ngoc Anh, Alistair Moffat, Index compression using 64-bit words,
60 * Software - Practice & Experience, v.40 n.2, p.131-147, February 2010
61 * (https://doi.org/10.1002/spe.948)
62 *
63 *
64 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
65 * Portions Copyright (c) 1994, Regents of the University of California
66 *
67 * IDENTIFICATION
68 * src/backend/lib/integerset.c
69 *
70 *-------------------------------------------------------------------------
71 */
72#include "postgres.h"
73
74#include "access/htup_details.h"
75#include "lib/integerset.h"
76#include "port/pg_bitutils.h"
77#include "utils/memutils.h"
78
79
80/*
81 * Maximum number of integers that can be encoded in a single Simple-8b
82 * codeword. (Defined here before anything else, so that we can size arrays
83 * using this.)
84 */
85#define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240
86
87/*
88 * Parameters for shape of the in-memory B-tree.
89 *
90 * These set the size of each internal and leaf node. They don't necessarily
91 * need to be the same, because the tree is just an in-memory structure.
92 * With the default 64, each node is about 1 kb.
93 *
94 * If you change these, you must recalculate MAX_TREE_LEVELS, too!
95 */
96#define MAX_INTERNAL_ITEMS 64
97#define MAX_LEAF_ITEMS 64
98
99/*
100 * Maximum height of the tree.
101 *
102 * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree. The
103 * theoretical maximum number of items that we can store in a set is 2^64,
104 * so MAX_TREE_LEVELS should be set so that:
105 *
106 * MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64.
107 *
108 * In practice, we'll need far fewer levels, because you will run out of
109 * memory long before reaching that number, but let's be conservative.
110 */
111#define MAX_TREE_LEVELS 11
112
113/*
114 * Node structures, for the in-memory B-tree.
115 *
116 * An internal node holds a number of downlink pointers to leaf nodes, or
117 * to internal nodes on a lower level. For each downlink, the key value
118 * corresponding to the lower level node is stored in a sorted array. The
119 * stored key values are low keys. In other words, if the downlink has value
120 * X, then all items stored on that child are >= X.
121 *
122 * Each leaf node holds a number of "items", with a varying number of
123 * integers packed into each item. Each item consists of two 64-bit words:
124 * The first word holds the first integer stored in the item, in plain format.
125 * The second word contains between 0 and 240 more integers, packed using
126 * Simple-8b encoding. By storing the first integer in plain, unpacked,
127 * format, we can use binary search to quickly find an item that holds (or
128 * would hold) a particular integer. And by storing the rest in packed form,
129 * we still get pretty good memory density, if there are clusters of integers
130 * with similar values.
131 *
132 * Each leaf node also has a pointer to the next leaf node, so that the leaf
133 * nodes can be easily walked from beginning to end when iterating.
134 */
135typedef struct intset_node intset_node;
136typedef struct intset_leaf_node intset_leaf_node;
137typedef struct intset_internal_node intset_internal_node;
138
139/* Common structure of both leaf and internal nodes. */
140struct intset_node
141{
142 uint16 level; /* tree level of this node */
143 uint16 num_items; /* number of items in this node */
144};
145
146/* Internal node */
147struct intset_internal_node
148{
149 /* common header, must match intset_node */
150 uint16 level; /* >= 1 on internal nodes */
151 uint16 num_items;
152
153 /*
154 * 'values' is an array of key values, and 'downlinks' are pointers to
155 * lower-level nodes, corresponding to the key values.
156 */
157 uint64 values[MAX_INTERNAL_ITEMS];
158 intset_node *downlinks[MAX_INTERNAL_ITEMS];
159};
160
161/* Leaf node */
162typedef struct
163{
164 uint64 first; /* first integer in this item */
165 uint64 codeword; /* simple8b encoded differences from 'first' */
166} leaf_item;
167
168#define MAX_VALUES_PER_LEAF_ITEM (1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD)
169
170struct intset_leaf_node
171{
172 /* common header, must match intset_node */
173 uint16 level; /* 0 on leafs */
174 uint16 num_items;
175
176 intset_leaf_node *next; /* right sibling, if any */
177
178 leaf_item items[MAX_LEAF_ITEMS];
179};
180
181/*
182 * We buffer insertions in a simple array, before packing and inserting them
183 * into the B-tree. MAX_BUFFERED_VALUES sets the size of the buffer. The
184 * encoder assumes that it is large enough that we can always fill a leaf
185 * item with buffered new items. In other words, MAX_BUFFERED_VALUES must be
186 * larger than MAX_VALUES_PER_LEAF_ITEM. For efficiency, make it much larger.
187 */
188#define MAX_BUFFERED_VALUES (MAX_VALUES_PER_LEAF_ITEM * 2)
189
190/*
191 * IntegerSet is the top-level object representing the set.
192 *
193 * The integers are stored in an in-memory B-tree structure, plus an array
194 * for newly-added integers. IntegerSet also tracks information about memory
195 * usage, as well as the current position when iterating the set with
196 * intset_begin_iterate / intset_iterate_next.
197 */
198struct IntegerSet
199{
200 /*
201 * 'context' is the memory context holding this integer set and all its
202 * tree nodes.
203 *
204 * 'mem_used' tracks the amount of memory used. We don't do anything with
205 * it in integerset.c itself, but the callers can ask for it with
206 * intset_memory_usage().
207 */
208 MemoryContext context;
209 uint64 mem_used;
210
211 uint64 num_entries; /* total # of values in the set */
212 uint64 highest_value; /* highest value stored in this set */
213
214 /*
215 * B-tree to hold the packed values.
216 *
217 * 'rightmost_nodes' hold pointers to the rightmost node on each level.
218 * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its
219 * parent, and so forth, all the way up to the root. These are needed when
220 * adding new values. (Currently, we require that new values are added at
221 * the end.)
222 */
223 int num_levels; /* height of the tree */
224 intset_node *root; /* root node */
225 intset_node *rightmost_nodes[MAX_TREE_LEVELS];
226 intset_leaf_node *leftmost_leaf; /* leftmost leaf node */
227
228 /*
229 * Holding area for new items that haven't been inserted to the tree yet.
230 */
231 uint64 buffered_values[MAX_BUFFERED_VALUES];
232 int num_buffered_values;
233
234 /*
235 * Iterator support.
236 *
237 * 'iter_values' is an array of integers ready to be returned to the
238 * caller; 'iter_num_values' is the length of that array, and
239 * 'iter_valueno' is the next index. 'iter_node' and 'iter_itemno' point
240 * to the leaf node, and item within the leaf node, to get the next batch
241 * of values from.
242 *
243 * Normally, 'iter_values' points to 'iter_values_buf', which holds items
244 * decoded from a leaf item. But after we have scanned the whole B-tree,
245 * we iterate through all the unbuffered values, too, by pointing
246 * iter_values to 'buffered_values'.
247 */
248 bool iter_active; /* is iteration in progress? */
249
250 const uint64 *iter_values;
251 int iter_num_values; /* number of elements in 'iter_values' */
252 int iter_valueno; /* next index into 'iter_values' */
253
254 intset_leaf_node *iter_node; /* current leaf node */
255 int iter_itemno; /* next item in 'iter_node' to decode */
256
257 uint64 iter_values_buf[MAX_VALUES_PER_LEAF_ITEM];
258};
259
260/*
261 * Prototypes for internal functions.
262 */
263static void intset_update_upper(IntegerSet *intset, int level,
264 intset_node *child, uint64 child_key);
265static void intset_flush_buffered_values(IntegerSet *intset);
266
267static int intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems,
268 bool nextkey);
269static int intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems,
270 bool nextkey);
271
272static uint64 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base);
273static int simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base);
274static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base);
275
276
277/*
278 * Create a new, initially empty, integer set.
279 *
280 * The integer set is created in the current memory context.
281 * We will do all subsequent allocations in the same context, too, regardless
282 * of which memory context is current when new integers are added to the set.
283 */
284IntegerSet *
285intset_create(void)
286{
287 IntegerSet *intset;
288
289 intset = (IntegerSet *) palloc(sizeof(IntegerSet));
290 intset->context = CurrentMemoryContext;
291 intset->mem_used = GetMemoryChunkSpace(intset);
292
293 intset->num_entries = 0;
294 intset->highest_value = 0;
295
296 intset->num_levels = 0;
297 intset->root = NULL;
298 memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes));
299 intset->leftmost_leaf = NULL;
300
301 intset->num_buffered_values = 0;
302
303 intset->iter_active = false;
304 intset->iter_node = NULL;
305 intset->iter_itemno = 0;
306 intset->iter_valueno = 0;
307 intset->iter_num_values = 0;
308 intset->iter_values = NULL;
309
310 return intset;
311}
312
313/*
314 * Allocate a new node.
315 */
316static intset_internal_node *
317intset_new_internal_node(IntegerSet *intset)
318{
319 intset_internal_node *n;
320
321 n = (intset_internal_node *) MemoryContextAlloc(intset->context,
322 sizeof(intset_internal_node));
323 intset->mem_used += GetMemoryChunkSpace(n);
324
325 n->level = 0; /* caller must set */
326 n->num_items = 0;
327
328 return n;
329}
330
331static intset_leaf_node *
332intset_new_leaf_node(IntegerSet *intset)
333{
334 intset_leaf_node *n;
335
336 n = (intset_leaf_node *) MemoryContextAlloc(intset->context,
337 sizeof(intset_leaf_node));
338 intset->mem_used += GetMemoryChunkSpace(n);
339
340 n->level = 0;
341 n->num_items = 0;
342 n->next = NULL;
343
344 return n;
345}
346
347/*
348 * Return the number of entries in the integer set.
349 */
350uint64
351intset_num_entries(IntegerSet *intset)
352{
353 return intset->num_entries;
354}
355
356/*
357 * Return the amount of memory used by the integer set.
358 */
359uint64
360intset_memory_usage(IntegerSet *intset)
361{
362 return intset->mem_used;
363}
364
365/*
366 * Add a value to the set.
367 *
368 * Values must be added in order.
369 */
370void
371intset_add_member(IntegerSet *intset, uint64 x)
372{
373 if (intset->iter_active)
374 elog(ERROR, "cannot add new values to integer set while iteration is in progress");
375
376 if (x <= intset->highest_value && intset->num_entries > 0)
377 elog(ERROR, "cannot add value to integer set out of order");
378
379 if (intset->num_buffered_values >= MAX_BUFFERED_VALUES)
380 {
381 /* Time to flush our buffer */
382 intset_flush_buffered_values(intset);
383 Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES);
384 }
385
386 /* Add it to the buffer of newly-added values */
387 intset->buffered_values[intset->num_buffered_values] = x;
388 intset->num_buffered_values++;
389 intset->num_entries++;
390 intset->highest_value = x;
391}
392
393/*
394 * Take a batch of buffered values, and pack them into the B-tree.
395 */
396static void
397intset_flush_buffered_values(IntegerSet *intset)
398{
399 uint64 *values = intset->buffered_values;
400 uint64 num_values = intset->num_buffered_values;
401 int num_packed = 0;
402 intset_leaf_node *leaf;
403
404 leaf = (intset_leaf_node *) intset->rightmost_nodes[0];
405
406 /*
407 * If the tree is completely empty, create the first leaf page, which is
408 * also the root.
409 */
410 if (leaf == NULL)
411 {
412 /*
413 * This is the very first item in the set.
414 *
415 * Allocate root node. It's also a leaf.
416 */
417 leaf = intset_new_leaf_node(intset);
418
419 intset->root = (intset_node *) leaf;
420 intset->leftmost_leaf = leaf;
421 intset->rightmost_nodes[0] = (intset_node *) leaf;
422 intset->num_levels = 1;
423 }
424
425 /*
426 * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer,
427 * stop. In most cases, we cannot encode that many values in a single
428 * value, but this way, the encoder doesn't have to worry about running
429 * out of input.
430 */
431 while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM)
432 {
433 leaf_item item;
434 int num_encoded;
435
436 /*
437 * Construct the next leaf item, packing as many buffered values as
438 * possible.
439 */
440 item.first = values[num_packed];
441 item.codeword = simple8b_encode(&values[num_packed + 1],
442 &num_encoded,
443 item.first);
444
445 /*
446 * Add the item to the node, allocating a new node if the old one is
447 * full.
448 */
449 if (leaf->num_items >= MAX_LEAF_ITEMS)
450 {
451 /* Allocate new leaf and link it to the tree */
452 intset_leaf_node *old_leaf = leaf;
453
454 leaf = intset_new_leaf_node(intset);
455 old_leaf->next = leaf;
456 intset->rightmost_nodes[0] = (intset_node *) leaf;
457 intset_update_upper(intset, 1, (intset_node *) leaf, item.first);
458 }
459 leaf->items[leaf->num_items++] = item;
460
461 num_packed += 1 + num_encoded;
462 }
463
464 /*
465 * Move any remaining buffered values to the beginning of the array.
466 */
467 if (num_packed < intset->num_buffered_values)
468 {
469 memmove(&intset->buffered_values[0],
470 &intset->buffered_values[num_packed],
471 (intset->num_buffered_values - num_packed) * sizeof(uint64));
472 }
473 intset->num_buffered_values -= num_packed;
474}
475
476/*
477 * Insert a downlink into parent node, after creating a new node.
478 *
479 * Recurses if the parent node is full, too.
480 */
481static void
482intset_update_upper(IntegerSet *intset, int level, intset_node *child,
483 uint64 child_key)
484{
485 intset_internal_node *parent;
486
487 Assert(level > 0);
488
489 /*
490 * Create a new root node, if necessary.
491 */
492 if (level >= intset->num_levels)
493 {
494 intset_node *oldroot = intset->root;
495 uint64 downlink_key;
496
497 /* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */
498 if (intset->num_levels == MAX_TREE_LEVELS)
499 elog(ERROR, "could not expand integer set, maximum number of levels reached");
500 intset->num_levels++;
501
502 /*
503 * Get the first value on the old root page, to be used as the
504 * downlink.
505 */
506 if (intset->root->level == 0)
507 downlink_key = ((intset_leaf_node *) oldroot)->items[0].first;
508 else
509 downlink_key = ((intset_internal_node *) oldroot)->values[0];
510
511 parent = intset_new_internal_node(intset);
512 parent->level = level;
513 parent->values[0] = downlink_key;
514 parent->downlinks[0] = oldroot;
515 parent->num_items = 1;
516
517 intset->root = (intset_node *) parent;
518 intset->rightmost_nodes[level] = (intset_node *) parent;
519 }
520
521 /*
522 * Place the downlink on the parent page.
523 */
524 parent = (intset_internal_node *) intset->rightmost_nodes[level];
525
526 if (parent->num_items < MAX_INTERNAL_ITEMS)
527 {
528 parent->values[parent->num_items] = child_key;
529 parent->downlinks[parent->num_items] = child;
530 parent->num_items++;
531 }
532 else
533 {
534 /*
535 * Doesn't fit. Allocate new parent, with the downlink as the first
536 * item on it, and recursively insert the downlink to the new parent
537 * to the grandparent.
538 */
539 parent = intset_new_internal_node(intset);
540 parent->level = level;
541 parent->values[0] = child_key;
542 parent->downlinks[0] = child;
543 parent->num_items = 1;
544
545 intset->rightmost_nodes[level] = (intset_node *) parent;
546
547 intset_update_upper(intset, level + 1, (intset_node *) parent, child_key);
548 }
549}
550
551/*
552 * Does the set contain the given value?
553 */
554bool
555intset_is_member(IntegerSet *intset, uint64 x)
556{
557 intset_node *node;
558 intset_leaf_node *leaf;
559 int level;
560 int itemno;
561 leaf_item *item;
562
563 /*
564 * The value might be in the buffer of newly-added values.
565 */
566 if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0])
567 {
568 int itemno;
569
570 itemno = intset_binsrch_uint64(x,
571 intset->buffered_values,
572 intset->num_buffered_values,
573 false);
574 if (itemno >= intset->num_buffered_values)
575 return false;
576 else
577 return (intset->buffered_values[itemno] == x);
578 }
579
580 /*
581 * Start from the root, and walk down the B-tree to find the right leaf
582 * node.
583 */
584 if (!intset->root)
585 return false;
586 node = intset->root;
587 for (level = intset->num_levels - 1; level > 0; level--)
588 {
589 intset_internal_node *n = (intset_internal_node *) node;
590
591 Assert(node->level == level);
592
593 itemno = intset_binsrch_uint64(x, n->values, n->num_items, true);
594 if (itemno == 0)
595 return false;
596 node = n->downlinks[itemno - 1];
597 }
598 Assert(node->level == 0);
599 leaf = (intset_leaf_node *) node;
600
601 /*
602 * Binary search to find the right item on the leaf page
603 */
604 itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true);
605 if (itemno == 0)
606 return false;
607 item = &leaf->items[itemno - 1];
608
609 /* Is this a match to the first value on the item? */
610 if (item->first == x)
611 return true;
612 Assert(x > item->first);
613
614 /* Is it in the packed codeword? */
615 if (simple8b_contains(item->codeword, x, item->first))
616 return true;
617
618 return false;
619}
620
621/*
622 * Begin in-order scan through all the values.
623 *
624 * While the iteration is in-progress, you cannot add new values to the set.
625 */
626void
627intset_begin_iterate(IntegerSet *intset)
628{
629 /* Note that we allow an iteration to be abandoned midway */
630 intset->iter_active = true;
631 intset->iter_node = intset->leftmost_leaf;
632 intset->iter_itemno = 0;
633 intset->iter_valueno = 0;
634 intset->iter_num_values = 0;
635 intset->iter_values = intset->iter_values_buf;
636}
637
638/*
639 * Returns the next integer, when iterating.
640 *
641 * intset_begin_iterate() must be called first. intset_iterate_next() returns
642 * the next value in the set. Returns true, if there was another value, and
643 * stores the value in *next. Otherwise, returns false.
644 */
645bool
646intset_iterate_next(IntegerSet *intset, uint64 *next)
647{
648 Assert(intset->iter_active);
649 for (;;)
650 {
651 /* Return next iter_values[] entry if any */
652 if (intset->iter_valueno < intset->iter_num_values)
653 {
654 *next = intset->iter_values[intset->iter_valueno++];
655 return true;
656 }
657
658 /* Decode next item in current leaf node, if any */
659 if (intset->iter_node &&
660 intset->iter_itemno < intset->iter_node->num_items)
661 {
662 leaf_item *item;
663 int num_decoded;
664
665 item = &intset->iter_node->items[intset->iter_itemno++];
666
667 intset->iter_values_buf[0] = item->first;
668 num_decoded = simple8b_decode(item->codeword,
669 &intset->iter_values_buf[1],
670 item->first);
671 intset->iter_num_values = num_decoded + 1;
672 intset->iter_valueno = 0;
673 continue;
674 }
675
676 /* No more items on this leaf, step to next node */
677 if (intset->iter_node)
678 {
679 intset->iter_node = intset->iter_node->next;
680 intset->iter_itemno = 0;
681 continue;
682 }
683
684 /*
685 * We have reached the end of the B-tree. But we might still have
686 * some integers in the buffer of newly-added values.
687 */
688 if (intset->iter_values == (const uint64 *) intset->iter_values_buf)
689 {
690 intset->iter_values = intset->buffered_values;
691 intset->iter_num_values = intset->num_buffered_values;
692 intset->iter_valueno = 0;
693 continue;
694 }
695
696 break;
697 }
698
699 /* No more results. */
700 intset->iter_active = false;
701 *next = 0; /* prevent uninitialized-variable warnings */
702 return false;
703}
704
705/*
706 * intset_binsrch_uint64() -- search a sorted array of uint64s
707 *
708 * Returns the first position with key equal or less than the given key.
709 * The returned position would be the "insert" location for the given key,
710 * that is, the position where the new key should be inserted to.
711 *
712 * 'nextkey' affects the behavior on equal keys. If true, and there is an
713 * equal key in the array, this returns the position immediately after the
714 * equal key. If false, this returns the position of the equal key itself.
715 */
716static int
717intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey)
718{
719 int low,
720 high,
721 mid;
722
723 low = 0;
724 high = arr_elems;
725 while (high > low)
726 {
727 mid = low + (high - low) / 2;
728
729 if (nextkey)
730 {
731 if (item >= arr[mid])
732 low = mid + 1;
733 else
734 high = mid;
735 }
736 else
737 {
738 if (item > arr[mid])
739 low = mid + 1;
740 else
741 high = mid;
742 }
743 }
744
745 return low;
746}
747
748/* same, but for an array of leaf items */
749static int
750intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
751{
752 int low,
753 high,
754 mid;
755
756 low = 0;
757 high = arr_elems;
758 while (high > low)
759 {
760 mid = low + (high - low) / 2;
761
762 if (nextkey)
763 {
764 if (item >= arr[mid].first)
765 low = mid + 1;
766 else
767 high = mid;
768 }
769 else
770 {
771 if (item > arr[mid].first)
772 low = mid + 1;
773 else
774 high = mid;
775 }
776 }
777
778 return low;
779}
780
781/*
782 * Simple-8b encoding.
783 *
784 * The simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
785 * called "codewords". The number of integers packed into a single codeword
786 * depends on the integers being packed; small integers are encoded using
787 * fewer bits than large integers. A single codeword can store a single
788 * 60-bit integer, or two 30-bit integers, for example.
789 *
790 * Since we're storing a unique, sorted, set of integers, we actually encode
791 * the *differences* between consecutive integers. That way, clusters of
792 * integers that are close to each other are packed efficiently, regardless
793 * of their absolute values.
794 *
795 * In Simple-8b, each codeword consists of a 4-bit selector, which indicates
796 * how many integers are encoded in the codeword, and the encoded integers are
797 * packed into the remaining 60 bits. The selector allows for 16 different
798 * ways of using the remaining 60 bits, called "modes". The number of integers
799 * packed into a single codeword in each mode is listed in the simple8b_modes
800 * table below. For example, consider the following codeword:
801 *
802 * 20-bit integer 20-bit integer 20-bit integer
803 * 1101 00000000000000010010 01111010000100100000 00000000000000010100
804 * ^
805 * selector
806 *
807 * The selector 1101 is 13 in decimal. From the modes table below, we see
808 * that it means that the codeword encodes three 20-bit integers. In decimal,
809 * those integers are 18, 500000 and 20. Because we encode deltas rather than
810 * absolute values, the actual values that they represent are 18, 500018 and
811 * 500038.
812 *
813 * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeroes
814 * (which means 240 or 120 consecutive integers, since we're encoding the
815 * deltas between integers), without using the rest of the codeword bits
816 * for anything.
817 *
818 * Simple-8b cannot encode integers larger than 60 bits. Values larger than
819 * that are always stored in the 'first' field of a leaf item, never in the
820 * packed codeword. If there is a sequence of integers that are more than
821 * 2^60 apart, the codeword will go unused on those items. To represent that,
822 * we use a magic EMPTY_CODEWORD codeword value.
823 */
824static const struct simple8b_mode
825{
826 uint8 bits_per_int;
827 uint8 num_ints;
828} simple8b_modes[17] =
829
830{
831 {0, 240}, /* mode 0: 240 zeroes */
832 {0, 120}, /* mode 1: 120 zeroes */
833 {1, 60}, /* mode 2: sixty 1-bit integers */
834 {2, 30}, /* mode 3: thirty 2-bit integers */
835 {3, 20}, /* mode 4: twenty 3-bit integers */
836 {4, 15}, /* mode 5: fifteen 4-bit integers */
837 {5, 12}, /* mode 6: twelve 5-bit integers */
838 {6, 10}, /* mode 7: ten 6-bit integers */
839 {7, 8}, /* mode 8: eight 7-bit integers (four bits
840 * are wasted) */
841 {8, 7}, /* mode 9: seven 8-bit integers (four bits
842 * are wasted) */
843 {10, 6}, /* mode 10: six 10-bit integers */
844 {12, 5}, /* mode 11: five 12-bit integers */
845 {15, 4}, /* mode 12: four 15-bit integers */
846 {20, 3}, /* mode 13: three 20-bit integers */
847 {30, 2}, /* mode 14: two 30-bit integers */
848 {60, 1}, /* mode 15: one 60-bit integer */
849
850 {0, 0} /* sentinel value */
851};
852
853/*
854 * EMPTY_CODEWORD is a special value, used to indicate "no values".
855 * It is used if the next value is too large to be encoded with Simple-8b.
856 *
857 * This value looks like a mode-0 codeword, but we can distinguish it
858 * because a regular mode-0 codeword would have zeroes in the unused bits.
859 */
860#define EMPTY_CODEWORD UINT64CONST(0x0FFFFFFFFFFFFFFF)
861
862/*
863 * Encode a number of integers into a Simple-8b codeword.
864 *
865 * (What we actually encode are deltas between successive integers.
866 * "base" is the value before ints[0].)
867 *
868 * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
869 * elements, ensuring that we can produce a full codeword.
870 *
871 * Returns the encoded codeword, and sets *num_encoded to the number of
872 * input integers that were encoded. That can be zero, if the first delta
873 * is too large to be encoded.
874 */
875static uint64
876simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base)
877{
878 int selector;
879 int nints;
880 int bits;
881 uint64 diff;
882 uint64 last_val;
883 uint64 codeword;
884 int i;
885
886 Assert(ints[0] > base);
887
888 /*
889 * Select the "mode" to use for this codeword.
890 *
891 * In each iteration, check if the next value can be represented in the
892 * current mode we're considering. If it's too large, then step up the
893 * mode to a wider one, and repeat. If it fits, move on to the next
894 * integer. Repeat until the codeword is full, given the current mode.
895 *
896 * Note that we don't have any way to represent unused slots in the
897 * codeword, so we require each codeword to be "full". It is always
898 * possible to produce a full codeword unless the very first delta is too
899 * large to be encoded. For example, if the first delta is small but the
900 * second is too large to be encoded, we'll end up using the last "mode",
901 * which has nints == 1.
902 */
903 selector = 0;
904 nints = simple8b_modes[0].num_ints;
905 bits = simple8b_modes[0].bits_per_int;
906 diff = ints[0] - base - 1;
907 last_val = ints[0];
908 i = 0; /* number of deltas we have accepted */
909 for (;;)
910 {
911 if (diff >= (UINT64CONST(1) << bits))
912 {
913 /* too large, step up to next mode */
914 selector++;
915 nints = simple8b_modes[selector].num_ints;
916 bits = simple8b_modes[selector].bits_per_int;
917 /* we might already have accepted enough deltas for this mode */
918 if (i >= nints)
919 break;
920 }
921 else
922 {
923 /* accept this delta; then done if codeword is full */
924 i++;
925 if (i >= nints)
926 break;
927 /* examine next delta */
928 Assert(ints[i] > last_val);
929 diff = ints[i] - last_val - 1;
930 last_val = ints[i];
931 }
932 }
933
934 if (nints == 0)
935 {
936 /*
937 * The first delta is too large to be encoded with Simple-8b.
938 *
939 * If there is at least one not-too-large integer in the input, we
940 * will encode it using mode 15 (or a more compact mode). Hence, we
941 * can only get here if the *first* delta is >= 2^60.
942 */
943 Assert(i == 0);
944 *num_encoded = 0;
945 return EMPTY_CODEWORD;
946 }
947
948 /*
949 * Encode the integers using the selected mode. Note that we shift them
950 * into the codeword in reverse order, so that they will come out in the
951 * correct order in the decoder.
952 */
953 codeword = 0;
954 if (bits > 0)
955 {
956 for (i = nints - 1; i > 0; i--)
957 {
958 diff = ints[i] - ints[i - 1] - 1;
959 codeword |= diff;
960 codeword <<= bits;
961 }
962 diff = ints[0] - base - 1;
963 codeword |= diff;
964 }
965
966 /* add selector to the codeword, and return */
967 codeword |= (uint64) selector << 60;
968
969 *num_encoded = nints;
970 return codeword;
971}
972
973/*
974 * Decode a codeword into an array of integers.
975 * Returns the number of integers decoded.
976 */
977static int
978simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
979{
980 int selector = (codeword >> 60);
981 int nints = simple8b_modes[selector].num_ints;
982 int bits = simple8b_modes[selector].bits_per_int;
983 uint64 mask = (UINT64CONST(1) << bits) - 1;
984 uint64 curr_value;
985
986 if (codeword == EMPTY_CODEWORD)
987 return 0;
988
989 curr_value = base;
990 for (int i = 0; i < nints; i++)
991 {
992 uint64 diff = codeword & mask;
993
994 curr_value += 1 + diff;
995 decoded[i] = curr_value;
996 codeword >>= bits;
997 }
998 return nints;
999}
1000
1001/*
1002 * This is very similar to simple8b_decode(), but instead of decoding all
1003 * the values to an array, it just checks if the given "key" is part of
1004 * the codeword.
1005 */
1006static bool
1007simple8b_contains(uint64 codeword, uint64 key, uint64 base)
1008{
1009 int selector = (codeword >> 60);
1010 int nints = simple8b_modes[selector].num_ints;
1011 int bits = simple8b_modes[selector].bits_per_int;
1012
1013 if (codeword == EMPTY_CODEWORD)
1014 return false;
1015
1016 if (bits == 0)
1017 {
1018 /* Special handling for 0-bit cases. */
1019 return (key - base) <= nints;
1020 }
1021 else
1022 {
1023 uint64 mask = (UINT64CONST(1) << bits) - 1;
1024 uint64 curr_value;
1025
1026 curr_value = base;
1027 for (int i = 0; i < nints; i++)
1028 {
1029 uint64 diff = codeword & mask;
1030
1031 curr_value += 1 + diff;
1032
1033 if (curr_value >= key)
1034 {
1035 if (curr_value == key)
1036 return true;
1037 else
1038 return false;
1039 }
1040
1041 codeword >>= bits;
1042 }
1043 }
1044 return false;
1045}
1046