| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * @a Lefteris Sidirourgos, Hannes Muehleisen |
| 11 | * @* Low level sample facilities |
| 12 | * |
| 13 | * This sampling implementation generates a sorted set of OIDs by |
| 14 | * calling the random number generator, and uses a binary tree to |
| 15 | * eliminate duplicates. The elements of the tree are then used to |
| 16 | * create a sorted sample BAT. This implementation has a logarithmic |
| 17 | * complexity that only depends on the sample size. |
| 18 | * |
| 19 | * There is a pathological case when the sample size is almost the |
| 20 | * size of the BAT. Then, many collisions occur and performance |
| 21 | * degrades. To catch this, we switch to antiset semantics when the |
| 22 | * sample size is larger than half the BAT size. Then, we generate the |
| 23 | * values that should be omitted from the sample. |
| 24 | */ |
| 25 | |
| 26 | #include "monetdb_config.h" |
| 27 | #include "gdk.h" |
| 28 | #include "gdk_private.h" |
| 29 | #include "xoshiro256starstar.h" |
| 30 | |
| 31 | /* this is a straightforward implementation of a binary tree */ |
| 32 | struct oidtreenode { |
| 33 | oid o; |
| 34 | struct oidtreenode *left; |
| 35 | struct oidtreenode *right; |
| 36 | }; |
| 37 | |
| 38 | static int |
| 39 | OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated) |
| 40 | { |
| 41 | struct oidtreenode **nodep; |
| 42 | |
| 43 | if (allocated == 0) { |
| 44 | tree->left = tree->right = NULL; |
| 45 | tree->o = o; |
| 46 | return 1; |
| 47 | } |
| 48 | nodep = &tree; |
| 49 | while (*nodep) { |
| 50 | if (o == (*nodep)->o) |
| 51 | return 0; |
| 52 | if (o < (*nodep)->o) |
| 53 | nodep = &(*nodep)->left; |
| 54 | else |
| 55 | nodep = &(*nodep)->right; |
| 56 | } |
| 57 | *nodep = &tree[allocated]; |
| 58 | tree[allocated].left = tree[allocated].right = NULL; |
| 59 | tree[allocated].o = o; |
| 60 | return 1; |
| 61 | } |
| 62 | |
| 63 | /* inorder traversal, gives us a sorted BAT */ |
| 64 | static void |
| 65 | OIDTreeToBAT(struct oidtreenode *node, BAT *bat) |
| 66 | { |
| 67 | if (node->left != NULL) |
| 68 | OIDTreeToBAT(node->left, bat); |
| 69 | ((oid *) bat->theap.base)[bat->batCount++] = node->o; |
| 70 | if (node->right != NULL ) |
| 71 | OIDTreeToBAT(node->right, bat); |
| 72 | } |
| 73 | |
| 74 | /* Antiset traversal, give us all values but the ones in the tree */ |
| 75 | static void |
| 76 | OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, oid start, oid stop) |
| 77 | { |
| 78 | oid noid; |
| 79 | |
| 80 | if (node->left != NULL) |
| 81 | OIDTreeToBATAntiset(node->left, bat, start, node->o); |
| 82 | else |
| 83 | for (noid = start; noid < node->o; noid++) |
| 84 | ((oid *) bat->theap.base)[bat->batCount++] = noid; |
| 85 | |
| 86 | if (node->right != NULL) |
| 87 | OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop); |
| 88 | else |
| 89 | for (noid = node->o+1; noid < stop; noid++) |
| 90 | ((oid *) bat->theap.base)[bat->batCount++] = noid; |
| 91 | } |
| 92 | |
| 93 | static BAT * |
| 94 | do_batsample(BAT *b, BUN n, random_state_engine rse, MT_Lock *lock) |
| 95 | { |
| 96 | BAT *bn; |
| 97 | BUN cnt, slen; |
| 98 | BUN rescnt; |
| 99 | struct oidtreenode *tree = NULL; |
| 100 | |
| 101 | BATcheck(b, "BATsample" , NULL); |
| 102 | ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n" , NULL); |
| 103 | cnt = BATcount(b); |
| 104 | /* empty sample size */ |
| 105 | if (n == 0) { |
| 106 | bn = BATdense(0, 0, 0); |
| 107 | } else if (cnt <= n) { |
| 108 | /* sample size is larger than the input BAT, return |
| 109 | * all oids */ |
| 110 | bn = BATdense(0, b->hseqbase, cnt); |
| 111 | } else { |
| 112 | oid minoid = b->hseqbase; |
| 113 | oid maxoid = b->hseqbase + cnt; |
| 114 | |
| 115 | /* if someone samples more than half of our tree, we |
| 116 | * do the antiset */ |
| 117 | bool antiset = n > cnt / 2; |
| 118 | slen = n; |
| 119 | if (antiset) |
| 120 | n = cnt - n; |
| 121 | |
| 122 | tree = GDKmalloc(n * sizeof(struct oidtreenode)); |
| 123 | if (tree == NULL) { |
| 124 | return NULL; |
| 125 | } |
| 126 | bn = COLnew(0, TYPE_oid, slen, TRANSIENT); |
| 127 | if (bn == NULL) { |
| 128 | GDKfree(tree); |
| 129 | return NULL; |
| 130 | } |
| 131 | |
| 132 | /* while we do not have enough sample OIDs yet */ |
| 133 | if (lock) /* serialize access to random state engine */ |
| 134 | MT_lock_set(lock); |
| 135 | for (rescnt = 0; rescnt < n; rescnt++) { |
| 136 | oid candoid; |
| 137 | do { |
| 138 | candoid = minoid + next(rse) % cnt; |
| 139 | /* if that candidate OID was already |
| 140 | * generated, try again */ |
| 141 | } while (!OIDTreeMaybeInsert(tree, candoid, rescnt)); |
| 142 | } |
| 143 | if (lock) |
| 144 | MT_lock_unset(lock); |
| 145 | if (!antiset) { |
| 146 | OIDTreeToBAT(tree, bn); |
| 147 | } else { |
| 148 | OIDTreeToBATAntiset(tree, bn, minoid, maxoid); |
| 149 | } |
| 150 | GDKfree(tree); |
| 151 | |
| 152 | BATsetcount(bn, slen); |
| 153 | bn->trevsorted = bn->batCount <= 1; |
| 154 | bn->tsorted = true; |
| 155 | bn->tkey = true; |
| 156 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil; |
| 157 | } |
| 158 | ALGODEBUG fprintf(stderr, "#BATsample(" ALGOBATFMT "," BUNFMT ")=" |
| 159 | ALGOOPTBATFMT "\n" , |
| 160 | ALGOBATPAR(b), n, ALGOOPTBATPAR(bn)); |
| 161 | return bn; |
| 162 | } |
| 163 | |
| 164 | /* BATsample implements sampling for BATs */ |
| 165 | BAT * |
| 166 | BATsample_with_seed(BAT *b, BUN n, unsigned seed) |
| 167 | { |
| 168 | random_state_engine rse; |
| 169 | |
| 170 | init_random_state_engine(rse, (uint64_t) seed); |
| 171 | |
| 172 | return do_batsample(b, n, rse, NULL); |
| 173 | } |
| 174 | |
| 175 | BAT * |
| 176 | BATsample(BAT *b, BUN n) |
| 177 | { |
| 178 | static random_state_engine rse; |
| 179 | static MT_Lock rse_lock = MT_LOCK_INITIALIZER("rse_lock" ); |
| 180 | |
| 181 | MT_lock_set(&rse_lock); |
| 182 | if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0) |
| 183 | init_random_state_engine(rse, (uint64_t) GDKusec()); |
| 184 | MT_lock_unset(&rse_lock); |
| 185 | return do_batsample(b, n, rse, &rse_lock); |
| 186 | } |
| 187 | |