| 1 | /* | 
|---|
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public | 
|---|
| 3 | * License, v. 2.0.  If a copy of the MPL was not distributed with this | 
|---|
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. | 
|---|
| 5 | * | 
|---|
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. | 
|---|
| 7 | */ | 
|---|
| 8 |  | 
|---|
| 9 | /* | 
|---|
| 10 | * @a Lefteris Sidirourgos, Hannes Muehleisen | 
|---|
| 11 | * @* Low level sample facilities | 
|---|
| 12 | * | 
|---|
| 13 | * This sampling implementation generates a sorted set of OIDs by | 
|---|
| 14 | * calling the random number generator, and uses a binary tree to | 
|---|
| 15 | * eliminate duplicates.  The elements of the tree are then used to | 
|---|
| 16 | * create a sorted sample BAT.  This implementation has a logarithmic | 
|---|
| 17 | * complexity that only depends on the sample size. | 
|---|
| 18 | * | 
|---|
| 19 | * There is a pathological case when the sample size is almost the | 
|---|
| 20 | * size of the BAT.  Then, many collisions occur and performance | 
|---|
| 21 | * degrades. To catch this, we switch to antiset semantics when the | 
|---|
| 22 | * sample size is larger than half the BAT size. Then, we generate the | 
|---|
| 23 | * values that should be omitted from the sample. | 
|---|
| 24 | */ | 
|---|
| 25 |  | 
|---|
| 26 | #include "monetdb_config.h" | 
|---|
| 27 | #include "gdk.h" | 
|---|
| 28 | #include "gdk_private.h" | 
|---|
| 29 | #include "xoshiro256starstar.h" | 
|---|
| 30 |  | 
|---|
| 31 | /* this is a straightforward implementation of a binary tree */ | 
|---|
| 32 | struct oidtreenode { | 
|---|
| 33 | oid o; | 
|---|
| 34 | struct oidtreenode *left; | 
|---|
| 35 | struct oidtreenode *right; | 
|---|
| 36 | }; | 
|---|
| 37 |  | 
|---|
| 38 | static int | 
|---|
| 39 | OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated) | 
|---|
| 40 | { | 
|---|
| 41 | struct oidtreenode **nodep; | 
|---|
| 42 |  | 
|---|
| 43 | if (allocated == 0) { | 
|---|
| 44 | tree->left = tree->right = NULL; | 
|---|
| 45 | tree->o = o; | 
|---|
| 46 | return 1; | 
|---|
| 47 | } | 
|---|
| 48 | nodep = &tree; | 
|---|
| 49 | while (*nodep) { | 
|---|
| 50 | if (o == (*nodep)->o) | 
|---|
| 51 | return 0; | 
|---|
| 52 | if (o < (*nodep)->o) | 
|---|
| 53 | nodep = &(*nodep)->left; | 
|---|
| 54 | else | 
|---|
| 55 | nodep = &(*nodep)->right; | 
|---|
| 56 | } | 
|---|
| 57 | *nodep = &tree[allocated]; | 
|---|
| 58 | tree[allocated].left = tree[allocated].right = NULL; | 
|---|
| 59 | tree[allocated].o = o; | 
|---|
| 60 | return 1; | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | /* inorder traversal, gives us a sorted BAT */ | 
|---|
| 64 | static void | 
|---|
| 65 | OIDTreeToBAT(struct oidtreenode *node, BAT *bat) | 
|---|
| 66 | { | 
|---|
| 67 | if (node->left != NULL) | 
|---|
| 68 | OIDTreeToBAT(node->left, bat); | 
|---|
| 69 | ((oid *) bat->theap.base)[bat->batCount++] = node->o; | 
|---|
| 70 | if (node->right != NULL ) | 
|---|
| 71 | OIDTreeToBAT(node->right, bat); | 
|---|
| 72 | } | 
|---|
| 73 |  | 
|---|
| 74 | /* Antiset traversal, give us all values but the ones in the tree */ | 
|---|
| 75 | static void | 
|---|
| 76 | OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, oid start, oid stop) | 
|---|
| 77 | { | 
|---|
| 78 | oid noid; | 
|---|
| 79 |  | 
|---|
| 80 | if (node->left != NULL) | 
|---|
| 81 | OIDTreeToBATAntiset(node->left, bat, start, node->o); | 
|---|
| 82 | else | 
|---|
| 83 | for (noid = start; noid < node->o; noid++) | 
|---|
| 84 | ((oid *) bat->theap.base)[bat->batCount++] = noid; | 
|---|
| 85 |  | 
|---|
| 86 | if (node->right != NULL) | 
|---|
| 87 | OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop); | 
|---|
| 88 | else | 
|---|
| 89 | for (noid = node->o+1; noid < stop; noid++) | 
|---|
| 90 | ((oid *) bat->theap.base)[bat->batCount++] = noid; | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 | static BAT * | 
|---|
| 94 | do_batsample(BAT *b, BUN n, random_state_engine rse, MT_Lock *lock) | 
|---|
| 95 | { | 
|---|
| 96 | BAT *bn; | 
|---|
| 97 | BUN cnt, slen; | 
|---|
| 98 | BUN rescnt; | 
|---|
| 99 | struct oidtreenode *tree = NULL; | 
|---|
| 100 |  | 
|---|
| 101 | BATcheck(b, "BATsample", NULL); | 
|---|
| 102 | ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n", NULL); | 
|---|
| 103 | cnt = BATcount(b); | 
|---|
| 104 | /* empty sample size */ | 
|---|
| 105 | if (n == 0) { | 
|---|
| 106 | bn = BATdense(0, 0, 0); | 
|---|
| 107 | } else if (cnt <= n) { | 
|---|
| 108 | /* sample size is larger than the input BAT, return | 
|---|
| 109 | * all oids */ | 
|---|
| 110 | bn = BATdense(0, b->hseqbase, cnt); | 
|---|
| 111 | } else { | 
|---|
| 112 | oid minoid = b->hseqbase; | 
|---|
| 113 | oid maxoid = b->hseqbase + cnt; | 
|---|
| 114 |  | 
|---|
| 115 | /* if someone samples more than half of our tree, we | 
|---|
| 116 | * do the antiset */ | 
|---|
| 117 | bool antiset = n > cnt / 2; | 
|---|
| 118 | slen = n; | 
|---|
| 119 | if (antiset) | 
|---|
| 120 | n = cnt - n; | 
|---|
| 121 |  | 
|---|
| 122 | tree = GDKmalloc(n * sizeof(struct oidtreenode)); | 
|---|
| 123 | if (tree == NULL) { | 
|---|
| 124 | return NULL; | 
|---|
| 125 | } | 
|---|
| 126 | bn = COLnew(0, TYPE_oid, slen, TRANSIENT); | 
|---|
| 127 | if (bn == NULL) { | 
|---|
| 128 | GDKfree(tree); | 
|---|
| 129 | return NULL; | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | /* while we do not have enough sample OIDs yet */ | 
|---|
| 133 | if (lock)	/* serialize access to random state engine */ | 
|---|
| 134 | MT_lock_set(lock); | 
|---|
| 135 | for (rescnt = 0; rescnt < n; rescnt++) { | 
|---|
| 136 | oid candoid; | 
|---|
| 137 | do { | 
|---|
| 138 | candoid = minoid + next(rse) % cnt; | 
|---|
| 139 | /* if that candidate OID was already | 
|---|
| 140 | * generated, try again */ | 
|---|
| 141 | } while (!OIDTreeMaybeInsert(tree, candoid, rescnt)); | 
|---|
| 142 | } | 
|---|
| 143 | if (lock) | 
|---|
| 144 | MT_lock_unset(lock); | 
|---|
| 145 | if (!antiset) { | 
|---|
| 146 | OIDTreeToBAT(tree, bn); | 
|---|
| 147 | } else { | 
|---|
| 148 | OIDTreeToBATAntiset(tree, bn, minoid, maxoid); | 
|---|
| 149 | } | 
|---|
| 150 | GDKfree(tree); | 
|---|
| 151 |  | 
|---|
| 152 | BATsetcount(bn, slen); | 
|---|
| 153 | bn->trevsorted = bn->batCount <= 1; | 
|---|
| 154 | bn->tsorted = true; | 
|---|
| 155 | bn->tkey = true; | 
|---|
| 156 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil; | 
|---|
| 157 | } | 
|---|
| 158 | ALGODEBUG fprintf(stderr, "#BATsample("ALGOBATFMT ","BUNFMT ")=" | 
|---|
| 159 | ALGOOPTBATFMT "\n", | 
|---|
| 160 | ALGOBATPAR(b), n, ALGOOPTBATPAR(bn)); | 
|---|
| 161 | return bn; | 
|---|
| 162 | } | 
|---|
| 163 |  | 
|---|
| 164 | /* BATsample implements sampling for BATs */ | 
|---|
| 165 | BAT * | 
|---|
| 166 | BATsample_with_seed(BAT *b, BUN n, unsigned seed) | 
|---|
| 167 | { | 
|---|
| 168 | random_state_engine rse; | 
|---|
| 169 |  | 
|---|
| 170 | init_random_state_engine(rse, (uint64_t) seed); | 
|---|
| 171 |  | 
|---|
| 172 | return do_batsample(b, n, rse, NULL); | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | BAT * | 
|---|
| 176 | BATsample(BAT *b, BUN n) | 
|---|
| 177 | { | 
|---|
| 178 | static random_state_engine rse; | 
|---|
| 179 | static MT_Lock rse_lock = MT_LOCK_INITIALIZER( "rse_lock"); | 
|---|
| 180 |  | 
|---|
| 181 | MT_lock_set(&rse_lock); | 
|---|
| 182 | if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0) | 
|---|
| 183 | init_random_state_engine(rse, (uint64_t) GDKusec()); | 
|---|
| 184 | MT_lock_unset(&rse_lock); | 
|---|
| 185 | return do_batsample(b, n, rse, &rse_lock); | 
|---|
| 186 | } | 
|---|
| 187 |  | 
|---|