1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * @a Lefteris Sidirourgos, Hannes Muehleisen |
11 | * @* Low level sample facilities |
12 | * |
13 | * This sampling implementation generates a sorted set of OIDs by |
14 | * calling the random number generator, and uses a binary tree to |
15 | * eliminate duplicates. The elements of the tree are then used to |
16 | * create a sorted sample BAT. This implementation has a logarithmic |
17 | * complexity that only depends on the sample size. |
18 | * |
19 | * There is a pathological case when the sample size is almost the |
20 | * size of the BAT. Then, many collisions occur and performance |
21 | * degrades. To catch this, we switch to antiset semantics when the |
22 | * sample size is larger than half the BAT size. Then, we generate the |
23 | * values that should be omitted from the sample. |
24 | */ |
25 | |
26 | #include "monetdb_config.h" |
27 | #include "gdk.h" |
28 | #include "gdk_private.h" |
29 | #include "xoshiro256starstar.h" |
30 | |
31 | /* this is a straightforward implementation of a binary tree */ |
32 | struct oidtreenode { |
33 | oid o; |
34 | struct oidtreenode *left; |
35 | struct oidtreenode *right; |
36 | }; |
37 | |
38 | static int |
39 | OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated) |
40 | { |
41 | struct oidtreenode **nodep; |
42 | |
43 | if (allocated == 0) { |
44 | tree->left = tree->right = NULL; |
45 | tree->o = o; |
46 | return 1; |
47 | } |
48 | nodep = &tree; |
49 | while (*nodep) { |
50 | if (o == (*nodep)->o) |
51 | return 0; |
52 | if (o < (*nodep)->o) |
53 | nodep = &(*nodep)->left; |
54 | else |
55 | nodep = &(*nodep)->right; |
56 | } |
57 | *nodep = &tree[allocated]; |
58 | tree[allocated].left = tree[allocated].right = NULL; |
59 | tree[allocated].o = o; |
60 | return 1; |
61 | } |
62 | |
63 | /* inorder traversal, gives us a sorted BAT */ |
64 | static void |
65 | OIDTreeToBAT(struct oidtreenode *node, BAT *bat) |
66 | { |
67 | if (node->left != NULL) |
68 | OIDTreeToBAT(node->left, bat); |
69 | ((oid *) bat->theap.base)[bat->batCount++] = node->o; |
70 | if (node->right != NULL ) |
71 | OIDTreeToBAT(node->right, bat); |
72 | } |
73 | |
74 | /* Antiset traversal, give us all values but the ones in the tree */ |
75 | static void |
76 | OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, oid start, oid stop) |
77 | { |
78 | oid noid; |
79 | |
80 | if (node->left != NULL) |
81 | OIDTreeToBATAntiset(node->left, bat, start, node->o); |
82 | else |
83 | for (noid = start; noid < node->o; noid++) |
84 | ((oid *) bat->theap.base)[bat->batCount++] = noid; |
85 | |
86 | if (node->right != NULL) |
87 | OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop); |
88 | else |
89 | for (noid = node->o+1; noid < stop; noid++) |
90 | ((oid *) bat->theap.base)[bat->batCount++] = noid; |
91 | } |
92 | |
93 | static BAT * |
94 | do_batsample(BAT *b, BUN n, random_state_engine rse, MT_Lock *lock) |
95 | { |
96 | BAT *bn; |
97 | BUN cnt, slen; |
98 | BUN rescnt; |
99 | struct oidtreenode *tree = NULL; |
100 | |
101 | BATcheck(b, "BATsample" , NULL); |
102 | ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n" , NULL); |
103 | cnt = BATcount(b); |
104 | /* empty sample size */ |
105 | if (n == 0) { |
106 | bn = BATdense(0, 0, 0); |
107 | } else if (cnt <= n) { |
108 | /* sample size is larger than the input BAT, return |
109 | * all oids */ |
110 | bn = BATdense(0, b->hseqbase, cnt); |
111 | } else { |
112 | oid minoid = b->hseqbase; |
113 | oid maxoid = b->hseqbase + cnt; |
114 | |
115 | /* if someone samples more than half of our tree, we |
116 | * do the antiset */ |
117 | bool antiset = n > cnt / 2; |
118 | slen = n; |
119 | if (antiset) |
120 | n = cnt - n; |
121 | |
122 | tree = GDKmalloc(n * sizeof(struct oidtreenode)); |
123 | if (tree == NULL) { |
124 | return NULL; |
125 | } |
126 | bn = COLnew(0, TYPE_oid, slen, TRANSIENT); |
127 | if (bn == NULL) { |
128 | GDKfree(tree); |
129 | return NULL; |
130 | } |
131 | |
132 | /* while we do not have enough sample OIDs yet */ |
133 | if (lock) /* serialize access to random state engine */ |
134 | MT_lock_set(lock); |
135 | for (rescnt = 0; rescnt < n; rescnt++) { |
136 | oid candoid; |
137 | do { |
138 | candoid = minoid + next(rse) % cnt; |
139 | /* if that candidate OID was already |
140 | * generated, try again */ |
141 | } while (!OIDTreeMaybeInsert(tree, candoid, rescnt)); |
142 | } |
143 | if (lock) |
144 | MT_lock_unset(lock); |
145 | if (!antiset) { |
146 | OIDTreeToBAT(tree, bn); |
147 | } else { |
148 | OIDTreeToBATAntiset(tree, bn, minoid, maxoid); |
149 | } |
150 | GDKfree(tree); |
151 | |
152 | BATsetcount(bn, slen); |
153 | bn->trevsorted = bn->batCount <= 1; |
154 | bn->tsorted = true; |
155 | bn->tkey = true; |
156 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil; |
157 | } |
158 | ALGODEBUG fprintf(stderr, "#BATsample(" ALGOBATFMT "," BUNFMT ")=" |
159 | ALGOOPTBATFMT "\n" , |
160 | ALGOBATPAR(b), n, ALGOOPTBATPAR(bn)); |
161 | return bn; |
162 | } |
163 | |
164 | /* BATsample implements sampling for BATs */ |
165 | BAT * |
166 | BATsample_with_seed(BAT *b, BUN n, unsigned seed) |
167 | { |
168 | random_state_engine rse; |
169 | |
170 | init_random_state_engine(rse, (uint64_t) seed); |
171 | |
172 | return do_batsample(b, n, rse, NULL); |
173 | } |
174 | |
175 | BAT * |
176 | BATsample(BAT *b, BUN n) |
177 | { |
178 | static random_state_engine rse; |
179 | static MT_Lock rse_lock = MT_LOCK_INITIALIZER("rse_lock" ); |
180 | |
181 | MT_lock_set(&rse_lock); |
182 | if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0) |
183 | init_random_state_engine(rse, (uint64_t) GDKusec()); |
184 | MT_lock_unset(&rse_lock); |
185 | return do_batsample(b, n, rse, &rse_lock); |
186 | } |
187 | |