1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * @a Lefteris Sidirourgos, Hannes Muehleisen
11 * @* Low level sample facilities
12 *
13 * This sampling implementation generates a sorted set of OIDs by
14 * calling the random number generator, and uses a binary tree to
15 * eliminate duplicates. The elements of the tree are then used to
16 * create a sorted sample BAT. This implementation has a logarithmic
17 * complexity that only depends on the sample size.
18 *
19 * There is a pathological case when the sample size is almost the
20 * size of the BAT. Then, many collisions occur and performance
21 * degrades. To catch this, we switch to antiset semantics when the
22 * sample size is larger than half the BAT size. Then, we generate the
23 * values that should be omitted from the sample.
24 */
25
26#include "monetdb_config.h"
27#include "gdk.h"
28#include "gdk_private.h"
29#include "xoshiro256starstar.h"
30
31/* this is a straightforward implementation of a binary tree */
32struct oidtreenode {
33 oid o;
34 struct oidtreenode *left;
35 struct oidtreenode *right;
36};
37
38static int
39OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
40{
41 struct oidtreenode **nodep;
42
43 if (allocated == 0) {
44 tree->left = tree->right = NULL;
45 tree->o = o;
46 return 1;
47 }
48 nodep = &tree;
49 while (*nodep) {
50 if (o == (*nodep)->o)
51 return 0;
52 if (o < (*nodep)->o)
53 nodep = &(*nodep)->left;
54 else
55 nodep = &(*nodep)->right;
56 }
57 *nodep = &tree[allocated];
58 tree[allocated].left = tree[allocated].right = NULL;
59 tree[allocated].o = o;
60 return 1;
61}
62
63/* inorder traversal, gives us a sorted BAT */
64static void
65OIDTreeToBAT(struct oidtreenode *node, BAT *bat)
66{
67 if (node->left != NULL)
68 OIDTreeToBAT(node->left, bat);
69 ((oid *) bat->theap.base)[bat->batCount++] = node->o;
70 if (node->right != NULL )
71 OIDTreeToBAT(node->right, bat);
72}
73
74/* Antiset traversal, give us all values but the ones in the tree */
75static void
76OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, oid start, oid stop)
77{
78 oid noid;
79
80 if (node->left != NULL)
81 OIDTreeToBATAntiset(node->left, bat, start, node->o);
82 else
83 for (noid = start; noid < node->o; noid++)
84 ((oid *) bat->theap.base)[bat->batCount++] = noid;
85
86 if (node->right != NULL)
87 OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop);
88 else
89 for (noid = node->o+1; noid < stop; noid++)
90 ((oid *) bat->theap.base)[bat->batCount++] = noid;
91}
92
93static BAT *
94do_batsample(BAT *b, BUN n, random_state_engine rse, MT_Lock *lock)
95{
96 BAT *bn;
97 BUN cnt, slen;
98 BUN rescnt;
99 struct oidtreenode *tree = NULL;
100
101 BATcheck(b, "BATsample", NULL);
102 ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n", NULL);
103 cnt = BATcount(b);
104 /* empty sample size */
105 if (n == 0) {
106 bn = BATdense(0, 0, 0);
107 } else if (cnt <= n) {
108 /* sample size is larger than the input BAT, return
109 * all oids */
110 bn = BATdense(0, b->hseqbase, cnt);
111 } else {
112 oid minoid = b->hseqbase;
113 oid maxoid = b->hseqbase + cnt;
114
115 /* if someone samples more than half of our tree, we
116 * do the antiset */
117 bool antiset = n > cnt / 2;
118 slen = n;
119 if (antiset)
120 n = cnt - n;
121
122 tree = GDKmalloc(n * sizeof(struct oidtreenode));
123 if (tree == NULL) {
124 return NULL;
125 }
126 bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
127 if (bn == NULL) {
128 GDKfree(tree);
129 return NULL;
130 }
131
132 /* while we do not have enough sample OIDs yet */
133 if (lock) /* serialize access to random state engine */
134 MT_lock_set(lock);
135 for (rescnt = 0; rescnt < n; rescnt++) {
136 oid candoid;
137 do {
138 candoid = minoid + next(rse) % cnt;
139 /* if that candidate OID was already
140 * generated, try again */
141 } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
142 }
143 if (lock)
144 MT_lock_unset(lock);
145 if (!antiset) {
146 OIDTreeToBAT(tree, bn);
147 } else {
148 OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
149 }
150 GDKfree(tree);
151
152 BATsetcount(bn, slen);
153 bn->trevsorted = bn->batCount <= 1;
154 bn->tsorted = true;
155 bn->tkey = true;
156 bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
157 }
158 ALGODEBUG fprintf(stderr, "#BATsample(" ALGOBATFMT "," BUNFMT ")="
159 ALGOOPTBATFMT "\n",
160 ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
161 return bn;
162}
163
164/* BATsample implements sampling for BATs */
165BAT *
166BATsample_with_seed(BAT *b, BUN n, unsigned seed)
167{
168 random_state_engine rse;
169
170 init_random_state_engine(rse, (uint64_t) seed);
171
172 return do_batsample(b, n, rse, NULL);
173}
174
175BAT *
176BATsample(BAT *b, BUN n)
177{
178 static random_state_engine rse;
179 static MT_Lock rse_lock = MT_LOCK_INITIALIZER("rse_lock");
180
181 MT_lock_set(&rse_lock);
182 if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
183 init_random_state_engine(rse, (uint64_t) GDKusec());
184 MT_lock_unset(&rse_lock);
185 return do_batsample(b, n, rse, &rse_lock);
186}
187