1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * @a Lefteris Sidirourgos
11 * @d 30/08/2011
12 * @+ The sampling facilities
13 *
14 * In the context of the SciBORQ project, we introduce a number of sampling
15 * techniques in the MonetDB software stack. Our goal is to provide methods
16 * for performing sampling (uniform and weighted) over a) the result of a
17 * query, b) the base tables, and c) the entire database schema. Sampling
18 * can be performed during query execution, as well as during data loading in
19 * the case of predefined sampling indexes. In addition to the sampling
20 * methods, a number of query plan optimisations for sampling are introduced on
21 * the SQL and MAL level.
22 *
23 * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
24 * query execution. That is steering query execution over many layers of
25 * samples with different size in order to achieve either strict error bounds
26 * or limited execution time. For more details see the SciBORQ module.
27 *
28 * In the following, details are presented on the implementation and the usage
29 * of each sampling method.
30 */
31
32#include "monetdb_config.h"
33#include "gdk.h"
34#include "mal_exception.h"
35#include "sample.h"
36// TODO: Go through this documentation and update it with an explanation about seeds.
37/*
38 * @- Uniform Sampling.
39 *
40 * A new SQL operator has been added to support sampling the result of a query.
41 * The syntax for sampling is:
42 * SELECT ... FROM ... WHERE ... SAMPLE s
43 *
44 * where s if is an integer greater than 1, it defines the number of rows to be
45 * in the sample. If s is a double between [0.0,1.0] the it refers to the
46 * percentage of the result to be sampled. That is if s=0.3 then the sample
47 * will be 30% the size of the query result.
48 *
49 * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only
50 * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a
51 * subquery. However, if this is needed, then one may define a function, for
52 * example
53 *
54 * CREATE FUNCTION mysample ()
55 * RETURNS TABLE(col a,...)
56 * BEGIN
57 * RETURN
58 * SELECT a,...
59 * FROM name_table
60 * SAMPLE 100;
61 * end;
62 *
63 * and then use function mysample() for example to populate a new table with
64 * the sample. E.g.,
65 *
66 * INSERT INTO sample_table (SELECT * FROM mysample());
67 *
68 */
69
70str
71SAMPLEuniform(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) {
72
73 bat *r, *b;
74 lng sample_size;
75 unsigned seed;
76 (void) cntxt;
77
78 BAT *br, *bb;
79
80 r = getArgReference_bat(stk, pci, 0);
81 b = getArgReference_bat(stk, pci, 1);
82
83 if ((bb = BATdescriptor(*b)) == NULL) {
84 throw(MAL, "sample.subuniform", INTERNAL_BAT_ACCESS);
85 }
86
87 if (getArgType(mb, pci, 2) == TYPE_dbl)
88 {
89 dbl pr = *getArgReference_dbl(stk, pci, 2);
90
91 if ( pr < 0.0 || pr > 1.0 ) {
92 BBPunfix(bb->batCacheid);
93 throw(MAL, "sample.subuniform", ILLEGAL_ARGUMENT
94 " p should be between 0 and 1.0" );
95 } else if (pr == 0) {/* special case */
96 sample_size = 0;
97 // TODO: Add special case for pr == 1.0.
98 } else {
99 sample_size = (lng) (pr*(double)BATcount(bb));
100 }
101 } else {
102 sample_size = *getArgReference_lng(stk, pci, 2);
103 }
104
105 if (pci->argc == 4) {
106 seed = (unsigned) *getArgReference_int(stk, pci, 3);
107 br = BATsample_with_seed(bb, (BUN) sample_size, seed);
108 }
109 else {
110 br = BATsample(bb, (BUN) sample_size);
111 }
112
113 BBPunfix(bb->batCacheid);
114 if (br == NULL)
115 throw(MAL, "sample.subuniform", OPERATION_FAILED);
116
117 BBPkeepref(*r = br->batCacheid);
118 return MAL_SUCCEED;
119}
120