1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * @a Lefteris Sidirourgos |
11 | * @d 30/08/2011 |
12 | * @+ The sampling facilities |
13 | * |
14 | * In the context of the SciBORQ project, we introduce a number of sampling |
15 | * techniques in the MonetDB software stack. Our goal is to provide methods |
16 | * for performing sampling (uniform and weighted) over a) the result of a |
17 | * query, b) the base tables, and c) the entire database schema. Sampling |
18 | * can be performed during query execution, as well as during data loading in |
19 | * the case of predefined sampling indexes. In addition to the sampling |
20 | * methods, a number of query plan optimisations for sampling are introduced on |
21 | * the SQL and MAL level. |
22 | * |
23 | * Besides the sampling methods, SciBORQ also aims at multi-layered bounded |
24 | * query execution. That is steering query execution over many layers of |
25 | * samples with different size in order to achieve either strict error bounds |
26 | * or limited execution time. For more details see the SciBORQ module. |
27 | * |
28 | * In the following, details are presented on the implementation and the usage |
29 | * of each sampling method. |
30 | */ |
31 | |
32 | #include "monetdb_config.h" |
33 | #include "gdk.h" |
34 | #include "mal_exception.h" |
35 | #include "sample.h" |
36 | // TODO: Go through this documentation and update it with an explanation about seeds. |
37 | /* |
38 | * @- Uniform Sampling. |
39 | * |
40 | * A new SQL operator has been added to support sampling the result of a query. |
41 | * The syntax for sampling is: |
42 | * SELECT ... FROM ... WHERE ... SAMPLE s |
43 | * |
44 | * where s if is an integer greater than 1, it defines the number of rows to be |
45 | * in the sample. If s is a double between [0.0,1.0] the it refers to the |
46 | * percentage of the result to be sampled. That is if s=0.3 then the sample |
47 | * will be 30% the size of the query result. |
48 | * |
49 | * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only |
50 | * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a |
51 | * subquery. However, if this is needed, then one may define a function, for |
52 | * example |
53 | * |
54 | * CREATE FUNCTION mysample () |
55 | * RETURNS TABLE(col a,...) |
56 | * BEGIN |
57 | * RETURN |
58 | * SELECT a,... |
59 | * FROM name_table |
60 | * SAMPLE 100; |
61 | * end; |
62 | * |
63 | * and then use function mysample() for example to populate a new table with |
64 | * the sample. E.g., |
65 | * |
66 | * INSERT INTO sample_table (SELECT * FROM mysample()); |
67 | * |
68 | */ |
69 | |
70 | str |
71 | SAMPLEuniform(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { |
72 | |
73 | bat *r, *b; |
74 | lng sample_size; |
75 | unsigned seed; |
76 | (void) cntxt; |
77 | |
78 | BAT *br, *bb; |
79 | |
80 | r = getArgReference_bat(stk, pci, 0); |
81 | b = getArgReference_bat(stk, pci, 1); |
82 | |
83 | if ((bb = BATdescriptor(*b)) == NULL) { |
84 | throw(MAL, "sample.subuniform" , INTERNAL_BAT_ACCESS); |
85 | } |
86 | |
87 | if (getArgType(mb, pci, 2) == TYPE_dbl) |
88 | { |
89 | dbl pr = *getArgReference_dbl(stk, pci, 2); |
90 | |
91 | if ( pr < 0.0 || pr > 1.0 ) { |
92 | BBPunfix(bb->batCacheid); |
93 | throw(MAL, "sample.subuniform" , ILLEGAL_ARGUMENT |
94 | " p should be between 0 and 1.0" ); |
95 | } else if (pr == 0) {/* special case */ |
96 | sample_size = 0; |
97 | // TODO: Add special case for pr == 1.0. |
98 | } else { |
99 | sample_size = (lng) (pr*(double)BATcount(bb)); |
100 | } |
101 | } else { |
102 | sample_size = *getArgReference_lng(stk, pci, 2); |
103 | } |
104 | |
105 | if (pci->argc == 4) { |
106 | seed = (unsigned) *getArgReference_int(stk, pci, 3); |
107 | br = BATsample_with_seed(bb, (BUN) sample_size, seed); |
108 | } |
109 | else { |
110 | br = BATsample(bb, (BUN) sample_size); |
111 | } |
112 | |
113 | BBPunfix(bb->batCacheid); |
114 | if (br == NULL) |
115 | throw(MAL, "sample.subuniform" , OPERATION_FAILED); |
116 | |
117 | BBPkeepref(*r = br->batCacheid); |
118 | return MAL_SUCCEED; |
119 | } |
120 | |