| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * @a Lefteris Sidirourgos |
| 11 | * @d 30/08/2011 |
| 12 | * @+ The sampling facilities |
| 13 | * |
| 14 | * In the context of the SciBORQ project, we introduce a number of sampling |
| 15 | * techniques in the MonetDB software stack. Our goal is to provide methods |
| 16 | * for performing sampling (uniform and weighted) over a) the result of a |
| 17 | * query, b) the base tables, and c) the entire database schema. Sampling |
| 18 | * can be performed during query execution, as well as during data loading in |
| 19 | * the case of predefined sampling indexes. In addition to the sampling |
| 20 | * methods, a number of query plan optimisations for sampling are introduced on |
| 21 | * the SQL and MAL level. |
| 22 | * |
| 23 | * Besides the sampling methods, SciBORQ also aims at multi-layered bounded |
| 24 | * query execution. That is steering query execution over many layers of |
| 25 | * samples with different size in order to achieve either strict error bounds |
| 26 | * or limited execution time. For more details see the SciBORQ module. |
| 27 | * |
| 28 | * In the following, details are presented on the implementation and the usage |
| 29 | * of each sampling method. |
| 30 | */ |
| 31 | |
| 32 | #include "monetdb_config.h" |
| 33 | #include "gdk.h" |
| 34 | #include "mal_exception.h" |
| 35 | #include "sample.h" |
| 36 | // TODO: Go through this documentation and update it with an explanation about seeds. |
| 37 | /* |
| 38 | * @- Uniform Sampling. |
| 39 | * |
| 40 | * A new SQL operator has been added to support sampling the result of a query. |
| 41 | * The syntax for sampling is: |
| 42 | * SELECT ... FROM ... WHERE ... SAMPLE s |
| 43 | * |
| 44 | * where s if is an integer greater than 1, it defines the number of rows to be |
| 45 | * in the sample. If s is a double between [0.0,1.0] the it refers to the |
| 46 | * percentage of the result to be sampled. That is if s=0.3 then the sample |
| 47 | * will be 30% the size of the query result. |
| 48 | * |
| 49 | * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only |
| 50 | * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a |
| 51 | * subquery. However, if this is needed, then one may define a function, for |
| 52 | * example |
| 53 | * |
| 54 | * CREATE FUNCTION mysample () |
| 55 | * RETURNS TABLE(col a,...) |
| 56 | * BEGIN |
| 57 | * RETURN |
| 58 | * SELECT a,... |
| 59 | * FROM name_table |
| 60 | * SAMPLE 100; |
| 61 | * end; |
| 62 | * |
| 63 | * and then use function mysample() for example to populate a new table with |
| 64 | * the sample. E.g., |
| 65 | * |
| 66 | * INSERT INTO sample_table (SELECT * FROM mysample()); |
| 67 | * |
| 68 | */ |
| 69 | |
| 70 | str |
| 71 | SAMPLEuniform(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { |
| 72 | |
| 73 | bat *r, *b; |
| 74 | lng sample_size; |
| 75 | unsigned seed; |
| 76 | (void) cntxt; |
| 77 | |
| 78 | BAT *br, *bb; |
| 79 | |
| 80 | r = getArgReference_bat(stk, pci, 0); |
| 81 | b = getArgReference_bat(stk, pci, 1); |
| 82 | |
| 83 | if ((bb = BATdescriptor(*b)) == NULL) { |
| 84 | throw(MAL, "sample.subuniform" , INTERNAL_BAT_ACCESS); |
| 85 | } |
| 86 | |
| 87 | if (getArgType(mb, pci, 2) == TYPE_dbl) |
| 88 | { |
| 89 | dbl pr = *getArgReference_dbl(stk, pci, 2); |
| 90 | |
| 91 | if ( pr < 0.0 || pr > 1.0 ) { |
| 92 | BBPunfix(bb->batCacheid); |
| 93 | throw(MAL, "sample.subuniform" , ILLEGAL_ARGUMENT |
| 94 | " p should be between 0 and 1.0" ); |
| 95 | } else if (pr == 0) {/* special case */ |
| 96 | sample_size = 0; |
| 97 | // TODO: Add special case for pr == 1.0. |
| 98 | } else { |
| 99 | sample_size = (lng) (pr*(double)BATcount(bb)); |
| 100 | } |
| 101 | } else { |
| 102 | sample_size = *getArgReference_lng(stk, pci, 2); |
| 103 | } |
| 104 | |
| 105 | if (pci->argc == 4) { |
| 106 | seed = (unsigned) *getArgReference_int(stk, pci, 3); |
| 107 | br = BATsample_with_seed(bb, (BUN) sample_size, seed); |
| 108 | } |
| 109 | else { |
| 110 | br = BATsample(bb, (BUN) sample_size); |
| 111 | } |
| 112 | |
| 113 | BBPunfix(bb->batCacheid); |
| 114 | if (br == NULL) |
| 115 | throw(MAL, "sample.subuniform" , OPERATION_FAILED); |
| 116 | |
| 117 | BBPkeepref(*r = br->batCacheid); |
| 118 | return MAL_SUCCEED; |
| 119 | } |
| 120 | |