1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * @- The Problem
11 * When creating a join, we want to make a unique key of the attributes on both
12 * sides and then join these keys. Consider the following BATs.
13 *
14 * @verbatim
15 * orders customer link
16 * ==================== ===================== ===========
17 * zipcode h_nr zipcode hnr oid cid
18 * o1 13 9 c1 11 10 o1 c5
19 * o2 11 10 c2 11 11 o2 c1
20 * o3 11 11 c3 12 2 o3 c2
21 * o4 12 5 c4 12 1 o4 nil
22 * o5 11 10 c5 13 9 o5 c1
23 * o6 12 2 c6 14 7 o6 c3
24 * o7 13 9 o7 c5
25 * o8 12 1 o8 c4
26 * o9 13 9 o9 c5
27 * @end verbatim
28 *
29 * The current approach is designed to take minimal memory, as our previous
30 * solutions to the problem did not scale well. In case of singular keys,
31 * the link is executed by a simple join. Before going into the join, we
32 * make sure the end result size is not too large, which is done by looking
33 * at relation sizes (if the other key is unique) or, if that is not possible,
34 * by computing the exact join size.
35 *
36 * The join algorithm was also improved to do dynamic sampling to determine
37 * with high accuracy the join size, so that we can alloc in one go a memory
38 * region of sufficient size. This also reduces the ds\_link memory requirements.
39 *
40 * For compound keys, those that consist of multiple attributes, we now compute
41 * a derived column that contains an integer hash value derived from all
42 * key columns.
43 * This is done by computing a hash value for each individual key column
44 * and combining those by bitwise XOR and left-rotation. That is, for each
45 * column,we rotate the working hash value by N bits and XOR the hash value
46 * of the column over it. The working hash value is initialized with zero,
47 * and after all columns are processed, this working value is used as output.
48 * Computing the hash value for all columns in the key for one table is done
49 * by the command hash(). Hence, we do hash on both sides, and join
50 * that together with a simple join:
51 *
52 * @code{join(hash(keys), hash(keys.reverse);}
53 *
54 * One complication of this procedure are nil values:
55 * @table
56 * @itemize
57 * @item
58 * it may happen that the final hash-value (an int formed by a
59 * random bit pattern) accidentally has the value of int(nil).
60 * Notice that join never matches nil values.
61 * Hence these accidental nils must be replaced by a begin value (currently: 0).
62 * @item
63 * in case any of the compound key values is nil, our nil semantics
64 * require us that those tuples may never match on a join. Consequently,
65 * during the hash() processing of all compound key columns for computing
66 * the hash value, we also maintain a bit-bat that records which tuples had
67 * a nil value. The bit-bat is initialized to false, and the results of the
68 * nil-check on each column is OR-ed to it.
69 * Afterwards, the hash-value of all tuples that have this nil-bit set to
70 * TRUE are forced to int(nil), which will exclude them from matching.
71 * @end itemize
72 *
73 * Joining on hash values produces a @emph{superset} of the join result:
74 * it may happen that two different key combinations hash on the same value,
75 * which will make them match on the join (false hits). The final part
76 * of the ds\_link therefore consists of filtering out the false hits.
77 * This is done incrementally by joining back the join result to the original
78 * columns, incrementally one by one for each pair of corresponding
79 * columns. These values are compared with each other and we AND the
80 * result of this comparison together for each pair of columns.
81 * The bat containing these bits is initialized to all TRUE and serves as
82 * final result after all column pairs have been compared.
83 * The initial join result is finally filtered with this bit-bat.
84 *
85 * Joining back from the initial join-result to the original columns on
86 * both sides takes quite a lot of memory. For this reason, the false
87 * hit-filtering is done in slices (not all tuples at one time).
88 * In this way the memory requirements of this phase are kept low.
89 * In fact, the most memory demanding part of the join is the int-join
90 * on hash number, which takes N*24 bytes (where N= |L| = |R|).
91 * In comparison, the previous CTmultigroup/CTmultiderive approach
92 * took N*48 bytes. Additionally, by making it possible to use merge-sort,
93 * it avoids severe performance degradation (memory thrashing) as produced
94 * by the old ds\_link when the inner join relation would be larger than memory.
95 *
96 * If ds\_link performance is still an issue, the sort-merge join used here
97 * could be replaced by partitioned hash-join with radix-cluster/decluster.
98 *
99 * @+ Implementation
100 */
101#ifndef _MKEY_H
102#define _MKEY_H
103
104/*#define _DEBUG_MKEY_ */
105
106#include "mal.h"
107#include "mal_interpreter.h"
108#include "mal_exception.h"
109
110mal_export str MKEYrotate(lng *ret, const lng *v, const int *nbits);
111mal_export str MKEYhash(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr p);
112mal_export str MKEYrotate_xor_hash(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr p);
113mal_export str MKEYbulk_rotate_xor_hash(bat *ret, const bat *hid, const int *nbits, const bat *bid);
114mal_export str MKEYbulkconst_rotate_xor_hash(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr p);
115mal_export str MKEYconstbulk_rotate_xor_hash(bat *ret, const lng *h, const int *nbits, const bat *bid);
116mal_export str MKEYbathash(bat *res, const bat *bid);
117
118#endif /* _MKEY_H */
119