mkey.h source code [MonetDB/monetdb5/modules/mal/mkey.h]

1	/*
2	* This Source Code Form is subject to the terms of the Mozilla Public
3	* License, v. 2.0. If a copy of the MPL was not distributed with this
4	* file, You can obtain one at http://mozilla.org/MPL/2.0/.
5	*
6	* Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7	*/
8
9	/*
10	* @- The Problem
11	* When creating a join, we want to make a unique key of the attributes on both
12	* sides and then join these keys. Consider the following BATs.
13	*
14	* @verbatim
15	* orders customer link
16	* ==================== ===================== ===========
17	* zipcode h_nr zipcode hnr oid cid
18	* o1 13 9 c1 11 10 o1 c5
19	* o2 11 10 c2 11 11 o2 c1
20	* o3 11 11 c3 12 2 o3 c2
21	* o4 12 5 c4 12 1 o4 nil
22	* o5 11 10 c5 13 9 o5 c1
23	* o6 12 2 c6 14 7 o6 c3
24	* o7 13 9 o7 c5
25	* o8 12 1 o8 c4
26	* o9 13 9 o9 c5
27	* @end verbatim
28	*
29	* The current approach is designed to take minimal memory, as our previous
30	* solutions to the problem did not scale well. In case of singular keys,
31	* the link is executed by a simple join. Before going into the join, we
32	* make sure the end result size is not too large, which is done by looking
33	* at relation sizes (if the other key is unique) or, if that is not possible,
34	* by computing the exact join size.
35	*
36	* The join algorithm was also improved to do dynamic sampling to determine
37	* with high accuracy the join size, so that we can alloc in one go a memory
38	* region of sufficient size. This also reduces the ds\_link memory requirements.
39	*
40	* For compound keys, those that consist of multiple attributes, we now compute
41	* a derived column that contains an integer hash value derived from all
42	* key columns.
43	* This is done by computing a hash value for each individual key column
44	* and combining those by bitwise XOR and left-rotation. That is, for each
45	* column,we rotate the working hash value by N bits and XOR the hash value
46	* of the column over it. The working hash value is initialized with zero,
47	* and after all columns are processed, this working value is used as output.
48	* Computing the hash value for all columns in the key for one table is done
49	* by the command hash(). Hence, we do hash on both sides, and join
50	* that together with a simple join:
51	*
52	* @code{join(hash(keys), hash(keys.reverse);}
53	*
54	* One complication of this procedure are nil values:
55	* @table
56	* @itemize
57	* @item
58	* it may happen that the final hash-value (an int formed by a
59	* random bit pattern) accidentally has the value of int(nil).
60	* Notice that join never matches nil values.
61	* Hence these accidental nils must be replaced by a begin value (currently: 0).
62	* @item
63	* in case any of the compound key values is nil, our nil semantics
64	* require us that those tuples may never match on a join. Consequently,
65	* during the hash() processing of all compound key columns for computing
66	* the hash value, we also maintain a bit-bat that records which tuples had
67	* a nil value. The bit-bat is initialized to false, and the results of the
68	* nil-check on each column is OR-ed to it.
69	* Afterwards, the hash-value of all tuples that have this nil-bit set to
70	* TRUE are forced to int(nil), which will exclude them from matching.
71	* @end itemize
72	*
73	* Joining on hash values produces a @emph{superset} of the join result:
74	* it may happen that two different key combinations hash on the same value,
75	* which will make them match on the join (false hits). The final part
76	* of the ds\_link therefore consists of filtering out the false hits.
77	* This is done incrementally by joining back the join result to the original
78	* columns, incrementally one by one for each pair of corresponding
79	* columns. These values are compared with each other and we AND the
80	* result of this comparison together for each pair of columns.
81	* The bat containing these bits is initialized to all TRUE and serves as
82	* final result after all column pairs have been compared.
83	* The initial join result is finally filtered with this bit-bat.
84	*
85	* Joining back from the initial join-result to the original columns on
86	* both sides takes quite a lot of memory. For this reason, the false
87	* hit-filtering is done in slices (not all tuples at one time).
88	* In this way the memory requirements of this phase are kept low.
89	* In fact, the most memory demanding part of the join is the int-join
90	* on hash number, which takes N*24 bytes (where N= \|L\| = \|R\|).
91	* In comparison, the previous CTmultigroup/CTmultiderive approach
92	* took N*48 bytes. Additionally, by making it possible to use merge-sort,
93	* it avoids severe performance degradation (memory thrashing) as produced
94	* by the old ds\_link when the inner join relation would be larger than memory.
95	*
96	* If ds\_link performance is still an issue, the sort-merge join used here
97	* could be replaced by partitioned hash-join with radix-cluster/decluster.
98	*
99	* @+ Implementation
100	*/
101	#ifndef _MKEY_H
102	#define _MKEY_H
103
104	/#define _DEBUG_MKEY_ /
105
106	#include "mal.h"
107	#include "mal_interpreter.h"
108	#include "mal_exception.h"
109
110	mal_export str MKEYrotate(lng ret, const* lng v, const* int *nbits);
111	mal_export str MKEYhash(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr p);
112	mal_export str MKEYrotate_xor_hash(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr p);
113	mal_export str MKEYbulk_rotate_xor_hash(bat ret, const* bat hid, const* int nbits, const* bat *bid);
114	mal_export str MKEYbulkconst_rotate_xor_hash(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr p);
115	mal_export str MKEYconstbulk_rotate_xor_hash(bat ret, const* lng h, const* int nbits, const* bat *bid);
116	mal_export str MKEYbathash(bat res, const* bat *bid);
117
118	#endif /* _MKEY_H */
119

Browse the source code of MonetDB/monetdb5/modules/mal/mkey.h