Set.h source code [ClickHouse/dbms/src/Interpreters/Set.h]

1	#pragma once
2
3	#include <shared_mutex>
4	#include <Core/Block.h>
5	#include <DataStreams/SizeLimits.h>
6	#include <DataTypes/IDataType.h>
7	#include <Interpreters/SetVariants.h>
8	#include <Interpreters/Context.h>
9	#include <Parsers/IAST.h>
10	#include <Storages/MergeTree/BoolMask.h>
11
12	#include <common/logger_useful.h>
13
14
15	namespace DB
16	{
17
18	struct Range;
19	class FieldWithInfinity;
20
21	class IFunctionBase;
22	using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
23
24
25	/* Data structure for implementation of IN expression.*
26	*/
27	class Set
28	{
29	public:
30	/// 'fill_set_elements': in addition to hash table
31	/// (that is useful only for checking that some value is in the set and may not store the original values),
32	/// store all set elements in explicit form.
33	/// This is needed for subsequent use for index.
34	Set(const SizeLimits & limits_, bool fill_set_elements_)
35	: log(&Logger::get("Set")),
36	limits (limits_), fill_set_elements(fill_set_elements_)
37	{
38	}
39
40	bool empty() const { return data.empty(); }
41
42	/* Set can be created either from AST or from a stream of data (subquery result).*
43	*/
44
45	/* Create a Set from expression (specified literally in the query).*
46	* 'types' - types of what are on the left hand side of IN.
47	* 'node' - list of values: 1, 2, 3 or list of tuples: (1, 2), (3, 4), (5, 6).
48	* 'fill_set_elements' - if true, fill vector of elements. For primary key to work.
49	*/
50	void createFromAST(const DataTypes & types, ASTPtr node, const Context & context);
51
52	/* Create a Set from stream.*
53	* Call setHeader, then call insertFromBlock for each block.
54	*/
55	void setHeader(const Block & header);
56
57	/// Returns false, if some limit was exceeded and no need to insert more data.
58	bool insertFromBlock(const Block & block);
59	/// Call after all blocks were inserted. To get the information that set is already created.
60	void finishInsert() { is_created = true; }
61
62	bool isCreated() const { return is_created; }
63
64	/* For columns of 'block', check belonging of corresponding rows to the set.*
65	* Return UInt8 column with the result.
66	*/
67	ColumnPtr execute(const Block & block, bool negative) const;
68
69	size_t getTotalRowCount() const { return data.getTotalRowCount(); }
70	size_t getTotalByteCount() const { return data.getTotalByteCount(); }
71
72	const DataTypes & getDataTypes() const { return data_types; }
73	const DataTypes & getElementsTypes() const { return set_elements_types; }
74
75	bool hasExplicitSetElements() const { return fill_set_elements; }
76	Columns getSetElements() const { return { set_elements.begin(), set_elements.end() }; }
77
78	void checkColumnsNumber(size_t num_key_columns) const;
79	void checkTypesEqual(size_t set_type_idx, const DataTypePtr & other_type) const;
80
81	private:
82	size_t keys_size = `0`;
83	Sizes key_sizes;
84
85	SetVariants data;
86
87	/* How IN works with Nullable types.*
88	*
89	* For simplicity reasons, all NULL values and any tuples with at least one NULL element are ignored in the Set.
90	* And for left hand side values, that are NULLs or contain any NULLs, we return 0 (means that element is not in Set).
91	*
92	* If we want more standard compliant behaviour, we must return NULL
93	* if lhs is NULL and set is not empty or if lhs is not in set, but set contains at least one NULL.
94	* It is more complicated with tuples.
95	* For example,
96	* (1, NULL, 2) IN ((1, NULL, 3)) must return 0,
97	* but (1, NULL, 2) IN ((1, 1111, 2)) must return NULL.
98	*
99	* We have not implemented such sophisticated behaviour.
100	*/
101
102	/* The data types from which the set was created.*
103	* When checking for belonging to a set, the types of columns to be checked must match with them.
104	*/
105	DataTypes data_types;
106
107	/// Types for set_elements.
108	DataTypes set_elements_types;
109
110	Logger * log;
111
112	/// Limitations on the maximum size of the set
113	SizeLimits limits;
114
115	/// Do we need to additionally store all elements of the set in explicit form for subsequent use for index.
116	bool fill_set_elements;
117
118	/// Check if set contains all the data.
119	bool is_created = false;
120
121	/// If in the left part columns contains the same types as the elements of the set.
122	void executeOrdinary(
123	const ColumnRawPtrs & key_columns,
124	ColumnUInt8::Container & vec_res,
125	bool negative,
126	const PaddedPODArray<UInt8> * null_map) const;
127
128	/// Collected elements of `Set`.
129	/// It is necessary for the index to work on the primary key in the IN statement.
130	std::vector<IColumn::WrappedPtr> set_elements;
131
132	/* Protects work with the set in the functions `insertFromBlock` and `execute`.*
133	* These functions can be called simultaneously from different threads only when using StorageSet,
134	* and StorageSet calls only these two functions.
135	* Therefore, the rest of the functions for working with set are not protected.
136	*/
137	mutable std::shared_mutex rwlock;
138
139	template <typename Method>
140	void insertFromBlockImpl(
141	Method & method,
142	const ColumnRawPtrs & key_columns,
143	size_t rows,
144	SetVariants & variants,
145	ConstNullMapPtr null_map,
146	ColumnUInt8::Container * out_filter);
147
148	template <typename Method, bool has_null_map, bool build_filter>
149	void insertFromBlockImplCase(
150	Method & method,
151	const ColumnRawPtrs & key_columns,
152	size_t rows,
153	SetVariants & variants,
154	ConstNullMapPtr null_map,
155	ColumnUInt8::Container * out_filter);
156
157	template <typename Method>
158	void executeImpl(
159	Method & method,
160	const ColumnRawPtrs & key_columns,
161	ColumnUInt8::Container & vec_res,
162	bool negative,
163	size_t rows,
164	ConstNullMapPtr null_map) const;
165
166	template <typename Method, bool has_null_map>
167	void executeImplCase(
168	Method & method,
169	const ColumnRawPtrs & key_columns,
170	ColumnUInt8::Container & vec_res,
171	bool negative,
172	size_t rows,
173	ConstNullMapPtr null_map) const;
174	};
175
176	using SetPtr = std::shared_ptr<Set>;
177	using ConstSetPtr = std::shared_ptr<const Set>;
178	using Sets = std::vector<SetPtr>;
179
180
181	class IFunction;
182	using FunctionPtr = std::shared_ptr<IFunction>;
183
184	/// Class for mayBeTrueInRange function.
185	class MergeTreeSetIndex
186	{
187	public:
188	/* Mapping for tuple positions from Set::set_elements to*
189	* position of pk index and functions chain applied to this column.
190	*/
191	struct KeyTuplePositionMapping
192	{
193	size_t tuple_index;
194	size_t key_index;
195	std::vector<FunctionBasePtr> functions;
196	};
197
198	MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && indexes_mapping_);
199
200	size_t size() const { return ordered_set.at(`0`)->size(); }
201
202	BoolMask mayBeTrueInRange(const std::vector<Range> & key_ranges, const DataTypes & data_types);
203
204	private:
205	Columns ordered_set;
206	std::vector<KeyTuplePositionMapping> indexes_mapping;
207	};
208
209	}
210

Browse the source code of ClickHouse/dbms/src/Interpreters/Set.h