SummingSortedBlockInputStream.cpp source code [ClickHouse/dbms/src/DataStreams/SummingSortedBlockInputStream.cpp]

1	#include <DataStreams/SummingSortedBlockInputStream.h>
2	#include <DataTypes/DataTypesNumber.h>
3	#include <DataTypes/NestedUtils.h>
4	#include <DataTypes/DataTypeTuple.h>
5	#include <DataTypes/DataTypeArray.h>
6	#include <DataTypes/DataTypeAggregateFunction.h>
7	#include <Columns/ColumnAggregateFunction.h>
8	#include <Columns/ColumnTuple.h>
9	#include <Common/StringUtils/StringUtils.h>
10	#include <Common/FieldVisitors.h>
11	#include <common/logger_useful.h>
12	#include <Common/typeid_cast.h>
13	#include <Common/assert_cast.h>
14
15	#include <AggregateFunctions/AggregateFunctionFactory.h>
16	#include <Functions/FunctionFactory.h>
17	#include <Functions/FunctionHelpers.h>
18	#include <Interpreters/Context.h>
19
20
21	namespace DB
22	{
23
24	namespace ErrorCodes
25	{
26	extern const int LOGICAL_ERROR;
27	}
28
29
30	namespace
31	{
32	bool isInPrimaryKey(const SortDescription & description, const std::string & name, const size_t number)
33	{
34	for (auto & desc : description)
35	if (desc.column_name == name \|\| (desc.column_name.empty() && desc.column_number == number))
36	return true;
37
38	return false;
39	}
40	}
41
42
43	SummingSortedBlockInputStream::SummingSortedBlockInputStream(
44	const BlockInputStreams & inputs_,
45	const SortDescription & description_,
46	/// List of columns to be summed. If empty, all numeric columns that are not in the description are taken.
47	const Names & column_names_to_sum,
48	size_t max_block_size_)
49	: MergingSortedBlockInputStream (inputs_, description_, max_block_size_)
50	{
51	current_row.resize(num_columns);
52
53	/// name of nested structure -> the column numbers that refer to it.
54	std::unordered_map<std::string, std::vector<size_t>> discovered_maps;
55
56	/* Fill in the column numbers, which must be summed.*
57	* This can only be numeric columns that are not part of the sort key.
58	* If a non-empty column_names_to_sum is specified, then we only take these columns.
59	* Some columns from column_names_to_sum may not be found. This is ignored.
60	*/
61	for (size_t i = `0`; i < num_columns; ++i)
62	{
63	const ColumnWithTypeAndName & column = header.safeGetByPosition(i);
64
65	/// Discover nested Maps and find columns for summation
66	if (typeid_cast<const DataTypeArray *>(column.type.get()))
67	{
68	const auto map_name = Nested::extractTableName(column.name);
69	/// if nested table name ends with `Map` it is a possible candidate for special handling
70	if (map_name == column.name \|\| !endsWith(map_name, "Map"))
71	{
72	column_numbers_not_to_aggregate.push_back(i);
73	continue;
74	}
75
76	discovered_maps [map_name].emplace_back(i);
77	}
78	else
79	{
80	bool is_agg_func = WhichDataType (column.type).isAggregateFunction();
81
82	/// There are special const columns for example after prewere sections.
83	if ((!column.type ->isSummable() && !is_agg_func) \|\| isColumnConst(*column.column))
84	{
85	column_numbers_not_to_aggregate.push_back(i);
86	continue;
87	}
88
89	/// Are they inside the PK?
90	if (isInPrimaryKey(description, column.name, i))
91	{
92	column_numbers_not_to_aggregate.push_back(i);
93	continue;
94	}
95
96	if (column_names_to_sum.empty()
97	\|\| column_names_to_sum.end() !=
98	std::find(column_names_to_sum.begin(), column_names_to_sum.end(), column.name))
99	{
100	// Create aggregator to sum this column
101	AggregateDescription desc;
102	desc.is_agg_func_type = is_agg_func;
103	desc.column_numbers = {i};
104
105	if (!is_agg_func)
106	{
107	desc.init("sumWithOverflow", {column.type});
108	}
109
110	columns_to_aggregate.emplace_back(std::move(desc));
111	}
112	else
113	{
114	// Column is not going to be summed, use last value
115	column_numbers_not_to_aggregate.push_back(i);
116	}
117	}
118	}
119
120	/// select actual nested Maps from list of candidates
121	for (const auto & map : discovered_maps)
122	{
123	/// map should contain at least two elements (key -> value)
124	if (map.second.size() < `2`)
125	{
126	for (auto col : map.second)
127	column_numbers_not_to_aggregate.push_back(col);
128	continue;
129	}
130
131	/// no elements of map could be in primary key
132	auto column_num_it = map.second.begin();
133	for (; column_num_it != map.second.end(); ++column_num_it)
134	if (isInPrimaryKey(description, header.safeGetByPosition(column_num_it).name, column_num_it))
135	break;
136	if (column_num_it != map.second.end())
137	{
138	for (auto col : map.second)
139	column_numbers_not_to_aggregate.push_back(col);
140	continue;
141	}
142
143	DataTypes argument_types;
144	AggregateDescription desc;
145	MapDescription map_desc;
146
147	column_num_it = map.second.begin();
148	for (; column_num_it != map.second.end(); ++column_num_it)
149	{
150	const ColumnWithTypeAndName & key_col = header.safeGetByPosition(*column_num_it);
151	const String & name = key_col.name;
152	const IDataType & nested_type = *static_cast<const DataTypeArray *>(key_col.type.get())->getNestedType();
153
154	if (column_num_it == map.second.begin()
155	\|\| endsWith(name, "ID")
156	\|\| endsWith(name, "Key")
157	\|\| endsWith(name, "Type"))
158	{
159	if (!nested_type.isValueRepresentedByInteger())
160	break;
161
162	map_desc.key_col_nums.push_back(*column_num_it);
163	}
164	else
165	{
166	if (!nested_type.isSummable())
167	break;
168
169	map_desc.val_col_nums.push_back(*column_num_it);
170	}
171
172	// Add column to function arguments
173	desc.column_numbers.push_back(*column_num_it);
174	argument_types.push_back(key_col.type);
175	}
176
177	if (column_num_it != map.second.end())
178	{
179	for (auto col : map.second)
180	column_numbers_not_to_aggregate.push_back(col);
181	continue;
182	}
183
184	if (map_desc.key_col_nums.size() == `1`)
185	{
186	// Create summation for all value columns in the map
187	desc.init("sumMapWithOverflow", argument_types);
188	columns_to_aggregate.emplace_back(std::move(desc));
189	}
190	else
191	{
192	// Fall back to legacy mergeMaps for composite keys
193	for (auto col : map.second)
194	column_numbers_not_to_aggregate.push_back(col);
195	maps_to_sum.emplace_back(std::move(map_desc));
196	}
197	}
198	}
199
200
201	void SummingSortedBlockInputStream::insertCurrentRowIfNeeded(MutableColumns & merged_columns)
202	{
203	/// We have nothing to aggregate. It means that it could be non-zero, because we have columns_not_to_aggregate.
204	if (columns_to_aggregate.empty())
205	current_row_is_zero = false;
206
207	for (auto & desc : columns_to_aggregate)
208	{
209	// Do not insert if the aggregation state hasn't been created
210	if (desc.created)
211	{
212	if (desc.is_agg_func_type)
213	{
214	current_row_is_zero = false;
215	}
216	else
217	{
218	try
219	{
220	desc.function ->insertResultInto(desc.state.data(), *desc.merged_column);
221
222	/// Update zero status of current row
223	if (desc.column_numbers.size() == `1`)
224	{
225	// Flag row as non-empty if at least one column number if non-zero
226	current_row_is_zero = current_row_is_zero && desc.merged_column ->isDefaultAt(desc.merged_column ->size() - `1`);
227	}
228	else
229	{
230	/// It is sumMapWithOverflow aggregate function.
231	/// Assume that the row isn't empty in this case (just because it is compatible with previous version)
232	current_row_is_zero = false;
233	}
234	}
235	catch (...)
236	{
237	desc.destroyState();
238	throw;
239	}
240	}
241	desc.destroyState();
242	}
243	else
244	desc.merged_column ->insertDefault();
245	}
246
247	/// If it is "zero" row, then rollback the insertion
248	/// (at this moment we need rollback only cols from columns_to_aggregate)
249	if (current_row_is_zero)
250	{
251	for (auto & desc : columns_to_aggregate)
252	desc.merged_column ->popBack(`1`);
253
254	return;
255	}
256
257	for (auto i : column_numbers_not_to_aggregate)
258	merged_columns [i]->insert(current_row [i]);
259
260	/// Update per-block and per-group flags
261	++merged_rows;
262	}
263
264
265	Block SummingSortedBlockInputStream::readImpl()
266	{
267	if (finished)
268	return Block ();
269
270	MutableColumns merged_columns;
271	init(merged_columns);
272
273	if (has_collation)
274	throw Exception ("Logical error: " + getName() + " does not support collations", ErrorCodes::LOGICAL_ERROR);
275
276	if (merged_columns.empty())
277	return {};
278
279	/// Update aggregation result columns for current block
280	for (auto & desc : columns_to_aggregate)
281	{
282	// Wrap aggregated columns in a tuple to match function signature
283	if (!desc.is_agg_func_type && isTuple(desc.function ->getReturnType()))
284	{
285	size_t tuple_size = desc.column_numbers.size();
286	MutableColumns tuple_columns(tuple_size);
287	for (size_t i = `0`; i < tuple_size; ++i)
288	tuple_columns [i] = header.safeGetByPosition(desc.column_numbers [i]).column ->cloneEmpty();
289
290	desc.merged_column = ColumnTuple::create(std::move(tuple_columns));
291	}
292	else
293	desc.merged_column = header.safeGetByPosition(desc.column_numbers [`0`]).column ->cloneEmpty();
294	}
295
296	merge(merged_columns, queue_without_collation);
297	Block res = header.cloneWithColumns(std::move(merged_columns));
298
299	/// Place aggregation results into block.
300	for (auto & desc : columns_to_aggregate)
301	{
302	if (!desc.is_agg_func_type && isTuple(desc.function ->getReturnType()))
303	{
304	/// Unpack tuple into block.
305	size_t tuple_size = desc.column_numbers.size();
306	for (size_t i = `0`; i < tuple_size; ++i)
307	res.getByPosition(desc.column_numbers [i]).column = assert_cast<const ColumnTuple &>(*desc.merged_column).getColumnPtr(i);
308	}
309	else
310	res.getByPosition(desc.column_numbers [`0`]).column = std::move(desc.merged_column);
311	}
312
313	return res;
314	}
315
316
317	void SummingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
318	{
319	merged_rows = `0`;
320
321	/// Take the rows in needed order and put them in `merged_columns` until rows no more than `max_block_size`
322	while (!queue.empty())
323	{
324	SortCursor current = queue.top();
325
326	setPrimaryKeyRef(next_key, current);
327
328	bool key_differs;
329
330	if (current_key.empty()) /// The first key encountered.
331	{
332	key_differs = true;
333	current_row_is_zero = true;
334	}
335	else
336	key_differs = next_key != current_key;
337
338	if (key_differs)
339	{
340	if (!current_key.empty())
341	/// Write the data for the previous group.
342	insertCurrentRowIfNeeded(merged_columns);
343
344	if (merged_rows >= max_block_size)
345	{
346	/// The block is now full and the last row is calculated completely.
347	current_key.reset();
348	return;
349	}
350
351	current_key.swap(next_key);
352
353	setRow(current_row, current);
354
355	/// Reset aggregation states for next row
356	for (auto & desc : columns_to_aggregate)
357	desc.createState();
358
359	// Start aggregations with current row
360	addRow(current);
361
362	if (maps_to_sum.empty())
363	{
364	/// We have only columns_to_aggregate. The status of current row will be determined
365	/// in 'insertCurrentRowIfNeeded' method on the values of aggregate functions.
366	current_row_is_zero = true;
367	}
368	else
369	{
370	/// We have complex maps that will be summed with 'mergeMap' method.
371	/// The single row is considered non zero, and the status after merging with other rows
372	/// will be determined in the branch below (when key_differs == false).
373	current_row_is_zero = false;
374	}
375	}
376	else
377	{
378	addRow(current);
379
380	// Merge maps only for same rows
381	for (const auto & desc : maps_to_sum)
382	if (mergeMap(desc, current_row, current))
383	current_row_is_zero = false;
384	}
385
386	queue.pop();
387
388	if (!current ->isLast())
389	{
390	current ->next();
391	queue.push(current);
392	}
393	else
394	{
395	/// We get the next block from the corresponding source, if there is one.
396	fetchNextBlock(current, queue);
397	}
398	}
399
400	/// We will write the data for the last group, if it is non-zero.
401	/// If it is zero, and without it the output stream will be empty, we will write it anyway.
402	insertCurrentRowIfNeeded(merged_columns);
403	finished = true;
404	}
405
406
407	bool SummingSortedBlockInputStream::mergeMap(const MapDescription & desc, Row & row, SortCursor & cursor)
408	{
409	/// Strongly non-optimal.
410
411	Row & left = row;
412	Row right(left.size());
413
414	for (size_t col_num : desc.key_col_nums)
415	right [col_num] = (*cursor ->all_columns [col_num])[cursor ->pos].template get<Array>();
416
417	for (size_t col_num : desc.val_col_nums)
418	right [col_num] = (*cursor ->all_columns [col_num])[cursor ->pos].template get<Array>();
419
420	auto at_ith_column_jth_row = [&](const Row & matrix, size_t i, size_t j) -> const Field &
421	{
422	return matrix [i].get<Array>()[j];
423	};
424
425	auto tuple_of_nth_columns_at_jth_row = [&](const Row & matrix, const ColumnNumbers & col_nums, size_t j) -> Array
426	{
427	size_t size = col_nums.size();
428	Array res(size);
429	for (size_t col_num_index = `0`; col_num_index < size; ++col_num_index)
430	res [col_num_index] = at_ith_column_jth_row (matrix, col_nums [col_num_index], j);
431	return res;
432	};
433
434	std::map<Array, Array> merged;
435
436	auto accumulate = [](Array & dst, const Array & src)
437	{
438	bool has_non_zero = false;
439	size_t size = dst.size();
440	for (size_t i = `0`; i < size; ++i)
441	if (applyVisitor(FieldVisitorSum (src [i]), dst [i]))
442	has_non_zero = true;
443	return has_non_zero;
444	};
445
446	auto merge = [&](const Row & matrix)
447	{
448	size_t rows = matrix [desc.key_col_nums [`0`]].get<Array>().size();
449
450	for (size_t j = `0`; j < rows; ++j)
451	{
452	Array key = tuple_of_nth_columns_at_jth_row (matrix, desc.key_col_nums, j);
453	Array value = tuple_of_nth_columns_at_jth_row (matrix, desc.val_col_nums, j);
454
455	auto it = merged.find(key);
456	if (merged.end() == it)
457	merged.emplace(std::move(key), std::move(value));
458	else
459	{
460	if (!accumulate (it ->second, value))
461	merged.erase(it);
462	}
463	}
464	};
465
466	merge (left);
467	merge (right);
468
469	for (size_t col_num : desc.key_col_nums)
470	row [col_num] = Array (merged.size());
471	for (size_t col_num : desc.val_col_nums)
472	row [col_num] = Array (merged.size());
473
474	size_t row_num = `0`;
475	for (const auto & key_value : merged)
476	{
477	for (size_t col_num_index = `0`, size = desc.key_col_nums.size(); col_num_index < size; ++col_num_index)
478	row [desc.key_col_nums [col_num_index]].get<Array>()[row_num] = key_value.first [col_num_index];
479
480	for (size_t col_num_index = `0`, size = desc.val_col_nums.size(); col_num_index < size; ++col_num_index)
481	row [desc.val_col_nums [col_num_index]].get<Array>()[row_num] = key_value.second [col_num_index];
482
483	++row_num;
484	}
485
486	return row_num != `0`;
487	}
488
489
490	void SummingSortedBlockInputStream::addRow(SortCursor & cursor)
491	{
492	for (auto & desc : columns_to_aggregate)
493	{
494	if (!desc.created)
495	throw Exception ("Logical error in SummingSortedBlockInputStream, there are no description", ErrorCodes::LOGICAL_ERROR);
496
497	if (desc.is_agg_func_type)
498	{
499	// desc.state is not used for AggregateFunction types
500	auto & col = cursor ->all_columns [desc.column_numbers [`0`]];
501	assert_cast<ColumnAggregateFunction &>(desc.merged_column).insertMergeFrom(col, cursor ->pos);
502	}
503	else
504	{
505	// Specialized case for unary functions
506	if (desc.column_numbers.size() == `1`)
507	{
508	auto & col = cursor ->all_columns [desc.column_numbers [`0`]];
509	desc.add_function(desc.function.get(), desc.state.data(), &col, cursor ->pos, nullptr);
510	}
511	else
512	{
513	// Gather all source columns into a vector
514	ColumnRawPtrs columns(desc.column_numbers.size());
515	for (size_t i = `0`; i < desc.column_numbers.size(); ++i)
516	columns [i] = cursor ->all_columns [desc.column_numbers [i]];
517
518	desc.add_function(desc.function.get(), desc.state.data(), columns.data(), cursor ->pos, nullptr);
519	}
520	}
521	}
522	}
523
524	}
525

Browse the source code of ClickHouse/dbms/src/DataStreams/SummingSortedBlockInputStream.cpp