1 | #include <Common/Exception.h> |
2 | #include <Common/FieldVisitors.h> |
3 | |
4 | #include <Core/Block.h> |
5 | |
6 | #include <IO/WriteBufferFromString.h> |
7 | #include <IO/Operators.h> |
8 | |
9 | #include <Common/typeid_cast.h> |
10 | #include <Common/assert_cast.h> |
11 | |
12 | #include <Columns/ColumnConst.h> |
13 | |
14 | #include <iterator> |
15 | #include <memory> |
16 | |
17 | |
18 | namespace DB |
19 | { |
20 | |
21 | namespace ErrorCodes |
22 | { |
23 | extern const int POSITION_OUT_OF_BOUND; |
24 | extern const int NOT_FOUND_COLUMN_IN_BLOCK; |
25 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
26 | extern const int BLOCKS_HAVE_DIFFERENT_STRUCTURE; |
27 | } |
28 | |
29 | |
30 | Block::Block(std::initializer_list<ColumnWithTypeAndName> il) : data{il} |
31 | { |
32 | initializeIndexByName(); |
33 | } |
34 | |
35 | |
36 | Block::Block(const ColumnsWithTypeAndName & data_) : data{data_} |
37 | { |
38 | initializeIndexByName(); |
39 | } |
40 | |
41 | |
42 | void Block::initializeIndexByName() |
43 | { |
44 | for (size_t i = 0, size = data.size(); i < size; ++i) |
45 | index_by_name[data[i].name] = i; |
46 | } |
47 | |
48 | |
49 | void Block::insert(size_t position, const ColumnWithTypeAndName & elem) |
50 | { |
51 | if (position > data.size()) |
52 | throw Exception("Position out of bound in Block::insert(), max position = " |
53 | + toString(data.size()), ErrorCodes::POSITION_OUT_OF_BOUND); |
54 | |
55 | for (auto & name_pos : index_by_name) |
56 | if (name_pos.second >= position) |
57 | ++name_pos.second; |
58 | |
59 | index_by_name.emplace(elem.name, position); |
60 | data.emplace(data.begin() + position, elem); |
61 | } |
62 | |
63 | void Block::insert(size_t position, ColumnWithTypeAndName && elem) |
64 | { |
65 | if (position > data.size()) |
66 | throw Exception("Position out of bound in Block::insert(), max position = " |
67 | + toString(data.size()), ErrorCodes::POSITION_OUT_OF_BOUND); |
68 | |
69 | for (auto & name_pos : index_by_name) |
70 | if (name_pos.second >= position) |
71 | ++name_pos.second; |
72 | |
73 | index_by_name.emplace(elem.name, position); |
74 | data.emplace(data.begin() + position, std::move(elem)); |
75 | } |
76 | |
77 | |
78 | void Block::insert(const ColumnWithTypeAndName & elem) |
79 | { |
80 | index_by_name.emplace(elem.name, data.size()); |
81 | data.emplace_back(elem); |
82 | } |
83 | |
84 | void Block::insert(ColumnWithTypeAndName && elem) |
85 | { |
86 | index_by_name.emplace(elem.name, data.size()); |
87 | data.emplace_back(std::move(elem)); |
88 | } |
89 | |
90 | |
91 | void Block::insertUnique(const ColumnWithTypeAndName & elem) |
92 | { |
93 | if (index_by_name.end() == index_by_name.find(elem.name)) |
94 | insert(elem); |
95 | } |
96 | |
97 | void Block::insertUnique(ColumnWithTypeAndName && elem) |
98 | { |
99 | if (index_by_name.end() == index_by_name.find(elem.name)) |
100 | insert(std::move(elem)); |
101 | } |
102 | |
103 | |
104 | void Block::erase(const std::set<size_t> & positions) |
105 | { |
106 | for (auto it = positions.rbegin(); it != positions.rend(); ++it) |
107 | erase(*it); |
108 | } |
109 | |
110 | |
111 | void Block::erase(size_t position) |
112 | { |
113 | if (data.empty()) |
114 | throw Exception("Block is empty" , ErrorCodes::POSITION_OUT_OF_BOUND); |
115 | |
116 | if (position >= data.size()) |
117 | throw Exception("Position out of bound in Block::erase(), max position = " |
118 | + toString(data.size() - 1), ErrorCodes::POSITION_OUT_OF_BOUND); |
119 | |
120 | eraseImpl(position); |
121 | } |
122 | |
123 | |
124 | void Block::eraseImpl(size_t position) |
125 | { |
126 | data.erase(data.begin() + position); |
127 | |
128 | for (auto it = index_by_name.begin(); it != index_by_name.end();) |
129 | { |
130 | if (it->second == position) |
131 | index_by_name.erase(it++); |
132 | else |
133 | { |
134 | if (it->second > position) |
135 | --it->second; |
136 | ++it; |
137 | } |
138 | } |
139 | } |
140 | |
141 | |
142 | void Block::erase(const String & name) |
143 | { |
144 | auto index_it = index_by_name.find(name); |
145 | if (index_it == index_by_name.end()) |
146 | throw Exception("No such name in Block::erase(): '" |
147 | + name + "'" , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); |
148 | |
149 | eraseImpl(index_it->second); |
150 | } |
151 | |
152 | |
153 | ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) |
154 | { |
155 | if (data.empty()) |
156 | throw Exception("Block is empty" , ErrorCodes::POSITION_OUT_OF_BOUND); |
157 | |
158 | if (position >= data.size()) |
159 | throw Exception("Position " + toString(position) |
160 | + " is out of bound in Block::safeGetByPosition(), max position = " |
161 | + toString(data.size() - 1) |
162 | + ", there are columns: " + dumpNames(), ErrorCodes::POSITION_OUT_OF_BOUND); |
163 | |
164 | return data[position]; |
165 | } |
166 | |
167 | |
168 | const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const |
169 | { |
170 | if (data.empty()) |
171 | throw Exception("Block is empty" , ErrorCodes::POSITION_OUT_OF_BOUND); |
172 | |
173 | if (position >= data.size()) |
174 | throw Exception("Position " + toString(position) |
175 | + " is out of bound in Block::safeGetByPosition(), max position = " |
176 | + toString(data.size() - 1) |
177 | + ", there are columns: " + dumpNames(), ErrorCodes::POSITION_OUT_OF_BOUND); |
178 | |
179 | return data[position]; |
180 | } |
181 | |
182 | |
183 | ColumnWithTypeAndName & Block::getByName(const std::string & name) |
184 | { |
185 | auto it = index_by_name.find(name); |
186 | if (index_by_name.end() == it) |
187 | throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() |
188 | , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); |
189 | |
190 | return data[it->second]; |
191 | } |
192 | |
193 | |
194 | const ColumnWithTypeAndName & Block::getByName(const std::string & name) const |
195 | { |
196 | auto it = index_by_name.find(name); |
197 | if (index_by_name.end() == it) |
198 | throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() |
199 | , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); |
200 | |
201 | return data[it->second]; |
202 | } |
203 | |
204 | |
205 | bool Block::has(const std::string & name) const |
206 | { |
207 | return index_by_name.end() != index_by_name.find(name); |
208 | } |
209 | |
210 | |
211 | size_t Block::getPositionByName(const std::string & name) const |
212 | { |
213 | auto it = index_by_name.find(name); |
214 | if (index_by_name.end() == it) |
215 | throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() |
216 | , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); |
217 | |
218 | return it->second; |
219 | } |
220 | |
221 | |
222 | void Block::checkNumberOfRows(bool allow_null_columns) const |
223 | { |
224 | ssize_t rows = -1; |
225 | for (const auto & elem : data) |
226 | { |
227 | if (!elem.column && allow_null_columns) |
228 | continue; |
229 | |
230 | if (!elem.column) |
231 | throw Exception("Column " + elem.name + " in block is nullptr, in method checkNumberOfRows." |
232 | , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
233 | |
234 | ssize_t size = elem.column->size(); |
235 | |
236 | if (rows == -1) |
237 | rows = size; |
238 | else if (rows != size) |
239 | throw Exception("Sizes of columns doesn't match: " |
240 | + data.front().name + ": " + toString(rows) |
241 | + ", " + elem.name + ": " + toString(size) |
242 | , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
243 | } |
244 | } |
245 | |
246 | |
247 | size_t Block::rows() const |
248 | { |
249 | for (const auto & elem : data) |
250 | if (elem.column) |
251 | return elem.column->size(); |
252 | |
253 | return 0; |
254 | } |
255 | |
256 | |
257 | size_t Block::bytes() const |
258 | { |
259 | size_t res = 0; |
260 | for (const auto & elem : data) |
261 | res += elem.column->byteSize(); |
262 | |
263 | return res; |
264 | } |
265 | |
266 | size_t Block::allocatedBytes() const |
267 | { |
268 | size_t res = 0; |
269 | for (const auto & elem : data) |
270 | res += elem.column->allocatedBytes(); |
271 | |
272 | return res; |
273 | } |
274 | |
275 | std::string Block::dumpNames() const |
276 | { |
277 | WriteBufferFromOwnString out; |
278 | for (auto it = data.begin(); it != data.end(); ++it) |
279 | { |
280 | if (it != data.begin()) |
281 | out << ", " ; |
282 | out << it->name; |
283 | } |
284 | return out.str(); |
285 | } |
286 | |
287 | |
288 | std::string Block::dumpStructure() const |
289 | { |
290 | WriteBufferFromOwnString out; |
291 | for (auto it = data.begin(); it != data.end(); ++it) |
292 | { |
293 | if (it != data.begin()) |
294 | out << ", " ; |
295 | it->dumpStructure(out); |
296 | } |
297 | return out.str(); |
298 | } |
299 | |
300 | |
301 | Block Block::cloneEmpty() const |
302 | { |
303 | Block res; |
304 | |
305 | for (const auto & elem : data) |
306 | res.insert(elem.cloneEmpty()); |
307 | |
308 | return res; |
309 | } |
310 | |
311 | |
312 | MutableColumns Block::cloneEmptyColumns() const |
313 | { |
314 | size_t num_columns = data.size(); |
315 | MutableColumns columns(num_columns); |
316 | for (size_t i = 0; i < num_columns; ++i) |
317 | columns[i] = data[i].column ? data[i].column->cloneEmpty() : data[i].type->createColumn(); |
318 | return columns; |
319 | } |
320 | |
321 | |
322 | Columns Block::getColumns() const |
323 | { |
324 | size_t num_columns = data.size(); |
325 | Columns columns(num_columns); |
326 | for (size_t i = 0; i < num_columns; ++i) |
327 | columns[i] = data[i].column; |
328 | return columns; |
329 | } |
330 | |
331 | |
332 | MutableColumns Block::mutateColumns() |
333 | { |
334 | size_t num_columns = data.size(); |
335 | MutableColumns columns(num_columns); |
336 | for (size_t i = 0; i < num_columns; ++i) |
337 | columns[i] = data[i].column ? (*std::move(data[i].column)).mutate() : data[i].type->createColumn(); |
338 | return columns; |
339 | } |
340 | |
341 | |
342 | void Block::setColumns(MutableColumns && columns) |
343 | { |
344 | /// TODO: assert if |columns| doesn't match |data|! |
345 | size_t num_columns = data.size(); |
346 | for (size_t i = 0; i < num_columns; ++i) |
347 | data[i].column = std::move(columns[i]); |
348 | } |
349 | |
350 | |
351 | void Block::setColumns(const Columns & columns) |
352 | { |
353 | /// TODO: assert if |columns| doesn't match |data|! |
354 | size_t num_columns = data.size(); |
355 | for (size_t i = 0; i < num_columns; ++i) |
356 | data[i].column = columns[i]; |
357 | } |
358 | |
359 | |
360 | |
361 | Block Block::cloneWithColumns(MutableColumns && columns) const |
362 | { |
363 | Block res; |
364 | |
365 | size_t num_columns = data.size(); |
366 | for (size_t i = 0; i < num_columns; ++i) |
367 | res.insert({ std::move(columns[i]), data[i].type, data[i].name }); |
368 | |
369 | return res; |
370 | } |
371 | |
372 | |
373 | Block Block::cloneWithColumns(const Columns & columns) const |
374 | { |
375 | Block res; |
376 | |
377 | size_t num_columns = data.size(); |
378 | |
379 | if (num_columns != columns.size()) |
380 | throw Exception("Cannot clone block with columns because block has " + toString(num_columns) + " columns, " |
381 | "but " + toString(columns.size()) + " columns given." , ErrorCodes::LOGICAL_ERROR); |
382 | |
383 | for (size_t i = 0; i < num_columns; ++i) |
384 | res.insert({ columns[i], data[i].type, data[i].name }); |
385 | |
386 | return res; |
387 | } |
388 | |
389 | |
390 | Block Block::cloneWithoutColumns() const |
391 | { |
392 | Block res; |
393 | |
394 | size_t num_columns = data.size(); |
395 | for (size_t i = 0; i < num_columns; ++i) |
396 | res.insert({ nullptr, data[i].type, data[i].name }); |
397 | |
398 | return res; |
399 | } |
400 | |
401 | |
402 | Block Block::sortColumns() const |
403 | { |
404 | Block sorted_block; |
405 | |
406 | for (const auto & name : index_by_name) |
407 | sorted_block.insert(data[name.second]); |
408 | |
409 | return sorted_block; |
410 | } |
411 | |
412 | |
413 | const ColumnsWithTypeAndName & Block::getColumnsWithTypeAndName() const |
414 | { |
415 | return data; |
416 | } |
417 | |
418 | |
419 | NamesAndTypesList Block::getNamesAndTypesList() const |
420 | { |
421 | NamesAndTypesList res; |
422 | |
423 | for (const auto & elem : data) |
424 | res.emplace_back(elem.name, elem.type); |
425 | |
426 | return res; |
427 | } |
428 | |
429 | |
430 | Names Block::getNames() const |
431 | { |
432 | Names res; |
433 | res.reserve(columns()); |
434 | |
435 | for (const auto & elem : data) |
436 | res.push_back(elem.name); |
437 | |
438 | return res; |
439 | } |
440 | |
441 | |
442 | DataTypes Block::getDataTypes() const |
443 | { |
444 | DataTypes res; |
445 | res.reserve(columns()); |
446 | |
447 | for (const auto & elem : data) |
448 | res.push_back(elem.type); |
449 | |
450 | return res; |
451 | } |
452 | |
453 | |
454 | template <typename ReturnType> |
455 | static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description) |
456 | { |
457 | auto on_error = [](const std::string & message [[maybe_unused]], int code [[maybe_unused]]) |
458 | { |
459 | if constexpr (std::is_same_v<ReturnType, void>) |
460 | throw Exception(message, code); |
461 | else |
462 | return false; |
463 | }; |
464 | |
465 | size_t columns = rhs.columns(); |
466 | if (lhs.columns() != columns) |
467 | return on_error("Block structure mismatch in " + context_description + " stream: different number of columns:\n" |
468 | + lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE); |
469 | |
470 | for (size_t i = 0; i < columns; ++i) |
471 | { |
472 | const auto & expected = rhs.getByPosition(i); |
473 | const auto & actual = lhs.getByPosition(i); |
474 | |
475 | if (actual.name != expected.name) |
476 | return on_error("Block structure mismatch in " + context_description + " stream: different names of columns:\n" |
477 | + lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE); |
478 | |
479 | if (!actual.type->equals(*expected.type)) |
480 | return on_error("Block structure mismatch in " + context_description + " stream: different types:\n" |
481 | + lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE); |
482 | |
483 | if (!actual.column || !expected.column) |
484 | continue; |
485 | |
486 | if (actual.column->getName() != expected.column->getName()) |
487 | return on_error("Block structure mismatch in " + context_description + " stream: different columns:\n" |
488 | + lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE); |
489 | |
490 | if (isColumnConst(*actual.column) && isColumnConst(*expected.column)) |
491 | { |
492 | Field actual_value = assert_cast<const ColumnConst &>(*actual.column).getField(); |
493 | Field expected_value = assert_cast<const ColumnConst &>(*expected.column).getField(); |
494 | |
495 | if (actual_value != expected_value) |
496 | return on_error("Block structure mismatch in " + context_description + " stream: different values of constants, actual: " |
497 | + applyVisitor(FieldVisitorToString(), actual_value) + ", expected: " + applyVisitor(FieldVisitorToString(), expected_value), |
498 | ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE); |
499 | } |
500 | } |
501 | |
502 | return ReturnType(true); |
503 | } |
504 | |
505 | |
506 | bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs) |
507 | { |
508 | return checkBlockStructure<bool>(lhs, rhs, {}); |
509 | } |
510 | |
511 | |
512 | void assertBlocksHaveEqualStructure(const Block & lhs, const Block & rhs, const std::string & context_description) |
513 | { |
514 | checkBlockStructure<void>(lhs, rhs, context_description); |
515 | } |
516 | |
517 | |
518 | void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff) |
519 | { |
520 | /// The traditional task: the largest common subsequence (LCS). |
521 | /// Assume that order is important. If this becomes wrong once, let's simplify it: for example, make 2 sets. |
522 | |
523 | std::vector<std::vector<int>> lcs(lhs.columns() + 1); |
524 | for (auto & v : lcs) |
525 | v.resize(rhs.columns() + 1); |
526 | |
527 | for (size_t i = 1; i <= lhs.columns(); ++i) |
528 | { |
529 | for (size_t j = 1; j <= rhs.columns(); ++j) |
530 | { |
531 | if (lhs.safeGetByPosition(i - 1) == rhs.safeGetByPosition(j - 1)) |
532 | lcs[i][j] = lcs[i - 1][j - 1] + 1; |
533 | else |
534 | lcs[i][j] = std::max(lcs[i - 1][j], lcs[i][j - 1]); |
535 | } |
536 | } |
537 | |
538 | /// Now go back and collect the answer. |
539 | ColumnsWithTypeAndName left_columns; |
540 | ColumnsWithTypeAndName right_columns; |
541 | size_t l = lhs.columns(); |
542 | size_t r = rhs.columns(); |
543 | while (l > 0 && r > 0) |
544 | { |
545 | if (lhs.safeGetByPosition(l - 1) == rhs.safeGetByPosition(r - 1)) |
546 | { |
547 | /// This element is in both sequences, so it does not get into `diff`. |
548 | --l; |
549 | --r; |
550 | } |
551 | else |
552 | { |
553 | /// Small heuristics: most often used when getting a difference for (expected_block, actual_block). |
554 | /// Therefore, the preference will be given to the field, which is in the left block (expected_block), therefore |
555 | /// in `diff` the column from `actual_block` will get. |
556 | if (lcs[l][r - 1] >= lcs[l - 1][r]) |
557 | right_columns.push_back(rhs.safeGetByPosition(--r)); |
558 | else |
559 | left_columns.push_back(lhs.safeGetByPosition(--l)); |
560 | } |
561 | } |
562 | |
563 | while (l > 0) |
564 | left_columns.push_back(lhs.safeGetByPosition(--l)); |
565 | while (r > 0) |
566 | right_columns.push_back(rhs.safeGetByPosition(--r)); |
567 | |
568 | WriteBufferFromString lhs_diff_writer(out_lhs_diff); |
569 | WriteBufferFromString rhs_diff_writer(out_rhs_diff); |
570 | |
571 | for (auto it = left_columns.rbegin(); it != left_columns.rend(); ++it) |
572 | { |
573 | lhs_diff_writer << it->dumpStructure(); |
574 | lhs_diff_writer << ", position: " << lhs.getPositionByName(it->name) << '\n'; |
575 | } |
576 | for (auto it = right_columns.rbegin(); it != right_columns.rend(); ++it) |
577 | { |
578 | rhs_diff_writer << it->dumpStructure(); |
579 | rhs_diff_writer << ", position: " << rhs.getPositionByName(it->name) << '\n'; |
580 | } |
581 | } |
582 | |
583 | |
584 | void Block::clear() |
585 | { |
586 | info = BlockInfo(); |
587 | data.clear(); |
588 | index_by_name.clear(); |
589 | } |
590 | |
591 | void Block::swap(Block & other) noexcept |
592 | { |
593 | std::swap(info, other.info); |
594 | data.swap(other.data); |
595 | index_by_name.swap(other.index_by_name); |
596 | } |
597 | |
598 | |
599 | void Block::updateHash(SipHash & hash) const |
600 | { |
601 | for (size_t row_no = 0, num_rows = rows(); row_no < num_rows; ++row_no) |
602 | for (const auto & col : data) |
603 | col.column->updateHashWithValue(row_no, hash); |
604 | } |
605 | |
606 | } |
607 | |