1 | #include <algorithm> |
2 | #include <optional> |
3 | |
4 | #include <Poco/File.h> |
5 | #include <Poco/DirectoryIterator.h> |
6 | |
7 | #include <Storages/MergeTree/MergeTreeIndexGranularity.h> |
8 | #include <Storages/MergeTree/checkDataPart.h> |
9 | #include <DataStreams/MarkInCompressedFile.h> |
10 | #include <Compression/CompressedReadBuffer.h> |
11 | #include <IO/HashingReadBuffer.h> |
12 | #include <Common/CurrentMetrics.h> |
13 | |
14 | |
15 | namespace CurrentMetrics |
16 | { |
17 | extern const Metric ReplicatedChecks; |
18 | } |
19 | |
20 | namespace DB |
21 | { |
22 | |
23 | namespace ErrorCodes |
24 | { |
25 | extern const int CORRUPTED_DATA; |
26 | extern const int LOGICAL_ERROR; |
27 | extern const int INCORRECT_MARK; |
28 | extern const int EMPTY_LIST_OF_COLUMNS_PASSED; |
29 | } |
30 | |
31 | |
32 | namespace |
33 | { |
34 | |
35 | /** To read and checksum single stream (a pair of .bin, .mrk files) for a single column or secondary index. |
36 | */ |
37 | class Stream |
38 | { |
39 | public: |
40 | String base_name; |
41 | String bin_file_extension; |
42 | String mrk_file_extension; |
43 | String bin_file_path; |
44 | String mrk_file_path; |
45 | private: |
46 | const MergeTreeIndexGranularity & index_granularity; |
47 | ReadBufferFromFile file_buf; |
48 | HashingReadBuffer compressed_hashing_buf; |
49 | CompressedReadBuffer uncompressing_buf; |
50 | size_t mark_position = 0; |
51 | public: |
52 | HashingReadBuffer uncompressed_hashing_buf; |
53 | |
54 | private: |
55 | ReadBufferFromFile mrk_file_buf; |
56 | |
57 | std::pair<MarkInCompressedFile, size_t> readMarkFromFile() |
58 | { |
59 | size_t mrk_rows; |
60 | MarkInCompressedFile mrk_mark; |
61 | readIntBinary(mrk_mark.offset_in_compressed_file, mrk_hashing_buf); |
62 | readIntBinary(mrk_mark.offset_in_decompressed_block, mrk_hashing_buf); |
63 | if (mrk_file_extension == ".mrk2" ) |
64 | readIntBinary(mrk_rows, mrk_hashing_buf); |
65 | else |
66 | mrk_rows = index_granularity.getMarkRows(mark_position); |
67 | |
68 | return {mrk_mark, mrk_rows}; |
69 | } |
70 | public: |
71 | HashingReadBuffer mrk_hashing_buf; |
72 | |
73 | Stream( |
74 | const String & path, |
75 | const String & base_name_, |
76 | const String & bin_file_extension_, |
77 | const String & mrk_file_extension_, |
78 | const MergeTreeIndexGranularity & index_granularity_) |
79 | : base_name(base_name_) |
80 | , bin_file_extension(bin_file_extension_) |
81 | , mrk_file_extension(mrk_file_extension_) |
82 | , bin_file_path(path + base_name + bin_file_extension) |
83 | , mrk_file_path(path + base_name + mrk_file_extension) |
84 | , index_granularity(index_granularity_) |
85 | , file_buf(bin_file_path) |
86 | , compressed_hashing_buf(file_buf) |
87 | , uncompressing_buf(compressed_hashing_buf) |
88 | , uncompressed_hashing_buf(uncompressing_buf) |
89 | , mrk_file_buf(mrk_file_path) |
90 | , mrk_hashing_buf(mrk_file_buf) |
91 | {} |
92 | |
93 | void assertMark(bool only_read=false) |
94 | { |
95 | |
96 | auto [mrk_mark, mrk_rows] = readMarkFromFile(); |
97 | bool has_alternative_mark = false; |
98 | MarkInCompressedFile alternative_data_mark = {}; |
99 | MarkInCompressedFile data_mark = {}; |
100 | |
101 | /// If the mark should be exactly at the border of blocks, we can also use a mark pointing to the end of previous block, |
102 | /// and the beginning of next. |
103 | if (!uncompressed_hashing_buf.hasPendingData()) |
104 | { |
105 | /// Get a mark pointing to the end of previous block. |
106 | has_alternative_mark = true; |
107 | alternative_data_mark.offset_in_compressed_file = compressed_hashing_buf.count() - uncompressing_buf.getSizeCompressed(); |
108 | alternative_data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset(); |
109 | |
110 | if (mrk_mark == alternative_data_mark) |
111 | { |
112 | mark_position++; |
113 | return; |
114 | } |
115 | |
116 | uncompressed_hashing_buf.next(); |
117 | |
118 | /// At the end of file `compressed_hashing_buf.count()` points to the end of the file even before `calling next()`, |
119 | /// and the check you just performed does not work correctly. For simplicity, we will not check the last mark. |
120 | if (uncompressed_hashing_buf.eof()) |
121 | { |
122 | mark_position++; |
123 | return; |
124 | } |
125 | } |
126 | |
127 | data_mark.offset_in_compressed_file = compressed_hashing_buf.count() - uncompressing_buf.getSizeCompressed(); |
128 | data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset(); |
129 | |
130 | if (!only_read && (mrk_mark != data_mark || mrk_rows != index_granularity.getMarkRows(mark_position))) |
131 | throw Exception("Incorrect mark: " + data_mark.toStringWithRows(index_granularity.getMarkRows(mark_position)) + |
132 | (has_alternative_mark ? " or " + alternative_data_mark.toString() : "" ) + " in data, " + |
133 | mrk_mark.toStringWithRows(mrk_rows) + " in " + mrk_file_path + " file" , ErrorCodes::INCORRECT_MARK); |
134 | |
135 | mark_position++; |
136 | } |
137 | |
138 | void assertEnd() |
139 | { |
140 | if (!uncompressed_hashing_buf.eof()) |
141 | throw Exception("EOF expected in " + bin_file_path + " file" |
142 | + " at position " |
143 | + toString(compressed_hashing_buf.count()) + " (compressed), " |
144 | + toString(uncompressed_hashing_buf.count()) + " (uncompressed)" , ErrorCodes::CORRUPTED_DATA); |
145 | |
146 | /// Maybe we have final mark. |
147 | if (index_granularity.hasFinalMark()) |
148 | { |
149 | auto final_mark_rows = readMarkFromFile().second; |
150 | if (final_mark_rows != 0) |
151 | throw Exception("Incorrect final mark at the end of " + mrk_file_path + " expected 0 rows, got " + toString(final_mark_rows), ErrorCodes::CORRUPTED_DATA); |
152 | } |
153 | |
154 | if (!mrk_hashing_buf.eof()) |
155 | throw Exception("EOF expected in " + mrk_file_path + " file" |
156 | + " at position " |
157 | + toString(mrk_hashing_buf.count()), ErrorCodes::CORRUPTED_DATA); |
158 | } |
159 | |
160 | void saveChecksums(MergeTreeData::DataPart::Checksums & checksums) |
161 | { |
162 | checksums.files[base_name + bin_file_extension] = MergeTreeData::DataPart::Checksums::Checksum( |
163 | compressed_hashing_buf.count(), compressed_hashing_buf.getHash(), |
164 | uncompressed_hashing_buf.count(), uncompressed_hashing_buf.getHash()); |
165 | |
166 | checksums.files[base_name + mrk_file_extension] = MergeTreeData::DataPart::Checksums::Checksum( |
167 | mrk_hashing_buf.count(), mrk_hashing_buf.getHash()); |
168 | } |
169 | }; |
170 | |
171 | } |
172 | |
173 | |
174 | MergeTreeData::DataPart::Checksums checkDataPart( |
175 | const String & full_path, |
176 | const MergeTreeIndexGranularity & adaptive_index_granularity, |
177 | const String & mrk_file_extension, |
178 | bool require_checksums, |
179 | const DataTypes & primary_key_data_types, |
180 | const MergeTreeIndices & indices, |
181 | std::function<bool()> is_cancelled) |
182 | { |
183 | Logger * log = &Logger::get("checkDataPart" ); |
184 | |
185 | /** Responsibility: |
186 | * - read list of columns from columns.txt; |
187 | * - read checksums if exist; |
188 | * - read (and validate checksum) of primary.idx; obtain number of marks; |
189 | * - read data files and marks for each stream of each column; calculate and validate checksums; |
190 | * - check that there are the same number of rows in each column; |
191 | * - check that all marks files have the same size; |
192 | */ |
193 | |
194 | CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedChecks}; |
195 | |
196 | String path = full_path; |
197 | if (!path.empty() && path.back() != '/') |
198 | path += "/" ; |
199 | |
200 | NamesAndTypesList columns; |
201 | |
202 | { |
203 | ReadBufferFromFile buf(path + "columns.txt" ); |
204 | columns.readText(buf); |
205 | assertEOF(buf); |
206 | } |
207 | |
208 | /// Checksums from file checksums.txt. May be absent. If present, they are subsequently compared with the actual data checksums. |
209 | MergeTreeData::DataPart::Checksums checksums_txt; |
210 | |
211 | if (require_checksums || Poco::File(path + "checksums.txt" ).exists()) |
212 | { |
213 | ReadBufferFromFile buf(path + "checksums.txt" ); |
214 | checksums_txt.read(buf); |
215 | assertEOF(buf); |
216 | } |
217 | |
218 | /// Real checksums based on contents of data. Must correspond to checksums.txt. If not - it means the data is broken. |
219 | MergeTreeData::DataPart::Checksums checksums_data; |
220 | |
221 | size_t marks_in_primary_key = 0; |
222 | if (!primary_key_data_types.empty()) |
223 | { |
224 | ReadBufferFromFile file_buf(path + "primary.idx" ); |
225 | HashingReadBuffer hashing_buf(file_buf); |
226 | |
227 | size_t key_size = primary_key_data_types.size(); |
228 | MutableColumns tmp_columns(key_size); |
229 | |
230 | for (size_t j = 0; j < key_size; ++j) |
231 | tmp_columns[j] = primary_key_data_types[j]->createColumn(); |
232 | |
233 | while (!hashing_buf.eof()) |
234 | { |
235 | if (is_cancelled()) |
236 | return {}; |
237 | |
238 | ++marks_in_primary_key; |
239 | for (size_t j = 0; j < key_size; ++j) |
240 | primary_key_data_types[j]->deserializeBinary(*tmp_columns[j].get(), hashing_buf); |
241 | } |
242 | |
243 | size_t primary_idx_size = hashing_buf.count(); |
244 | |
245 | checksums_data.files["primary.idx" ] = MergeTreeData::DataPart::Checksums::Checksum(primary_idx_size, hashing_buf.getHash()); |
246 | } |
247 | |
248 | /// Optional files count.txt, partition.dat, minmax_*.idx, ttl.txt. Just calculate checksums for existing files. |
249 | Poco::DirectoryIterator dir_end; |
250 | for (Poco::DirectoryIterator dir_it(path); dir_it != dir_end; ++dir_it) |
251 | { |
252 | const String & file_name = dir_it.name(); |
253 | if (file_name == "count.txt" |
254 | || file_name == "partition.dat" |
255 | || (startsWith(file_name, "minmax_" ) && endsWith(file_name, ".idx" )) |
256 | || file_name == "ttl.txt" ) |
257 | { |
258 | ReadBufferFromFile file_buf(dir_it->path()); |
259 | HashingReadBuffer hashing_buf(file_buf); |
260 | hashing_buf.tryIgnore(std::numeric_limits<size_t>::max()); |
261 | checksums_data.files[file_name] = MergeTreeData::DataPart::Checksums::Checksum(hashing_buf.count(), hashing_buf.getHash()); |
262 | } |
263 | } |
264 | |
265 | if (is_cancelled()) |
266 | return {}; |
267 | |
268 | /// If count.txt file exists, use it as source of truth for number of rows. Otherwise just check that all columns have equal amount of rows. |
269 | std::optional<size_t> rows; |
270 | |
271 | if (Poco::File(path + "count.txt" ).exists()) |
272 | { |
273 | ReadBufferFromFile buf(path + "count.txt" ); |
274 | size_t count = 0; |
275 | readText(count, buf); |
276 | assertEOF(buf); |
277 | rows = count; |
278 | } |
279 | |
280 | /// Read and check skip indices. |
281 | for (const auto & index : indices) |
282 | { |
283 | Stream stream(path, index->getFileName(), ".idx" , mrk_file_extension, adaptive_index_granularity); |
284 | size_t mark_num = 0; |
285 | |
286 | while (!stream.uncompressed_hashing_buf.eof()) |
287 | { |
288 | if (stream.mrk_hashing_buf.eof()) |
289 | throw Exception("Unexpected end of mrk file while reading index " + index->name, |
290 | ErrorCodes::CORRUPTED_DATA); |
291 | try |
292 | { |
293 | stream.assertMark(); |
294 | } |
295 | catch (Exception & e) |
296 | { |
297 | e.addMessage("Cannot read mark " + toString(mark_num) |
298 | + " in file " + stream.mrk_file_path |
299 | + ", mrk file offset: " + toString(stream.mrk_hashing_buf.count())); |
300 | throw; |
301 | } |
302 | try |
303 | { |
304 | index->createIndexGranule()->deserializeBinary(stream.uncompressed_hashing_buf); |
305 | } |
306 | catch (Exception & e) |
307 | { |
308 | e.addMessage("Cannot read granule " + toString(mark_num) |
309 | + " in file " + stream.bin_file_path |
310 | + ", mrk file offset: " + toString(stream.mrk_hashing_buf.count())); |
311 | throw; |
312 | } |
313 | ++mark_num; |
314 | if (is_cancelled()) |
315 | return {}; |
316 | } |
317 | |
318 | stream.assertEnd(); |
319 | stream.saveChecksums(checksums_data); |
320 | } |
321 | |
322 | /// Read all columns, calculate checksums and validate marks. |
323 | for (const NameAndTypePair & name_type : columns) |
324 | { |
325 | LOG_DEBUG(log, "Checking column " + name_type.name + " in " + path); |
326 | |
327 | std::map<String, Stream> streams; |
328 | size_t column_size = 0; |
329 | size_t mark_num = 0; |
330 | |
331 | IDataType::DeserializeBinaryBulkStatePtr state; |
332 | IDataType::DeserializeBinaryBulkSettings settings; |
333 | settings.getter = [&](const IDataType::SubstreamPath & substream_path) |
334 | { |
335 | String file_name = IDataType::getFileNameForStream(name_type.name, substream_path); |
336 | auto & stream = streams.try_emplace(file_name, path, file_name, ".bin" , mrk_file_extension, adaptive_index_granularity).first->second; |
337 | return &stream.uncompressed_hashing_buf; |
338 | }; |
339 | |
340 | /// Prefixes have to be read before data because first mark points after prefix |
341 | name_type.type->deserializeBinaryBulkStatePrefix(settings, state); |
342 | |
343 | while (true) |
344 | { |
345 | |
346 | /// Check that mark points to current position in file. |
347 | bool marks_eof = false; |
348 | name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path) |
349 | { |
350 | String file_name = IDataType::getFileNameForStream(name_type.name, substream_path); |
351 | |
352 | auto & stream = streams.try_emplace(file_name, path, file_name, ".bin" , mrk_file_extension, adaptive_index_granularity).first->second; |
353 | try |
354 | { |
355 | /// LowCardinality dictionary column is not read monotonically, so marks maybe inconsistent with |
356 | /// offset position in file. So we just read data and marks file, but doesn't check marks equality. |
357 | bool only_read = !substream_path.empty() && substream_path.back().type == IDataType::Substream::DictionaryKeys; |
358 | if (!stream.mrk_hashing_buf.eof()) |
359 | stream.assertMark(only_read); |
360 | else |
361 | marks_eof = true; |
362 | } |
363 | catch (Exception & e) |
364 | { |
365 | e.addMessage("Cannot read mark " + toString(mark_num) + " at row " + toString(column_size) |
366 | + " in file " + stream.mrk_file_path |
367 | + ", mrk file offset: " + toString(stream.mrk_hashing_buf.count())); |
368 | throw; |
369 | } |
370 | }, settings.path); |
371 | |
372 | size_t rows_after_mark = adaptive_index_granularity.getMarkRows(mark_num); |
373 | ++mark_num; |
374 | |
375 | /// Read index_granularity rows from column. |
376 | /// NOTE Shared array sizes of Nested columns are read more than once. That's Ok. |
377 | |
378 | MutableColumnPtr tmp_column = name_type.type->createColumn(); |
379 | name_type.type->deserializeBinaryBulkWithMultipleStreams(*tmp_column, rows_after_mark, settings, state); |
380 | |
381 | size_t read_size = tmp_column->size(); |
382 | column_size += read_size; |
383 | |
384 | /// We already checked all marks except final (it will be checked in assertEnd()). |
385 | if (mark_num == adaptive_index_granularity.getMarksCountWithoutFinal()) |
386 | break; |
387 | else if (marks_eof) |
388 | throw Exception("Unexpected end of mrk file while reading column " + name_type.name, ErrorCodes::CORRUPTED_DATA); |
389 | |
390 | if (is_cancelled()) |
391 | return {}; |
392 | } |
393 | |
394 | /// Check that number of rows are equal in each column. |
395 | if (!rows) |
396 | rows = column_size; |
397 | else if (*rows != column_size) |
398 | throw Exception{"Unexpected number of rows in column " |
399 | + name_type.name + " (" + toString(column_size) + ", expected: " + toString(*rows) + ")" , |
400 | ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH}; |
401 | |
402 | /// Save checksums for column. |
403 | name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path) |
404 | { |
405 | String file_name = IDataType::getFileNameForStream(name_type.name, substream_path); |
406 | auto stream_it = streams.find(file_name); |
407 | if (stream_it == streams.end()) |
408 | throw Exception("Logical error: cannot find stream " + file_name, ErrorCodes::LOGICAL_ERROR); |
409 | |
410 | stream_it->second.assertEnd(); |
411 | stream_it->second.saveChecksums(checksums_data); |
412 | }, {}); |
413 | |
414 | if (is_cancelled()) |
415 | return {}; |
416 | } |
417 | |
418 | if (!rows) |
419 | throw Exception("No columns in data part" , ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); |
420 | |
421 | if (!primary_key_data_types.empty()) |
422 | { |
423 | size_t expected_marks = adaptive_index_granularity.getMarksCount(); |
424 | if (expected_marks != marks_in_primary_key) |
425 | { |
426 | throw Exception("Size of primary key doesn't match expected number of marks." |
427 | " Number of rows in columns: " + toString(*rows) |
428 | + ", expected number of marks: " + toString(expected_marks) |
429 | + ", size of primary key: " + toString(marks_in_primary_key), |
430 | ErrorCodes::CORRUPTED_DATA); |
431 | } |
432 | } |
433 | |
434 | if (require_checksums || !checksums_txt.files.empty()) |
435 | checksums_txt.checkEqual(checksums_data, true); |
436 | |
437 | return checksums_data; |
438 | } |
439 | |
440 | MergeTreeData::DataPart::Checksums checkDataPart( |
441 | MergeTreeData::DataPartPtr data_part, |
442 | bool require_checksums, |
443 | const DataTypes & primary_key_data_types, |
444 | const MergeTreeIndices & indices, |
445 | std::function<bool()> is_cancelled) |
446 | { |
447 | return checkDataPart( |
448 | data_part->getFullPath(), |
449 | data_part->index_granularity, |
450 | data_part->index_granularity_info.marks_file_extension, |
451 | require_checksums, |
452 | primary_key_data_types, |
453 | indices, |
454 | is_cancelled); |
455 | } |
456 | |
457 | |
458 | } |
459 | |