1#pragma once
2#include <Core/Types.h>
3#include <IO/ReadBuffer.h>
4#include <IO/WriteBuffer.h>
5#include <city.h>
6#include <map>
7#include <optional>
8
9
10class SipHash;
11
12
13namespace DB
14{
15
16/// Checksum of one file.
17struct MergeTreeDataPartChecksum
18{
19 using uint128 = CityHash_v1_0_2::uint128;
20
21 UInt64 file_size {};
22 uint128 file_hash {};
23
24 bool is_compressed = false;
25 UInt64 uncompressed_size {};
26 uint128 uncompressed_hash {};
27
28 MergeTreeDataPartChecksum() {}
29 MergeTreeDataPartChecksum(UInt64 file_size_, uint128 file_hash_) : file_size(file_size_), file_hash(file_hash_) {}
30 MergeTreeDataPartChecksum(UInt64 file_size_, uint128 file_hash_, UInt64 uncompressed_size_, uint128 uncompressed_hash_)
31 : file_size(file_size_), file_hash(file_hash_), is_compressed(true),
32 uncompressed_size(uncompressed_size_), uncompressed_hash(uncompressed_hash_) {}
33
34 void checkEqual(const MergeTreeDataPartChecksum & rhs, bool have_uncompressed, const String & name) const;
35 void checkSize(const String & path) const;
36};
37
38
39/** Checksums of all non-temporary files.
40 * For compressed files, the check sum and the size of the decompressed data are stored to not depend on the compression method.
41 */
42struct MergeTreeDataPartChecksums
43{
44 using Checksum = MergeTreeDataPartChecksum;
45
46 /// The order is important.
47 using FileChecksums = std::map<String, Checksum>;
48 FileChecksums files;
49
50 void addFile(const String & file_name, UInt64 file_size, Checksum::uint128 file_hash);
51
52 void add(MergeTreeDataPartChecksums && rhs_checksums);
53
54 bool empty() const
55 {
56 return files.empty();
57 }
58
59 /// Checks that the set of columns and their checksums are the same. If not, throws an exception.
60 /// If have_uncompressed, for compressed files it compares the checksums of the decompressed data.
61 /// Otherwise, it compares only the checksums of the files.
62 void checkEqual(const MergeTreeDataPartChecksums & rhs, bool have_uncompressed) const;
63
64 static bool isBadChecksumsErrorCode(int code);
65
66 /// Checks that the directory contains all the needed files of the correct size. Does not check the checksum.
67 void checkSizes(const String & path) const;
68
69 /// Returns false if the checksum is too old.
70 bool read(ReadBuffer & in);
71 /// Assume that header with version (the first line) is read
72 bool read(ReadBuffer & in, size_t format_version);
73 bool read_v2(ReadBuffer & in);
74 bool read_v3(ReadBuffer & in);
75 bool read_v4(ReadBuffer & in);
76
77 void write(WriteBuffer & out) const;
78
79 /// Checksum from the set of checksums of .bin files (for deduplication).
80 void computeTotalChecksumDataOnly(SipHash & hash) const;
81
82 /// SipHash of all all files hashes represented as hex string
83 String getTotalChecksumHex() const;
84
85 String getSerializedString() const;
86 static MergeTreeDataPartChecksums deserializeFrom(const String & s);
87
88 UInt64 getTotalSizeOnDisk() const;
89};
90
91
92/// A kind of MergeTreeDataPartChecksums intended to be stored in ZooKeeper (to save its RAM)
93/// MinimalisticDataPartChecksums and MergeTreeDataPartChecksums have the same serialization format
94/// for versions less than MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS.
95struct MinimalisticDataPartChecksums
96{
97 UInt64 num_compressed_files = 0;
98 UInt64 num_uncompressed_files = 0;
99
100 using uint128 = MergeTreeDataPartChecksum::uint128;
101 uint128 hash_of_all_files {};
102 uint128 hash_of_uncompressed_files {};
103 uint128 uncompressed_hash_of_compressed_files {};
104
105 bool operator==(const MinimalisticDataPartChecksums & other) const
106 {
107 return num_compressed_files == other.num_compressed_files
108 && num_uncompressed_files == other.num_uncompressed_files
109 && hash_of_all_files == other.hash_of_all_files
110 && hash_of_uncompressed_files == other.hash_of_uncompressed_files
111 && uncompressed_hash_of_compressed_files == other.uncompressed_hash_of_compressed_files;
112 }
113
114 /// Is set only for old formats
115 std::optional<MergeTreeDataPartChecksums> full_checksums;
116
117 static constexpr size_t MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS = 5;
118
119 MinimalisticDataPartChecksums() = default;
120 void computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums);
121
122 bool deserialize(ReadBuffer & in);
123 void deserializeWithoutHeader(ReadBuffer & in);
124 static MinimalisticDataPartChecksums deserializeFrom(const String & s);
125
126 void serialize(WriteBuffer & to) const;
127 void serializeWithoutHeader(WriteBuffer & to) const;
128 String getSerializedString();
129 static String getSerializedString(const MergeTreeDataPartChecksums & full_checksums, bool minimalistic);
130
131 void checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
132 void checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
133 void checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
134};
135
136
137}
138