1#pragma once
2
3#include <Core/Block.h>
4#include <Core/Types.h>
5#include <Core/NamesAndTypes.h>
6#include <Storages/MergeTree/MergeTreeIndexGranularity.h>
7#include <Storages/MergeTree/MergeTreeIndexGranularityInfo.h>
8#include <Storages/MergeTree/MergeTreeIndices.h>
9#include <Storages/MergeTree/MergeTreePartInfo.h>
10#include <Storages/MergeTree/MergeTreePartition.h>
11#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
12#include <Storages/MergeTree/MergeTreeDataPartTTLInfo.h>
13#include <Storages/MergeTree/KeyCondition.h>
14#include <Columns/IColumn.h>
15
16#include <Poco/Path.h>
17
18#include <shared_mutex>
19
20
21namespace DB
22{
23
24struct ColumnSize;
25class MergeTreeData;
26struct FutureMergedMutatedPart;
27
28
29/// Description of the data part.
30struct MergeTreeDataPart
31{
32 using Checksums = MergeTreeDataPartChecksums;
33 using Checksum = MergeTreeDataPartChecksums::Checksum;
34
35 MergeTreeDataPart(const MergeTreeData & storage_, const DiskPtr & disk_, const String & name_, const MergeTreePartInfo & info_);
36
37 MergeTreeDataPart(MergeTreeData & storage_, const DiskPtr & disk_, const String & name_);
38
39 /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()).
40 /// If no checksums are present returns the name of the first physically existing column.
41 String getColumnNameWithMinumumCompressedSize() const;
42
43 /// NOTE: Returns zeros if column files are not found in checksums.
44 /// NOTE: You must ensure that no ALTERs are in progress when calculating ColumnSizes.
45 /// (either by locking columns_lock, or by locking table structure).
46 ColumnSize getColumnSize(const String & name, const IDataType & type) const;
47
48 ColumnSize getTotalColumnsSize() const;
49
50 size_t getFileSizeOrZero(const String & file_name) const;
51
52 /// Returns full path to part dir
53 String getFullPath() const;
54
55 /// Returns part->name with prefixes like 'tmp_<name>'
56 String getNameWithPrefix() const;
57
58 /// Generate the new name for this part according to `new_part_info` and min/max dates from the old name.
59 /// This is useful when you want to change e.g. block numbers or the mutation version of the part.
60 String getNewName(const MergeTreePartInfo & new_part_info) const;
61
62 bool contains(const MergeTreeDataPart & other) const { return info.contains(other.info); }
63
64 /// If the partition key includes date column (a common case), these functions will return min and max values for this column.
65 DayNum getMinDate() const;
66 DayNum getMaxDate() const;
67
68 /// otherwise, if the partition key includes dateTime column (also a common case), these functions will return min and max values for this column.
69 time_t getMinTime() const;
70 time_t getMaxTime() const;
71
72 bool isEmpty() const { return rows_count == 0; }
73
74 const MergeTreeData & storage;
75
76 DiskPtr disk;
77 String name;
78 MergeTreePartInfo info;
79
80 /// A directory path (relative to storage's path) where part data is actually stored
81 /// Examples: 'detached/tmp_fetch_<name>', 'tmp_<name>', '<name>'
82 mutable String relative_path;
83
84 size_t rows_count = 0;
85 std::atomic<UInt64> bytes_on_disk {0}; /// 0 - if not counted;
86 /// Is used from several threads without locks (it is changed with ALTER).
87 /// May not contain size of checksums.txt and columns.txt
88 time_t modification_time = 0;
89 /// When the part is removed from the working set. Changes once.
90 mutable std::atomic<time_t> remove_time { std::numeric_limits<time_t>::max() };
91
92 /// If true, the destructor will delete the directory with the part.
93 bool is_temp = false;
94
95 /// If true it means that there are no ZooKeeper node for this part, so it should be deleted only from filesystem
96 bool is_duplicate = false;
97
98 /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table.
99 mutable std::atomic<bool> is_frozen {false};
100
101 /**
102 * Part state is a stage of its lifetime. States are ordered and state of a part could be increased only.
103 * Part state should be modified under data_parts mutex.
104 *
105 * Possible state transitions:
106 * Temporary -> Precommitted: we are trying to commit a fetched, inserted or merged part to active set
107 * Precommitted -> Outdated: we could not to add a part to active set and doing a rollback (for example it is duplicated part)
108 * Precommitted -> Commited: we successfully committed a part to active dataset
109 * Precommitted -> Outdated: a part was replaced by a covering part or DROP PARTITION
110 * Outdated -> Deleting: a cleaner selected this part for deletion
111 * Deleting -> Outdated: if an ZooKeeper error occurred during the deletion, we will retry deletion
112 * Committed -> DeleteOnDestroy if part was moved to another disk
113 */
114 enum class State
115 {
116 Temporary, /// the part is generating now, it is not in data_parts list
117 PreCommitted, /// the part is in data_parts, but not used for SELECTs
118 Committed, /// active data part, used by current and upcoming SELECTs
119 Outdated, /// not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes
120 Deleting, /// not active data part with identity refcounter, it is deleting right now by a cleaner
121 DeleteOnDestroy, /// part was moved to another disk and should be deleted in own destructor
122 };
123
124 using TTLInfo = MergeTreeDataPartTTLInfo;
125 using TTLInfos = MergeTreeDataPartTTLInfos;
126
127 TTLInfos ttl_infos;
128
129 /// Current state of the part. If the part is in working set already, it should be accessed via data_parts mutex
130 mutable State state{State::Temporary};
131
132 /// Returns name of state
133 static String stateToString(State state);
134 String stateString() const;
135
136 String getNameWithState() const
137 {
138 return name + " (state " + stateString() + ")";
139 }
140
141 /// Returns true if state of part is one of affordable_states
142 bool checkState(const std::initializer_list<State> & affordable_states) const
143 {
144 for (auto affordable_state : affordable_states)
145 {
146 if (state == affordable_state)
147 return true;
148 }
149 return false;
150 }
151
152 /// Throws an exception if state of the part is not in affordable_states
153 void assertState(const std::initializer_list<State> & affordable_states) const;
154
155 /// In comparison with lambdas, it is move assignable and could has several overloaded operator()
156 struct StatesFilter
157 {
158 std::initializer_list<State> affordable_states;
159 StatesFilter(const std::initializer_list<State> & affordable_states_) : affordable_states(affordable_states_) {}
160
161 bool operator() (const std::shared_ptr<const MergeTreeDataPart> & part) const
162 {
163 return part->checkState(affordable_states);
164 }
165 };
166
167 /// Returns a lambda that returns true only for part with states from specified list
168 static inline StatesFilter getStatesFilter(const std::initializer_list<State> & affordable_states)
169 {
170 return StatesFilter(affordable_states);
171 }
172
173 /// Primary key (correspond to primary.idx file).
174 /// Always loaded in RAM. Contains each index_granularity-th value of primary key tuple.
175 /// Note that marks (also correspond to primary key) is not always in RAM, but cached. See MarkCache.h.
176 using Index = Columns;
177 Index index;
178
179 MergeTreePartition partition;
180
181 /// Amount of rows between marks
182 /// As index always loaded into memory
183 MergeTreeIndexGranularity index_granularity;
184
185 /// Index that for each part stores min and max values of a set of columns. This allows quickly excluding
186 /// parts based on conditions on these columns imposed by a query.
187 /// Currently this index is built using only columns required by partition expression, but in principle it
188 /// can be built using any set of columns.
189 struct MinMaxIndex
190 {
191 /// A direct product of ranges for each key column. See Storages/MergeTree/KeyCondition.cpp for details.
192 std::vector<Range> parallelogram;
193 bool initialized = false;
194
195 public:
196 MinMaxIndex() = default;
197
198 /// For month-based partitioning.
199 MinMaxIndex(DayNum min_date, DayNum max_date)
200 : parallelogram(1, Range(min_date, true, max_date, true))
201 , initialized(true)
202 {
203 }
204
205 void load(const MergeTreeData & storage, const String & part_path);
206 void store(const MergeTreeData & storage, const String & part_path, Checksums & checksums) const;
207 void store(const Names & column_names, const DataTypes & data_types, const String & part_path, Checksums & checksums) const;
208
209 void update(const Block & block, const Names & column_names);
210 void merge(const MinMaxIndex & other);
211 };
212
213 MinMaxIndex minmax_idx;
214
215 Checksums checksums;
216
217 /// Columns description.
218 NamesAndTypesList columns;
219
220 /// Columns with values, that all have been zeroed by expired ttl
221 NameSet empty_columns;
222
223 using ColumnToSize = std::map<std::string, UInt64>;
224
225 /** It is blocked for writing when changing columns, checksums or any part files.
226 * Locked to read when reading columns, checksums or any part files.
227 */
228 mutable std::shared_mutex columns_lock;
229
230 MergeTreeIndexGranularityInfo index_granularity_info;
231
232 ~MergeTreeDataPart();
233
234 /// Calculate the total size of the entire directory with all the files
235 static UInt64 calculateTotalSizeOnDisk(const String & from);
236
237 void remove() const;
238
239 /// Makes checks and move part to new directory
240 /// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly
241 void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists = true) const;
242
243 /// Generate unique path to detach part
244 String getRelativePathForDetachedPart(const String & prefix) const;
245
246 /// Moves a part to detached/ directory and adds prefix to its name
247 void renameToDetached(const String & prefix) const;
248
249 /// Makes clone of a part in detached/ directory via hard links
250 void makeCloneInDetached(const String & prefix) const;
251
252 /// Makes full clone of part in detached/ on another disk
253 void makeCloneOnDiskDetached(const ReservationPtr & reservation) const;
254
255 /// Populates columns_to_size map (compressed size).
256 void accumulateColumnSizes(ColumnToSize & column_to_size) const;
257
258 /// Initialize columns (from columns.txt if exists, or create from column files if not).
259 /// Load checksums from checksums.txt if exists. Load index if required.
260 void loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency);
261
262 /// Checks that .bin and .mrk files exist
263 bool hasColumnFiles(const String & column, const IDataType & type) const;
264
265 /// For data in RAM ('index')
266 UInt64 getIndexSizeInBytes() const;
267 UInt64 getIndexSizeInAllocatedBytes() const;
268 UInt64 getMarksCount() const;
269
270private:
271 /// Reads columns names and types from columns.txt
272 void loadColumns(bool require);
273
274 /// If checksums.txt exists, reads files' checksums (and sizes) from it
275 void loadChecksums(bool require);
276
277 /// Loads marks index granularity into memory
278 void loadIndexGranularity();
279
280 /// Loads index file.
281 void loadIndex();
282
283 /// Load rows count for this part from disk (for the newer storage format version).
284 /// For the older format version calculates rows count from the size of a column with a fixed size.
285 void loadRowsCount();
286
287 /// Loads ttl infos in json format from file ttl.txt. If file doesn`t exists assigns ttl infos with all zeros
288 void loadTTLInfos();
289
290 void loadPartitionAndMinMaxIndex();
291
292 void checkConsistency(bool require_part_metadata);
293
294 ColumnSize getColumnSizeImpl(const String & name, const IDataType & type, std::unordered_set<String> * processed_substreams) const;
295};
296
297
298using MergeTreeDataPartState = MergeTreeDataPart::State;
299
300}
301