| 1 | #pragma once |
| 2 | |
| 3 | #include <Core/Block.h> |
| 4 | #include <Core/Types.h> |
| 5 | #include <Core/NamesAndTypes.h> |
| 6 | #include <Storages/MergeTree/MergeTreeIndexGranularity.h> |
| 7 | #include <Storages/MergeTree/MergeTreeIndexGranularityInfo.h> |
| 8 | #include <Storages/MergeTree/MergeTreeIndices.h> |
| 9 | #include <Storages/MergeTree/MergeTreePartInfo.h> |
| 10 | #include <Storages/MergeTree/MergeTreePartition.h> |
| 11 | #include <Storages/MergeTree/MergeTreeDataPartChecksum.h> |
| 12 | #include <Storages/MergeTree/MergeTreeDataPartTTLInfo.h> |
| 13 | #include <Storages/MergeTree/KeyCondition.h> |
| 14 | #include <Columns/IColumn.h> |
| 15 | |
| 16 | #include <Poco/Path.h> |
| 17 | |
| 18 | #include <shared_mutex> |
| 19 | |
| 20 | |
| 21 | namespace DB |
| 22 | { |
| 23 | |
| 24 | struct ColumnSize; |
| 25 | class MergeTreeData; |
| 26 | struct FutureMergedMutatedPart; |
| 27 | |
| 28 | |
| 29 | /// Description of the data part. |
| 30 | struct MergeTreeDataPart |
| 31 | { |
| 32 | using Checksums = MergeTreeDataPartChecksums; |
| 33 | using Checksum = MergeTreeDataPartChecksums::Checksum; |
| 34 | |
| 35 | MergeTreeDataPart(const MergeTreeData & storage_, const DiskPtr & disk_, const String & name_, const MergeTreePartInfo & info_); |
| 36 | |
| 37 | MergeTreeDataPart(MergeTreeData & storage_, const DiskPtr & disk_, const String & name_); |
| 38 | |
| 39 | /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()). |
| 40 | /// If no checksums are present returns the name of the first physically existing column. |
| 41 | String getColumnNameWithMinumumCompressedSize() const; |
| 42 | |
| 43 | /// NOTE: Returns zeros if column files are not found in checksums. |
| 44 | /// NOTE: You must ensure that no ALTERs are in progress when calculating ColumnSizes. |
| 45 | /// (either by locking columns_lock, or by locking table structure). |
| 46 | ColumnSize getColumnSize(const String & name, const IDataType & type) const; |
| 47 | |
| 48 | ColumnSize getTotalColumnsSize() const; |
| 49 | |
| 50 | size_t getFileSizeOrZero(const String & file_name) const; |
| 51 | |
| 52 | /// Returns full path to part dir |
| 53 | String getFullPath() const; |
| 54 | |
| 55 | /// Returns part->name with prefixes like 'tmp_<name>' |
| 56 | String getNameWithPrefix() const; |
| 57 | |
| 58 | /// Generate the new name for this part according to `new_part_info` and min/max dates from the old name. |
| 59 | /// This is useful when you want to change e.g. block numbers or the mutation version of the part. |
| 60 | String getNewName(const MergeTreePartInfo & new_part_info) const; |
| 61 | |
| 62 | bool contains(const MergeTreeDataPart & other) const { return info.contains(other.info); } |
| 63 | |
| 64 | /// If the partition key includes date column (a common case), these functions will return min and max values for this column. |
| 65 | DayNum getMinDate() const; |
| 66 | DayNum getMaxDate() const; |
| 67 | |
| 68 | /// otherwise, if the partition key includes dateTime column (also a common case), these functions will return min and max values for this column. |
| 69 | time_t getMinTime() const; |
| 70 | time_t getMaxTime() const; |
| 71 | |
| 72 | bool isEmpty() const { return rows_count == 0; } |
| 73 | |
| 74 | const MergeTreeData & storage; |
| 75 | |
| 76 | DiskPtr disk; |
| 77 | String name; |
| 78 | MergeTreePartInfo info; |
| 79 | |
| 80 | /// A directory path (relative to storage's path) where part data is actually stored |
| 81 | /// Examples: 'detached/tmp_fetch_<name>', 'tmp_<name>', '<name>' |
| 82 | mutable String relative_path; |
| 83 | |
| 84 | size_t rows_count = 0; |
| 85 | std::atomic<UInt64> bytes_on_disk {0}; /// 0 - if not counted; |
| 86 | /// Is used from several threads without locks (it is changed with ALTER). |
| 87 | /// May not contain size of checksums.txt and columns.txt |
| 88 | time_t modification_time = 0; |
| 89 | /// When the part is removed from the working set. Changes once. |
| 90 | mutable std::atomic<time_t> remove_time { std::numeric_limits<time_t>::max() }; |
| 91 | |
| 92 | /// If true, the destructor will delete the directory with the part. |
| 93 | bool is_temp = false; |
| 94 | |
| 95 | /// If true it means that there are no ZooKeeper node for this part, so it should be deleted only from filesystem |
| 96 | bool is_duplicate = false; |
| 97 | |
| 98 | /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table. |
| 99 | mutable std::atomic<bool> is_frozen {false}; |
| 100 | |
| 101 | /** |
| 102 | * Part state is a stage of its lifetime. States are ordered and state of a part could be increased only. |
| 103 | * Part state should be modified under data_parts mutex. |
| 104 | * |
| 105 | * Possible state transitions: |
| 106 | * Temporary -> Precommitted: we are trying to commit a fetched, inserted or merged part to active set |
| 107 | * Precommitted -> Outdated: we could not to add a part to active set and doing a rollback (for example it is duplicated part) |
| 108 | * Precommitted -> Commited: we successfully committed a part to active dataset |
| 109 | * Precommitted -> Outdated: a part was replaced by a covering part or DROP PARTITION |
| 110 | * Outdated -> Deleting: a cleaner selected this part for deletion |
| 111 | * Deleting -> Outdated: if an ZooKeeper error occurred during the deletion, we will retry deletion |
| 112 | * Committed -> DeleteOnDestroy if part was moved to another disk |
| 113 | */ |
| 114 | enum class State |
| 115 | { |
| 116 | Temporary, /// the part is generating now, it is not in data_parts list |
| 117 | PreCommitted, /// the part is in data_parts, but not used for SELECTs |
| 118 | Committed, /// active data part, used by current and upcoming SELECTs |
| 119 | Outdated, /// not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes |
| 120 | Deleting, /// not active data part with identity refcounter, it is deleting right now by a cleaner |
| 121 | DeleteOnDestroy, /// part was moved to another disk and should be deleted in own destructor |
| 122 | }; |
| 123 | |
| 124 | using TTLInfo = MergeTreeDataPartTTLInfo; |
| 125 | using TTLInfos = MergeTreeDataPartTTLInfos; |
| 126 | |
| 127 | TTLInfos ttl_infos; |
| 128 | |
| 129 | /// Current state of the part. If the part is in working set already, it should be accessed via data_parts mutex |
| 130 | mutable State state{State::Temporary}; |
| 131 | |
| 132 | /// Returns name of state |
| 133 | static String stateToString(State state); |
| 134 | String stateString() const; |
| 135 | |
| 136 | String getNameWithState() const |
| 137 | { |
| 138 | return name + " (state " + stateString() + ")" ; |
| 139 | } |
| 140 | |
| 141 | /// Returns true if state of part is one of affordable_states |
| 142 | bool checkState(const std::initializer_list<State> & affordable_states) const |
| 143 | { |
| 144 | for (auto affordable_state : affordable_states) |
| 145 | { |
| 146 | if (state == affordable_state) |
| 147 | return true; |
| 148 | } |
| 149 | return false; |
| 150 | } |
| 151 | |
| 152 | /// Throws an exception if state of the part is not in affordable_states |
| 153 | void assertState(const std::initializer_list<State> & affordable_states) const; |
| 154 | |
| 155 | /// In comparison with lambdas, it is move assignable and could has several overloaded operator() |
| 156 | struct StatesFilter |
| 157 | { |
| 158 | std::initializer_list<State> affordable_states; |
| 159 | StatesFilter(const std::initializer_list<State> & affordable_states_) : affordable_states(affordable_states_) {} |
| 160 | |
| 161 | bool operator() (const std::shared_ptr<const MergeTreeDataPart> & part) const |
| 162 | { |
| 163 | return part->checkState(affordable_states); |
| 164 | } |
| 165 | }; |
| 166 | |
| 167 | /// Returns a lambda that returns true only for part with states from specified list |
| 168 | static inline StatesFilter getStatesFilter(const std::initializer_list<State> & affordable_states) |
| 169 | { |
| 170 | return StatesFilter(affordable_states); |
| 171 | } |
| 172 | |
| 173 | /// Primary key (correspond to primary.idx file). |
| 174 | /// Always loaded in RAM. Contains each index_granularity-th value of primary key tuple. |
| 175 | /// Note that marks (also correspond to primary key) is not always in RAM, but cached. See MarkCache.h. |
| 176 | using Index = Columns; |
| 177 | Index index; |
| 178 | |
| 179 | MergeTreePartition partition; |
| 180 | |
| 181 | /// Amount of rows between marks |
| 182 | /// As index always loaded into memory |
| 183 | MergeTreeIndexGranularity index_granularity; |
| 184 | |
| 185 | /// Index that for each part stores min and max values of a set of columns. This allows quickly excluding |
| 186 | /// parts based on conditions on these columns imposed by a query. |
| 187 | /// Currently this index is built using only columns required by partition expression, but in principle it |
| 188 | /// can be built using any set of columns. |
| 189 | struct MinMaxIndex |
| 190 | { |
| 191 | /// A direct product of ranges for each key column. See Storages/MergeTree/KeyCondition.cpp for details. |
| 192 | std::vector<Range> parallelogram; |
| 193 | bool initialized = false; |
| 194 | |
| 195 | public: |
| 196 | MinMaxIndex() = default; |
| 197 | |
| 198 | /// For month-based partitioning. |
| 199 | MinMaxIndex(DayNum min_date, DayNum max_date) |
| 200 | : parallelogram(1, Range(min_date, true, max_date, true)) |
| 201 | , initialized(true) |
| 202 | { |
| 203 | } |
| 204 | |
| 205 | void load(const MergeTreeData & storage, const String & part_path); |
| 206 | void store(const MergeTreeData & storage, const String & part_path, Checksums & checksums) const; |
| 207 | void store(const Names & column_names, const DataTypes & data_types, const String & part_path, Checksums & checksums) const; |
| 208 | |
| 209 | void update(const Block & block, const Names & column_names); |
| 210 | void merge(const MinMaxIndex & other); |
| 211 | }; |
| 212 | |
| 213 | MinMaxIndex minmax_idx; |
| 214 | |
| 215 | Checksums checksums; |
| 216 | |
| 217 | /// Columns description. |
| 218 | NamesAndTypesList columns; |
| 219 | |
| 220 | /// Columns with values, that all have been zeroed by expired ttl |
| 221 | NameSet empty_columns; |
| 222 | |
| 223 | using ColumnToSize = std::map<std::string, UInt64>; |
| 224 | |
| 225 | /** It is blocked for writing when changing columns, checksums or any part files. |
| 226 | * Locked to read when reading columns, checksums or any part files. |
| 227 | */ |
| 228 | mutable std::shared_mutex columns_lock; |
| 229 | |
| 230 | MergeTreeIndexGranularityInfo index_granularity_info; |
| 231 | |
| 232 | ~MergeTreeDataPart(); |
| 233 | |
| 234 | /// Calculate the total size of the entire directory with all the files |
| 235 | static UInt64 calculateTotalSizeOnDisk(const String & from); |
| 236 | |
| 237 | void remove() const; |
| 238 | |
| 239 | /// Makes checks and move part to new directory |
| 240 | /// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly |
| 241 | void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists = true) const; |
| 242 | |
| 243 | /// Generate unique path to detach part |
| 244 | String getRelativePathForDetachedPart(const String & prefix) const; |
| 245 | |
| 246 | /// Moves a part to detached/ directory and adds prefix to its name |
| 247 | void renameToDetached(const String & prefix) const; |
| 248 | |
| 249 | /// Makes clone of a part in detached/ directory via hard links |
| 250 | void makeCloneInDetached(const String & prefix) const; |
| 251 | |
| 252 | /// Makes full clone of part in detached/ on another disk |
| 253 | void makeCloneOnDiskDetached(const ReservationPtr & reservation) const; |
| 254 | |
| 255 | /// Populates columns_to_size map (compressed size). |
| 256 | void accumulateColumnSizes(ColumnToSize & column_to_size) const; |
| 257 | |
| 258 | /// Initialize columns (from columns.txt if exists, or create from column files if not). |
| 259 | /// Load checksums from checksums.txt if exists. Load index if required. |
| 260 | void loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency); |
| 261 | |
| 262 | /// Checks that .bin and .mrk files exist |
| 263 | bool hasColumnFiles(const String & column, const IDataType & type) const; |
| 264 | |
| 265 | /// For data in RAM ('index') |
| 266 | UInt64 getIndexSizeInBytes() const; |
| 267 | UInt64 getIndexSizeInAllocatedBytes() const; |
| 268 | UInt64 getMarksCount() const; |
| 269 | |
| 270 | private: |
| 271 | /// Reads columns names and types from columns.txt |
| 272 | void loadColumns(bool require); |
| 273 | |
| 274 | /// If checksums.txt exists, reads files' checksums (and sizes) from it |
| 275 | void loadChecksums(bool require); |
| 276 | |
| 277 | /// Loads marks index granularity into memory |
| 278 | void loadIndexGranularity(); |
| 279 | |
| 280 | /// Loads index file. |
| 281 | void loadIndex(); |
| 282 | |
| 283 | /// Load rows count for this part from disk (for the newer storage format version). |
| 284 | /// For the older format version calculates rows count from the size of a column with a fixed size. |
| 285 | void loadRowsCount(); |
| 286 | |
| 287 | /// Loads ttl infos in json format from file ttl.txt. If file doesn`t exists assigns ttl infos with all zeros |
| 288 | void loadTTLInfos(); |
| 289 | |
| 290 | void loadPartitionAndMinMaxIndex(); |
| 291 | |
| 292 | void checkConsistency(bool require_part_metadata); |
| 293 | |
| 294 | ColumnSize getColumnSizeImpl(const String & name, const IDataType & type, std::unordered_set<String> * processed_substreams) const; |
| 295 | }; |
| 296 | |
| 297 | |
| 298 | using MergeTreeDataPartState = MergeTreeDataPart::State; |
| 299 | |
| 300 | } |
| 301 | |