| 1 | #pragma once | 
|---|
| 2 |  | 
|---|
| 3 | #include <Core/Block.h> | 
|---|
| 4 | #include <Core/Types.h> | 
|---|
| 5 | #include <Core/NamesAndTypes.h> | 
|---|
| 6 | #include <Storages/MergeTree/MergeTreeIndexGranularity.h> | 
|---|
| 7 | #include <Storages/MergeTree/MergeTreeIndexGranularityInfo.h> | 
|---|
| 8 | #include <Storages/MergeTree/MergeTreeIndices.h> | 
|---|
| 9 | #include <Storages/MergeTree/MergeTreePartInfo.h> | 
|---|
| 10 | #include <Storages/MergeTree/MergeTreePartition.h> | 
|---|
| 11 | #include <Storages/MergeTree/MergeTreeDataPartChecksum.h> | 
|---|
| 12 | #include <Storages/MergeTree/MergeTreeDataPartTTLInfo.h> | 
|---|
| 13 | #include <Storages/MergeTree/KeyCondition.h> | 
|---|
| 14 | #include <Columns/IColumn.h> | 
|---|
| 15 |  | 
|---|
| 16 | #include <Poco/Path.h> | 
|---|
| 17 |  | 
|---|
| 18 | #include <shared_mutex> | 
|---|
| 19 |  | 
|---|
| 20 |  | 
|---|
| 21 | namespace DB | 
|---|
| 22 | { | 
|---|
| 23 |  | 
|---|
| 24 | struct ColumnSize; | 
|---|
| 25 | class MergeTreeData; | 
|---|
| 26 | struct FutureMergedMutatedPart; | 
|---|
| 27 |  | 
|---|
| 28 |  | 
|---|
| 29 | /// Description of the data part. | 
|---|
| 30 | struct MergeTreeDataPart | 
|---|
| 31 | { | 
|---|
| 32 | using Checksums = MergeTreeDataPartChecksums; | 
|---|
| 33 | using Checksum = MergeTreeDataPartChecksums::Checksum; | 
|---|
| 34 |  | 
|---|
| 35 | MergeTreeDataPart(const MergeTreeData & storage_, const DiskPtr & disk_, const String & name_, const MergeTreePartInfo & info_); | 
|---|
| 36 |  | 
|---|
| 37 | MergeTreeDataPart(MergeTreeData & storage_, const DiskPtr & disk_, const String & name_); | 
|---|
| 38 |  | 
|---|
| 39 | /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()). | 
|---|
| 40 | /// If no checksums are present returns the name of the first physically existing column. | 
|---|
| 41 | String getColumnNameWithMinumumCompressedSize() const; | 
|---|
| 42 |  | 
|---|
| 43 | /// NOTE: Returns zeros if column files are not found in checksums. | 
|---|
| 44 | /// NOTE: You must ensure that no ALTERs are in progress when calculating ColumnSizes. | 
|---|
| 45 | ///   (either by locking columns_lock, or by locking table structure). | 
|---|
| 46 | ColumnSize getColumnSize(const String & name, const IDataType & type) const; | 
|---|
| 47 |  | 
|---|
| 48 | ColumnSize getTotalColumnsSize() const; | 
|---|
| 49 |  | 
|---|
| 50 | size_t getFileSizeOrZero(const String & file_name) const; | 
|---|
| 51 |  | 
|---|
| 52 | /// Returns full path to part dir | 
|---|
| 53 | String getFullPath() const; | 
|---|
| 54 |  | 
|---|
| 55 | /// Returns part->name with prefixes like 'tmp_<name>' | 
|---|
| 56 | String getNameWithPrefix() const; | 
|---|
| 57 |  | 
|---|
| 58 | /// Generate the new name for this part according to `new_part_info` and min/max dates from the old name. | 
|---|
| 59 | /// This is useful when you want to change e.g. block numbers or the mutation version of the part. | 
|---|
| 60 | String getNewName(const MergeTreePartInfo & new_part_info) const; | 
|---|
| 61 |  | 
|---|
| 62 | bool contains(const MergeTreeDataPart & other) const { return info.contains(other.info); } | 
|---|
| 63 |  | 
|---|
| 64 | /// If the partition key includes date column (a common case), these functions will return min and max values for this column. | 
|---|
| 65 | DayNum getMinDate() const; | 
|---|
| 66 | DayNum getMaxDate() const; | 
|---|
| 67 |  | 
|---|
| 68 | /// otherwise, if the partition key includes dateTime column (also a common case), these functions will return min and max values for this column. | 
|---|
| 69 | time_t getMinTime() const; | 
|---|
| 70 | time_t getMaxTime() const; | 
|---|
| 71 |  | 
|---|
| 72 | bool isEmpty() const { return rows_count == 0; } | 
|---|
| 73 |  | 
|---|
| 74 | const MergeTreeData & storage; | 
|---|
| 75 |  | 
|---|
| 76 | DiskPtr disk; | 
|---|
| 77 | String name; | 
|---|
| 78 | MergeTreePartInfo info; | 
|---|
| 79 |  | 
|---|
| 80 | /// A directory path (relative to storage's path) where part data is actually stored | 
|---|
| 81 | /// Examples: 'detached/tmp_fetch_<name>', 'tmp_<name>', '<name>' | 
|---|
| 82 | mutable String relative_path; | 
|---|
| 83 |  | 
|---|
| 84 | size_t rows_count = 0; | 
|---|
| 85 | std::atomic<UInt64> bytes_on_disk {0};  /// 0 - if not counted; | 
|---|
| 86 | /// Is used from several threads without locks (it is changed with ALTER). | 
|---|
| 87 | /// May not contain size of checksums.txt and columns.txt | 
|---|
| 88 | time_t modification_time = 0; | 
|---|
| 89 | /// When the part is removed from the working set. Changes once. | 
|---|
| 90 | mutable std::atomic<time_t> remove_time { std::numeric_limits<time_t>::max() }; | 
|---|
| 91 |  | 
|---|
| 92 | /// If true, the destructor will delete the directory with the part. | 
|---|
| 93 | bool is_temp = false; | 
|---|
| 94 |  | 
|---|
| 95 | /// If true it means that there are no ZooKeeper node for this part, so it should be deleted only from filesystem | 
|---|
| 96 | bool is_duplicate = false; | 
|---|
| 97 |  | 
|---|
| 98 | /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table. | 
|---|
| 99 | mutable std::atomic<bool> is_frozen {false}; | 
|---|
| 100 |  | 
|---|
| 101 | /** | 
|---|
| 102 | * Part state is a stage of its lifetime. States are ordered and state of a part could be increased only. | 
|---|
| 103 | * Part state should be modified under data_parts mutex. | 
|---|
| 104 | * | 
|---|
| 105 | * Possible state transitions: | 
|---|
| 106 | * Temporary -> Precommitted:   we are trying to commit a fetched, inserted or merged part to active set | 
|---|
| 107 | * Precommitted -> Outdated:    we could not to add a part to active set and doing a rollback (for example it is duplicated part) | 
|---|
| 108 | * Precommitted -> Commited:    we successfully committed a part to active dataset | 
|---|
| 109 | * Precommitted -> Outdated:    a part was replaced by a covering part or DROP PARTITION | 
|---|
| 110 | * Outdated -> Deleting:        a cleaner selected this part for deletion | 
|---|
| 111 | * Deleting -> Outdated:        if an ZooKeeper error occurred during the deletion, we will retry deletion | 
|---|
| 112 | * Committed -> DeleteOnDestroy if part was moved to another disk | 
|---|
| 113 | */ | 
|---|
| 114 | enum class State | 
|---|
| 115 | { | 
|---|
| 116 | Temporary,       /// the part is generating now, it is not in data_parts list | 
|---|
| 117 | PreCommitted,    /// the part is in data_parts, but not used for SELECTs | 
|---|
| 118 | Committed,       /// active data part, used by current and upcoming SELECTs | 
|---|
| 119 | Outdated,        /// not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes | 
|---|
| 120 | Deleting,        /// not active data part with identity refcounter, it is deleting right now by a cleaner | 
|---|
| 121 | DeleteOnDestroy, /// part was moved to another disk and should be deleted in own destructor | 
|---|
| 122 | }; | 
|---|
| 123 |  | 
|---|
| 124 | using TTLInfo = MergeTreeDataPartTTLInfo; | 
|---|
| 125 | using TTLInfos = MergeTreeDataPartTTLInfos; | 
|---|
| 126 |  | 
|---|
| 127 | TTLInfos ttl_infos; | 
|---|
| 128 |  | 
|---|
| 129 | /// Current state of the part. If the part is in working set already, it should be accessed via data_parts mutex | 
|---|
| 130 | mutable State state{State::Temporary}; | 
|---|
| 131 |  | 
|---|
| 132 | /// Returns name of state | 
|---|
| 133 | static String stateToString(State state); | 
|---|
| 134 | String stateString() const; | 
|---|
| 135 |  | 
|---|
| 136 | String getNameWithState() const | 
|---|
| 137 | { | 
|---|
| 138 | return name + " (state "+ stateString() + ")"; | 
|---|
| 139 | } | 
|---|
| 140 |  | 
|---|
| 141 | /// Returns true if state of part is one of affordable_states | 
|---|
| 142 | bool checkState(const std::initializer_list<State> & affordable_states) const | 
|---|
| 143 | { | 
|---|
| 144 | for (auto affordable_state : affordable_states) | 
|---|
| 145 | { | 
|---|
| 146 | if (state == affordable_state) | 
|---|
| 147 | return true; | 
|---|
| 148 | } | 
|---|
| 149 | return false; | 
|---|
| 150 | } | 
|---|
| 151 |  | 
|---|
| 152 | /// Throws an exception if state of the part is not in affordable_states | 
|---|
| 153 | void assertState(const std::initializer_list<State> & affordable_states) const; | 
|---|
| 154 |  | 
|---|
| 155 | /// In comparison with lambdas, it is move assignable and could has several overloaded operator() | 
|---|
| 156 | struct StatesFilter | 
|---|
| 157 | { | 
|---|
| 158 | std::initializer_list<State> affordable_states; | 
|---|
| 159 | StatesFilter(const std::initializer_list<State> & affordable_states_) : affordable_states(affordable_states_) {} | 
|---|
| 160 |  | 
|---|
| 161 | bool operator() (const std::shared_ptr<const MergeTreeDataPart> & part) const | 
|---|
| 162 | { | 
|---|
| 163 | return part->checkState(affordable_states); | 
|---|
| 164 | } | 
|---|
| 165 | }; | 
|---|
| 166 |  | 
|---|
| 167 | /// Returns a lambda that returns true only for part with states from specified list | 
|---|
| 168 | static inline StatesFilter getStatesFilter(const std::initializer_list<State> & affordable_states) | 
|---|
| 169 | { | 
|---|
| 170 | return StatesFilter(affordable_states); | 
|---|
| 171 | } | 
|---|
| 172 |  | 
|---|
| 173 | /// Primary key (correspond to primary.idx file). | 
|---|
| 174 | /// Always loaded in RAM. Contains each index_granularity-th value of primary key tuple. | 
|---|
| 175 | /// Note that marks (also correspond to primary key) is not always in RAM, but cached. See MarkCache.h. | 
|---|
| 176 | using Index = Columns; | 
|---|
| 177 | Index index; | 
|---|
| 178 |  | 
|---|
| 179 | MergeTreePartition partition; | 
|---|
| 180 |  | 
|---|
| 181 | /// Amount of rows between marks | 
|---|
| 182 | /// As index always loaded into memory | 
|---|
| 183 | MergeTreeIndexGranularity index_granularity; | 
|---|
| 184 |  | 
|---|
| 185 | /// Index that for each part stores min and max values of a set of columns. This allows quickly excluding | 
|---|
| 186 | /// parts based on conditions on these columns imposed by a query. | 
|---|
| 187 | /// Currently this index is built using only columns required by partition expression, but in principle it | 
|---|
| 188 | /// can be built using any set of columns. | 
|---|
| 189 | struct MinMaxIndex | 
|---|
| 190 | { | 
|---|
| 191 | /// A direct product of ranges for each key column. See Storages/MergeTree/KeyCondition.cpp for details. | 
|---|
| 192 | std::vector<Range> parallelogram; | 
|---|
| 193 | bool initialized = false; | 
|---|
| 194 |  | 
|---|
| 195 | public: | 
|---|
| 196 | MinMaxIndex() = default; | 
|---|
| 197 |  | 
|---|
| 198 | /// For month-based partitioning. | 
|---|
| 199 | MinMaxIndex(DayNum min_date, DayNum max_date) | 
|---|
| 200 | : parallelogram(1, Range(min_date, true, max_date, true)) | 
|---|
| 201 | , initialized(true) | 
|---|
| 202 | { | 
|---|
| 203 | } | 
|---|
| 204 |  | 
|---|
| 205 | void load(const MergeTreeData & storage, const String & part_path); | 
|---|
| 206 | void store(const MergeTreeData & storage, const String & part_path, Checksums & checksums) const; | 
|---|
| 207 | void store(const Names & column_names, const DataTypes & data_types, const String & part_path, Checksums & checksums) const; | 
|---|
| 208 |  | 
|---|
| 209 | void update(const Block & block, const Names & column_names); | 
|---|
| 210 | void merge(const MinMaxIndex & other); | 
|---|
| 211 | }; | 
|---|
| 212 |  | 
|---|
| 213 | MinMaxIndex minmax_idx; | 
|---|
| 214 |  | 
|---|
| 215 | Checksums checksums; | 
|---|
| 216 |  | 
|---|
| 217 | /// Columns description. | 
|---|
| 218 | NamesAndTypesList columns; | 
|---|
| 219 |  | 
|---|
| 220 | /// Columns with values, that all have been zeroed by expired ttl | 
|---|
| 221 | NameSet empty_columns; | 
|---|
| 222 |  | 
|---|
| 223 | using ColumnToSize = std::map<std::string, UInt64>; | 
|---|
| 224 |  | 
|---|
| 225 | /** It is blocked for writing when changing columns, checksums or any part files. | 
|---|
| 226 | * Locked to read when reading columns, checksums or any part files. | 
|---|
| 227 | */ | 
|---|
| 228 | mutable std::shared_mutex columns_lock; | 
|---|
| 229 |  | 
|---|
| 230 | MergeTreeIndexGranularityInfo index_granularity_info; | 
|---|
| 231 |  | 
|---|
| 232 | ~MergeTreeDataPart(); | 
|---|
| 233 |  | 
|---|
| 234 | /// Calculate the total size of the entire directory with all the files | 
|---|
| 235 | static UInt64 calculateTotalSizeOnDisk(const String & from); | 
|---|
| 236 |  | 
|---|
| 237 | void remove() const; | 
|---|
| 238 |  | 
|---|
| 239 | /// Makes checks and move part to new directory | 
|---|
| 240 | /// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly | 
|---|
| 241 | void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists = true) const; | 
|---|
| 242 |  | 
|---|
| 243 | /// Generate unique path to detach part | 
|---|
| 244 | String getRelativePathForDetachedPart(const String & prefix) const; | 
|---|
| 245 |  | 
|---|
| 246 | /// Moves a part to detached/ directory and adds prefix to its name | 
|---|
| 247 | void renameToDetached(const String & prefix) const; | 
|---|
| 248 |  | 
|---|
| 249 | /// Makes clone of a part in detached/ directory via hard links | 
|---|
| 250 | void makeCloneInDetached(const String & prefix) const; | 
|---|
| 251 |  | 
|---|
| 252 | /// Makes full clone of part in detached/ on another disk | 
|---|
| 253 | void makeCloneOnDiskDetached(const ReservationPtr & reservation) const; | 
|---|
| 254 |  | 
|---|
| 255 | /// Populates columns_to_size map (compressed size). | 
|---|
| 256 | void accumulateColumnSizes(ColumnToSize & column_to_size) const; | 
|---|
| 257 |  | 
|---|
| 258 | /// Initialize columns (from columns.txt if exists, or create from column files if not). | 
|---|
| 259 | /// Load checksums from checksums.txt if exists. Load index if required. | 
|---|
| 260 | void loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency); | 
|---|
| 261 |  | 
|---|
| 262 | /// Checks that .bin and .mrk files exist | 
|---|
| 263 | bool hasColumnFiles(const String & column, const IDataType & type) const; | 
|---|
| 264 |  | 
|---|
| 265 | /// For data in RAM ('index') | 
|---|
| 266 | UInt64 getIndexSizeInBytes() const; | 
|---|
| 267 | UInt64 getIndexSizeInAllocatedBytes() const; | 
|---|
| 268 | UInt64 getMarksCount() const; | 
|---|
| 269 |  | 
|---|
| 270 | private: | 
|---|
| 271 | /// Reads columns names and types from columns.txt | 
|---|
| 272 | void loadColumns(bool require); | 
|---|
| 273 |  | 
|---|
| 274 | /// If checksums.txt exists, reads files' checksums (and sizes) from it | 
|---|
| 275 | void loadChecksums(bool require); | 
|---|
| 276 |  | 
|---|
| 277 | /// Loads marks index granularity into memory | 
|---|
| 278 | void loadIndexGranularity(); | 
|---|
| 279 |  | 
|---|
| 280 | /// Loads index file. | 
|---|
| 281 | void loadIndex(); | 
|---|
| 282 |  | 
|---|
| 283 | /// Load rows count for this part from disk (for the newer storage format version). | 
|---|
| 284 | /// For the older format version calculates rows count from the size of a column with a fixed size. | 
|---|
| 285 | void loadRowsCount(); | 
|---|
| 286 |  | 
|---|
| 287 | /// Loads ttl infos in json format from file ttl.txt. If file doesn`t exists assigns ttl infos with all zeros | 
|---|
| 288 | void loadTTLInfos(); | 
|---|
| 289 |  | 
|---|
| 290 | void loadPartitionAndMinMaxIndex(); | 
|---|
| 291 |  | 
|---|
| 292 | void checkConsistency(bool require_part_metadata); | 
|---|
| 293 |  | 
|---|
| 294 | ColumnSize getColumnSizeImpl(const String & name, const IDataType & type, std::unordered_set<String> * processed_substreams) const; | 
|---|
| 295 | }; | 
|---|
| 296 |  | 
|---|
| 297 |  | 
|---|
| 298 | using MergeTreeDataPartState = MergeTreeDataPart::State; | 
|---|
| 299 |  | 
|---|
| 300 | } | 
|---|
| 301 |  | 
|---|