1//===----------------------------------------------------------------------===//
2// DuckDB
3//
4// duckdb/common/hive_partitioning.hpp
5//
6//
7//===----------------------------------------------------------------------===//
8
9#pragma once
10
11#include "duckdb/common/types/column/partitioned_column_data.hpp"
12#include "duckdb/execution/expression_executor.hpp"
13#include "duckdb/optimizer/filter_combiner.hpp"
14#include "duckdb/optimizer/statistics_propagator.hpp"
15#include "duckdb/planner/expression_iterator.hpp"
16#include "duckdb/planner/table_filter.hpp"
17#include "re2/re2.h"
18
19#include <iostream>
20#include <sstream>
21
22namespace duckdb {
23
24class HivePartitioning {
25public:
26 //! Parse a filename that follows the hive partitioning scheme
27 DUCKDB_API static std::map<string, string> Parse(const string &filename);
28 DUCKDB_API static std::map<string, string> Parse(const string &filename, duckdb_re2::RE2 &regex);
29 //! Prunes a list of filenames based on a set of filters, can be used by TableFunctions in the
30 //! pushdown_complex_filter function to skip files with filename-based filters. Also removes the filters that always
31 //! evaluate to true.
32 DUCKDB_API static void ApplyFiltersToFileList(ClientContext &context, vector<string> &files,
33 vector<unique_ptr<Expression>> &filters,
34 unordered_map<string, column_t> &column_map, idx_t table_index,
35 bool hive_enabled, bool filename_enabled);
36
37 //! Returns the compiled regex pattern to match hive partitions
38 DUCKDB_API static const string REGEX_STRING;
39};
40
41struct HivePartitionKey {
42 //! Columns by which we want to partition
43 vector<Value> values;
44 //! Precomputed hash of values
45 hash_t hash;
46
47 struct Hash {
48 std::size_t operator()(const HivePartitionKey &k) const {
49 return k.hash;
50 }
51 };
52
53 struct Equality {
54 bool operator()(const HivePartitionKey &a, const HivePartitionKey &b) const {
55 if (a.values.size() != b.values.size()) {
56 return false;
57 }
58 for (idx_t i = 0; i < a.values.size(); i++) {
59 if (!Value::NotDistinctFrom(lvalue: a.values[i], rvalue: b.values[i])) {
60 return false;
61 }
62 }
63 return true;
64 }
65 };
66};
67
68//! Maps hive partitions to partition_ids
69typedef unordered_map<HivePartitionKey, idx_t, HivePartitionKey::Hash, HivePartitionKey::Equality> hive_partition_map_t;
70
71//! class shared between HivePartitionColumnData classes that synchronizes partition discovery between threads.
72//! each HivePartitionedColumnData will hold a local copy of the key->partition map
73class GlobalHivePartitionState {
74public:
75 mutex lock;
76 hive_partition_map_t partition_map;
77 //! Used for incremental updating local copies of the partition map;
78 vector<hive_partition_map_t::const_iterator> partitions;
79};
80
81class HivePartitionedColumnData : public PartitionedColumnData {
82public:
83 HivePartitionedColumnData(ClientContext &context, vector<LogicalType> types, vector<idx_t> partition_by_cols,
84 shared_ptr<GlobalHivePartitionState> global_state = nullptr)
85 : PartitionedColumnData(PartitionedColumnDataType::HIVE, context, std::move(types)),
86 global_state(std::move(global_state)), group_by_columns(std::move(partition_by_cols)),
87 hashes_v(LogicalType::HASH) {
88 InitializeKeys();
89 }
90 HivePartitionedColumnData(const HivePartitionedColumnData &other);
91 void ComputePartitionIndices(PartitionedColumnDataAppendState &state, DataChunk &input) override;
92
93 //! Reverse lookup map to reconstruct keys from a partition id
94 std::map<idx_t, const HivePartitionKey *> GetReverseMap();
95
96protected:
97 //! Create allocators for all currently registered partitions
98 void GrowAllocators();
99 //! Create append states for all currently registered partitions
100 void GrowAppendState(PartitionedColumnDataAppendState &state);
101 //! Create and initialize partitions for all currently registered partitions
102 void GrowPartitions(PartitionedColumnDataAppendState &state);
103 //! Register a newly discovered partition
104 idx_t RegisterNewPartition(HivePartitionKey key, PartitionedColumnDataAppendState &state);
105 //! Copy the newly added entries in the global_state.map to the local_partition_map (requires lock!)
106 void SynchronizeLocalMap();
107
108private:
109 void InitializeKeys();
110
111protected:
112 //! Shared HivePartitionedColumnData should always have a global state to allow parallel key discovery
113 shared_ptr<GlobalHivePartitionState> global_state;
114 //! Thread-local copy of the partition map
115 hive_partition_map_t local_partition_map;
116 //! The columns that make up the key
117 vector<idx_t> group_by_columns;
118 //! Thread-local pre-allocated vector for hashes
119 Vector hashes_v;
120 //! Thread-local pre-allocated HivePartitionKeys
121 vector<HivePartitionKey> keys;
122};
123
124} // namespace duckdb
125