hive_partitioning.hpp source code [Velox/build/_deps/duckdb-src/src/include/duckdb/common/hive_partitioning.hpp]

1	//===----------------------------------------------------------------------===//
2	// DuckDB
3	//
4	// duckdb/common/hive_partitioning.hpp
5	//
6	//
7	//===----------------------------------------------------------------------===//
8
9	#pragma once
10
11	#include "duckdb/common/types/column/partitioned_column_data.hpp"
12	#include "duckdb/execution/expression_executor.hpp"
13	#include "duckdb/optimizer/filter_combiner.hpp"
14	#include "duckdb/optimizer/statistics_propagator.hpp"
15	#include "duckdb/planner/expression_iterator.hpp"
16	#include "duckdb/planner/table_filter.hpp"
17	#include "re2/re2.h"
18
19	#include <iostream>
20	#include <sstream>
21
22	namespace duckdb {
23
24	class HivePartitioning {
25	public:
26	//! Parse a filename that follows the hive partitioning scheme
27	DUCKDB_API static std::map<string, string> Parse(const string &filename);
28	DUCKDB_API static std::map<string, string> Parse(const string &filename, duckdb_re2::RE2 &regex);
29	//! Prunes a list of filenames based on a set of filters, can be used by TableFunctions in the
30	//! pushdown_complex_filter function to skip files with filename-based filters. Also removes the filters that always
31	//! evaluate to true.
32	DUCKDB_API static void ApplyFiltersToFileList(ClientContext &context, vector<string> &files,
33	vector<unique_ptr<Expression>> &filters,
34	unordered_map<string, column_t> &column_map, idx_t table_index,
35	bool hive_enabled, bool filename_enabled);
36
37	//! Returns the compiled regex pattern to match hive partitions
38	DUCKDB_API static const string REGEX_STRING;
39	};
40
41	struct HivePartitionKey {
42	//! Columns by which we want to partition
43	vector<Value> values;
44	//! Precomputed hash of values
45	hash_t hash;
46
47	struct Hash {
48	std::size_t operator()(const HivePartitionKey &k) const {
49	return k.hash;
50	}
51	};
52
53	struct Equality {
54	bool operator()(const HivePartitionKey &a, const HivePartitionKey &b) const {
55	if (a.values.size() != b.values.size()) {
56	return false;
57	}
58	for (idx_t i = `0`; i < a.values.size(); i++) {
59	if (!Value::NotDistinctFrom(lvalue: a.values [i], rvalue: b.values [i])) {
60	return false;
61	}
62	}
63	return true;
64	}
65	};
66	};
67
68	//! Maps hive partitions to partition_ids
69	typedef unordered_map<HivePartitionKey, idx_t, HivePartitionKey::Hash, HivePartitionKey::Equality> hive_partition_map_t;
70
71	//! class shared between HivePartitionColumnData classes that synchronizes partition discovery between threads.
72	//! each HivePartitionedColumnData will hold a local copy of the key->partition map
73	class GlobalHivePartitionState {
74	public:
75	mutex lock;
76	hive_partition_map_t partition_map;
77	//! Used for incremental updating local copies of the partition map;
78	vector<hive_partition_map_t::const_iterator> partitions;
79	};
80
81	class HivePartitionedColumnData : public PartitionedColumnData {
82	public:
83	HivePartitionedColumnData(ClientContext &context, vector<LogicalType> types, vector<idx_t> partition_by_cols,
84	shared_ptr<GlobalHivePartitionState> global_state = nullptr)
85	: PartitionedColumnData (PartitionedColumnDataType::HIVE, context, std::move(types)),
86	global_state (std::move(global_state)), group_by_columns (std::move(partition_by_cols)),
87	hashes_v (LogicalType::HASH) {
88	InitializeKeys();
89	}
90	HivePartitionedColumnData(const HivePartitionedColumnData &other);
91	void ComputePartitionIndices(PartitionedColumnDataAppendState &state, DataChunk &input) override;
92
93	//! Reverse lookup map to reconstruct keys from a partition id
94	std::map<idx_t, const HivePartitionKey *> GetReverseMap();
95
96	protected:
97	//! Create allocators for all currently registered partitions
98	void GrowAllocators();
99	//! Create append states for all currently registered partitions
100	void GrowAppendState(PartitionedColumnDataAppendState &state);
101	//! Create and initialize partitions for all currently registered partitions
102	void GrowPartitions(PartitionedColumnDataAppendState &state);
103	//! Register a newly discovered partition
104	idx_t RegisterNewPartition(HivePartitionKey key, PartitionedColumnDataAppendState &state);
105	//! Copy the newly added entries in the global_state.map to the local_partition_map (requires lock!)
106	void SynchronizeLocalMap();
107
108	private:
109	void InitializeKeys();
110
111	protected:
112	//! Shared HivePartitionedColumnData should always have a global state to allow parallel key discovery
113	shared_ptr<GlobalHivePartitionState> global_state;
114	//! Thread-local copy of the partition map
115	hive_partition_map_t local_partition_map;
116	//! The columns that make up the key
117	vector<idx_t> group_by_columns;
118	//! Thread-local pre-allocated vector for hashes
119	Vector hashes_v;
120	//! Thread-local pre-allocated HivePartitionKeys
121	vector<HivePartitionKey> keys;
122	};
123
124	} // namespace duckdb
125

Browse the source code of Velox/build/_deps/duckdb-src/src/include/duckdb/common/hive_partitioning.hpp