1 | //===----------------------------------------------------------------------===// |
2 | // DuckDB |
3 | // |
4 | // duckdb/common/multi_file_reader.hpp |
5 | // |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #pragma once |
10 | |
11 | #include "duckdb/common/types.hpp" |
12 | #include "duckdb/common/multi_file_reader_options.hpp" |
13 | #include "duckdb/common/enums/file_glob_options.hpp" |
14 | #include "duckdb/common/union_by_name.hpp" |
15 | #include "duckdb/common/optional_ptr.hpp" |
16 | #include "duckdb/common/types/value.hpp" |
17 | |
18 | namespace duckdb { |
19 | class TableFunction; |
20 | class TableFunctionSet; |
21 | class TableFilterSet; |
22 | class LogicalGet; |
23 | class Expression; |
24 | class ClientContext; |
25 | class DataChunk; |
26 | |
27 | struct HivePartitioningIndex { |
28 | HivePartitioningIndex(string value, idx_t index); |
29 | |
30 | string value; |
31 | idx_t index; |
32 | |
33 | DUCKDB_API void Serialize(Serializer &serializer) const; |
34 | DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source); |
35 | }; |
36 | |
37 | //! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader |
38 | struct MultiFileReaderBindData { |
39 | //! The index of the filename column (if any) |
40 | idx_t filename_idx = DConstants::INVALID_INDEX; |
41 | //! The set of hive partitioning indexes (if any) |
42 | vector<HivePartitioningIndex> hive_partitioning_indexes; |
43 | |
44 | DUCKDB_API void Serialize(Serializer &serializer) const; |
45 | DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source); |
46 | }; |
47 | |
48 | struct MultiFileFilterEntry { |
49 | idx_t index = DConstants::INVALID_INDEX; |
50 | bool is_constant = false; |
51 | }; |
52 | |
53 | struct MultiFileConstantEntry { |
54 | MultiFileConstantEntry(idx_t column_id, Value value_p) : column_id(column_id), value(std::move(value_p)) { |
55 | } |
56 | |
57 | //! The column id to apply the constant value to |
58 | idx_t column_id; |
59 | //! The constant value |
60 | Value value; |
61 | }; |
62 | |
63 | struct MultiFileReaderData { |
64 | //! The column ids to read from the file |
65 | vector<idx_t> column_ids; |
66 | //! The mapping of column id -> result column id |
67 | //! The result chunk will be filled as follows: chunk.data[column_mapping[i]] = ReadColumn(column_ids[i]); |
68 | vector<idx_t> column_mapping; |
69 | //! Whether or not there are no columns to read. This can happen when a file only consists of constants |
70 | bool empty_columns = false; |
71 | //! Filters can point to either (1) local columns in the file, or (2) constant values in the `constant_map` |
72 | //! This map specifies where the to-be-filtered value can be found |
73 | vector<MultiFileFilterEntry> filter_map; |
74 | //! The set of table filters |
75 | optional_ptr<TableFilterSet> filters; |
76 | //! The constants that should be applied at the various positions |
77 | vector<MultiFileConstantEntry> constant_map; |
78 | //! Map of column_id -> cast, used when reading multiple files when files have diverging types |
79 | //! for the same column |
80 | unordered_map<column_t, LogicalType> cast_map; |
81 | }; |
82 | |
83 | struct MultiFileReader { |
84 | //! Add the parameters for multi-file readers (e.g. union_by_name, filename) to a table function |
85 | DUCKDB_API static void AddParameters(TableFunction &table_function); |
86 | //! Performs any globbing for the multi-file reader and returns a list of files to be read |
87 | DUCKDB_API static vector<string> GetFileList(ClientContext &context, const Value &input, const string &name, |
88 | FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY); |
89 | //! Parse the named parameters of a multi-file reader |
90 | DUCKDB_API static bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options); |
91 | //! Perform complex filter pushdown into the multi-file reader, potentially filtering out files that should be read |
92 | //! If "true" the first file has been eliminated |
93 | DUCKDB_API static bool ComplexFilterPushdown(ClientContext &context, vector<string> &files, |
94 | const MultiFileReaderOptions &options, LogicalGet &get, |
95 | vector<unique_ptr<Expression>> &filters); |
96 | //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required |
97 | DUCKDB_API static MultiFileReaderBindData BindOptions(MultiFileReaderOptions &options, const vector<string> &files, |
98 | vector<LogicalType> &return_types, vector<string> &names); |
99 | //! Finalize the bind phase of the multi-file reader after we know (1) the required (output) columns, and (2) the |
100 | //! pushed down table filters |
101 | DUCKDB_API static void FinalizeBind(const MultiFileReaderOptions &file_options, |
102 | const MultiFileReaderBindData &options, const string &filename, |
103 | const vector<string> &local_names, const vector<LogicalType> &global_types, |
104 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
105 | MultiFileReaderData &reader_data); |
106 | //! Create all required mappings from the global types/names to the file-local types/names |
107 | DUCKDB_API static void CreateMapping(const string &file_name, const vector<LogicalType> &local_types, |
108 | const vector<string> &local_names, const vector<LogicalType> &global_types, |
109 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
110 | optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data, |
111 | const string &initial_file); |
112 | //! Finalize the reading of a chunk - applying any constants that are required |
113 | DUCKDB_API static void FinalizeChunk(const MultiFileReaderBindData &bind_data, |
114 | const MultiFileReaderData &reader_data, DataChunk &chunk); |
115 | //! Creates a table function set from a single reader function (including e.g. list parameters, etc) |
116 | DUCKDB_API static TableFunctionSet CreateFunctionSet(TableFunction table_function); |
117 | |
118 | template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS> |
119 | static MultiFileReaderBindData BindUnionReader(ClientContext &context, vector<LogicalType> &return_types, |
120 | vector<string> &names, RESULT_CLASS &result, |
121 | OPTIONS_CLASS &options) { |
122 | D_ASSERT(options.file_options.union_by_name); |
123 | vector<string> union_col_names; |
124 | vector<LogicalType> union_col_types; |
125 | // obtain the set of union column names + types by unifying the types of all of the files |
126 | // note that this requires opening readers for each file and reading the metadata of each file |
127 | auto union_readers = |
128 | UnionByName::UnionCols<READER_CLASS>(context, result.files, union_col_types, union_col_names, options); |
129 | |
130 | std::move(union_readers.begin(), union_readers.end(), std::back_inserter(result.union_readers)); |
131 | // perform the binding on the obtained set of names + types |
132 | auto bind_data = |
133 | MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types&: union_col_types, names&: union_col_names); |
134 | names = union_col_names; |
135 | return_types = union_col_types; |
136 | result.Initialize(result.union_readers[0]); |
137 | D_ASSERT(names.size() == return_types.size()); |
138 | return bind_data; |
139 | } |
140 | |
141 | template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS> |
142 | static MultiFileReaderBindData BindReader(ClientContext &context, vector<LogicalType> &return_types, |
143 | vector<string> &names, RESULT_CLASS &result, OPTIONS_CLASS &options) { |
144 | if (options.file_options.union_by_name) { |
145 | return BindUnionReader<READER_CLASS>(context, return_types, names, result, options); |
146 | } else { |
147 | shared_ptr<READER_CLASS> reader; |
148 | reader = make_shared<READER_CLASS>(context, result.files[0], options); |
149 | return_types = reader->return_types; |
150 | names = reader->names; |
151 | result.Initialize(std::move(reader)); |
152 | return MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types, names); |
153 | } |
154 | } |
155 | |
156 | template <class READER_CLASS> |
157 | static void InitializeReader(READER_CLASS &reader, const MultiFileReaderOptions &options, |
158 | const MultiFileReaderBindData &bind_data, const vector<LogicalType> &global_types, |
159 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
160 | optional_ptr<TableFilterSet> table_filters, const string &initial_file) { |
161 | FinalizeBind(file_options: options, options: bind_data, filename: reader.GetFileName(), local_names: reader.GetNames(), global_types, global_names, |
162 | global_column_ids, reader_data&: reader.reader_data); |
163 | CreateMapping(file_name: reader.GetFileName(), local_types: reader.GetTypes(), local_names: reader.GetNames(), global_types, global_names, |
164 | global_column_ids, filters: table_filters, reader_data&: reader.reader_data, initial_file); |
165 | reader.reader_data.filters = table_filters; |
166 | } |
167 | |
168 | template <class BIND_DATA> |
169 | static void PruneReaders(BIND_DATA &data) { |
170 | unordered_set<string> file_set; |
171 | for (auto &file : data.files) { |
172 | file_set.insert(file); |
173 | } |
174 | |
175 | if (data.initial_reader) { |
176 | // check if the initial reader should still be read |
177 | auto entry = file_set.find(data.initial_reader->GetFileName()); |
178 | if (entry == file_set.end()) { |
179 | data.initial_reader.reset(); |
180 | } |
181 | } |
182 | for (idx_t r = 0; r < data.union_readers.size(); r++) { |
183 | // check if the union reader should still be read or not |
184 | auto entry = file_set.find(data.union_readers[r]->GetFileName()); |
185 | if (entry == file_set.end()) { |
186 | data.union_readers.erase(data.union_readers.begin() + r); |
187 | r--; |
188 | continue; |
189 | } |
190 | } |
191 | } |
192 | |
193 | private: |
194 | static void CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types, |
195 | const vector<string> &local_names, const vector<LogicalType> &global_types, |
196 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
197 | MultiFileReaderData &reader_data, const string &initial_file); |
198 | }; |
199 | |
200 | } // namespace duckdb |
201 | |