1//===----------------------------------------------------------------------===//
2// DuckDB
3//
4// duckdb/common/multi_file_reader.hpp
5//
6//
7//===----------------------------------------------------------------------===//
8
9#pragma once
10
11#include "duckdb/common/types.hpp"
12#include "duckdb/common/multi_file_reader_options.hpp"
13#include "duckdb/common/enums/file_glob_options.hpp"
14#include "duckdb/common/union_by_name.hpp"
15#include "duckdb/common/optional_ptr.hpp"
16#include "duckdb/common/types/value.hpp"
17
18namespace duckdb {
19class TableFunction;
20class TableFunctionSet;
21class TableFilterSet;
22class LogicalGet;
23class Expression;
24class ClientContext;
25class DataChunk;
26
27struct HivePartitioningIndex {
28 HivePartitioningIndex(string value, idx_t index);
29
30 string value;
31 idx_t index;
32
33 DUCKDB_API void Serialize(Serializer &serializer) const;
34 DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source);
35};
36
37//! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader
38struct MultiFileReaderBindData {
39 //! The index of the filename column (if any)
40 idx_t filename_idx = DConstants::INVALID_INDEX;
41 //! The set of hive partitioning indexes (if any)
42 vector<HivePartitioningIndex> hive_partitioning_indexes;
43
44 DUCKDB_API void Serialize(Serializer &serializer) const;
45 DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source);
46};
47
48struct MultiFileFilterEntry {
49 idx_t index = DConstants::INVALID_INDEX;
50 bool is_constant = false;
51};
52
53struct MultiFileConstantEntry {
54 MultiFileConstantEntry(idx_t column_id, Value value_p) : column_id(column_id), value(std::move(value_p)) {
55 }
56
57 //! The column id to apply the constant value to
58 idx_t column_id;
59 //! The constant value
60 Value value;
61};
62
63struct MultiFileReaderData {
64 //! The column ids to read from the file
65 vector<idx_t> column_ids;
66 //! The mapping of column id -> result column id
67 //! The result chunk will be filled as follows: chunk.data[column_mapping[i]] = ReadColumn(column_ids[i]);
68 vector<idx_t> column_mapping;
69 //! Whether or not there are no columns to read. This can happen when a file only consists of constants
70 bool empty_columns = false;
71 //! Filters can point to either (1) local columns in the file, or (2) constant values in the `constant_map`
72 //! This map specifies where the to-be-filtered value can be found
73 vector<MultiFileFilterEntry> filter_map;
74 //! The set of table filters
75 optional_ptr<TableFilterSet> filters;
76 //! The constants that should be applied at the various positions
77 vector<MultiFileConstantEntry> constant_map;
78 //! Map of column_id -> cast, used when reading multiple files when files have diverging types
79 //! for the same column
80 unordered_map<column_t, LogicalType> cast_map;
81};
82
83struct MultiFileReader {
84 //! Add the parameters for multi-file readers (e.g. union_by_name, filename) to a table function
85 DUCKDB_API static void AddParameters(TableFunction &table_function);
86 //! Performs any globbing for the multi-file reader and returns a list of files to be read
87 DUCKDB_API static vector<string> GetFileList(ClientContext &context, const Value &input, const string &name,
88 FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY);
89 //! Parse the named parameters of a multi-file reader
90 DUCKDB_API static bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options);
91 //! Perform complex filter pushdown into the multi-file reader, potentially filtering out files that should be read
92 //! If "true" the first file has been eliminated
93 DUCKDB_API static bool ComplexFilterPushdown(ClientContext &context, vector<string> &files,
94 const MultiFileReaderOptions &options, LogicalGet &get,
95 vector<unique_ptr<Expression>> &filters);
96 //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required
97 DUCKDB_API static MultiFileReaderBindData BindOptions(MultiFileReaderOptions &options, const vector<string> &files,
98 vector<LogicalType> &return_types, vector<string> &names);
99 //! Finalize the bind phase of the multi-file reader after we know (1) the required (output) columns, and (2) the
100 //! pushed down table filters
101 DUCKDB_API static void FinalizeBind(const MultiFileReaderOptions &file_options,
102 const MultiFileReaderBindData &options, const string &filename,
103 const vector<string> &local_names, const vector<LogicalType> &global_types,
104 const vector<string> &global_names, const vector<column_t> &global_column_ids,
105 MultiFileReaderData &reader_data);
106 //! Create all required mappings from the global types/names to the file-local types/names
107 DUCKDB_API static void CreateMapping(const string &file_name, const vector<LogicalType> &local_types,
108 const vector<string> &local_names, const vector<LogicalType> &global_types,
109 const vector<string> &global_names, const vector<column_t> &global_column_ids,
110 optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data,
111 const string &initial_file);
112 //! Finalize the reading of a chunk - applying any constants that are required
113 DUCKDB_API static void FinalizeChunk(const MultiFileReaderBindData &bind_data,
114 const MultiFileReaderData &reader_data, DataChunk &chunk);
115 //! Creates a table function set from a single reader function (including e.g. list parameters, etc)
116 DUCKDB_API static TableFunctionSet CreateFunctionSet(TableFunction table_function);
117
118 template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS>
119 static MultiFileReaderBindData BindUnionReader(ClientContext &context, vector<LogicalType> &return_types,
120 vector<string> &names, RESULT_CLASS &result,
121 OPTIONS_CLASS &options) {
122 D_ASSERT(options.file_options.union_by_name);
123 vector<string> union_col_names;
124 vector<LogicalType> union_col_types;
125 // obtain the set of union column names + types by unifying the types of all of the files
126 // note that this requires opening readers for each file and reading the metadata of each file
127 auto union_readers =
128 UnionByName::UnionCols<READER_CLASS>(context, result.files, union_col_types, union_col_names, options);
129
130 std::move(union_readers.begin(), union_readers.end(), std::back_inserter(result.union_readers));
131 // perform the binding on the obtained set of names + types
132 auto bind_data =
133 MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types&: union_col_types, names&: union_col_names);
134 names = union_col_names;
135 return_types = union_col_types;
136 result.Initialize(result.union_readers[0]);
137 D_ASSERT(names.size() == return_types.size());
138 return bind_data;
139 }
140
141 template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS>
142 static MultiFileReaderBindData BindReader(ClientContext &context, vector<LogicalType> &return_types,
143 vector<string> &names, RESULT_CLASS &result, OPTIONS_CLASS &options) {
144 if (options.file_options.union_by_name) {
145 return BindUnionReader<READER_CLASS>(context, return_types, names, result, options);
146 } else {
147 shared_ptr<READER_CLASS> reader;
148 reader = make_shared<READER_CLASS>(context, result.files[0], options);
149 return_types = reader->return_types;
150 names = reader->names;
151 result.Initialize(std::move(reader));
152 return MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types, names);
153 }
154 }
155
156 template <class READER_CLASS>
157 static void InitializeReader(READER_CLASS &reader, const MultiFileReaderOptions &options,
158 const MultiFileReaderBindData &bind_data, const vector<LogicalType> &global_types,
159 const vector<string> &global_names, const vector<column_t> &global_column_ids,
160 optional_ptr<TableFilterSet> table_filters, const string &initial_file) {
161 FinalizeBind(file_options: options, options: bind_data, filename: reader.GetFileName(), local_names: reader.GetNames(), global_types, global_names,
162 global_column_ids, reader_data&: reader.reader_data);
163 CreateMapping(file_name: reader.GetFileName(), local_types: reader.GetTypes(), local_names: reader.GetNames(), global_types, global_names,
164 global_column_ids, filters: table_filters, reader_data&: reader.reader_data, initial_file);
165 reader.reader_data.filters = table_filters;
166 }
167
168 template <class BIND_DATA>
169 static void PruneReaders(BIND_DATA &data) {
170 unordered_set<string> file_set;
171 for (auto &file : data.files) {
172 file_set.insert(file);
173 }
174
175 if (data.initial_reader) {
176 // check if the initial reader should still be read
177 auto entry = file_set.find(data.initial_reader->GetFileName());
178 if (entry == file_set.end()) {
179 data.initial_reader.reset();
180 }
181 }
182 for (idx_t r = 0; r < data.union_readers.size(); r++) {
183 // check if the union reader should still be read or not
184 auto entry = file_set.find(data.union_readers[r]->GetFileName());
185 if (entry == file_set.end()) {
186 data.union_readers.erase(data.union_readers.begin() + r);
187 r--;
188 continue;
189 }
190 }
191 }
192
193private:
194 static void CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types,
195 const vector<string> &local_names, const vector<LogicalType> &global_types,
196 const vector<string> &global_names, const vector<column_t> &global_column_ids,
197 MultiFileReaderData &reader_data, const string &initial_file);
198};
199
200} // namespace duckdb
201