| 1 | //===----------------------------------------------------------------------===// |
| 2 | // DuckDB |
| 3 | // |
| 4 | // duckdb/common/multi_file_reader.hpp |
| 5 | // |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #pragma once |
| 10 | |
| 11 | #include "duckdb/common/types.hpp" |
| 12 | #include "duckdb/common/multi_file_reader_options.hpp" |
| 13 | #include "duckdb/common/enums/file_glob_options.hpp" |
| 14 | #include "duckdb/common/union_by_name.hpp" |
| 15 | #include "duckdb/common/optional_ptr.hpp" |
| 16 | #include "duckdb/common/types/value.hpp" |
| 17 | |
| 18 | namespace duckdb { |
| 19 | class TableFunction; |
| 20 | class TableFunctionSet; |
| 21 | class TableFilterSet; |
| 22 | class LogicalGet; |
| 23 | class Expression; |
| 24 | class ClientContext; |
| 25 | class DataChunk; |
| 26 | |
| 27 | struct HivePartitioningIndex { |
| 28 | HivePartitioningIndex(string value, idx_t index); |
| 29 | |
| 30 | string value; |
| 31 | idx_t index; |
| 32 | |
| 33 | DUCKDB_API void Serialize(Serializer &serializer) const; |
| 34 | DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source); |
| 35 | }; |
| 36 | |
| 37 | //! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader |
| 38 | struct MultiFileReaderBindData { |
| 39 | //! The index of the filename column (if any) |
| 40 | idx_t filename_idx = DConstants::INVALID_INDEX; |
| 41 | //! The set of hive partitioning indexes (if any) |
| 42 | vector<HivePartitioningIndex> hive_partitioning_indexes; |
| 43 | |
| 44 | DUCKDB_API void Serialize(Serializer &serializer) const; |
| 45 | DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source); |
| 46 | }; |
| 47 | |
| 48 | struct MultiFileFilterEntry { |
| 49 | idx_t index = DConstants::INVALID_INDEX; |
| 50 | bool is_constant = false; |
| 51 | }; |
| 52 | |
| 53 | struct MultiFileConstantEntry { |
| 54 | MultiFileConstantEntry(idx_t column_id, Value value_p) : column_id(column_id), value(std::move(value_p)) { |
| 55 | } |
| 56 | |
| 57 | //! The column id to apply the constant value to |
| 58 | idx_t column_id; |
| 59 | //! The constant value |
| 60 | Value value; |
| 61 | }; |
| 62 | |
| 63 | struct MultiFileReaderData { |
| 64 | //! The column ids to read from the file |
| 65 | vector<idx_t> column_ids; |
| 66 | //! The mapping of column id -> result column id |
| 67 | //! The result chunk will be filled as follows: chunk.data[column_mapping[i]] = ReadColumn(column_ids[i]); |
| 68 | vector<idx_t> column_mapping; |
| 69 | //! Whether or not there are no columns to read. This can happen when a file only consists of constants |
| 70 | bool empty_columns = false; |
| 71 | //! Filters can point to either (1) local columns in the file, or (2) constant values in the `constant_map` |
| 72 | //! This map specifies where the to-be-filtered value can be found |
| 73 | vector<MultiFileFilterEntry> filter_map; |
| 74 | //! The set of table filters |
| 75 | optional_ptr<TableFilterSet> filters; |
| 76 | //! The constants that should be applied at the various positions |
| 77 | vector<MultiFileConstantEntry> constant_map; |
| 78 | //! Map of column_id -> cast, used when reading multiple files when files have diverging types |
| 79 | //! for the same column |
| 80 | unordered_map<column_t, LogicalType> cast_map; |
| 81 | }; |
| 82 | |
| 83 | struct MultiFileReader { |
| 84 | //! Add the parameters for multi-file readers (e.g. union_by_name, filename) to a table function |
| 85 | DUCKDB_API static void AddParameters(TableFunction &table_function); |
| 86 | //! Performs any globbing for the multi-file reader and returns a list of files to be read |
| 87 | DUCKDB_API static vector<string> GetFileList(ClientContext &context, const Value &input, const string &name, |
| 88 | FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY); |
| 89 | //! Parse the named parameters of a multi-file reader |
| 90 | DUCKDB_API static bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options); |
| 91 | //! Perform complex filter pushdown into the multi-file reader, potentially filtering out files that should be read |
| 92 | //! If "true" the first file has been eliminated |
| 93 | DUCKDB_API static bool ComplexFilterPushdown(ClientContext &context, vector<string> &files, |
| 94 | const MultiFileReaderOptions &options, LogicalGet &get, |
| 95 | vector<unique_ptr<Expression>> &filters); |
| 96 | //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required |
| 97 | DUCKDB_API static MultiFileReaderBindData BindOptions(MultiFileReaderOptions &options, const vector<string> &files, |
| 98 | vector<LogicalType> &return_types, vector<string> &names); |
| 99 | //! Finalize the bind phase of the multi-file reader after we know (1) the required (output) columns, and (2) the |
| 100 | //! pushed down table filters |
| 101 | DUCKDB_API static void FinalizeBind(const MultiFileReaderOptions &file_options, |
| 102 | const MultiFileReaderBindData &options, const string &filename, |
| 103 | const vector<string> &local_names, const vector<LogicalType> &global_types, |
| 104 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
| 105 | MultiFileReaderData &reader_data); |
| 106 | //! Create all required mappings from the global types/names to the file-local types/names |
| 107 | DUCKDB_API static void CreateMapping(const string &file_name, const vector<LogicalType> &local_types, |
| 108 | const vector<string> &local_names, const vector<LogicalType> &global_types, |
| 109 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
| 110 | optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data, |
| 111 | const string &initial_file); |
| 112 | //! Finalize the reading of a chunk - applying any constants that are required |
| 113 | DUCKDB_API static void FinalizeChunk(const MultiFileReaderBindData &bind_data, |
| 114 | const MultiFileReaderData &reader_data, DataChunk &chunk); |
| 115 | //! Creates a table function set from a single reader function (including e.g. list parameters, etc) |
| 116 | DUCKDB_API static TableFunctionSet CreateFunctionSet(TableFunction table_function); |
| 117 | |
| 118 | template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS> |
| 119 | static MultiFileReaderBindData BindUnionReader(ClientContext &context, vector<LogicalType> &return_types, |
| 120 | vector<string> &names, RESULT_CLASS &result, |
| 121 | OPTIONS_CLASS &options) { |
| 122 | D_ASSERT(options.file_options.union_by_name); |
| 123 | vector<string> union_col_names; |
| 124 | vector<LogicalType> union_col_types; |
| 125 | // obtain the set of union column names + types by unifying the types of all of the files |
| 126 | // note that this requires opening readers for each file and reading the metadata of each file |
| 127 | auto union_readers = |
| 128 | UnionByName::UnionCols<READER_CLASS>(context, result.files, union_col_types, union_col_names, options); |
| 129 | |
| 130 | std::move(union_readers.begin(), union_readers.end(), std::back_inserter(result.union_readers)); |
| 131 | // perform the binding on the obtained set of names + types |
| 132 | auto bind_data = |
| 133 | MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types&: union_col_types, names&: union_col_names); |
| 134 | names = union_col_names; |
| 135 | return_types = union_col_types; |
| 136 | result.Initialize(result.union_readers[0]); |
| 137 | D_ASSERT(names.size() == return_types.size()); |
| 138 | return bind_data; |
| 139 | } |
| 140 | |
| 141 | template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS> |
| 142 | static MultiFileReaderBindData BindReader(ClientContext &context, vector<LogicalType> &return_types, |
| 143 | vector<string> &names, RESULT_CLASS &result, OPTIONS_CLASS &options) { |
| 144 | if (options.file_options.union_by_name) { |
| 145 | return BindUnionReader<READER_CLASS>(context, return_types, names, result, options); |
| 146 | } else { |
| 147 | shared_ptr<READER_CLASS> reader; |
| 148 | reader = make_shared<READER_CLASS>(context, result.files[0], options); |
| 149 | return_types = reader->return_types; |
| 150 | names = reader->names; |
| 151 | result.Initialize(std::move(reader)); |
| 152 | return MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types, names); |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | template <class READER_CLASS> |
| 157 | static void InitializeReader(READER_CLASS &reader, const MultiFileReaderOptions &options, |
| 158 | const MultiFileReaderBindData &bind_data, const vector<LogicalType> &global_types, |
| 159 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
| 160 | optional_ptr<TableFilterSet> table_filters, const string &initial_file) { |
| 161 | FinalizeBind(file_options: options, options: bind_data, filename: reader.GetFileName(), local_names: reader.GetNames(), global_types, global_names, |
| 162 | global_column_ids, reader_data&: reader.reader_data); |
| 163 | CreateMapping(file_name: reader.GetFileName(), local_types: reader.GetTypes(), local_names: reader.GetNames(), global_types, global_names, |
| 164 | global_column_ids, filters: table_filters, reader_data&: reader.reader_data, initial_file); |
| 165 | reader.reader_data.filters = table_filters; |
| 166 | } |
| 167 | |
| 168 | template <class BIND_DATA> |
| 169 | static void PruneReaders(BIND_DATA &data) { |
| 170 | unordered_set<string> file_set; |
| 171 | for (auto &file : data.files) { |
| 172 | file_set.insert(file); |
| 173 | } |
| 174 | |
| 175 | if (data.initial_reader) { |
| 176 | // check if the initial reader should still be read |
| 177 | auto entry = file_set.find(data.initial_reader->GetFileName()); |
| 178 | if (entry == file_set.end()) { |
| 179 | data.initial_reader.reset(); |
| 180 | } |
| 181 | } |
| 182 | for (idx_t r = 0; r < data.union_readers.size(); r++) { |
| 183 | // check if the union reader should still be read or not |
| 184 | auto entry = file_set.find(data.union_readers[r]->GetFileName()); |
| 185 | if (entry == file_set.end()) { |
| 186 | data.union_readers.erase(data.union_readers.begin() + r); |
| 187 | r--; |
| 188 | continue; |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | private: |
| 194 | static void CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types, |
| 195 | const vector<string> &local_names, const vector<LogicalType> &global_types, |
| 196 | const vector<string> &global_names, const vector<column_t> &global_column_ids, |
| 197 | MultiFileReaderData &reader_data, const string &initial_file); |
| 198 | }; |
| 199 | |
| 200 | } // namespace duckdb |
| 201 | |