| 1 | //===----------------------------------------------------------------------===// | 
|---|
| 2 | //                         DuckDB | 
|---|
| 3 | // | 
|---|
| 4 | // duckdb/common/multi_file_reader.hpp | 
|---|
| 5 | // | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 |  | 
|---|
| 9 | #pragma once | 
|---|
| 10 |  | 
|---|
| 11 | #include "duckdb/common/types.hpp" | 
|---|
| 12 | #include "duckdb/common/multi_file_reader_options.hpp" | 
|---|
| 13 | #include "duckdb/common/enums/file_glob_options.hpp" | 
|---|
| 14 | #include "duckdb/common/union_by_name.hpp" | 
|---|
| 15 | #include "duckdb/common/optional_ptr.hpp" | 
|---|
| 16 | #include "duckdb/common/types/value.hpp" | 
|---|
| 17 |  | 
|---|
| 18 | namespace duckdb { | 
|---|
| 19 | class TableFunction; | 
|---|
| 20 | class TableFunctionSet; | 
|---|
| 21 | class TableFilterSet; | 
|---|
| 22 | class LogicalGet; | 
|---|
| 23 | class Expression; | 
|---|
| 24 | class ClientContext; | 
|---|
| 25 | class DataChunk; | 
|---|
| 26 |  | 
|---|
| 27 | struct HivePartitioningIndex { | 
|---|
| 28 | HivePartitioningIndex(string value, idx_t index); | 
|---|
| 29 |  | 
|---|
| 30 | string value; | 
|---|
| 31 | idx_t index; | 
|---|
| 32 |  | 
|---|
| 33 | DUCKDB_API void Serialize(Serializer &serializer) const; | 
|---|
| 34 | DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source); | 
|---|
| 35 | }; | 
|---|
| 36 |  | 
|---|
| 37 | //! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader | 
|---|
| 38 | struct MultiFileReaderBindData { | 
|---|
| 39 | //! The index of the filename column (if any) | 
|---|
| 40 | idx_t filename_idx = DConstants::INVALID_INDEX; | 
|---|
| 41 | //! The set of hive partitioning indexes (if any) | 
|---|
| 42 | vector<HivePartitioningIndex> hive_partitioning_indexes; | 
|---|
| 43 |  | 
|---|
| 44 | DUCKDB_API void Serialize(Serializer &serializer) const; | 
|---|
| 45 | DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source); | 
|---|
| 46 | }; | 
|---|
| 47 |  | 
|---|
| 48 | struct MultiFileFilterEntry { | 
|---|
| 49 | idx_t index = DConstants::INVALID_INDEX; | 
|---|
| 50 | bool is_constant = false; | 
|---|
| 51 | }; | 
|---|
| 52 |  | 
|---|
| 53 | struct MultiFileConstantEntry { | 
|---|
| 54 | MultiFileConstantEntry(idx_t column_id, Value value_p) : column_id(column_id), value(std::move(value_p)) { | 
|---|
| 55 | } | 
|---|
| 56 |  | 
|---|
| 57 | //! The column id to apply the constant value to | 
|---|
| 58 | idx_t column_id; | 
|---|
| 59 | //! The constant value | 
|---|
| 60 | Value value; | 
|---|
| 61 | }; | 
|---|
| 62 |  | 
|---|
| 63 | struct MultiFileReaderData { | 
|---|
| 64 | //! The column ids to read from the file | 
|---|
| 65 | vector<idx_t> column_ids; | 
|---|
| 66 | //! The mapping of column id -> result column id | 
|---|
| 67 | //! The result chunk will be filled as follows: chunk.data[column_mapping[i]] = ReadColumn(column_ids[i]); | 
|---|
| 68 | vector<idx_t> column_mapping; | 
|---|
| 69 | //! Whether or not there are no columns to read. This can happen when a file only consists of constants | 
|---|
| 70 | bool empty_columns = false; | 
|---|
| 71 | //! Filters can point to either (1) local columns in the file, or (2) constant values in the `constant_map` | 
|---|
| 72 | //! This map specifies where the to-be-filtered value can be found | 
|---|
| 73 | vector<MultiFileFilterEntry> filter_map; | 
|---|
| 74 | //! The set of table filters | 
|---|
| 75 | optional_ptr<TableFilterSet> filters; | 
|---|
| 76 | //! The constants that should be applied at the various positions | 
|---|
| 77 | vector<MultiFileConstantEntry> constant_map; | 
|---|
| 78 | //! Map of column_id -> cast, used when reading multiple files when files have diverging types | 
|---|
| 79 | //! for the same column | 
|---|
| 80 | unordered_map<column_t, LogicalType> cast_map; | 
|---|
| 81 | }; | 
|---|
| 82 |  | 
|---|
| 83 | struct MultiFileReader { | 
|---|
| 84 | //! Add the parameters for multi-file readers (e.g. union_by_name, filename) to a table function | 
|---|
| 85 | DUCKDB_API static void AddParameters(TableFunction &table_function); | 
|---|
| 86 | //! Performs any globbing for the multi-file reader and returns a list of files to be read | 
|---|
| 87 | DUCKDB_API static vector<string> GetFileList(ClientContext &context, const Value &input, const string &name, | 
|---|
| 88 | FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY); | 
|---|
| 89 | //! Parse the named parameters of a multi-file reader | 
|---|
| 90 | DUCKDB_API static bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options); | 
|---|
| 91 | //! Perform complex filter pushdown into the multi-file reader, potentially filtering out files that should be read | 
|---|
| 92 | //! If "true" the first file has been eliminated | 
|---|
| 93 | DUCKDB_API static bool ComplexFilterPushdown(ClientContext &context, vector<string> &files, | 
|---|
| 94 | const MultiFileReaderOptions &options, LogicalGet &get, | 
|---|
| 95 | vector<unique_ptr<Expression>> &filters); | 
|---|
| 96 | //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required | 
|---|
| 97 | DUCKDB_API static MultiFileReaderBindData BindOptions(MultiFileReaderOptions &options, const vector<string> &files, | 
|---|
| 98 | vector<LogicalType> &return_types, vector<string> &names); | 
|---|
| 99 | //! Finalize the bind phase of the multi-file reader after we know (1) the required (output) columns, and (2) the | 
|---|
| 100 | //! pushed down table filters | 
|---|
| 101 | DUCKDB_API static void FinalizeBind(const MultiFileReaderOptions &file_options, | 
|---|
| 102 | const MultiFileReaderBindData &options, const string &filename, | 
|---|
| 103 | const vector<string> &local_names, const vector<LogicalType> &global_types, | 
|---|
| 104 | const vector<string> &global_names, const vector<column_t> &global_column_ids, | 
|---|
| 105 | MultiFileReaderData &reader_data); | 
|---|
| 106 | //! Create all required mappings from the global types/names to the file-local types/names | 
|---|
| 107 | DUCKDB_API static void CreateMapping(const string &file_name, const vector<LogicalType> &local_types, | 
|---|
| 108 | const vector<string> &local_names, const vector<LogicalType> &global_types, | 
|---|
| 109 | const vector<string> &global_names, const vector<column_t> &global_column_ids, | 
|---|
| 110 | optional_ptr<TableFilterSet> filters, MultiFileReaderData &reader_data, | 
|---|
| 111 | const string &initial_file); | 
|---|
| 112 | //! Finalize the reading of a chunk - applying any constants that are required | 
|---|
| 113 | DUCKDB_API static void FinalizeChunk(const MultiFileReaderBindData &bind_data, | 
|---|
| 114 | const MultiFileReaderData &reader_data, DataChunk &chunk); | 
|---|
| 115 | //! Creates a table function set from a single reader function (including e.g. list parameters, etc) | 
|---|
| 116 | DUCKDB_API static TableFunctionSet CreateFunctionSet(TableFunction table_function); | 
|---|
| 117 |  | 
|---|
| 118 | template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS> | 
|---|
| 119 | static MultiFileReaderBindData BindUnionReader(ClientContext &context, vector<LogicalType> &return_types, | 
|---|
| 120 | vector<string> &names, RESULT_CLASS &result, | 
|---|
| 121 | OPTIONS_CLASS &options) { | 
|---|
| 122 | D_ASSERT(options.file_options.union_by_name); | 
|---|
| 123 | vector<string> union_col_names; | 
|---|
| 124 | vector<LogicalType> union_col_types; | 
|---|
| 125 | // obtain the set of union column names + types by unifying the types of all of the files | 
|---|
| 126 | // note that this requires opening readers for each file and reading the metadata of each file | 
|---|
| 127 | auto union_readers = | 
|---|
| 128 | UnionByName::UnionCols<READER_CLASS>(context, result.files, union_col_types, union_col_names, options); | 
|---|
| 129 |  | 
|---|
| 130 | std::move(union_readers.begin(), union_readers.end(), std::back_inserter(result.union_readers)); | 
|---|
| 131 | // perform the binding on the obtained set of names + types | 
|---|
| 132 | auto bind_data = | 
|---|
| 133 | MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types&: union_col_types, names&: union_col_names); | 
|---|
| 134 | names = union_col_names; | 
|---|
| 135 | return_types = union_col_types; | 
|---|
| 136 | result.Initialize(result.union_readers[0]); | 
|---|
| 137 | D_ASSERT(names.size() == return_types.size()); | 
|---|
| 138 | return bind_data; | 
|---|
| 139 | } | 
|---|
| 140 |  | 
|---|
| 141 | template <class READER_CLASS, class RESULT_CLASS, class OPTIONS_CLASS> | 
|---|
| 142 | static MultiFileReaderBindData BindReader(ClientContext &context, vector<LogicalType> &return_types, | 
|---|
| 143 | vector<string> &names, RESULT_CLASS &result, OPTIONS_CLASS &options) { | 
|---|
| 144 | if (options.file_options.union_by_name) { | 
|---|
| 145 | return BindUnionReader<READER_CLASS>(context, return_types, names, result, options); | 
|---|
| 146 | } else { | 
|---|
| 147 | shared_ptr<READER_CLASS> reader; | 
|---|
| 148 | reader = make_shared<READER_CLASS>(context, result.files[0], options); | 
|---|
| 149 | return_types = reader->return_types; | 
|---|
| 150 | names = reader->names; | 
|---|
| 151 | result.Initialize(std::move(reader)); | 
|---|
| 152 | return MultiFileReader::BindOptions(options&: options.file_options, files: result.files, return_types, names); | 
|---|
| 153 | } | 
|---|
| 154 | } | 
|---|
| 155 |  | 
|---|
| 156 | template <class READER_CLASS> | 
|---|
| 157 | static void InitializeReader(READER_CLASS &reader, const MultiFileReaderOptions &options, | 
|---|
| 158 | const MultiFileReaderBindData &bind_data, const vector<LogicalType> &global_types, | 
|---|
| 159 | const vector<string> &global_names, const vector<column_t> &global_column_ids, | 
|---|
| 160 | optional_ptr<TableFilterSet> table_filters, const string &initial_file) { | 
|---|
| 161 | FinalizeBind(file_options: options, options: bind_data, filename: reader.GetFileName(), local_names: reader.GetNames(), global_types, global_names, | 
|---|
| 162 | global_column_ids, reader_data&: reader.reader_data); | 
|---|
| 163 | CreateMapping(file_name: reader.GetFileName(), local_types: reader.GetTypes(), local_names: reader.GetNames(), global_types, global_names, | 
|---|
| 164 | global_column_ids, filters: table_filters, reader_data&: reader.reader_data, initial_file); | 
|---|
| 165 | reader.reader_data.filters = table_filters; | 
|---|
| 166 | } | 
|---|
| 167 |  | 
|---|
| 168 | template <class BIND_DATA> | 
|---|
| 169 | static void PruneReaders(BIND_DATA &data) { | 
|---|
| 170 | unordered_set<string> file_set; | 
|---|
| 171 | for (auto &file : data.files) { | 
|---|
| 172 | file_set.insert(file); | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | if (data.initial_reader) { | 
|---|
| 176 | // check if the initial reader should still be read | 
|---|
| 177 | auto entry = file_set.find(data.initial_reader->GetFileName()); | 
|---|
| 178 | if (entry == file_set.end()) { | 
|---|
| 179 | data.initial_reader.reset(); | 
|---|
| 180 | } | 
|---|
| 181 | } | 
|---|
| 182 | for (idx_t r = 0; r < data.union_readers.size(); r++) { | 
|---|
| 183 | // check if the union reader should still be read or not | 
|---|
| 184 | auto entry = file_set.find(data.union_readers[r]->GetFileName()); | 
|---|
| 185 | if (entry == file_set.end()) { | 
|---|
| 186 | data.union_readers.erase(data.union_readers.begin() + r); | 
|---|
| 187 | r--; | 
|---|
| 188 | continue; | 
|---|
| 189 | } | 
|---|
| 190 | } | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | private: | 
|---|
| 194 | static void CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types, | 
|---|
| 195 | const vector<string> &local_names, const vector<LogicalType> &global_types, | 
|---|
| 196 | const vector<string> &global_names, const vector<column_t> &global_column_ids, | 
|---|
| 197 | MultiFileReaderData &reader_data, const string &initial_file); | 
|---|
| 198 | }; | 
|---|
| 199 |  | 
|---|
| 200 | } // namespace duckdb | 
|---|
| 201 |  | 
|---|