1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_ARROW_READER_H |
19 | #define PARQUET_ARROW_READER_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | #include <vector> |
24 | |
25 | #include "parquet/util/visibility.h" |
26 | |
27 | #include "arrow/io/interfaces.h" |
28 | #include "arrow/util/macros.h" |
29 | |
30 | namespace arrow { |
31 | |
32 | class Array; |
33 | class ChunkedArray; |
34 | class MemoryPool; |
35 | class RecordBatchReader; |
36 | class Schema; |
37 | class Status; |
38 | class Table; |
39 | |
40 | } // namespace arrow |
41 | |
42 | namespace parquet { |
43 | |
44 | class FileMetaData; |
45 | class ParquetFileReader; |
46 | class ReaderProperties; |
47 | |
48 | namespace arrow { |
49 | |
50 | class ColumnChunkReader; |
51 | class ColumnReader; |
52 | class RowGroupReader; |
53 | |
54 | // Arrow read adapter class for deserializing Parquet files as Arrow row |
55 | // batches. |
56 | // |
57 | // This interfaces caters for different use cases and thus provides different |
58 | // interfaces. In its most simplistic form, we cater for a user that wants to |
59 | // read the whole Parquet at once with the FileReader::ReadTable method. |
60 | // |
61 | // More advanced users that also want to implement parallelism on top of each |
62 | // single Parquet files should do this on the RowGroup level. For this, they can |
63 | // call FileReader::RowGroup(i)->ReadTable to receive only the specified |
64 | // RowGroup as a table. |
65 | // |
66 | // In the most advanced situation, where a consumer wants to independently read |
67 | // RowGroups in parallel and consume each column individually, they can call |
68 | // FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column |
69 | // instance. |
70 | // |
71 | // TODO(wesm): nested data does not always make sense with this user |
72 | // interface unless you are only reading a single leaf node from a branch of |
73 | // a table. For example: |
74 | // |
75 | // repeated group data { |
76 | // optional group record { |
77 | // optional int32 val1; |
78 | // optional byte_array val2; |
79 | // optional bool val3; |
80 | // } |
81 | // optional int32 val4; |
82 | // } |
83 | // |
84 | // In the Parquet file, there are 3 leaf nodes: |
85 | // |
86 | // * data.record.val1 |
87 | // * data.record.val2 |
88 | // * data.record.val3 |
89 | // * data.val4 |
90 | // |
91 | // When materializing this data in an Arrow array, we would have: |
92 | // |
93 | // data: list<struct< |
94 | // record: struct< |
95 | // val1: int32, |
96 | // val2: string (= list<uint8>), |
97 | // val3: bool, |
98 | // >, |
99 | // val4: int32 |
100 | // >> |
101 | // |
102 | // However, in the Parquet format, each leaf node has its own repetition and |
103 | // definition levels describing the structure of the intermediate nodes in |
104 | // this array structure. Thus, we will need to scan the leaf data for a group |
105 | // of leaf nodes part of the same type tree to create a single result Arrow |
106 | // nested array structure. |
107 | // |
108 | // This is additionally complicated "chunky" repeated fields or very large byte |
109 | // arrays |
110 | class PARQUET_EXPORT FileReader { |
111 | public: |
112 | FileReader(::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader); |
113 | |
114 | // Since the distribution of columns amongst a Parquet file's row groups may |
115 | // be uneven (the number of values in each column chunk can be different), we |
116 | // provide a column-oriented read interface. The ColumnReader hides the |
117 | // details of paging through the file's row groups and yielding |
118 | // fully-materialized arrow::Array instances |
119 | // |
120 | // Returns error status if the column of interest is not flat. |
121 | ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out); |
122 | |
123 | /// \brief Return arrow schema by apply selection of column indices. |
124 | /// \returns error status if passed wrong indices. |
125 | ::arrow::Status GetSchema(const std::vector<int>& indices, |
126 | std::shared_ptr<::arrow::Schema>* out); |
127 | |
128 | // Read column as a whole into an Array. |
129 | ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); |
130 | |
131 | /// \note Deprecated since 0.12 |
132 | ARROW_DEPRECATED("Use version with ChunkedArray output" ) |
133 | ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); |
134 | |
135 | // NOTE: Experimental API |
136 | // Reads a specific top level schema field into an Array |
137 | // The index i refers the index of the top level schema field, which may |
138 | // be nested or flat - e.g. |
139 | // |
140 | // 0 foo.bar |
141 | // foo.bar.baz |
142 | // foo.qux |
143 | // 1 foo2 |
144 | // 2 foo3 |
145 | // |
146 | // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc |
147 | ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); |
148 | |
149 | /// \note Deprecated since 0.12 |
150 | ARROW_DEPRECATED("Use version with ChunkedArray output" ) |
151 | ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); |
152 | |
153 | /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the |
154 | /// ordering in row_group_indices matters. |
155 | /// \returns error Status if row_group_indices contains invalid index |
156 | ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices, |
157 | std::shared_ptr<::arrow::RecordBatchReader>* out); |
158 | |
159 | /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, |
160 | /// whose columns are selected by column_indices. The ordering in row_group_indices |
161 | /// and column_indices matter. |
162 | /// \returns error Status if either row_group_indices or column_indices contains invalid |
163 | /// index |
164 | ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices, |
165 | const std::vector<int>& column_indices, |
166 | std::shared_ptr<::arrow::RecordBatchReader>* out); |
167 | |
168 | // Read a table of columns into a Table |
169 | ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out); |
170 | |
171 | // Read a table of columns into a Table. Read only the indicated column |
172 | // indices (relative to the schema) |
173 | ::arrow::Status ReadTable(const std::vector<int>& column_indices, |
174 | std::shared_ptr<::arrow::Table>* out); |
175 | |
176 | ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices, |
177 | std::shared_ptr<::arrow::Table>* out); |
178 | |
179 | ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out); |
180 | |
181 | ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups, |
182 | const std::vector<int>& column_indices, |
183 | std::shared_ptr<::arrow::Table>* out); |
184 | |
185 | ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups, |
186 | std::shared_ptr<::arrow::Table>* out); |
187 | |
188 | /// \brief Scan file contents with one thread, return number of rows |
189 | ::arrow::Status ScanContents(std::vector<int> columns, const int32_t column_batch_size, |
190 | int64_t* num_rows); |
191 | |
192 | /// \brief Return a reader for the RowGroup, this object must not outlive the |
193 | /// FileReader. |
194 | std::shared_ptr<RowGroupReader> RowGroup(int row_group_index); |
195 | |
196 | int num_row_groups() const; |
197 | |
198 | const ParquetFileReader* parquet_reader() const; |
199 | |
200 | /// Set the number of threads to use during reads of multiple columns. By |
201 | /// default only 1 thread is used |
202 | /// \deprecated Use set_use_threads instead. |
203 | ARROW_DEPRECATED("Use set_use_threads instead" ) |
204 | void set_num_threads(int num_threads); |
205 | |
206 | /// Set whether to use multiple threads during reads of multiple columns. |
207 | /// By default only one thread is used. |
208 | void set_use_threads(bool use_threads); |
209 | |
210 | virtual ~FileReader(); |
211 | |
212 | private: |
213 | friend ColumnChunkReader; |
214 | friend RowGroupReader; |
215 | |
216 | class PARQUET_NO_EXPORT Impl; |
217 | std::unique_ptr<Impl> impl_; |
218 | }; |
219 | |
220 | class PARQUET_EXPORT RowGroupReader { |
221 | public: |
222 | std::shared_ptr<ColumnChunkReader> Column(int column_index); |
223 | |
224 | ::arrow::Status ReadTable(const std::vector<int>& column_indices, |
225 | std::shared_ptr<::arrow::Table>* out); |
226 | ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out); |
227 | |
228 | virtual ~RowGroupReader(); |
229 | |
230 | private: |
231 | friend FileReader; |
232 | RowGroupReader(FileReader::Impl* reader, int row_group_index); |
233 | |
234 | FileReader::Impl* impl_; |
235 | int row_group_index_; |
236 | }; |
237 | |
238 | class PARQUET_EXPORT ColumnChunkReader { |
239 | public: |
240 | ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); |
241 | |
242 | /// \note Deprecated since 0.12 |
243 | ARROW_DEPRECATED("Use version with ChunkedArray output" ) |
244 | ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); |
245 | |
246 | virtual ~ColumnChunkReader(); |
247 | |
248 | private: |
249 | friend RowGroupReader; |
250 | ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index); |
251 | |
252 | FileReader::Impl* impl_; |
253 | int column_index_; |
254 | int row_group_index_; |
255 | }; |
256 | |
257 | // At this point, the column reader is a stream iterator. It only knows how to |
258 | // read the next batch of values for a particular column from the file until it |
259 | // runs out. |
260 | // |
261 | // We also do not expose any internal Parquet details, such as row groups. This |
262 | // might change in the future. |
263 | class PARQUET_EXPORT ColumnReader { |
264 | public: |
265 | class PARQUET_NO_EXPORT ColumnReaderImpl; |
266 | virtual ~ColumnReader(); |
267 | |
268 | // Scan the next array of the indicated size. The actual size of the |
269 | // returned array may be less than the passed size depending how much data is |
270 | // available in the file. |
271 | // |
272 | // When all the data in the file has been exhausted, the result is set to |
273 | // nullptr. |
274 | // |
275 | // Returns Status::OK on a successful read, including if you have exhausted |
276 | // the data available in the file. |
277 | ::arrow::Status NextBatch(int64_t batch_size, |
278 | std::shared_ptr<::arrow::ChunkedArray>* out); |
279 | |
280 | /// \note Deprecated since 0.12 |
281 | ARROW_DEPRECATED("Use version with ChunkedArray output" ) |
282 | ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); |
283 | |
284 | private: |
285 | std::unique_ptr<ColumnReaderImpl> impl_; |
286 | explicit ColumnReader(std::unique_ptr<ColumnReaderImpl> impl); |
287 | |
288 | friend class FileReader; |
289 | friend class PrimitiveImpl; |
290 | friend class StructImpl; |
291 | }; |
292 | |
293 | // Helper function to create a file reader from an implementation of an Arrow |
294 | // readable file |
295 | // |
296 | // metadata : separately-computed file metadata, can be nullptr |
297 | PARQUET_EXPORT |
298 | ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, |
299 | ::arrow::MemoryPool* allocator, |
300 | const ReaderProperties& properties, |
301 | const std::shared_ptr<FileMetaData>& metadata, |
302 | std::unique_ptr<FileReader>* reader); |
303 | |
304 | PARQUET_EXPORT |
305 | ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, |
306 | ::arrow::MemoryPool* allocator, |
307 | std::unique_ptr<FileReader>* reader); |
308 | |
309 | } // namespace arrow |
310 | } // namespace parquet |
311 | |
312 | #endif // PARQUET_ARROW_READER_H |
313 | |