1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_ARROW_READER_H
19#define PARQUET_ARROW_READER_H
20
21#include <cstdint>
22#include <memory>
23#include <vector>
24
25#include "parquet/util/visibility.h"
26
27#include "arrow/io/interfaces.h"
28#include "arrow/util/macros.h"
29
30namespace arrow {
31
32class Array;
33class ChunkedArray;
34class MemoryPool;
35class RecordBatchReader;
36class Schema;
37class Status;
38class Table;
39
40} // namespace arrow
41
42namespace parquet {
43
44class FileMetaData;
45class ParquetFileReader;
46class ReaderProperties;
47
48namespace arrow {
49
50class ColumnChunkReader;
51class ColumnReader;
52class RowGroupReader;
53
54// Arrow read adapter class for deserializing Parquet files as Arrow row
55// batches.
56//
57// This interfaces caters for different use cases and thus provides different
58// interfaces. In its most simplistic form, we cater for a user that wants to
59// read the whole Parquet at once with the FileReader::ReadTable method.
60//
61// More advanced users that also want to implement parallelism on top of each
62// single Parquet files should do this on the RowGroup level. For this, they can
63// call FileReader::RowGroup(i)->ReadTable to receive only the specified
64// RowGroup as a table.
65//
66// In the most advanced situation, where a consumer wants to independently read
67// RowGroups in parallel and consume each column individually, they can call
68// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column
69// instance.
70//
71// TODO(wesm): nested data does not always make sense with this user
72// interface unless you are only reading a single leaf node from a branch of
73// a table. For example:
74//
75// repeated group data {
76// optional group record {
77// optional int32 val1;
78// optional byte_array val2;
79// optional bool val3;
80// }
81// optional int32 val4;
82// }
83//
84// In the Parquet file, there are 3 leaf nodes:
85//
86// * data.record.val1
87// * data.record.val2
88// * data.record.val3
89// * data.val4
90//
91// When materializing this data in an Arrow array, we would have:
92//
93// data: list<struct<
94// record: struct<
95// val1: int32,
96// val2: string (= list<uint8>),
97// val3: bool,
98// >,
99// val4: int32
100// >>
101//
102// However, in the Parquet format, each leaf node has its own repetition and
103// definition levels describing the structure of the intermediate nodes in
104// this array structure. Thus, we will need to scan the leaf data for a group
105// of leaf nodes part of the same type tree to create a single result Arrow
106// nested array structure.
107//
108// This is additionally complicated "chunky" repeated fields or very large byte
109// arrays
110class PARQUET_EXPORT FileReader {
111 public:
112 FileReader(::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader);
113
114 // Since the distribution of columns amongst a Parquet file's row groups may
115 // be uneven (the number of values in each column chunk can be different), we
116 // provide a column-oriented read interface. The ColumnReader hides the
117 // details of paging through the file's row groups and yielding
118 // fully-materialized arrow::Array instances
119 //
120 // Returns error status if the column of interest is not flat.
121 ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out);
122
123 /// \brief Return arrow schema by apply selection of column indices.
124 /// \returns error status if passed wrong indices.
125 ::arrow::Status GetSchema(const std::vector<int>& indices,
126 std::shared_ptr<::arrow::Schema>* out);
127
128 // Read column as a whole into an Array.
129 ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out);
130
131 /// \note Deprecated since 0.12
132 ARROW_DEPRECATED("Use version with ChunkedArray output")
133 ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out);
134
135 // NOTE: Experimental API
136 // Reads a specific top level schema field into an Array
137 // The index i refers the index of the top level schema field, which may
138 // be nested or flat - e.g.
139 //
140 // 0 foo.bar
141 // foo.bar.baz
142 // foo.qux
143 // 1 foo2
144 // 2 foo3
145 //
146 // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
147 ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out);
148
149 /// \note Deprecated since 0.12
150 ARROW_DEPRECATED("Use version with ChunkedArray output")
151 ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out);
152
153 /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the
154 /// ordering in row_group_indices matters.
155 /// \returns error Status if row_group_indices contains invalid index
156 ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
157 std::shared_ptr<::arrow::RecordBatchReader>* out);
158
159 /// \brief Return a RecordBatchReader of row groups selected from row_group_indices,
160 /// whose columns are selected by column_indices. The ordering in row_group_indices
161 /// and column_indices matter.
162 /// \returns error Status if either row_group_indices or column_indices contains invalid
163 /// index
164 ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
165 const std::vector<int>& column_indices,
166 std::shared_ptr<::arrow::RecordBatchReader>* out);
167
168 // Read a table of columns into a Table
169 ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);
170
171 // Read a table of columns into a Table. Read only the indicated column
172 // indices (relative to the schema)
173 ::arrow::Status ReadTable(const std::vector<int>& column_indices,
174 std::shared_ptr<::arrow::Table>* out);
175
176 ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
177 std::shared_ptr<::arrow::Table>* out);
178
179 ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out);
180
181 ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
182 const std::vector<int>& column_indices,
183 std::shared_ptr<::arrow::Table>* out);
184
185 ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
186 std::shared_ptr<::arrow::Table>* out);
187
188 /// \brief Scan file contents with one thread, return number of rows
189 ::arrow::Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
190 int64_t* num_rows);
191
192 /// \brief Return a reader for the RowGroup, this object must not outlive the
193 /// FileReader.
194 std::shared_ptr<RowGroupReader> RowGroup(int row_group_index);
195
196 int num_row_groups() const;
197
198 const ParquetFileReader* parquet_reader() const;
199
200 /// Set the number of threads to use during reads of multiple columns. By
201 /// default only 1 thread is used
202 /// \deprecated Use set_use_threads instead.
203 ARROW_DEPRECATED("Use set_use_threads instead")
204 void set_num_threads(int num_threads);
205
206 /// Set whether to use multiple threads during reads of multiple columns.
207 /// By default only one thread is used.
208 void set_use_threads(bool use_threads);
209
210 virtual ~FileReader();
211
212 private:
213 friend ColumnChunkReader;
214 friend RowGroupReader;
215
216 class PARQUET_NO_EXPORT Impl;
217 std::unique_ptr<Impl> impl_;
218};
219
220class PARQUET_EXPORT RowGroupReader {
221 public:
222 std::shared_ptr<ColumnChunkReader> Column(int column_index);
223
224 ::arrow::Status ReadTable(const std::vector<int>& column_indices,
225 std::shared_ptr<::arrow::Table>* out);
226 ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);
227
228 virtual ~RowGroupReader();
229
230 private:
231 friend FileReader;
232 RowGroupReader(FileReader::Impl* reader, int row_group_index);
233
234 FileReader::Impl* impl_;
235 int row_group_index_;
236};
237
238class PARQUET_EXPORT ColumnChunkReader {
239 public:
240 ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out);
241
242 /// \note Deprecated since 0.12
243 ARROW_DEPRECATED("Use version with ChunkedArray output")
244 ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out);
245
246 virtual ~ColumnChunkReader();
247
248 private:
249 friend RowGroupReader;
250 ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index);
251
252 FileReader::Impl* impl_;
253 int column_index_;
254 int row_group_index_;
255};
256
257// At this point, the column reader is a stream iterator. It only knows how to
258// read the next batch of values for a particular column from the file until it
259// runs out.
260//
261// We also do not expose any internal Parquet details, such as row groups. This
262// might change in the future.
263class PARQUET_EXPORT ColumnReader {
264 public:
265 class PARQUET_NO_EXPORT ColumnReaderImpl;
266 virtual ~ColumnReader();
267
268 // Scan the next array of the indicated size. The actual size of the
269 // returned array may be less than the passed size depending how much data is
270 // available in the file.
271 //
272 // When all the data in the file has been exhausted, the result is set to
273 // nullptr.
274 //
275 // Returns Status::OK on a successful read, including if you have exhausted
276 // the data available in the file.
277 ::arrow::Status NextBatch(int64_t batch_size,
278 std::shared_ptr<::arrow::ChunkedArray>* out);
279
280 /// \note Deprecated since 0.12
281 ARROW_DEPRECATED("Use version with ChunkedArray output")
282 ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out);
283
284 private:
285 std::unique_ptr<ColumnReaderImpl> impl_;
286 explicit ColumnReader(std::unique_ptr<ColumnReaderImpl> impl);
287
288 friend class FileReader;
289 friend class PrimitiveImpl;
290 friend class StructImpl;
291};
292
293// Helper function to create a file reader from an implementation of an Arrow
294// readable file
295//
296// metadata : separately-computed file metadata, can be nullptr
297PARQUET_EXPORT
298::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
299 ::arrow::MemoryPool* allocator,
300 const ReaderProperties& properties,
301 const std::shared_ptr<FileMetaData>& metadata,
302 std::unique_ptr<FileReader>* reader);
303
304PARQUET_EXPORT
305::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
306 ::arrow::MemoryPool* allocator,
307 std::unique_ptr<FileReader>* reader);
308
309} // namespace arrow
310} // namespace parquet
311
312#endif // PARQUET_ARROW_READER_H
313