reader.h source code [arrow/parquet/arrow/reader.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#ifndef PARQUET_ARROW_READER_H
19	#define PARQUET_ARROW_READER_H
20
21	#include <cstdint>
22	#include <memory>
23	#include <vector>
24
25	#include "parquet/util/visibility.h"
26
27	#include "arrow/io/interfaces.h"
28	#include "arrow/util/macros.h"
29
30	namespace arrow {
31
32	class Array;
33	class ChunkedArray;
34	class MemoryPool;
35	class RecordBatchReader;
36	class Schema;
37	class Status;
38	class Table;
39
40	} // namespace arrow
41
42	namespace parquet {
43
44	class FileMetaData;
45	class ParquetFileReader;
46	class ReaderProperties;
47
48	namespace arrow {
49
50	class ColumnChunkReader;
51	class ColumnReader;
52	class RowGroupReader;
53
54	// Arrow read adapter class for deserializing Parquet files as Arrow row
55	// batches.
56	//
57	// This interfaces caters for different use cases and thus provides different
58	// interfaces. In its most simplistic form, we cater for a user that wants to
59	// read the whole Parquet at once with the FileReader::ReadTable method.
60	//
61	// More advanced users that also want to implement parallelism on top of each
62	// single Parquet files should do this on the RowGroup level. For this, they can
63	// call FileReader::RowGroup(i)->ReadTable to receive only the specified
64	// RowGroup as a table.
65	//
66	// In the most advanced situation, where a consumer wants to independently read
67	// RowGroups in parallel and consume each column individually, they can call
68	// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column
69	// instance.
70	//
71	// TODO(wesm): nested data does not always make sense with this user
72	// interface unless you are only reading a single leaf node from a branch of
73	// a table. For example:
74	//
75	// repeated group data {
76	// optional group record {
77	// optional int32 val1;
78	// optional byte_array val2;
79	// optional bool val3;
80	// }
81	// optional int32 val4;
82	// }
83	//
84	// In the Parquet file, there are 3 leaf nodes:
85	//
86	// data.record.val1*
87	// data.record.val2*
88	// data.record.val3*
89	// data.val4*
90	//
91	// When materializing this data in an Arrow array, we would have:
92	//
93	// data: list<struct<
94	// record: struct<
95	// val1: int32,
96	// val2: string (= list<uint8>),
97	// val3: bool,
98	// >,
99	// val4: int32
100	// >>
101	//
102	// However, in the Parquet format, each leaf node has its own repetition and
103	// definition levels describing the structure of the intermediate nodes in
104	// this array structure. Thus, we will need to scan the leaf data for a group
105	// of leaf nodes part of the same type tree to create a single result Arrow
106	// nested array structure.
107	//
108	// This is additionally complicated "chunky" repeated fields or very large byte
109	// arrays
110	class PARQUET_EXPORT FileReader {
111	public:
112	FileReader(::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader);
113
114	// Since the distribution of columns amongst a Parquet file's row groups may
115	// be uneven (the number of values in each column chunk can be different), we
116	// provide a column-oriented read interface. The ColumnReader hides the
117	// details of paging through the file's row groups and yielding
118	// fully-materialized arrow::Array instances
119	//
120	// Returns error status if the column of interest is not flat.
121	::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out);
122
123	/// \brief Return arrow schema by apply selection of column indices.
124	/// \returns error status if passed wrong indices.
125	::arrow::Status GetSchema(const std::vector<int>& indices,
126	std::shared_ptr<::arrow::Schema>* out);
127
128	// Read column as a whole into an Array.
129	::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out);
130
131	/// \note Deprecated since 0.12
132	ARROW_DEPRECATED("Use version with ChunkedArray output")
133	::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out);
134
135	// NOTE: Experimental API
136	// Reads a specific top level schema field into an Array
137	// The index i refers the index of the top level schema field, which may
138	// be nested or flat - e.g.
139	//
140	// 0 foo.bar
141	// foo.bar.baz
142	// foo.qux
143	// 1 foo2
144	// 2 foo3
145	//
146	// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
147	::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out);
148
149	/// \note Deprecated since 0.12
150	ARROW_DEPRECATED("Use version with ChunkedArray output")
151	::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out);
152
153	/// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the
154	/// ordering in row_group_indices matters.
155	/// \returns error Status if row_group_indices contains invalid index
156	::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
157	std::shared_ptr<::arrow::RecordBatchReader>* out);
158
159	/// \brief Return a RecordBatchReader of row groups selected from row_group_indices,
160	/// whose columns are selected by column_indices. The ordering in row_group_indices
161	/// and column_indices matter.
162	/// \returns error Status if either row_group_indices or column_indices contains invalid
163	/// index
164	::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
165	const std::vector<int>& column_indices,
166	std::shared_ptr<::arrow::RecordBatchReader>* out);
167
168	// Read a table of columns into a Table
169	::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);
170
171	// Read a table of columns into a Table. Read only the indicated column
172	// indices (relative to the schema)
173	::arrow::Status ReadTable(const std::vector<int>& column_indices,
174	std::shared_ptr<::arrow::Table>* out);
175
176	::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
177	std::shared_ptr<::arrow::Table>* out);
178
179	::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out);
180
181	::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
182	const std::vector<int>& column_indices,
183	std::shared_ptr<::arrow::Table>* out);
184
185	::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
186	std::shared_ptr<::arrow::Table>* out);
187
188	/// \brief Scan file contents with one thread, return number of rows
189	::arrow::Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
190	int64_t* num_rows);
191
192	/// \brief Return a reader for the RowGroup, this object must not outlive the
193	/// FileReader.
194	std::shared_ptr<RowGroupReader> RowGroup(int row_group_index);
195
196	int num_row_groups() const;
197
198	const ParquetFileReader* parquet_reader() const;
199
200	/// Set the number of threads to use during reads of multiple columns. By
201	/// default only 1 thread is used
202	/// \deprecated Use set_use_threads instead.
203	ARROW_DEPRECATED("Use set_use_threads instead")
204	void set_num_threads(int num_threads);
205
206	/// Set whether to use multiple threads during reads of multiple columns.
207	/// By default only one thread is used.
208	void set_use_threads(bool use_threads);
209
210	virtual ~FileReader();
211
212	private:
213	friend ColumnChunkReader;
214	friend RowGroupReader;
215
216	class PARQUET_NO_EXPORT Impl;
217	std::unique_ptr<Impl> impl_;
218	};
219
220	class PARQUET_EXPORT RowGroupReader {
221	public:
222	std::shared_ptr<ColumnChunkReader> Column(int column_index);
223
224	::arrow::Status ReadTable(const std::vector<int>& column_indices,
225	std::shared_ptr<::arrow::Table>* out);
226	::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);
227
228	virtual ~RowGroupReader();
229
230	private:
231	friend FileReader;
232	RowGroupReader(FileReader::Impl* reader, int row_group_index);
233
234	FileReader::Impl* impl_;
235	int row_group_index_;
236	};
237
238	class PARQUET_EXPORT ColumnChunkReader {
239	public:
240	::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out);
241
242	/// \note Deprecated since 0.12
243	ARROW_DEPRECATED("Use version with ChunkedArray output")
244	::arrow::Status Read(std::shared_ptr<::arrow::Array>* out);
245
246	virtual ~ColumnChunkReader();
247
248	private:
249	friend RowGroupReader;
250	ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index);
251
252	FileReader::Impl* impl_;
253	int column_index_;
254	int row_group_index_;
255	};
256
257	// At this point, the column reader is a stream iterator. It only knows how to
258	// read the next batch of values for a particular column from the file until it
259	// runs out.
260	//
261	// We also do not expose any internal Parquet details, such as row groups. This
262	// might change in the future.
263	class PARQUET_EXPORT ColumnReader {
264	public:
265	class PARQUET_NO_EXPORT ColumnReaderImpl;
266	virtual ~ColumnReader();
267
268	// Scan the next array of the indicated size. The actual size of the
269	// returned array may be less than the passed size depending how much data is
270	// available in the file.
271	//
272	// When all the data in the file has been exhausted, the result is set to
273	// nullptr.
274	//
275	// Returns Status::OK on a successful read, including if you have exhausted
276	// the data available in the file.
277	::arrow::Status NextBatch(int64_t batch_size,
278	std::shared_ptr<::arrow::ChunkedArray>* out);
279
280	/// \note Deprecated since 0.12
281	ARROW_DEPRECATED("Use version with ChunkedArray output")
282	::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out);
283
284	private:
285	std::unique_ptr<ColumnReaderImpl> impl_;
286	explicit ColumnReader(std::unique_ptr<ColumnReaderImpl> impl);
287
288	friend class FileReader;
289	friend class PrimitiveImpl;
290	friend class StructImpl;
291	};
292
293	// Helper function to create a file reader from an implementation of an Arrow
294	// readable file
295	//
296	// metadata : separately-computed file metadata, can be nullptr
297	PARQUET_EXPORT
298	::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
299	::arrow::MemoryPool* allocator,
300	const ReaderProperties& properties,
301	const std::shared_ptr<FileMetaData>& metadata,
302	std::unique_ptr<FileReader>* reader);
303
304	PARQUET_EXPORT
305	::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
306	::arrow::MemoryPool* allocator,
307	std::unique_ptr<FileReader>* reader);
308
309	} // namespace arrow
310	} // namespace parquet
311
312	#endif // PARQUET_ARROW_READER_H
313

Browse the source code of arrow/parquet/arrow/reader.h