1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_RECORD_READER_H
19#define PARQUET_RECORD_READER_H
20
21#include <cstdint>
22#include <memory>
23#include <vector>
24
25#include "arrow/memory_pool.h"
26
27#include "parquet/util/macros.h"
28#include "parquet/util/memory.h"
29
30namespace arrow {
31
32class Array;
33
34} // namespace arrow
35
36namespace parquet {
37
38class ColumnDescriptor;
39class PageReader;
40
41namespace internal {
42
43/// \brief Stateful column reader that delimits semantic records for both flat
44/// and nested columns
45///
46/// \note API EXPERIMENTAL
47/// \since 1.3.0
48class RecordReader {
49 public:
50 // So that we can create subclasses
51 class RecordReaderImpl;
52
53 static std::shared_ptr<RecordReader> Make(
54 const ColumnDescriptor* descr,
55 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
56
57 virtual ~RecordReader();
58
59 /// \brief Decoded definition levels
60 const int16_t* def_levels() const;
61
62 /// \brief Decoded repetition levels
63 const int16_t* rep_levels() const;
64
65 /// \brief Decoded values, including nulls, if any
66 const uint8_t* values() const;
67
68 /// \brief Attempt to read indicated number of records from column chunk
69 /// \return number of records read
70 int64_t ReadRecords(int64_t num_records);
71
72 /// \brief Pre-allocate space for data. Results in better flat read performance
73 void Reserve(int64_t num_values);
74
75 /// \brief Clear consumed values and repetition/definition levels as the
76 /// result of calling ReadRecords
77 void Reset();
78
79 std::shared_ptr<ResizableBuffer> ReleaseValues();
80 std::shared_ptr<ResizableBuffer> ReleaseIsValid();
81
82 /// \brief Number of values written including nulls (if any)
83 int64_t values_written() const;
84
85 /// \brief Number of definition / repetition levels (from those that have
86 /// been decoded) that have been consumed inside the reader.
87 int64_t levels_position() const;
88
89 /// \brief Number of definition / repetition levels that have been written
90 /// internally in the reader
91 int64_t levels_written() const;
92
93 /// \brief Number of nulls in the leaf
94 int64_t null_count() const;
95
96 /// \brief True if the leaf values are nullable
97 bool nullable_values() const;
98
99 /// \brief Return true if the record reader has more internal data yet to
100 /// process
101 bool HasMoreData() const;
102
103 /// \brief Advance record reader to the next row group
104 /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
105 void SetPageReader(std::unique_ptr<PageReader> reader);
106
107 void DebugPrintState();
108
109 // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output
110 std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks();
111
112 private:
113 std::unique_ptr<RecordReaderImpl> impl_;
114 explicit RecordReader(RecordReaderImpl* impl);
115};
116
117} // namespace internal
118} // namespace parquet
119
120#endif // PARQUET_RECORD_READER_H
121