1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// This module defines an abstract interface for iterating through pages in a
19// Parquet column chunk within a row group. It could be extended in the future
20// to iterate through all data pages in all chunks in a file.
21
22#ifndef PARQUET_COLUMN_PAGE_H
23#define PARQUET_COLUMN_PAGE_H
24
25#include <cstdint>
26#include <memory>
27#include <string>
28
29#include "parquet/statistics.h"
30#include "parquet/types.h"
31
32namespace parquet {
33
34// TODO: Parallel processing is not yet safe because of memory-ownership
35// semantics (the PageReader may or may not own the memory referenced by a
36// page)
37//
38// TODO(wesm): In the future Parquet implementations may store the crc code
39// in format::PageHeader. parquet-mr currently does not, so we also skip it
40// here, both on the read and write path
41class Page {
42 public:
43 Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
44 : buffer_(buffer), type_(type) {}
45
46 PageType::type type() const { return type_; }
47
48 std::shared_ptr<Buffer> buffer() const { return buffer_; }
49
50 // @returns: a pointer to the page's data
51 const uint8_t* data() const { return buffer_->data(); }
52
53 // @returns: the total size in bytes of the page's data buffer
54 int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
55
56 private:
57 std::shared_ptr<Buffer> buffer_;
58 PageType::type type_;
59};
60
61/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
62class DataPage : public Page {
63 public:
64 int32_t num_values() const { return num_values_; }
65 Encoding::type encoding() const { return encoding_; }
66 const EncodedStatistics& statistics() const { return statistics_; }
67
68 protected:
69 DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
70 Encoding::type encoding,
71 const EncodedStatistics& statistics = EncodedStatistics())
72 : Page(buffer, type),
73 num_values_(num_values),
74 encoding_(encoding),
75 statistics_(statistics) {}
76
77 int32_t num_values_;
78 Encoding::type encoding_;
79 EncodedStatistics statistics_;
80};
81
82class DataPageV1 : public DataPage {
83 public:
84 DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
85 Encoding::type encoding, Encoding::type definition_level_encoding,
86 Encoding::type repetition_level_encoding,
87 const EncodedStatistics& statistics = EncodedStatistics())
88 : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, statistics),
89 definition_level_encoding_(definition_level_encoding),
90 repetition_level_encoding_(repetition_level_encoding) {}
91
92 Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
93
94 Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
95
96 private:
97 Encoding::type definition_level_encoding_;
98 Encoding::type repetition_level_encoding_;
99};
100
101class CompressedDataPage : public DataPageV1 {
102 public:
103 CompressedDataPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
104 Encoding::type encoding, Encoding::type definition_level_encoding,
105 Encoding::type repetition_level_encoding, int64_t uncompressed_size,
106 const EncodedStatistics& statistics = EncodedStatistics())
107 : DataPageV1(buffer, num_values, encoding, definition_level_encoding,
108 repetition_level_encoding, statistics),
109 uncompressed_size_(uncompressed_size) {}
110
111 int64_t uncompressed_size() const { return uncompressed_size_; }
112
113 private:
114 int64_t uncompressed_size_;
115};
116
117class DataPageV2 : public DataPage {
118 public:
119 DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
120 int32_t num_rows, Encoding::type encoding,
121 int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
122 bool is_compressed = false)
123 : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding),
124 num_nulls_(num_nulls),
125 num_rows_(num_rows),
126 definition_levels_byte_length_(definition_levels_byte_length),
127 repetition_levels_byte_length_(repetition_levels_byte_length),
128 is_compressed_(is_compressed) {}
129
130 int32_t num_nulls() const { return num_nulls_; }
131
132 int32_t num_rows() const { return num_rows_; }
133
134 int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
135
136 int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
137
138 bool is_compressed() const { return is_compressed_; }
139
140 private:
141 int32_t num_nulls_;
142 int32_t num_rows_;
143 int32_t definition_levels_byte_length_;
144 int32_t repetition_levels_byte_length_;
145 bool is_compressed_;
146
147 // TODO(wesm): format::DataPageHeaderV2.statistics
148};
149
150class DictionaryPage : public Page {
151 public:
152 DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
153 Encoding::type encoding, bool is_sorted = false)
154 : Page(buffer, PageType::DICTIONARY_PAGE),
155 num_values_(num_values),
156 encoding_(encoding),
157 is_sorted_(is_sorted) {}
158
159 int32_t num_values() const { return num_values_; }
160
161 Encoding::type encoding() const { return encoding_; }
162
163 bool is_sorted() const { return is_sorted_; }
164
165 private:
166 int32_t num_values_;
167 Encoding::type encoding_;
168 bool is_sorted_;
169};
170
171} // namespace parquet
172
173#endif // PARQUET_COLUMN_PAGE_H
174