1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// This module defines an abstract interface for iterating through pages in a
19// Parquet column chunk within a row group. It could be extended in the future
20// to iterate through all data pages in all chunks in a file.
21
22#ifndef PARQUET_COLUMN_PAGE_H
23#define PARQUET_COLUMN_PAGE_H
24
25#include <cstdint>
26#include <memory>
27#include <string>
28
29#include "parquet/statistics.h"
30#include "parquet/types.h"
31#include "parquet/util/memory.h"
32
33namespace parquet {
34
35// TODO: Parallel processing is not yet safe because of memory-ownership
36// semantics (the PageReader may or may not own the memory referenced by a
37// page)
38//
39// TODO(wesm): In the future Parquet implementations may store the crc code
40// in format::PageHeader. parquet-mr currently does not, so we also skip it
41// here, both on the read and write path
42class Page {
43 public:
44 Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
45 : buffer_(buffer), type_(type) {}
46
47 PageType::type type() const { return type_; }
48
49 std::shared_ptr<Buffer> buffer() const { return buffer_; }
50
51 // @returns: a pointer to the page's data
52 const uint8_t* data() const { return buffer_->data(); }
53
54 // @returns: the total size in bytes of the page's data buffer
55 int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
56
57 private:
58 std::shared_ptr<Buffer> buffer_;
59 PageType::type type_;
60};
61
62class DataPage : public Page {
63 public:
64 DataPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
65 Encoding::type encoding, Encoding::type definition_level_encoding,
66 Encoding::type repetition_level_encoding,
67 const EncodedStatistics& statistics = EncodedStatistics())
68 : Page(buffer, PageType::DATA_PAGE),
69 num_values_(num_values),
70 encoding_(encoding),
71 definition_level_encoding_(definition_level_encoding),
72 repetition_level_encoding_(repetition_level_encoding),
73 statistics_(statistics) {}
74
75 int32_t num_values() const { return num_values_; }
76
77 Encoding::type encoding() const { return encoding_; }
78
79 Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
80
81 Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
82
83 const EncodedStatistics& statistics() const { return statistics_; }
84
85 private:
86 int32_t num_values_;
87 Encoding::type encoding_;
88 Encoding::type definition_level_encoding_;
89 Encoding::type repetition_level_encoding_;
90 EncodedStatistics statistics_;
91};
92
93class CompressedDataPage : public DataPage {
94 public:
95 CompressedDataPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
96 Encoding::type encoding, Encoding::type definition_level_encoding,
97 Encoding::type repetition_level_encoding, int64_t uncompressed_size,
98 const EncodedStatistics& statistics = EncodedStatistics())
99 : DataPage(buffer, num_values, encoding, definition_level_encoding,
100 repetition_level_encoding, statistics),
101 uncompressed_size_(uncompressed_size) {}
102
103 int64_t uncompressed_size() const { return uncompressed_size_; }
104
105 private:
106 int64_t uncompressed_size_;
107};
108
109class DataPageV2 : public Page {
110 public:
111 DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
112 int32_t num_rows, Encoding::type encoding,
113 int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
114 bool is_compressed = false)
115 : Page(buffer, PageType::DATA_PAGE_V2),
116 num_values_(num_values),
117 num_nulls_(num_nulls),
118 num_rows_(num_rows),
119 encoding_(encoding),
120 definition_levels_byte_length_(definition_levels_byte_length),
121 repetition_levels_byte_length_(repetition_levels_byte_length),
122 is_compressed_(is_compressed) {}
123
124 int32_t num_values() const { return num_values_; }
125
126 int32_t num_nulls() const { return num_nulls_; }
127
128 int32_t num_rows() const { return num_rows_; }
129
130 Encoding::type encoding() const { return encoding_; }
131
132 int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
133
134 int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
135
136 bool is_compressed() const { return is_compressed_; }
137
138 private:
139 int32_t num_values_;
140 int32_t num_nulls_;
141 int32_t num_rows_;
142 Encoding::type encoding_;
143 int32_t definition_levels_byte_length_;
144 int32_t repetition_levels_byte_length_;
145 bool is_compressed_;
146
147 // TODO(wesm): format::DataPageHeaderV2.statistics
148};
149
150class DictionaryPage : public Page {
151 public:
152 DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
153 Encoding::type encoding, bool is_sorted = false)
154 : Page(buffer, PageType::DICTIONARY_PAGE),
155 num_values_(num_values),
156 encoding_(encoding),
157 is_sorted_(is_sorted) {}
158
159 int32_t num_values() const { return num_values_; }
160
161 Encoding::type encoding() const { return encoding_; }
162
163 bool is_sorted() const { return is_sorted_; }
164
165 private:
166 int32_t num_values_;
167 Encoding::type encoding_;
168 bool is_sorted_;
169};
170
171} // namespace parquet
172
173#endif // PARQUET_COLUMN_PAGE_H
174