1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_FILE_METADATA_H
19#define PARQUET_FILE_METADATA_H
20
21#include <cstdint>
22#include <memory>
23#include <string>
24#include <vector>
25
26#include "arrow/util/key_value_metadata.h"
27
28#include "parquet/platform.h"
29#include "parquet/properties.h"
30#include "parquet/types.h"
31
32namespace parquet {
33
34class ColumnDescriptor;
35class EncodedStatistics;
36class Statistics;
37class SchemaDescriptor;
38
39namespace schema {
40
41class ColumnPath;
42
43} // namespace schema
44
45using KeyValueMetadata = ::arrow::KeyValueMetadata;
46
47class PARQUET_EXPORT ApplicationVersion {
48 public:
49 // Known Versions with Issues
50 static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
51 static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
52 static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
53 static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
54 // Regular expression for the version format
55 // major . minor . patch unknown - prerelease.x + build info
56 // Eg: 1.5.0ab-cdh5.5.0+cd
57 static constexpr char const* VERSION_FORMAT =
58 "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
59 // Regular expression for the application format
60 // application_name version VERSION_FORMAT (build build_name)
61 // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
62 static constexpr char const* APPLICATION_FORMAT =
63 "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
64
65 // Application that wrote the file. e.g. "IMPALA"
66 std::string application_;
67 // Build name
68 std::string build_;
69
70 // Version of the application that wrote the file, expressed as
71 // (<major>.<minor>.<patch>). Unmatched parts default to 0.
72 // "1.2.3" => {1, 2, 3}
73 // "1.2" => {0, 0, 0}
74 // "1.2-cdh5" => {0, 0, 0}
75 // TODO (majetideepak): Implement support for pre_release
76 struct {
77 int major;
78 int minor;
79 int patch;
80 std::string unknown;
81 std::string pre_release;
82 std::string build_info;
83 } version;
84
85 ApplicationVersion() {}
86 explicit ApplicationVersion(const std::string& created_by);
87 ApplicationVersion(const std::string& application, int major, int minor, int patch);
88
89 // Returns true if version is strictly less than other_version
90 bool VersionLt(const ApplicationVersion& other_version) const;
91
92 // Returns true if version is strictly less than other_version
93 bool VersionEq(const ApplicationVersion& other_version) const;
94
95 // Checks if the Version has the correct statistics for a given column
96 bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
97 SortOrder::type sort_order = SortOrder::SIGNED) const;
98};
99
100class PARQUET_EXPORT ColumnChunkMetaData {
101 public:
102 // API convenience to get a MetaData accessor
103 static std::unique_ptr<ColumnChunkMetaData> Make(
104 const void* metadata, const ColumnDescriptor* descr,
105 const ApplicationVersion* writer_version = NULLPTR);
106
107 ~ColumnChunkMetaData();
108
109 // column chunk
110 int64_t file_offset() const;
111
112 // parameter is only used when a dataset is spread across multiple files
113 const std::string& file_path() const;
114
115 // column metadata
116 Type::type type() const;
117 int64_t num_values() const;
118 std::shared_ptr<schema::ColumnPath> path_in_schema() const;
119 bool is_stats_set() const;
120 std::shared_ptr<Statistics> statistics() const;
121 Compression::type compression() const;
122 const std::vector<Encoding::type>& encodings() const;
123 bool has_dictionary_page() const;
124 int64_t dictionary_page_offset() const;
125 int64_t data_page_offset() const;
126 bool has_index_page() const;
127 int64_t index_page_offset() const;
128 int64_t total_compressed_size() const;
129 int64_t total_uncompressed_size() const;
130
131 private:
132 explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr,
133 const ApplicationVersion* writer_version = NULLPTR);
134 // PIMPL Idiom
135 class ColumnChunkMetaDataImpl;
136 std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
137};
138
139class PARQUET_EXPORT RowGroupMetaData {
140 public:
141 // API convenience to get a MetaData accessor
142 static std::unique_ptr<RowGroupMetaData> Make(
143 const void* metadata, const SchemaDescriptor* schema,
144 const ApplicationVersion* writer_version = NULLPTR);
145
146 ~RowGroupMetaData();
147
148 // row-group metadata
149 int num_columns() const;
150 int64_t num_rows() const;
151 int64_t total_byte_size() const;
152 // Return const-pointer to make it clear that this object is not to be copied
153 const SchemaDescriptor* schema() const;
154 std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const;
155
156 private:
157 explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
158 const ApplicationVersion* writer_version = NULLPTR);
159 // PIMPL Idiom
160 class RowGroupMetaDataImpl;
161 std::unique_ptr<RowGroupMetaDataImpl> impl_;
162};
163
164class FileMetaDataBuilder;
165
166class PARQUET_EXPORT FileMetaData {
167 public:
168 // API convenience to get a MetaData accessor
169 static std::shared_ptr<FileMetaData> Make(const void* serialized_metadata,
170 uint32_t* metadata_len);
171
172 ~FileMetaData();
173
174 // file metadata
175 uint32_t size() const;
176
177 int num_columns() const;
178
179 int64_t num_rows() const;
180
181 int num_row_groups() const;
182 ParquetVersion::type version() const;
183 const std::string& created_by() const;
184 int num_schema_elements() const;
185 std::unique_ptr<RowGroupMetaData> RowGroup(int i) const;
186
187 const ApplicationVersion& writer_version() const;
188
189 void WriteTo(::arrow::io::OutputStream* dst) const;
190
191 // Return const-pointer to make it clear that this object is not to be copied
192 const SchemaDescriptor* schema() const;
193
194 std::shared_ptr<const KeyValueMetadata> key_value_metadata() const;
195
196 // Set file_path ColumnChunk fields to a particular value
197 void set_file_path(const std::string& path);
198
199 // Merge row-group metadata from "other" FileMetaData object
200 void AppendRowGroups(const FileMetaData& other);
201
202 private:
203 friend FileMetaDataBuilder;
204 explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len);
205
206 // PIMPL Idiom
207 FileMetaData();
208 class FileMetaDataImpl;
209 std::unique_ptr<FileMetaDataImpl> impl_;
210};
211
212// Builder API
213class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
214 public:
215 // API convenience to get a MetaData reader
216 static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
217 const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column);
218
219 static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
220 const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
221 void* contents);
222
223 ~ColumnChunkMetaDataBuilder();
224
225 // column chunk
226 // Used when a dataset is spread across multiple files
227 void set_file_path(const std::string& path);
228 // column metadata
229 void SetStatistics(const EncodedStatistics& stats);
230 // get the column descriptor
231 const ColumnDescriptor* descr() const;
232 // commit the metadata
233 void Finish(int64_t num_values, int64_t dictonary_page_offset,
234 int64_t index_page_offset, int64_t data_page_offset,
235 int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
236 bool dictionary_fallback);
237
238 // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
239 const void* contents() const;
240
241 // For writing metadata at end of column chunk
242 void WriteTo(::arrow::io::OutputStream* sink);
243
244 private:
245 explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
246 const ColumnDescriptor* column);
247 explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
248 const ColumnDescriptor* column, void* contents);
249 // PIMPL Idiom
250 class ColumnChunkMetaDataBuilderImpl;
251 std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
252};
253
254class PARQUET_EXPORT RowGroupMetaDataBuilder {
255 public:
256 // API convenience to get a MetaData reader
257 static std::unique_ptr<RowGroupMetaDataBuilder> Make(
258 const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
259 void* contents);
260
261 ~RowGroupMetaDataBuilder();
262
263 ColumnChunkMetaDataBuilder* NextColumnChunk();
264 int num_columns();
265 int64_t num_rows();
266 int current_column() const;
267
268 void set_num_rows(int64_t num_rows);
269
270 // commit the metadata
271 void Finish(int64_t total_bytes_written);
272
273 private:
274 explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
275 const SchemaDescriptor* schema_, void* contents);
276 // PIMPL Idiom
277 class RowGroupMetaDataBuilderImpl;
278 std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
279};
280
281class PARQUET_EXPORT FileMetaDataBuilder {
282 public:
283 // API convenience to get a MetaData reader
284 static std::unique_ptr<FileMetaDataBuilder> Make(
285 const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props,
286 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
287
288 ~FileMetaDataBuilder();
289
290 // The prior RowGroupMetaDataBuilder (if any) is destroyed
291 RowGroupMetaDataBuilder* AppendRowGroup();
292
293 // Complete the Thrift structure
294 std::unique_ptr<FileMetaData> Finish();
295
296 private:
297 explicit FileMetaDataBuilder(
298 const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props,
299 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
300 // PIMPL Idiom
301 class FileMetaDataBuilderImpl;
302 std::unique_ptr<FileMetaDataBuilderImpl> impl_;
303};
304
305PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
306
307} // namespace parquet
308
309#endif // PARQUET_FILE_METADATA_H
310