1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_FILE_METADATA_H
19#define PARQUET_FILE_METADATA_H
20
21#include <memory>
22#include <set>
23#include <string>
24#include <vector>
25
26#include "arrow/util/key_value_metadata.h"
27
28#include "parquet/properties.h"
29#include "parquet/schema.h"
30#include "parquet/statistics.h"
31#include "parquet/types.h"
32#include "parquet/util/macros.h"
33#include "parquet/util/memory.h"
34#include "parquet/util/visibility.h"
35
36namespace parquet {
37
38using KeyValueMetadata = ::arrow::KeyValueMetadata;
39
40class PARQUET_EXPORT ApplicationVersion {
41 public:
42 // Known Versions with Issues
43 static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
44 static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
45 static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
46 static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
47 // Regular expression for the version format
48 // major . minor . patch unknown - prerelease.x + build info
49 // Eg: 1.5.0ab-cdh5.5.0+cd
50 static constexpr char const* VERSION_FORMAT =
51 "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
52 // Regular expression for the application format
53 // application_name version VERSION_FORMAT (build build_name)
54 // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
55 static constexpr char const* APPLICATION_FORMAT =
56 "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
57
58 // Application that wrote the file. e.g. "IMPALA"
59 std::string application_;
60 // Build name
61 std::string build_;
62
63 // Version of the application that wrote the file, expressed as
64 // (<major>.<minor>.<patch>). Unmatched parts default to 0.
65 // "1.2.3" => {1, 2, 3}
66 // "1.2" => {0, 0, 0}
67 // "1.2-cdh5" => {0, 0, 0}
68 // TODO (majetideepak): Implement support for pre_release
69 struct {
70 int major;
71 int minor;
72 int patch;
73 std::string unknown;
74 std::string pre_release;
75 std::string build_info;
76 } version;
77
78 ApplicationVersion() {}
79 explicit ApplicationVersion(const std::string& created_by);
80 ApplicationVersion(const std::string& application, int major, int minor, int patch);
81
82 // Returns true if version is strictly less than other_version
83 bool VersionLt(const ApplicationVersion& other_version) const;
84
85 // Returns true if version is strictly less than other_version
86 bool VersionEq(const ApplicationVersion& other_version) const;
87
88 // Checks if the Version has the correct statistics for a given column
89 bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
90 SortOrder::type sort_order = SortOrder::SIGNED) const;
91};
92
93class PARQUET_EXPORT ColumnChunkMetaData {
94 public:
95 // API convenience to get a MetaData accessor
96 static std::unique_ptr<ColumnChunkMetaData> Make(
97 const void* metadata, const ColumnDescriptor* descr,
98 const ApplicationVersion* writer_version = NULLPTR);
99
100 ~ColumnChunkMetaData();
101
102 // column chunk
103 int64_t file_offset() const;
104 // parameter is only used when a dataset is spread across multiple files
105 const std::string& file_path() const;
106 // column metadata
107 Type::type type() const;
108 int64_t num_values() const;
109 std::shared_ptr<schema::ColumnPath> path_in_schema() const;
110 bool is_stats_set() const;
111 std::shared_ptr<RowGroupStatistics> statistics() const;
112 Compression::type compression() const;
113 const std::vector<Encoding::type>& encodings() const;
114 bool has_dictionary_page() const;
115 int64_t dictionary_page_offset() const;
116 int64_t data_page_offset() const;
117 bool has_index_page() const;
118 int64_t index_page_offset() const;
119 int64_t total_compressed_size() const;
120 int64_t total_uncompressed_size() const;
121
122 private:
123 explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr,
124 const ApplicationVersion* writer_version = NULLPTR);
125 // PIMPL Idiom
126 class ColumnChunkMetaDataImpl;
127 std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
128};
129
130class PARQUET_EXPORT RowGroupMetaData {
131 public:
132 // API convenience to get a MetaData accessor
133 static std::unique_ptr<RowGroupMetaData> Make(
134 const void* metadata, const SchemaDescriptor* schema,
135 const ApplicationVersion* writer_version = NULLPTR);
136
137 ~RowGroupMetaData();
138
139 // row-group metadata
140 int num_columns() const;
141 int64_t num_rows() const;
142 int64_t total_byte_size() const;
143 // Return const-pointer to make it clear that this object is not to be copied
144 const SchemaDescriptor* schema() const;
145 std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const;
146
147 private:
148 explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
149 const ApplicationVersion* writer_version = NULLPTR);
150 // PIMPL Idiom
151 class RowGroupMetaDataImpl;
152 std::unique_ptr<RowGroupMetaDataImpl> impl_;
153};
154
155class FileMetaDataBuilder;
156
157class PARQUET_EXPORT FileMetaData {
158 public:
159 // API convenience to get a MetaData accessor
160 static std::shared_ptr<FileMetaData> Make(const void* serialized_metadata,
161 uint32_t* metadata_len);
162
163 ~FileMetaData();
164
165 // file metadata
166 uint32_t size() const;
167 int num_columns() const;
168 int64_t num_rows() const;
169 int num_row_groups() const;
170 ParquetVersion::type version() const;
171 const std::string& created_by() const;
172 int num_schema_elements() const;
173 std::unique_ptr<RowGroupMetaData> RowGroup(int i) const;
174
175 const ApplicationVersion& writer_version() const;
176
177 void WriteTo(OutputStream* dst) const;
178
179 // Return const-pointer to make it clear that this object is not to be copied
180 const SchemaDescriptor* schema() const;
181
182 std::shared_ptr<const KeyValueMetadata> key_value_metadata() const;
183
184 private:
185 friend FileMetaDataBuilder;
186 explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len);
187
188 // PIMPL Idiom
189 FileMetaData();
190 class FileMetaDataImpl;
191 std::unique_ptr<FileMetaDataImpl> impl_;
192};
193
194// Builder API
195class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
196 public:
197 // API convenience to get a MetaData reader
198 static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
199 const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column);
200
201 static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
202 const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
203 void* contents);
204
205 ~ColumnChunkMetaDataBuilder();
206
207 // column chunk
208 // Used when a dataset is spread across multiple files
209 void set_file_path(const std::string& path);
210 // column metadata
211 void SetStatistics(bool is_signed, const EncodedStatistics& stats);
212 // get the column descriptor
213 const ColumnDescriptor* descr() const;
214 // commit the metadata
215 void Finish(int64_t num_values, int64_t dictonary_page_offset,
216 int64_t index_page_offset, int64_t data_page_offset,
217 int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
218 bool dictionary_fallback);
219
220 // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
221 const void* contents() const;
222
223 // For writing metadata at end of column chunk
224 void WriteTo(OutputStream* sink);
225
226 private:
227 explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
228 const ColumnDescriptor* column);
229 explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
230 const ColumnDescriptor* column, void* contents);
231 // PIMPL Idiom
232 class ColumnChunkMetaDataBuilderImpl;
233 std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
234};
235
236class PARQUET_EXPORT RowGroupMetaDataBuilder {
237 public:
238 // API convenience to get a MetaData reader
239 static std::unique_ptr<RowGroupMetaDataBuilder> Make(
240 const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
241 void* contents);
242
243 ~RowGroupMetaDataBuilder();
244
245 ColumnChunkMetaDataBuilder* NextColumnChunk();
246 int num_columns();
247 int64_t num_rows();
248 int current_column() const;
249
250 void set_num_rows(int64_t num_rows);
251
252 // commit the metadata
253 void Finish(int64_t total_bytes_written);
254
255 private:
256 explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
257 const SchemaDescriptor* schema_, void* contents);
258 // PIMPL Idiom
259 class RowGroupMetaDataBuilderImpl;
260 std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
261};
262
263class PARQUET_EXPORT FileMetaDataBuilder {
264 public:
265 // API convenience to get a MetaData reader
266 static std::unique_ptr<FileMetaDataBuilder> Make(
267 const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props,
268 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
269
270 ~FileMetaDataBuilder();
271
272 // The prior RowGroupMetaDataBuilder (if any) is destroyed
273 RowGroupMetaDataBuilder* AppendRowGroup();
274
275 // Complete the Thrift structure
276 std::unique_ptr<FileMetaData> Finish();
277
278 private:
279 explicit FileMetaDataBuilder(
280 const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props,
281 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
282 // PIMPL Idiom
283 class FileMetaDataBuilderImpl;
284 std::unique_ptr<FileMetaDataBuilderImpl> impl_;
285};
286
287PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
288
289} // namespace parquet
290
291#endif // PARQUET_FILE_METADATA_H
292