1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_FILE_METADATA_H |
19 | #define PARQUET_FILE_METADATA_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "arrow/util/key_value_metadata.h" |
27 | |
28 | #include "parquet/platform.h" |
29 | #include "parquet/properties.h" |
30 | #include "parquet/types.h" |
31 | |
32 | namespace parquet { |
33 | |
34 | class ColumnDescriptor; |
35 | class EncodedStatistics; |
36 | class Statistics; |
37 | class SchemaDescriptor; |
38 | |
39 | namespace schema { |
40 | |
41 | class ColumnPath; |
42 | |
43 | } // namespace schema |
44 | |
45 | using KeyValueMetadata = ::arrow::KeyValueMetadata; |
46 | |
47 | class PARQUET_EXPORT ApplicationVersion { |
48 | public: |
49 | // Known Versions with Issues |
50 | static const ApplicationVersion& PARQUET_251_FIXED_VERSION(); |
51 | static const ApplicationVersion& PARQUET_816_FIXED_VERSION(); |
52 | static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION(); |
53 | static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION(); |
54 | // Regular expression for the version format |
55 | // major . minor . patch unknown - prerelease.x + build info |
56 | // Eg: 1.5.0ab-cdh5.5.0+cd |
57 | static constexpr char const* VERSION_FORMAT = |
58 | "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$" ; |
59 | // Regular expression for the application format |
60 | // application_name version VERSION_FORMAT (build build_name) |
61 | // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) |
62 | static constexpr char const* APPLICATION_FORMAT = |
63 | "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)" ; |
64 | |
65 | // Application that wrote the file. e.g. "IMPALA" |
66 | std::string application_; |
67 | // Build name |
68 | std::string build_; |
69 | |
70 | // Version of the application that wrote the file, expressed as |
71 | // (<major>.<minor>.<patch>). Unmatched parts default to 0. |
72 | // "1.2.3" => {1, 2, 3} |
73 | // "1.2" => {0, 0, 0} |
74 | // "1.2-cdh5" => {0, 0, 0} |
75 | // TODO (majetideepak): Implement support for pre_release |
76 | struct { |
77 | int major; |
78 | int minor; |
79 | int patch; |
80 | std::string unknown; |
81 | std::string pre_release; |
82 | std::string build_info; |
83 | } version; |
84 | |
85 | ApplicationVersion() {} |
86 | explicit ApplicationVersion(const std::string& created_by); |
87 | ApplicationVersion(const std::string& application, int major, int minor, int patch); |
88 | |
89 | // Returns true if version is strictly less than other_version |
90 | bool VersionLt(const ApplicationVersion& other_version) const; |
91 | |
92 | // Returns true if version is strictly less than other_version |
93 | bool VersionEq(const ApplicationVersion& other_version) const; |
94 | |
95 | // Checks if the Version has the correct statistics for a given column |
96 | bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics, |
97 | SortOrder::type sort_order = SortOrder::SIGNED) const; |
98 | }; |
99 | |
100 | class PARQUET_EXPORT ColumnChunkMetaData { |
101 | public: |
102 | // API convenience to get a MetaData accessor |
103 | static std::unique_ptr<ColumnChunkMetaData> Make( |
104 | const void* metadata, const ColumnDescriptor* descr, |
105 | const ApplicationVersion* writer_version = NULLPTR); |
106 | |
107 | ~ColumnChunkMetaData(); |
108 | |
109 | // column chunk |
110 | int64_t file_offset() const; |
111 | |
112 | // parameter is only used when a dataset is spread across multiple files |
113 | const std::string& file_path() const; |
114 | |
115 | // column metadata |
116 | Type::type type() const; |
117 | int64_t num_values() const; |
118 | std::shared_ptr<schema::ColumnPath> path_in_schema() const; |
119 | bool is_stats_set() const; |
120 | std::shared_ptr<Statistics> statistics() const; |
121 | Compression::type compression() const; |
122 | const std::vector<Encoding::type>& encodings() const; |
123 | bool has_dictionary_page() const; |
124 | int64_t dictionary_page_offset() const; |
125 | int64_t data_page_offset() const; |
126 | bool has_index_page() const; |
127 | int64_t index_page_offset() const; |
128 | int64_t total_compressed_size() const; |
129 | int64_t total_uncompressed_size() const; |
130 | |
131 | private: |
132 | explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, |
133 | const ApplicationVersion* writer_version = NULLPTR); |
134 | // PIMPL Idiom |
135 | class ColumnChunkMetaDataImpl; |
136 | std::unique_ptr<ColumnChunkMetaDataImpl> impl_; |
137 | }; |
138 | |
139 | class PARQUET_EXPORT RowGroupMetaData { |
140 | public: |
141 | // API convenience to get a MetaData accessor |
142 | static std::unique_ptr<RowGroupMetaData> Make( |
143 | const void* metadata, const SchemaDescriptor* schema, |
144 | const ApplicationVersion* writer_version = NULLPTR); |
145 | |
146 | ~RowGroupMetaData(); |
147 | |
148 | // row-group metadata |
149 | int num_columns() const; |
150 | int64_t num_rows() const; |
151 | int64_t total_byte_size() const; |
152 | // Return const-pointer to make it clear that this object is not to be copied |
153 | const SchemaDescriptor* schema() const; |
154 | std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const; |
155 | |
156 | private: |
157 | explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, |
158 | const ApplicationVersion* writer_version = NULLPTR); |
159 | // PIMPL Idiom |
160 | class RowGroupMetaDataImpl; |
161 | std::unique_ptr<RowGroupMetaDataImpl> impl_; |
162 | }; |
163 | |
164 | class FileMetaDataBuilder; |
165 | |
166 | class PARQUET_EXPORT FileMetaData { |
167 | public: |
168 | // API convenience to get a MetaData accessor |
169 | static std::shared_ptr<FileMetaData> Make(const void* serialized_metadata, |
170 | uint32_t* metadata_len); |
171 | |
172 | ~FileMetaData(); |
173 | |
174 | // file metadata |
175 | uint32_t size() const; |
176 | |
177 | int num_columns() const; |
178 | |
179 | int64_t num_rows() const; |
180 | |
181 | int num_row_groups() const; |
182 | ParquetVersion::type version() const; |
183 | const std::string& created_by() const; |
184 | int num_schema_elements() const; |
185 | std::unique_ptr<RowGroupMetaData> RowGroup(int i) const; |
186 | |
187 | const ApplicationVersion& writer_version() const; |
188 | |
189 | void WriteTo(::arrow::io::OutputStream* dst) const; |
190 | |
191 | // Return const-pointer to make it clear that this object is not to be copied |
192 | const SchemaDescriptor* schema() const; |
193 | |
194 | std::shared_ptr<const KeyValueMetadata> key_value_metadata() const; |
195 | |
196 | // Set file_path ColumnChunk fields to a particular value |
197 | void set_file_path(const std::string& path); |
198 | |
199 | // Merge row-group metadata from "other" FileMetaData object |
200 | void AppendRowGroups(const FileMetaData& other); |
201 | |
202 | private: |
203 | friend FileMetaDataBuilder; |
204 | explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); |
205 | |
206 | // PIMPL Idiom |
207 | FileMetaData(); |
208 | class FileMetaDataImpl; |
209 | std::unique_ptr<FileMetaDataImpl> impl_; |
210 | }; |
211 | |
212 | // Builder API |
213 | class PARQUET_EXPORT ColumnChunkMetaDataBuilder { |
214 | public: |
215 | // API convenience to get a MetaData reader |
216 | static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( |
217 | const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column); |
218 | |
219 | static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( |
220 | const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column, |
221 | void* contents); |
222 | |
223 | ~ColumnChunkMetaDataBuilder(); |
224 | |
225 | // column chunk |
226 | // Used when a dataset is spread across multiple files |
227 | void set_file_path(const std::string& path); |
228 | // column metadata |
229 | void SetStatistics(const EncodedStatistics& stats); |
230 | // get the column descriptor |
231 | const ColumnDescriptor* descr() const; |
232 | // commit the metadata |
233 | void Finish(int64_t num_values, int64_t dictonary_page_offset, |
234 | int64_t index_page_offset, int64_t data_page_offset, |
235 | int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, |
236 | bool dictionary_fallback); |
237 | |
238 | // The metadata contents, suitable for passing to ColumnChunkMetaData::Make |
239 | const void* contents() const; |
240 | |
241 | // For writing metadata at end of column chunk |
242 | void WriteTo(::arrow::io::OutputStream* sink); |
243 | |
244 | private: |
245 | explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, |
246 | const ColumnDescriptor* column); |
247 | explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, |
248 | const ColumnDescriptor* column, void* contents); |
249 | // PIMPL Idiom |
250 | class ColumnChunkMetaDataBuilderImpl; |
251 | std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_; |
252 | }; |
253 | |
254 | class PARQUET_EXPORT RowGroupMetaDataBuilder { |
255 | public: |
256 | // API convenience to get a MetaData reader |
257 | static std::unique_ptr<RowGroupMetaDataBuilder> Make( |
258 | const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_, |
259 | void* contents); |
260 | |
261 | ~RowGroupMetaDataBuilder(); |
262 | |
263 | ColumnChunkMetaDataBuilder* NextColumnChunk(); |
264 | int num_columns(); |
265 | int64_t num_rows(); |
266 | int current_column() const; |
267 | |
268 | void set_num_rows(int64_t num_rows); |
269 | |
270 | // commit the metadata |
271 | void Finish(int64_t total_bytes_written); |
272 | |
273 | private: |
274 | explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, |
275 | const SchemaDescriptor* schema_, void* contents); |
276 | // PIMPL Idiom |
277 | class RowGroupMetaDataBuilderImpl; |
278 | std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_; |
279 | }; |
280 | |
281 | class PARQUET_EXPORT FileMetaDataBuilder { |
282 | public: |
283 | // API convenience to get a MetaData reader |
284 | static std::unique_ptr<FileMetaDataBuilder> Make( |
285 | const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props, |
286 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
287 | |
288 | ~FileMetaDataBuilder(); |
289 | |
290 | // The prior RowGroupMetaDataBuilder (if any) is destroyed |
291 | RowGroupMetaDataBuilder* AppendRowGroup(); |
292 | |
293 | // Complete the Thrift structure |
294 | std::unique_ptr<FileMetaData> Finish(); |
295 | |
296 | private: |
297 | explicit FileMetaDataBuilder( |
298 | const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props, |
299 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
300 | // PIMPL Idiom |
301 | class FileMetaDataBuilderImpl; |
302 | std::unique_ptr<FileMetaDataBuilderImpl> impl_; |
303 | }; |
304 | |
305 | PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); |
306 | |
307 | } // namespace parquet |
308 | |
309 | #endif // PARQUET_FILE_METADATA_H |
310 | |