| 1 | // Licensed to the Apache Software Foundation (ASF) under one |
| 2 | // or more contributor license agreements. See the NOTICE file |
| 3 | // distributed with this work for additional information |
| 4 | // regarding copyright ownership. The ASF licenses this file |
| 5 | // to you under the Apache License, Version 2.0 (the |
| 6 | // "License"); you may not use this file except in compliance |
| 7 | // with the License. You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, |
| 12 | // software distributed under the License is distributed on an |
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | // KIND, either express or implied. See the License for the |
| 15 | // specific language governing permissions and limitations |
| 16 | // under the License. |
| 17 | |
| 18 | #ifndef PARQUET_FILE_METADATA_H |
| 19 | #define PARQUET_FILE_METADATA_H |
| 20 | |
| 21 | #include <cstdint> |
| 22 | #include <memory> |
| 23 | #include <string> |
| 24 | #include <vector> |
| 25 | |
| 26 | #include "arrow/util/key_value_metadata.h" |
| 27 | |
| 28 | #include "parquet/platform.h" |
| 29 | #include "parquet/properties.h" |
| 30 | #include "parquet/types.h" |
| 31 | |
| 32 | namespace parquet { |
| 33 | |
| 34 | class ColumnDescriptor; |
| 35 | class EncodedStatistics; |
| 36 | class Statistics; |
| 37 | class SchemaDescriptor; |
| 38 | |
| 39 | namespace schema { |
| 40 | |
| 41 | class ColumnPath; |
| 42 | |
| 43 | } // namespace schema |
| 44 | |
| 45 | using KeyValueMetadata = ::arrow::KeyValueMetadata; |
| 46 | |
| 47 | class PARQUET_EXPORT ApplicationVersion { |
| 48 | public: |
| 49 | // Known Versions with Issues |
| 50 | static const ApplicationVersion& PARQUET_251_FIXED_VERSION(); |
| 51 | static const ApplicationVersion& PARQUET_816_FIXED_VERSION(); |
| 52 | static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION(); |
| 53 | static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION(); |
| 54 | // Regular expression for the version format |
| 55 | // major . minor . patch unknown - prerelease.x + build info |
| 56 | // Eg: 1.5.0ab-cdh5.5.0+cd |
| 57 | static constexpr char const* VERSION_FORMAT = |
| 58 | "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$" ; |
| 59 | // Regular expression for the application format |
| 60 | // application_name version VERSION_FORMAT (build build_name) |
| 61 | // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) |
| 62 | static constexpr char const* APPLICATION_FORMAT = |
| 63 | "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)" ; |
| 64 | |
| 65 | // Application that wrote the file. e.g. "IMPALA" |
| 66 | std::string application_; |
| 67 | // Build name |
| 68 | std::string build_; |
| 69 | |
| 70 | // Version of the application that wrote the file, expressed as |
| 71 | // (<major>.<minor>.<patch>). Unmatched parts default to 0. |
| 72 | // "1.2.3" => {1, 2, 3} |
| 73 | // "1.2" => {0, 0, 0} |
| 74 | // "1.2-cdh5" => {0, 0, 0} |
| 75 | // TODO (majetideepak): Implement support for pre_release |
| 76 | struct { |
| 77 | int major; |
| 78 | int minor; |
| 79 | int patch; |
| 80 | std::string unknown; |
| 81 | std::string pre_release; |
| 82 | std::string build_info; |
| 83 | } version; |
| 84 | |
| 85 | ApplicationVersion() {} |
| 86 | explicit ApplicationVersion(const std::string& created_by); |
| 87 | ApplicationVersion(const std::string& application, int major, int minor, int patch); |
| 88 | |
| 89 | // Returns true if version is strictly less than other_version |
| 90 | bool VersionLt(const ApplicationVersion& other_version) const; |
| 91 | |
| 92 | // Returns true if version is strictly less than other_version |
| 93 | bool VersionEq(const ApplicationVersion& other_version) const; |
| 94 | |
| 95 | // Checks if the Version has the correct statistics for a given column |
| 96 | bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics, |
| 97 | SortOrder::type sort_order = SortOrder::SIGNED) const; |
| 98 | }; |
| 99 | |
| 100 | class PARQUET_EXPORT ColumnChunkMetaData { |
| 101 | public: |
| 102 | // API convenience to get a MetaData accessor |
| 103 | static std::unique_ptr<ColumnChunkMetaData> Make( |
| 104 | const void* metadata, const ColumnDescriptor* descr, |
| 105 | const ApplicationVersion* writer_version = NULLPTR); |
| 106 | |
| 107 | ~ColumnChunkMetaData(); |
| 108 | |
| 109 | // column chunk |
| 110 | int64_t file_offset() const; |
| 111 | |
| 112 | // parameter is only used when a dataset is spread across multiple files |
| 113 | const std::string& file_path() const; |
| 114 | |
| 115 | // column metadata |
| 116 | Type::type type() const; |
| 117 | int64_t num_values() const; |
| 118 | std::shared_ptr<schema::ColumnPath> path_in_schema() const; |
| 119 | bool is_stats_set() const; |
| 120 | std::shared_ptr<Statistics> statistics() const; |
| 121 | Compression::type compression() const; |
| 122 | const std::vector<Encoding::type>& encodings() const; |
| 123 | bool has_dictionary_page() const; |
| 124 | int64_t dictionary_page_offset() const; |
| 125 | int64_t data_page_offset() const; |
| 126 | bool has_index_page() const; |
| 127 | int64_t index_page_offset() const; |
| 128 | int64_t total_compressed_size() const; |
| 129 | int64_t total_uncompressed_size() const; |
| 130 | |
| 131 | private: |
| 132 | explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, |
| 133 | const ApplicationVersion* writer_version = NULLPTR); |
| 134 | // PIMPL Idiom |
| 135 | class ColumnChunkMetaDataImpl; |
| 136 | std::unique_ptr<ColumnChunkMetaDataImpl> impl_; |
| 137 | }; |
| 138 | |
| 139 | class PARQUET_EXPORT RowGroupMetaData { |
| 140 | public: |
| 141 | // API convenience to get a MetaData accessor |
| 142 | static std::unique_ptr<RowGroupMetaData> Make( |
| 143 | const void* metadata, const SchemaDescriptor* schema, |
| 144 | const ApplicationVersion* writer_version = NULLPTR); |
| 145 | |
| 146 | ~RowGroupMetaData(); |
| 147 | |
| 148 | // row-group metadata |
| 149 | int num_columns() const; |
| 150 | int64_t num_rows() const; |
| 151 | int64_t total_byte_size() const; |
| 152 | // Return const-pointer to make it clear that this object is not to be copied |
| 153 | const SchemaDescriptor* schema() const; |
| 154 | std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const; |
| 155 | |
| 156 | private: |
| 157 | explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, |
| 158 | const ApplicationVersion* writer_version = NULLPTR); |
| 159 | // PIMPL Idiom |
| 160 | class RowGroupMetaDataImpl; |
| 161 | std::unique_ptr<RowGroupMetaDataImpl> impl_; |
| 162 | }; |
| 163 | |
| 164 | class FileMetaDataBuilder; |
| 165 | |
| 166 | class PARQUET_EXPORT FileMetaData { |
| 167 | public: |
| 168 | // API convenience to get a MetaData accessor |
| 169 | static std::shared_ptr<FileMetaData> Make(const void* serialized_metadata, |
| 170 | uint32_t* metadata_len); |
| 171 | |
| 172 | ~FileMetaData(); |
| 173 | |
| 174 | // file metadata |
| 175 | uint32_t size() const; |
| 176 | |
| 177 | int num_columns() const; |
| 178 | |
| 179 | int64_t num_rows() const; |
| 180 | |
| 181 | int num_row_groups() const; |
| 182 | ParquetVersion::type version() const; |
| 183 | const std::string& created_by() const; |
| 184 | int num_schema_elements() const; |
| 185 | std::unique_ptr<RowGroupMetaData> RowGroup(int i) const; |
| 186 | |
| 187 | const ApplicationVersion& writer_version() const; |
| 188 | |
| 189 | void WriteTo(::arrow::io::OutputStream* dst) const; |
| 190 | |
| 191 | // Return const-pointer to make it clear that this object is not to be copied |
| 192 | const SchemaDescriptor* schema() const; |
| 193 | |
| 194 | std::shared_ptr<const KeyValueMetadata> key_value_metadata() const; |
| 195 | |
| 196 | // Set file_path ColumnChunk fields to a particular value |
| 197 | void set_file_path(const std::string& path); |
| 198 | |
| 199 | // Merge row-group metadata from "other" FileMetaData object |
| 200 | void AppendRowGroups(const FileMetaData& other); |
| 201 | |
| 202 | private: |
| 203 | friend FileMetaDataBuilder; |
| 204 | explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); |
| 205 | |
| 206 | // PIMPL Idiom |
| 207 | FileMetaData(); |
| 208 | class FileMetaDataImpl; |
| 209 | std::unique_ptr<FileMetaDataImpl> impl_; |
| 210 | }; |
| 211 | |
| 212 | // Builder API |
| 213 | class PARQUET_EXPORT ColumnChunkMetaDataBuilder { |
| 214 | public: |
| 215 | // API convenience to get a MetaData reader |
| 216 | static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( |
| 217 | const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column); |
| 218 | |
| 219 | static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( |
| 220 | const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column, |
| 221 | void* contents); |
| 222 | |
| 223 | ~ColumnChunkMetaDataBuilder(); |
| 224 | |
| 225 | // column chunk |
| 226 | // Used when a dataset is spread across multiple files |
| 227 | void set_file_path(const std::string& path); |
| 228 | // column metadata |
| 229 | void SetStatistics(const EncodedStatistics& stats); |
| 230 | // get the column descriptor |
| 231 | const ColumnDescriptor* descr() const; |
| 232 | // commit the metadata |
| 233 | void Finish(int64_t num_values, int64_t dictonary_page_offset, |
| 234 | int64_t index_page_offset, int64_t data_page_offset, |
| 235 | int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, |
| 236 | bool dictionary_fallback); |
| 237 | |
| 238 | // The metadata contents, suitable for passing to ColumnChunkMetaData::Make |
| 239 | const void* contents() const; |
| 240 | |
| 241 | // For writing metadata at end of column chunk |
| 242 | void WriteTo(::arrow::io::OutputStream* sink); |
| 243 | |
| 244 | private: |
| 245 | explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, |
| 246 | const ColumnDescriptor* column); |
| 247 | explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, |
| 248 | const ColumnDescriptor* column, void* contents); |
| 249 | // PIMPL Idiom |
| 250 | class ColumnChunkMetaDataBuilderImpl; |
| 251 | std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_; |
| 252 | }; |
| 253 | |
| 254 | class PARQUET_EXPORT RowGroupMetaDataBuilder { |
| 255 | public: |
| 256 | // API convenience to get a MetaData reader |
| 257 | static std::unique_ptr<RowGroupMetaDataBuilder> Make( |
| 258 | const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_, |
| 259 | void* contents); |
| 260 | |
| 261 | ~RowGroupMetaDataBuilder(); |
| 262 | |
| 263 | ColumnChunkMetaDataBuilder* NextColumnChunk(); |
| 264 | int num_columns(); |
| 265 | int64_t num_rows(); |
| 266 | int current_column() const; |
| 267 | |
| 268 | void set_num_rows(int64_t num_rows); |
| 269 | |
| 270 | // commit the metadata |
| 271 | void Finish(int64_t total_bytes_written); |
| 272 | |
| 273 | private: |
| 274 | explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, |
| 275 | const SchemaDescriptor* schema_, void* contents); |
| 276 | // PIMPL Idiom |
| 277 | class RowGroupMetaDataBuilderImpl; |
| 278 | std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_; |
| 279 | }; |
| 280 | |
| 281 | class PARQUET_EXPORT FileMetaDataBuilder { |
| 282 | public: |
| 283 | // API convenience to get a MetaData reader |
| 284 | static std::unique_ptr<FileMetaDataBuilder> Make( |
| 285 | const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props, |
| 286 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
| 287 | |
| 288 | ~FileMetaDataBuilder(); |
| 289 | |
| 290 | // The prior RowGroupMetaDataBuilder (if any) is destroyed |
| 291 | RowGroupMetaDataBuilder* AppendRowGroup(); |
| 292 | |
| 293 | // Complete the Thrift structure |
| 294 | std::unique_ptr<FileMetaData> Finish(); |
| 295 | |
| 296 | private: |
| 297 | explicit FileMetaDataBuilder( |
| 298 | const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props, |
| 299 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
| 300 | // PIMPL Idiom |
| 301 | class FileMetaDataBuilderImpl; |
| 302 | std::unique_ptr<FileMetaDataBuilderImpl> impl_; |
| 303 | }; |
| 304 | |
| 305 | PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); |
| 306 | |
| 307 | } // namespace parquet |
| 308 | |
| 309 | #endif // PARQUET_FILE_METADATA_H |
| 310 | |