| 1 | // Licensed to the Apache Software Foundation (ASF) under one |
| 2 | // or more contributor license agreements. See the NOTICE file |
| 3 | // distributed with this work for additional information |
| 4 | // regarding copyright ownership. The ASF licenses this file |
| 5 | // to you under the Apache License, Version 2.0 (the |
| 6 | // "License"); you may not use this file except in compliance |
| 7 | // with the License. You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, |
| 12 | // software distributed under the License is distributed on an |
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | // KIND, either express or implied. See the License for the |
| 15 | // specific language governing permissions and limitations |
| 16 | // under the License. |
| 17 | |
| 18 | #ifndef PARQUET_TYPES_H |
| 19 | #define PARQUET_TYPES_H |
| 20 | |
| 21 | #include <algorithm> |
| 22 | #include <cstdint> |
| 23 | #include <cstring> |
| 24 | #include <memory> |
| 25 | #include <sstream> |
| 26 | #include <string> |
| 27 | |
| 28 | #include "parquet/platform.h" |
| 29 | |
| 30 | namespace arrow { |
| 31 | namespace util { |
| 32 | |
| 33 | class Codec; |
| 34 | |
| 35 | } // namespace util |
| 36 | } // namespace arrow |
| 37 | |
| 38 | namespace parquet { |
| 39 | |
| 40 | // ---------------------------------------------------------------------- |
| 41 | // Metadata enums to match Thrift metadata |
| 42 | // |
| 43 | // The reason we maintain our own enums is to avoid transitive dependency on |
| 44 | // the compiled Thrift headers (and thus thrift/Thrift.h) for users of the |
| 45 | // public API. After building parquet-cpp, you should not need to include |
| 46 | // Thrift headers in your application. This means some boilerplate to convert |
| 47 | // between our types and Parquet's Thrift types. |
| 48 | // |
| 49 | // We can also add special values like NONE to distinguish between metadata |
| 50 | // values being set and not set. As an example consider ConvertedType and |
| 51 | // CompressionCodec |
| 52 | |
| 53 | // Mirrors parquet::Type |
| 54 | struct Type { |
| 55 | enum type { |
| 56 | BOOLEAN = 0, |
| 57 | INT32 = 1, |
| 58 | INT64 = 2, |
| 59 | INT96 = 3, |
| 60 | FLOAT = 4, |
| 61 | DOUBLE = 5, |
| 62 | BYTE_ARRAY = 6, |
| 63 | FIXED_LEN_BYTE_ARRAY = 7, |
| 64 | // Should always be last element. |
| 65 | UNDEFINED = 8 |
| 66 | }; |
| 67 | }; |
| 68 | |
| 69 | // Mirrors parquet::ConvertedType |
| 70 | struct ConvertedType { |
| 71 | enum type { |
| 72 | NONE, |
| 73 | UTF8, |
| 74 | MAP, |
| 75 | MAP_KEY_VALUE, |
| 76 | LIST, |
| 77 | ENUM, |
| 78 | DECIMAL, |
| 79 | DATE, |
| 80 | TIME_MILLIS, |
| 81 | TIME_MICROS, |
| 82 | TIMESTAMP_MILLIS, |
| 83 | TIMESTAMP_MICROS, |
| 84 | UINT_8, |
| 85 | UINT_16, |
| 86 | UINT_32, |
| 87 | UINT_64, |
| 88 | INT_8, |
| 89 | INT_16, |
| 90 | INT_32, |
| 91 | INT_64, |
| 92 | JSON, |
| 93 | BSON, |
| 94 | INTERVAL, |
| 95 | NA = 25, |
| 96 | // Should always be last element. |
| 97 | UNDEFINED = 26 |
| 98 | }; |
| 99 | }; |
| 100 | |
| 101 | // forward declaration |
| 102 | namespace format { |
| 103 | |
| 104 | class LogicalType; |
| 105 | |
| 106 | } |
| 107 | |
| 108 | // Mirrors parquet::FieldRepetitionType |
| 109 | struct Repetition { |
| 110 | enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 }; |
| 111 | }; |
| 112 | |
| 113 | // Reference: |
| 114 | // parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ |
| 115 | // format/converter/ParquetMetadataConverter.java |
| 116 | // Sort order for page and column statistics. Types are associated with sort |
| 117 | // orders (e.g., UTF8 columns should use UNSIGNED) and column stats are |
| 118 | // aggregated using a sort order. As of parquet-format version 2.3.1, the |
| 119 | // order used to aggregate stats is always SIGNED and is not stored in the |
| 120 | // Parquet file. These stats are discarded for types that need unsigned. |
| 121 | // See PARQUET-686. |
| 122 | struct SortOrder { |
| 123 | enum type { SIGNED, UNSIGNED, UNKNOWN }; |
| 124 | }; |
| 125 | |
| 126 | namespace schema { |
| 127 | |
| 128 | struct DecimalMetadata { |
| 129 | bool isset; |
| 130 | int32_t scale; |
| 131 | int32_t precision; |
| 132 | }; |
| 133 | |
| 134 | } // namespace schema |
| 135 | |
| 136 | /// \brief Implementation of parquet.thrift LogicalType types. |
| 137 | class PARQUET_EXPORT LogicalType { |
| 138 | public: |
| 139 | struct Type { |
| 140 | enum type { |
| 141 | UNKNOWN = 0, |
| 142 | STRING = 1, |
| 143 | MAP, |
| 144 | LIST, |
| 145 | ENUM, |
| 146 | DECIMAL, |
| 147 | DATE, |
| 148 | TIME, |
| 149 | TIMESTAMP, |
| 150 | INTERVAL, |
| 151 | INT, |
| 152 | NIL, // Thrift NullType |
| 153 | JSON, |
| 154 | BSON, |
| 155 | UUID, |
| 156 | NONE |
| 157 | }; |
| 158 | }; |
| 159 | |
| 160 | struct TimeUnit { |
| 161 | enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS }; |
| 162 | }; |
| 163 | |
| 164 | /// \brief If possible, return a logical type equivalent to the given legacy |
| 165 | /// converted type (and decimal metadata if applicable). |
| 166 | static std::shared_ptr<const LogicalType> FromConvertedType( |
| 167 | const parquet::ConvertedType::type converted_type, |
| 168 | const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1, |
| 169 | -1}); |
| 170 | |
| 171 | /// \brief Return the logical type represented by the Thrift intermediary object. |
| 172 | static std::shared_ptr<const LogicalType> FromThrift( |
| 173 | const parquet::format::LogicalType& thrift_logical_type); |
| 174 | |
| 175 | /// \brief Return the explicitly requested logical type. |
| 176 | static std::shared_ptr<const LogicalType> String(); |
| 177 | static std::shared_ptr<const LogicalType> Map(); |
| 178 | static std::shared_ptr<const LogicalType> List(); |
| 179 | static std::shared_ptr<const LogicalType> Enum(); |
| 180 | static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0); |
| 181 | static std::shared_ptr<const LogicalType> Date(); |
| 182 | static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc, |
| 183 | LogicalType::TimeUnit::unit time_unit); |
| 184 | |
| 185 | /// \brief Create a Timestamp logical type |
| 186 | /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized |
| 187 | /// \param[in] time_unit the resolution of the timestamp |
| 188 | /// \param[in] is_from_converted_type if true, the timestamp was generated |
| 189 | /// by translating a legacy converted type of TIMESTAMP_MILLIS or |
| 190 | /// TIMESTAMP_MICROS. Default is false. |
| 191 | /// \param[in] force_set_converted_type if true, always set the |
| 192 | /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS |
| 193 | /// metadata. Default is false |
| 194 | static std::shared_ptr<const LogicalType> Timestamp( |
| 195 | bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit, |
| 196 | bool is_from_converted_type = false, bool force_set_converted_type = false); |
| 197 | |
| 198 | static std::shared_ptr<const LogicalType> Interval(); |
| 199 | static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed); |
| 200 | static std::shared_ptr<const LogicalType> Null(); |
| 201 | static std::shared_ptr<const LogicalType> JSON(); |
| 202 | static std::shared_ptr<const LogicalType> BSON(); |
| 203 | static std::shared_ptr<const LogicalType> UUID(); |
| 204 | static std::shared_ptr<const LogicalType> None(); |
| 205 | static std::shared_ptr<const LogicalType> Unknown(); |
| 206 | |
| 207 | /// \brief Return true if this logical type is consistent with the given underlying |
| 208 | /// physical type. |
| 209 | bool is_applicable(parquet::Type::type primitive_type, |
| 210 | int32_t primitive_length = -1) const; |
| 211 | |
| 212 | /// \brief Return true if this logical type is equivalent to the given legacy converted |
| 213 | /// type (and decimal metadata if applicable). |
| 214 | bool is_compatible(parquet::ConvertedType::type converted_type, |
| 215 | parquet::schema::DecimalMetadata converted_decimal_metadata = { |
| 216 | false, -1, -1}) const; |
| 217 | |
| 218 | /// \brief If possible, return the legacy converted type (and decimal metadata if |
| 219 | /// applicable) equivalent to this logical type. |
| 220 | parquet::ConvertedType::type ToConvertedType( |
| 221 | parquet::schema::DecimalMetadata* out_decimal_metadata) const; |
| 222 | |
| 223 | /// \brief Return a printable representation of this logical type. |
| 224 | std::string ToString() const; |
| 225 | |
| 226 | /// \brief Return a JSON representation of this logical type. |
| 227 | std::string ToJSON() const; |
| 228 | |
| 229 | /// \brief Return a serializable Thrift object for this logical type. |
| 230 | parquet::format::LogicalType ToThrift() const; |
| 231 | |
| 232 | /// \brief Return true if the given logical type is equivalent to this logical type. |
| 233 | bool Equals(const LogicalType& other) const; |
| 234 | |
| 235 | /// \brief Return the enumerated type of this logical type. |
| 236 | LogicalType::Type::type type() const; |
| 237 | |
| 238 | /// \brief Return the appropriate sort order for this logical type. |
| 239 | SortOrder::type sort_order() const; |
| 240 | |
| 241 | // Type checks ... |
| 242 | bool is_string() const; |
| 243 | bool is_map() const; |
| 244 | bool is_list() const; |
| 245 | bool is_enum() const; |
| 246 | bool is_decimal() const; |
| 247 | bool is_date() const; |
| 248 | bool is_time() const; |
| 249 | bool is_timestamp() const; |
| 250 | bool is_interval() const; |
| 251 | bool is_int() const; |
| 252 | bool is_null() const; |
| 253 | bool is_JSON() const; |
| 254 | bool is_BSON() const; |
| 255 | bool is_UUID() const; |
| 256 | bool is_none() const; |
| 257 | /// \brief Return true if this logical type is of a known type. |
| 258 | bool is_valid() const; |
| 259 | bool is_invalid() const; |
| 260 | /// \brief Return true if this logical type is suitable for a schema GroupNode. |
| 261 | bool is_nested() const; |
| 262 | bool is_nonnested() const; |
| 263 | /// \brief Return true if this logical type is included in the Thrift output for its |
| 264 | /// node. |
| 265 | bool is_serialized() const; |
| 266 | |
| 267 | LogicalType(const LogicalType&) = delete; |
| 268 | LogicalType& operator=(const LogicalType&) = delete; |
| 269 | virtual ~LogicalType() noexcept; |
| 270 | |
| 271 | protected: |
| 272 | LogicalType(); |
| 273 | |
| 274 | class Impl; |
| 275 | std::unique_ptr<const Impl> impl_; |
| 276 | }; |
| 277 | |
| 278 | /// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8. |
| 279 | class PARQUET_EXPORT StringLogicalType : public LogicalType { |
| 280 | public: |
| 281 | static std::shared_ptr<const LogicalType> Make(); |
| 282 | |
| 283 | private: |
| 284 | StringLogicalType() = default; |
| 285 | }; |
| 286 | |
| 287 | /// \brief Allowed for group nodes only. |
| 288 | class PARQUET_EXPORT MapLogicalType : public LogicalType { |
| 289 | public: |
| 290 | static std::shared_ptr<const LogicalType> Make(); |
| 291 | |
| 292 | private: |
| 293 | MapLogicalType() = default; |
| 294 | }; |
| 295 | |
| 296 | /// \brief Allowed for group nodes only. |
| 297 | class PARQUET_EXPORT ListLogicalType : public LogicalType { |
| 298 | public: |
| 299 | static std::shared_ptr<const LogicalType> Make(); |
| 300 | |
| 301 | private: |
| 302 | ListLogicalType() = default; |
| 303 | }; |
| 304 | |
| 305 | /// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8. |
| 306 | class PARQUET_EXPORT EnumLogicalType : public LogicalType { |
| 307 | public: |
| 308 | static std::shared_ptr<const LogicalType> Make(); |
| 309 | |
| 310 | private: |
| 311 | EnumLogicalType() = default; |
| 312 | }; |
| 313 | |
| 314 | /// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY, |
| 315 | /// depending on the precision. |
| 316 | class PARQUET_EXPORT DecimalLogicalType : public LogicalType { |
| 317 | public: |
| 318 | static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0); |
| 319 | int32_t precision() const; |
| 320 | int32_t scale() const; |
| 321 | |
| 322 | private: |
| 323 | DecimalLogicalType() = default; |
| 324 | }; |
| 325 | |
| 326 | /// \brief Allowed for physical type INT32. |
| 327 | class PARQUET_EXPORT DateLogicalType : public LogicalType { |
| 328 | public: |
| 329 | static std::shared_ptr<const LogicalType> Make(); |
| 330 | |
| 331 | private: |
| 332 | DateLogicalType() = default; |
| 333 | }; |
| 334 | |
| 335 | /// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS). |
| 336 | class PARQUET_EXPORT TimeLogicalType : public LogicalType { |
| 337 | public: |
| 338 | static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc, |
| 339 | LogicalType::TimeUnit::unit time_unit); |
| 340 | bool is_adjusted_to_utc() const; |
| 341 | LogicalType::TimeUnit::unit time_unit() const; |
| 342 | |
| 343 | private: |
| 344 | TimeLogicalType() = default; |
| 345 | }; |
| 346 | |
| 347 | /// \brief Allowed for physical type INT64. |
| 348 | class PARQUET_EXPORT TimestampLogicalType : public LogicalType { |
| 349 | public: |
| 350 | static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc, |
| 351 | LogicalType::TimeUnit::unit time_unit, |
| 352 | bool is_from_converted_type = false, |
| 353 | bool force_set_converted_type = false); |
| 354 | bool is_adjusted_to_utc() const; |
| 355 | LogicalType::TimeUnit::unit time_unit() const; |
| 356 | |
| 357 | /// \brief If true, will not set LogicalType in Thrift metadata |
| 358 | bool is_from_converted_type() const; |
| 359 | |
| 360 | /// \brief If true, will set ConvertedType for micros and millis |
| 361 | /// resolution in legacy ConvertedType Thrift metadata |
| 362 | bool force_set_converted_type() const; |
| 363 | |
| 364 | private: |
| 365 | TimestampLogicalType() = default; |
| 366 | }; |
| 367 | |
| 368 | /// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12 |
| 369 | class PARQUET_EXPORT IntervalLogicalType : public LogicalType { |
| 370 | public: |
| 371 | static std::shared_ptr<const LogicalType> Make(); |
| 372 | |
| 373 | private: |
| 374 | IntervalLogicalType() = default; |
| 375 | }; |
| 376 | |
| 377 | /// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64 |
| 378 | /// (for bit width 64). |
| 379 | class PARQUET_EXPORT IntLogicalType : public LogicalType { |
| 380 | public: |
| 381 | static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed); |
| 382 | int bit_width() const; |
| 383 | bool is_signed() const; |
| 384 | |
| 385 | private: |
| 386 | IntLogicalType() = default; |
| 387 | }; |
| 388 | |
| 389 | /// \brief Allowed for any physical type. |
| 390 | class PARQUET_EXPORT NullLogicalType : public LogicalType { |
| 391 | public: |
| 392 | static std::shared_ptr<const LogicalType> Make(); |
| 393 | |
| 394 | private: |
| 395 | NullLogicalType() = default; |
| 396 | }; |
| 397 | |
| 398 | /// \brief Allowed for physical type BYTE_ARRAY. |
| 399 | class PARQUET_EXPORT JSONLogicalType : public LogicalType { |
| 400 | public: |
| 401 | static std::shared_ptr<const LogicalType> Make(); |
| 402 | |
| 403 | private: |
| 404 | JSONLogicalType() = default; |
| 405 | }; |
| 406 | |
| 407 | /// \brief Allowed for physical type BYTE_ARRAY. |
| 408 | class PARQUET_EXPORT BSONLogicalType : public LogicalType { |
| 409 | public: |
| 410 | static std::shared_ptr<const LogicalType> Make(); |
| 411 | |
| 412 | private: |
| 413 | BSONLogicalType() = default; |
| 414 | }; |
| 415 | |
| 416 | /// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16, |
| 417 | /// must encode raw UUID bytes. |
| 418 | class PARQUET_EXPORT UUIDLogicalType : public LogicalType { |
| 419 | public: |
| 420 | static std::shared_ptr<const LogicalType> Make(); |
| 421 | |
| 422 | private: |
| 423 | UUIDLogicalType() = default; |
| 424 | }; |
| 425 | |
| 426 | /// \brief Allowed for any physical type. |
| 427 | class PARQUET_EXPORT NoLogicalType : public LogicalType { |
| 428 | public: |
| 429 | static std::shared_ptr<const LogicalType> Make(); |
| 430 | |
| 431 | private: |
| 432 | NoLogicalType() = default; |
| 433 | }; |
| 434 | |
| 435 | /// \brief Allowed for any type. |
| 436 | class PARQUET_EXPORT UnknownLogicalType : public LogicalType { |
| 437 | public: |
| 438 | static std::shared_ptr<const LogicalType> Make(); |
| 439 | |
| 440 | private: |
| 441 | UnknownLogicalType() = default; |
| 442 | }; |
| 443 | |
| 444 | // Data encodings. Mirrors parquet::Encoding |
| 445 | struct Encoding { |
| 446 | enum type { |
| 447 | PLAIN = 0, |
| 448 | PLAIN_DICTIONARY = 2, |
| 449 | RLE = 3, |
| 450 | BIT_PACKED = 4, |
| 451 | DELTA_BINARY_PACKED = 5, |
| 452 | DELTA_LENGTH_BYTE_ARRAY = 6, |
| 453 | DELTA_BYTE_ARRAY = 7, |
| 454 | RLE_DICTIONARY = 8, |
| 455 | UNKNOWN = 999 |
| 456 | }; |
| 457 | }; |
| 458 | |
| 459 | /// \brief Return true if Parquet supports indicated compression type |
| 460 | PARQUET_EXPORT |
| 461 | bool IsCodecSupported(Compression::type codec); |
| 462 | |
| 463 | PARQUET_EXPORT |
| 464 | std::unique_ptr<Codec> GetCodec(Compression::type codec); |
| 465 | |
| 466 | PARQUET_EXPORT |
| 467 | std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level); |
| 468 | |
| 469 | struct Encryption { |
| 470 | enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; |
| 471 | }; |
| 472 | |
| 473 | // parquet::PageType |
| 474 | struct PageType { |
| 475 | enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; |
| 476 | }; |
| 477 | |
| 478 | class ColumnOrder { |
| 479 | public: |
| 480 | enum type { UNDEFINED, TYPE_DEFINED_ORDER }; |
| 481 | explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {} |
| 482 | // Default to Type Defined Order |
| 483 | ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {} |
| 484 | ColumnOrder::type get_order() { return column_order_; } |
| 485 | |
| 486 | static ColumnOrder undefined_; |
| 487 | static ColumnOrder type_defined_; |
| 488 | |
| 489 | private: |
| 490 | ColumnOrder::type column_order_; |
| 491 | }; |
| 492 | |
| 493 | // ---------------------------------------------------------------------- |
| 494 | |
| 495 | struct ByteArray { |
| 496 | ByteArray() : len(0), ptr(NULLPTR) {} |
| 497 | ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} |
| 498 | |
| 499 | ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion |
| 500 | : ByteArray(static_cast<uint32_t>(view.size()), |
| 501 | reinterpret_cast<const uint8_t*>(view.data())) {} |
| 502 | uint32_t len; |
| 503 | const uint8_t* ptr; |
| 504 | }; |
| 505 | |
| 506 | inline bool operator==(const ByteArray& left, const ByteArray& right) { |
| 507 | return left.len == right.len && |
| 508 | (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); |
| 509 | } |
| 510 | |
| 511 | inline bool operator!=(const ByteArray& left, const ByteArray& right) { |
| 512 | return !(left == right); |
| 513 | } |
| 514 | |
| 515 | struct FixedLenByteArray { |
| 516 | FixedLenByteArray() : ptr(NULLPTR) {} |
| 517 | explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} |
| 518 | const uint8_t* ptr; |
| 519 | }; |
| 520 | |
| 521 | using FLBA = FixedLenByteArray; |
| 522 | |
| 523 | // Julian day at unix epoch. |
| 524 | // |
| 525 | // The Julian Day Number (JDN) is the integer assigned to a whole solar day in |
| 526 | // the Julian day count starting from noon Universal time, with Julian day |
| 527 | // number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, |
| 528 | // proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian |
| 529 | // calendar), |
| 530 | constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); |
| 531 | constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); |
| 532 | constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); |
| 533 | constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); |
| 534 | constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); |
| 535 | |
| 536 | MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; |
| 537 | STRUCT_END(Int96, 12); |
| 538 | |
| 539 | inline bool operator==(const Int96& left, const Int96& right) { |
| 540 | return std::equal(left.value, left.value + 3, right.value); |
| 541 | } |
| 542 | |
| 543 | inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); } |
| 544 | |
| 545 | static inline std::string ByteArrayToString(const ByteArray& a) { |
| 546 | return std::string(reinterpret_cast<const char*>(a.ptr), a.len); |
| 547 | } |
| 548 | |
| 549 | static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { |
| 550 | std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); |
| 551 | } |
| 552 | |
| 553 | static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { |
| 554 | int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; |
| 555 | int64_t nanoseconds = 0; |
| 556 | |
| 557 | memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); |
| 558 | return days_since_epoch * kNanosecondsPerDay + nanoseconds; |
| 559 | } |
| 560 | |
| 561 | static inline std::string Int96ToString(const Int96& a) { |
| 562 | std::ostringstream result; |
| 563 | std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " " )); |
| 564 | return result.str(); |
| 565 | } |
| 566 | |
| 567 | static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) { |
| 568 | std::ostringstream result; |
| 569 | std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " " )); |
| 570 | return result.str(); |
| 571 | } |
| 572 | |
| 573 | template <Type::type TYPE> |
| 574 | struct type_traits {}; |
| 575 | |
| 576 | template <> |
| 577 | struct type_traits<Type::BOOLEAN> { |
| 578 | using value_type = bool; |
| 579 | |
| 580 | static constexpr int value_byte_size = 1; |
| 581 | static constexpr const char* printf_code = "d" ; |
| 582 | }; |
| 583 | |
| 584 | template <> |
| 585 | struct type_traits<Type::INT32> { |
| 586 | using value_type = int32_t; |
| 587 | |
| 588 | static constexpr int value_byte_size = 4; |
| 589 | static constexpr const char* printf_code = "d" ; |
| 590 | }; |
| 591 | |
| 592 | template <> |
| 593 | struct type_traits<Type::INT64> { |
| 594 | using value_type = int64_t; |
| 595 | |
| 596 | static constexpr int value_byte_size = 8; |
| 597 | static constexpr const char* printf_code = "ld" ; |
| 598 | }; |
| 599 | |
| 600 | template <> |
| 601 | struct type_traits<Type::INT96> { |
| 602 | using value_type = Int96; |
| 603 | |
| 604 | static constexpr int value_byte_size = 12; |
| 605 | static constexpr const char* printf_code = "s" ; |
| 606 | }; |
| 607 | |
| 608 | template <> |
| 609 | struct type_traits<Type::FLOAT> { |
| 610 | using value_type = float; |
| 611 | |
| 612 | static constexpr int value_byte_size = 4; |
| 613 | static constexpr const char* printf_code = "f" ; |
| 614 | }; |
| 615 | |
| 616 | template <> |
| 617 | struct type_traits<Type::DOUBLE> { |
| 618 | using value_type = double; |
| 619 | |
| 620 | static constexpr int value_byte_size = 8; |
| 621 | static constexpr const char* printf_code = "lf" ; |
| 622 | }; |
| 623 | |
| 624 | template <> |
| 625 | struct type_traits<Type::BYTE_ARRAY> { |
| 626 | using value_type = ByteArray; |
| 627 | |
| 628 | static constexpr int value_byte_size = sizeof(ByteArray); |
| 629 | static constexpr const char* printf_code = "s" ; |
| 630 | }; |
| 631 | |
| 632 | template <> |
| 633 | struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> { |
| 634 | using value_type = FixedLenByteArray; |
| 635 | |
| 636 | static constexpr int value_byte_size = sizeof(FixedLenByteArray); |
| 637 | static constexpr const char* printf_code = "s" ; |
| 638 | }; |
| 639 | |
| 640 | template <Type::type TYPE> |
| 641 | struct PhysicalType { |
| 642 | using c_type = typename type_traits<TYPE>::value_type; |
| 643 | static constexpr Type::type type_num = TYPE; |
| 644 | }; |
| 645 | |
| 646 | using BooleanType = PhysicalType<Type::BOOLEAN>; |
| 647 | using Int32Type = PhysicalType<Type::INT32>; |
| 648 | using Int64Type = PhysicalType<Type::INT64>; |
| 649 | using Int96Type = PhysicalType<Type::INT96>; |
| 650 | using FloatType = PhysicalType<Type::FLOAT>; |
| 651 | using DoubleType = PhysicalType<Type::DOUBLE>; |
| 652 | using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>; |
| 653 | using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>; |
| 654 | |
| 655 | template <typename Type> |
| 656 | inline std::string format_fwf(int width) { |
| 657 | std::stringstream ss; |
| 658 | ss << "%-" << width << type_traits<Type::type_num>::printf_code; |
| 659 | return ss.str(); |
| 660 | } |
| 661 | |
| 662 | PARQUET_EXPORT std::string EncodingToString(Encoding::type t); |
| 663 | |
| 664 | PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t); |
| 665 | |
| 666 | PARQUET_EXPORT std::string TypeToString(Type::type t); |
| 667 | |
| 668 | PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, |
| 669 | const std::string& val); |
| 670 | |
| 671 | /// \deprecated Since 1.5.0 |
| 672 | ARROW_DEPRECATED("Use std::string instead of char* as input" ) |
| 673 | PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val); |
| 674 | |
| 675 | PARQUET_EXPORT int GetTypeByteSize(Type::type t); |
| 676 | |
| 677 | PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive); |
| 678 | |
| 679 | PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted, |
| 680 | Type::type primitive); |
| 681 | |
| 682 | PARQUET_EXPORT SortOrder::type GetSortOrder( |
| 683 | const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive); |
| 684 | |
| 685 | namespace internal { |
| 686 | |
| 687 | PARQUET_EXPORT |
| 688 | int32_t DecimalSize(int32_t precision); |
| 689 | |
| 690 | } // namespace internal |
| 691 | } // namespace parquet |
| 692 | |
| 693 | #endif // PARQUET_TYPES_H |
| 694 | |