1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_TYPES_H
19#define PARQUET_TYPES_H
20
21#include <algorithm>
22#include <cstdint>
23#include <cstring>
24#include <memory>
25#include <sstream>
26#include <string>
27
28#include "parquet/platform.h"
29
30namespace arrow {
31namespace util {
32
33class Codec;
34
35} // namespace util
36} // namespace arrow
37
38namespace parquet {
39
40// ----------------------------------------------------------------------
41// Metadata enums to match Thrift metadata
42//
43// The reason we maintain our own enums is to avoid transitive dependency on
44// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
45// public API. After building parquet-cpp, you should not need to include
46// Thrift headers in your application. This means some boilerplate to convert
47// between our types and Parquet's Thrift types.
48//
49// We can also add special values like NONE to distinguish between metadata
50// values being set and not set. As an example consider ConvertedType and
51// CompressionCodec
52
53// Mirrors parquet::Type
54struct Type {
55 enum type {
56 BOOLEAN = 0,
57 INT32 = 1,
58 INT64 = 2,
59 INT96 = 3,
60 FLOAT = 4,
61 DOUBLE = 5,
62 BYTE_ARRAY = 6,
63 FIXED_LEN_BYTE_ARRAY = 7,
64 // Should always be last element.
65 UNDEFINED = 8
66 };
67};
68
69// Mirrors parquet::ConvertedType
70struct ConvertedType {
71 enum type {
72 NONE,
73 UTF8,
74 MAP,
75 MAP_KEY_VALUE,
76 LIST,
77 ENUM,
78 DECIMAL,
79 DATE,
80 TIME_MILLIS,
81 TIME_MICROS,
82 TIMESTAMP_MILLIS,
83 TIMESTAMP_MICROS,
84 UINT_8,
85 UINT_16,
86 UINT_32,
87 UINT_64,
88 INT_8,
89 INT_16,
90 INT_32,
91 INT_64,
92 JSON,
93 BSON,
94 INTERVAL,
95 NA = 25,
96 // Should always be last element.
97 UNDEFINED = 26
98 };
99};
100
101// forward declaration
102namespace format {
103
104class LogicalType;
105
106}
107
108// Mirrors parquet::FieldRepetitionType
109struct Repetition {
110 enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
111};
112
113// Reference:
114// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
115// format/converter/ParquetMetadataConverter.java
116// Sort order for page and column statistics. Types are associated with sort
117// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
118// aggregated using a sort order. As of parquet-format version 2.3.1, the
119// order used to aggregate stats is always SIGNED and is not stored in the
120// Parquet file. These stats are discarded for types that need unsigned.
121// See PARQUET-686.
122struct SortOrder {
123 enum type { SIGNED, UNSIGNED, UNKNOWN };
124};
125
126namespace schema {
127
128struct DecimalMetadata {
129 bool isset;
130 int32_t scale;
131 int32_t precision;
132};
133
134} // namespace schema
135
136/// \brief Implementation of parquet.thrift LogicalType types.
137class PARQUET_EXPORT LogicalType {
138 public:
139 struct Type {
140 enum type {
141 UNKNOWN = 0,
142 STRING = 1,
143 MAP,
144 LIST,
145 ENUM,
146 DECIMAL,
147 DATE,
148 TIME,
149 TIMESTAMP,
150 INTERVAL,
151 INT,
152 NIL, // Thrift NullType
153 JSON,
154 BSON,
155 UUID,
156 NONE
157 };
158 };
159
160 struct TimeUnit {
161 enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
162 };
163
164 /// \brief If possible, return a logical type equivalent to the given legacy
165 /// converted type (and decimal metadata if applicable).
166 static std::shared_ptr<const LogicalType> FromConvertedType(
167 const parquet::ConvertedType::type converted_type,
168 const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
169 -1});
170
171 /// \brief Return the logical type represented by the Thrift intermediary object.
172 static std::shared_ptr<const LogicalType> FromThrift(
173 const parquet::format::LogicalType& thrift_logical_type);
174
175 /// \brief Return the explicitly requested logical type.
176 static std::shared_ptr<const LogicalType> String();
177 static std::shared_ptr<const LogicalType> Map();
178 static std::shared_ptr<const LogicalType> List();
179 static std::shared_ptr<const LogicalType> Enum();
180 static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
181 static std::shared_ptr<const LogicalType> Date();
182 static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
183 LogicalType::TimeUnit::unit time_unit);
184
185 /// \brief Create a Timestamp logical type
186 /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
187 /// \param[in] time_unit the resolution of the timestamp
188 /// \param[in] is_from_converted_type if true, the timestamp was generated
189 /// by translating a legacy converted type of TIMESTAMP_MILLIS or
190 /// TIMESTAMP_MICROS. Default is false.
191 /// \param[in] force_set_converted_type if true, always set the
192 /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
193 /// metadata. Default is false
194 static std::shared_ptr<const LogicalType> Timestamp(
195 bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
196 bool is_from_converted_type = false, bool force_set_converted_type = false);
197
198 static std::shared_ptr<const LogicalType> Interval();
199 static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
200 static std::shared_ptr<const LogicalType> Null();
201 static std::shared_ptr<const LogicalType> JSON();
202 static std::shared_ptr<const LogicalType> BSON();
203 static std::shared_ptr<const LogicalType> UUID();
204 static std::shared_ptr<const LogicalType> None();
205 static std::shared_ptr<const LogicalType> Unknown();
206
207 /// \brief Return true if this logical type is consistent with the given underlying
208 /// physical type.
209 bool is_applicable(parquet::Type::type primitive_type,
210 int32_t primitive_length = -1) const;
211
212 /// \brief Return true if this logical type is equivalent to the given legacy converted
213 /// type (and decimal metadata if applicable).
214 bool is_compatible(parquet::ConvertedType::type converted_type,
215 parquet::schema::DecimalMetadata converted_decimal_metadata = {
216 false, -1, -1}) const;
217
218 /// \brief If possible, return the legacy converted type (and decimal metadata if
219 /// applicable) equivalent to this logical type.
220 parquet::ConvertedType::type ToConvertedType(
221 parquet::schema::DecimalMetadata* out_decimal_metadata) const;
222
223 /// \brief Return a printable representation of this logical type.
224 std::string ToString() const;
225
226 /// \brief Return a JSON representation of this logical type.
227 std::string ToJSON() const;
228
229 /// \brief Return a serializable Thrift object for this logical type.
230 parquet::format::LogicalType ToThrift() const;
231
232 /// \brief Return true if the given logical type is equivalent to this logical type.
233 bool Equals(const LogicalType& other) const;
234
235 /// \brief Return the enumerated type of this logical type.
236 LogicalType::Type::type type() const;
237
238 /// \brief Return the appropriate sort order for this logical type.
239 SortOrder::type sort_order() const;
240
241 // Type checks ...
242 bool is_string() const;
243 bool is_map() const;
244 bool is_list() const;
245 bool is_enum() const;
246 bool is_decimal() const;
247 bool is_date() const;
248 bool is_time() const;
249 bool is_timestamp() const;
250 bool is_interval() const;
251 bool is_int() const;
252 bool is_null() const;
253 bool is_JSON() const;
254 bool is_BSON() const;
255 bool is_UUID() const;
256 bool is_none() const;
257 /// \brief Return true if this logical type is of a known type.
258 bool is_valid() const;
259 bool is_invalid() const;
260 /// \brief Return true if this logical type is suitable for a schema GroupNode.
261 bool is_nested() const;
262 bool is_nonnested() const;
263 /// \brief Return true if this logical type is included in the Thrift output for its
264 /// node.
265 bool is_serialized() const;
266
267 LogicalType(const LogicalType&) = delete;
268 LogicalType& operator=(const LogicalType&) = delete;
269 virtual ~LogicalType() noexcept;
270
271 protected:
272 LogicalType();
273
274 class Impl;
275 std::unique_ptr<const Impl> impl_;
276};
277
278/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
279class PARQUET_EXPORT StringLogicalType : public LogicalType {
280 public:
281 static std::shared_ptr<const LogicalType> Make();
282
283 private:
284 StringLogicalType() = default;
285};
286
287/// \brief Allowed for group nodes only.
288class PARQUET_EXPORT MapLogicalType : public LogicalType {
289 public:
290 static std::shared_ptr<const LogicalType> Make();
291
292 private:
293 MapLogicalType() = default;
294};
295
296/// \brief Allowed for group nodes only.
297class PARQUET_EXPORT ListLogicalType : public LogicalType {
298 public:
299 static std::shared_ptr<const LogicalType> Make();
300
301 private:
302 ListLogicalType() = default;
303};
304
305/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
306class PARQUET_EXPORT EnumLogicalType : public LogicalType {
307 public:
308 static std::shared_ptr<const LogicalType> Make();
309
310 private:
311 EnumLogicalType() = default;
312};
313
314/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
315/// depending on the precision.
316class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
317 public:
318 static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
319 int32_t precision() const;
320 int32_t scale() const;
321
322 private:
323 DecimalLogicalType() = default;
324};
325
326/// \brief Allowed for physical type INT32.
327class PARQUET_EXPORT DateLogicalType : public LogicalType {
328 public:
329 static std::shared_ptr<const LogicalType> Make();
330
331 private:
332 DateLogicalType() = default;
333};
334
335/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
336class PARQUET_EXPORT TimeLogicalType : public LogicalType {
337 public:
338 static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
339 LogicalType::TimeUnit::unit time_unit);
340 bool is_adjusted_to_utc() const;
341 LogicalType::TimeUnit::unit time_unit() const;
342
343 private:
344 TimeLogicalType() = default;
345};
346
347/// \brief Allowed for physical type INT64.
348class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
349 public:
350 static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
351 LogicalType::TimeUnit::unit time_unit,
352 bool is_from_converted_type = false,
353 bool force_set_converted_type = false);
354 bool is_adjusted_to_utc() const;
355 LogicalType::TimeUnit::unit time_unit() const;
356
357 /// \brief If true, will not set LogicalType in Thrift metadata
358 bool is_from_converted_type() const;
359
360 /// \brief If true, will set ConvertedType for micros and millis
361 /// resolution in legacy ConvertedType Thrift metadata
362 bool force_set_converted_type() const;
363
364 private:
365 TimestampLogicalType() = default;
366};
367
368/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
369class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
370 public:
371 static std::shared_ptr<const LogicalType> Make();
372
373 private:
374 IntervalLogicalType() = default;
375};
376
377/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
378/// (for bit width 64).
379class PARQUET_EXPORT IntLogicalType : public LogicalType {
380 public:
381 static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
382 int bit_width() const;
383 bool is_signed() const;
384
385 private:
386 IntLogicalType() = default;
387};
388
389/// \brief Allowed for any physical type.
390class PARQUET_EXPORT NullLogicalType : public LogicalType {
391 public:
392 static std::shared_ptr<const LogicalType> Make();
393
394 private:
395 NullLogicalType() = default;
396};
397
398/// \brief Allowed for physical type BYTE_ARRAY.
399class PARQUET_EXPORT JSONLogicalType : public LogicalType {
400 public:
401 static std::shared_ptr<const LogicalType> Make();
402
403 private:
404 JSONLogicalType() = default;
405};
406
407/// \brief Allowed for physical type BYTE_ARRAY.
408class PARQUET_EXPORT BSONLogicalType : public LogicalType {
409 public:
410 static std::shared_ptr<const LogicalType> Make();
411
412 private:
413 BSONLogicalType() = default;
414};
415
416/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
417/// must encode raw UUID bytes.
418class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
419 public:
420 static std::shared_ptr<const LogicalType> Make();
421
422 private:
423 UUIDLogicalType() = default;
424};
425
426/// \brief Allowed for any physical type.
427class PARQUET_EXPORT NoLogicalType : public LogicalType {
428 public:
429 static std::shared_ptr<const LogicalType> Make();
430
431 private:
432 NoLogicalType() = default;
433};
434
435/// \brief Allowed for any type.
436class PARQUET_EXPORT UnknownLogicalType : public LogicalType {
437 public:
438 static std::shared_ptr<const LogicalType> Make();
439
440 private:
441 UnknownLogicalType() = default;
442};
443
444// Data encodings. Mirrors parquet::Encoding
445struct Encoding {
446 enum type {
447 PLAIN = 0,
448 PLAIN_DICTIONARY = 2,
449 RLE = 3,
450 BIT_PACKED = 4,
451 DELTA_BINARY_PACKED = 5,
452 DELTA_LENGTH_BYTE_ARRAY = 6,
453 DELTA_BYTE_ARRAY = 7,
454 RLE_DICTIONARY = 8,
455 UNKNOWN = 999
456 };
457};
458
459/// \brief Return true if Parquet supports indicated compression type
460PARQUET_EXPORT
461bool IsCodecSupported(Compression::type codec);
462
463PARQUET_EXPORT
464std::unique_ptr<Codec> GetCodec(Compression::type codec);
465
466PARQUET_EXPORT
467std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
468
469struct Encryption {
470 enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
471};
472
473// parquet::PageType
474struct PageType {
475 enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 };
476};
477
478class ColumnOrder {
479 public:
480 enum type { UNDEFINED, TYPE_DEFINED_ORDER };
481 explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
482 // Default to Type Defined Order
483 ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
484 ColumnOrder::type get_order() { return column_order_; }
485
486 static ColumnOrder undefined_;
487 static ColumnOrder type_defined_;
488
489 private:
490 ColumnOrder::type column_order_;
491};
492
493// ----------------------------------------------------------------------
494
495struct ByteArray {
496 ByteArray() : len(0), ptr(NULLPTR) {}
497 ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
498
499 ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
500 : ByteArray(static_cast<uint32_t>(view.size()),
501 reinterpret_cast<const uint8_t*>(view.data())) {}
502 uint32_t len;
503 const uint8_t* ptr;
504};
505
506inline bool operator==(const ByteArray& left, const ByteArray& right) {
507 return left.len == right.len &&
508 (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
509}
510
511inline bool operator!=(const ByteArray& left, const ByteArray& right) {
512 return !(left == right);
513}
514
515struct FixedLenByteArray {
516 FixedLenByteArray() : ptr(NULLPTR) {}
517 explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
518 const uint8_t* ptr;
519};
520
521using FLBA = FixedLenByteArray;
522
523// Julian day at unix epoch.
524//
525// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
526// the Julian day count starting from noon Universal time, with Julian day
527// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
528// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
529// calendar),
530constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
531constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
532constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
533constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
534constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
535
536MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
537STRUCT_END(Int96, 12);
538
539inline bool operator==(const Int96& left, const Int96& right) {
540 return std::equal(left.value, left.value + 3, right.value);
541}
542
543inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
544
545static inline std::string ByteArrayToString(const ByteArray& a) {
546 return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
547}
548
549static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
550 std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
551}
552
553static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
554 int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays;
555 int64_t nanoseconds = 0;
556
557 memcpy(&nanoseconds, &i96.value, sizeof(int64_t));
558 return days_since_epoch * kNanosecondsPerDay + nanoseconds;
559}
560
561static inline std::string Int96ToString(const Int96& a) {
562 std::ostringstream result;
563 std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
564 return result.str();
565}
566
567static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
568 std::ostringstream result;
569 std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
570 return result.str();
571}
572
573template <Type::type TYPE>
574struct type_traits {};
575
576template <>
577struct type_traits<Type::BOOLEAN> {
578 using value_type = bool;
579
580 static constexpr int value_byte_size = 1;
581 static constexpr const char* printf_code = "d";
582};
583
584template <>
585struct type_traits<Type::INT32> {
586 using value_type = int32_t;
587
588 static constexpr int value_byte_size = 4;
589 static constexpr const char* printf_code = "d";
590};
591
592template <>
593struct type_traits<Type::INT64> {
594 using value_type = int64_t;
595
596 static constexpr int value_byte_size = 8;
597 static constexpr const char* printf_code = "ld";
598};
599
600template <>
601struct type_traits<Type::INT96> {
602 using value_type = Int96;
603
604 static constexpr int value_byte_size = 12;
605 static constexpr const char* printf_code = "s";
606};
607
608template <>
609struct type_traits<Type::FLOAT> {
610 using value_type = float;
611
612 static constexpr int value_byte_size = 4;
613 static constexpr const char* printf_code = "f";
614};
615
616template <>
617struct type_traits<Type::DOUBLE> {
618 using value_type = double;
619
620 static constexpr int value_byte_size = 8;
621 static constexpr const char* printf_code = "lf";
622};
623
624template <>
625struct type_traits<Type::BYTE_ARRAY> {
626 using value_type = ByteArray;
627
628 static constexpr int value_byte_size = sizeof(ByteArray);
629 static constexpr const char* printf_code = "s";
630};
631
632template <>
633struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
634 using value_type = FixedLenByteArray;
635
636 static constexpr int value_byte_size = sizeof(FixedLenByteArray);
637 static constexpr const char* printf_code = "s";
638};
639
640template <Type::type TYPE>
641struct PhysicalType {
642 using c_type = typename type_traits<TYPE>::value_type;
643 static constexpr Type::type type_num = TYPE;
644};
645
646using BooleanType = PhysicalType<Type::BOOLEAN>;
647using Int32Type = PhysicalType<Type::INT32>;
648using Int64Type = PhysicalType<Type::INT64>;
649using Int96Type = PhysicalType<Type::INT96>;
650using FloatType = PhysicalType<Type::FLOAT>;
651using DoubleType = PhysicalType<Type::DOUBLE>;
652using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
653using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
654
655template <typename Type>
656inline std::string format_fwf(int width) {
657 std::stringstream ss;
658 ss << "%-" << width << type_traits<Type::type_num>::printf_code;
659 return ss.str();
660}
661
662PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
663
664PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
665
666PARQUET_EXPORT std::string TypeToString(Type::type t);
667
668PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
669 const std::string& val);
670
671/// \deprecated Since 1.5.0
672ARROW_DEPRECATED("Use std::string instead of char* as input")
673PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val);
674
675PARQUET_EXPORT int GetTypeByteSize(Type::type t);
676
677PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
678
679PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
680 Type::type primitive);
681
682PARQUET_EXPORT SortOrder::type GetSortOrder(
683 const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
684
685namespace internal {
686
687PARQUET_EXPORT
688int32_t DecimalSize(int32_t precision);
689
690} // namespace internal
691} // namespace parquet
692
693#endif // PARQUET_TYPES_H
694