1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <cmath>
19#include <cstdint>
20#include <memory>
21#include <sstream>
22#include <string>
23
24#include "arrow/util/checked_cast.h"
25#include "arrow/util/compression.h"
26#include "arrow/util/logging.h"
27
28#include "parquet/exception.h"
29#include "parquet/parquet_types.h"
30#include "parquet/types.h"
31
32using ::arrow::internal::checked_cast;
33using arrow::util::Codec;
34
35namespace parquet {
36
37bool IsCodecSupported(Compression::type codec) {
38 switch (codec) {
39 case Compression::UNCOMPRESSED:
40 case Compression::SNAPPY:
41 case Compression::GZIP:
42 case Compression::BROTLI:
43 case Compression::ZSTD:
44 case Compression::LZ4:
45 return true;
46 default:
47 return false;
48 }
49}
50
51std::unique_ptr<Codec> GetCodec(Compression::type codec) {
52 return GetCodec(codec, Codec::UseDefaultCompressionLevel());
53}
54
55std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level) {
56 std::unique_ptr<Codec> result;
57 if (!IsCodecSupported(codec)) {
58 std::stringstream ss;
59 ss << "Codec type " << Codec::GetCodecAsString(codec)
60 << " not supported in Parquet format";
61 throw ParquetException(ss.str());
62 }
63
64 PARQUET_THROW_NOT_OK(Codec::Create(codec, compression_level, &result));
65 return result;
66}
67
68std::string FormatStatValue(Type::type parquet_type, const std::string& val) {
69 std::stringstream result;
70 switch (parquet_type) {
71 case Type::BOOLEAN:
72 result << reinterpret_cast<const bool*>(val.c_str())[0];
73 break;
74 case Type::INT32:
75 result << reinterpret_cast<const int32_t*>(val.c_str())[0];
76 break;
77 case Type::INT64:
78 result << reinterpret_cast<const int64_t*>(val.c_str())[0];
79 break;
80 case Type::DOUBLE:
81 result << reinterpret_cast<const double*>(val.c_str())[0];
82 break;
83 case Type::FLOAT:
84 result << reinterpret_cast<const float*>(val.c_str())[0];
85 break;
86 case Type::INT96: {
87 auto const i32_val = reinterpret_cast<const int32_t*>(val.c_str());
88 result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
89 break;
90 }
91 case Type::BYTE_ARRAY: {
92 return val;
93 }
94 case Type::FIXED_LEN_BYTE_ARRAY: {
95 return val;
96 }
97 case Type::UNDEFINED:
98 default:
99 break;
100 }
101 return result.str();
102}
103
104std::string FormatStatValue(Type::type parquet_type, const char* val) {
105 std::stringstream result;
106 switch (parquet_type) {
107 case Type::BOOLEAN:
108 result << reinterpret_cast<const bool*>(val)[0];
109 break;
110 case Type::INT32:
111 result << reinterpret_cast<const int32_t*>(val)[0];
112 break;
113 case Type::INT64:
114 result << reinterpret_cast<const int64_t*>(val)[0];
115 break;
116 case Type::DOUBLE:
117 result << reinterpret_cast<const double*>(val)[0];
118 break;
119 case Type::FLOAT:
120 result << reinterpret_cast<const float*>(val)[0];
121 break;
122 case Type::INT96: {
123 auto const i32_val = reinterpret_cast<const int32_t*>(val);
124 result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
125 break;
126 }
127 case Type::BYTE_ARRAY: {
128 result << val;
129 break;
130 }
131 case Type::FIXED_LEN_BYTE_ARRAY: {
132 result << val;
133 break;
134 }
135 case Type::UNDEFINED:
136 default:
137 break;
138 }
139 return result.str();
140}
141
142std::string EncodingToString(Encoding::type t) {
143 switch (t) {
144 case Encoding::PLAIN:
145 return "PLAIN";
146 case Encoding::PLAIN_DICTIONARY:
147 return "PLAIN_DICTIONARY";
148 case Encoding::RLE:
149 return "RLE";
150 case Encoding::BIT_PACKED:
151 return "BIT_PACKED";
152 case Encoding::DELTA_BINARY_PACKED:
153 return "DELTA_BINARY_PACKED";
154 case Encoding::DELTA_LENGTH_BYTE_ARRAY:
155 return "DELTA_LENGTH_BYTE_ARRAY";
156 case Encoding::DELTA_BYTE_ARRAY:
157 return "DELTA_BYTE_ARRAY";
158 case Encoding::RLE_DICTIONARY:
159 return "RLE_DICTIONARY";
160 default:
161 return "UNKNOWN";
162 }
163}
164
165std::string TypeToString(Type::type t) {
166 switch (t) {
167 case Type::BOOLEAN:
168 return "BOOLEAN";
169 case Type::INT32:
170 return "INT32";
171 case Type::INT64:
172 return "INT64";
173 case Type::INT96:
174 return "INT96";
175 case Type::FLOAT:
176 return "FLOAT";
177 case Type::DOUBLE:
178 return "DOUBLE";
179 case Type::BYTE_ARRAY:
180 return "BYTE_ARRAY";
181 case Type::FIXED_LEN_BYTE_ARRAY:
182 return "FIXED_LEN_BYTE_ARRAY";
183 case Type::UNDEFINED:
184 default:
185 return "UNKNOWN";
186 }
187}
188
189std::string ConvertedTypeToString(ConvertedType::type t) {
190 switch (t) {
191 case ConvertedType::NONE:
192 return "NONE";
193 case ConvertedType::UTF8:
194 return "UTF8";
195 case ConvertedType::MAP:
196 return "MAP";
197 case ConvertedType::MAP_KEY_VALUE:
198 return "MAP_KEY_VALUE";
199 case ConvertedType::LIST:
200 return "LIST";
201 case ConvertedType::ENUM:
202 return "ENUM";
203 case ConvertedType::DECIMAL:
204 return "DECIMAL";
205 case ConvertedType::DATE:
206 return "DATE";
207 case ConvertedType::TIME_MILLIS:
208 return "TIME_MILLIS";
209 case ConvertedType::TIME_MICROS:
210 return "TIME_MICROS";
211 case ConvertedType::TIMESTAMP_MILLIS:
212 return "TIMESTAMP_MILLIS";
213 case ConvertedType::TIMESTAMP_MICROS:
214 return "TIMESTAMP_MICROS";
215 case ConvertedType::UINT_8:
216 return "UINT_8";
217 case ConvertedType::UINT_16:
218 return "UINT_16";
219 case ConvertedType::UINT_32:
220 return "UINT_32";
221 case ConvertedType::UINT_64:
222 return "UINT_64";
223 case ConvertedType::INT_8:
224 return "INT_8";
225 case ConvertedType::INT_16:
226 return "INT_16";
227 case ConvertedType::INT_32:
228 return "INT_32";
229 case ConvertedType::INT_64:
230 return "INT_64";
231 case ConvertedType::JSON:
232 return "JSON";
233 case ConvertedType::BSON:
234 return "BSON";
235 case ConvertedType::INTERVAL:
236 return "INTERVAL";
237 case ConvertedType::UNDEFINED:
238 default:
239 return "UNKNOWN";
240 }
241}
242
243int GetTypeByteSize(Type::type parquet_type) {
244 switch (parquet_type) {
245 case Type::BOOLEAN:
246 return type_traits<BooleanType::type_num>::value_byte_size;
247 case Type::INT32:
248 return type_traits<Int32Type::type_num>::value_byte_size;
249 case Type::INT64:
250 return type_traits<Int64Type::type_num>::value_byte_size;
251 case Type::INT96:
252 return type_traits<Int96Type::type_num>::value_byte_size;
253 case Type::DOUBLE:
254 return type_traits<DoubleType::type_num>::value_byte_size;
255 case Type::FLOAT:
256 return type_traits<FloatType::type_num>::value_byte_size;
257 case Type::BYTE_ARRAY:
258 return type_traits<ByteArrayType::type_num>::value_byte_size;
259 case Type::FIXED_LEN_BYTE_ARRAY:
260 return type_traits<FLBAType::type_num>::value_byte_size;
261 case Type::UNDEFINED:
262 default:
263 return 0;
264 }
265 return 0;
266}
267
268// Return the Sort Order of the Parquet Physical Types
269SortOrder::type DefaultSortOrder(Type::type primitive) {
270 switch (primitive) {
271 case Type::BOOLEAN:
272 case Type::INT32:
273 case Type::INT64:
274 case Type::FLOAT:
275 case Type::DOUBLE:
276 return SortOrder::SIGNED;
277 case Type::BYTE_ARRAY:
278 case Type::FIXED_LEN_BYTE_ARRAY:
279 return SortOrder::UNSIGNED;
280 case Type::INT96:
281 case Type::UNDEFINED:
282 return SortOrder::UNKNOWN;
283 }
284 return SortOrder::UNKNOWN;
285}
286
287// Return the SortOrder of the Parquet Types using Logical or Physical Types
288SortOrder::type GetSortOrder(ConvertedType::type converted, Type::type primitive) {
289 if (converted == ConvertedType::NONE) return DefaultSortOrder(primitive);
290 switch (converted) {
291 case ConvertedType::INT_8:
292 case ConvertedType::INT_16:
293 case ConvertedType::INT_32:
294 case ConvertedType::INT_64:
295 case ConvertedType::DATE:
296 case ConvertedType::TIME_MICROS:
297 case ConvertedType::TIME_MILLIS:
298 case ConvertedType::TIMESTAMP_MICROS:
299 case ConvertedType::TIMESTAMP_MILLIS:
300 return SortOrder::SIGNED;
301 case ConvertedType::UINT_8:
302 case ConvertedType::UINT_16:
303 case ConvertedType::UINT_32:
304 case ConvertedType::UINT_64:
305 case ConvertedType::ENUM:
306 case ConvertedType::UTF8:
307 case ConvertedType::BSON:
308 case ConvertedType::JSON:
309 return SortOrder::UNSIGNED;
310 case ConvertedType::DECIMAL:
311 case ConvertedType::LIST:
312 case ConvertedType::MAP:
313 case ConvertedType::MAP_KEY_VALUE:
314 case ConvertedType::INTERVAL:
315 case ConvertedType::NONE: // required instead of default
316 case ConvertedType::NA: // required instead of default
317 case ConvertedType::UNDEFINED:
318 return SortOrder::UNKNOWN;
319 }
320 return SortOrder::UNKNOWN;
321}
322
323SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_type,
324 Type::type primitive) {
325 SortOrder::type o = SortOrder::UNKNOWN;
326 if (logical_type && logical_type->is_valid()) {
327 o = (logical_type->is_none() ? DefaultSortOrder(primitive)
328 : logical_type->sort_order());
329 }
330 return o;
331}
332
333ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
334ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
335
336// Static methods for LogicalType class
337
338std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
339 const ConvertedType::type converted_type,
340 const schema::DecimalMetadata converted_decimal_metadata) {
341 switch (converted_type) {
342 case ConvertedType::UTF8:
343 return StringLogicalType::Make();
344 case ConvertedType::MAP_KEY_VALUE:
345 case ConvertedType::MAP:
346 return MapLogicalType::Make();
347 case ConvertedType::LIST:
348 return ListLogicalType::Make();
349 case ConvertedType::ENUM:
350 return EnumLogicalType::Make();
351 case ConvertedType::DECIMAL:
352 return DecimalLogicalType::Make(converted_decimal_metadata.precision,
353 converted_decimal_metadata.scale);
354 case ConvertedType::DATE:
355 return DateLogicalType::Make();
356 case ConvertedType::TIME_MILLIS:
357 return TimeLogicalType::Make(true, LogicalType::TimeUnit::MILLIS);
358 case ConvertedType::TIME_MICROS:
359 return TimeLogicalType::Make(true, LogicalType::TimeUnit::MICROS);
360 case ConvertedType::TIMESTAMP_MILLIS:
361 return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MILLIS,
362 /*is_from_converted_type=*/true,
363 /*force_set_converted_type=*/false);
364 case ConvertedType::TIMESTAMP_MICROS:
365 return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MICROS,
366 /*is_from_converted_type=*/true,
367 /*force_set_converted_type=*/false);
368 case ConvertedType::INTERVAL:
369 return IntervalLogicalType::Make();
370 case ConvertedType::INT_8:
371 return IntLogicalType::Make(8, true);
372 case ConvertedType::INT_16:
373 return IntLogicalType::Make(16, true);
374 case ConvertedType::INT_32:
375 return IntLogicalType::Make(32, true);
376 case ConvertedType::INT_64:
377 return IntLogicalType::Make(64, true);
378 case ConvertedType::UINT_8:
379 return IntLogicalType::Make(8, false);
380 case ConvertedType::UINT_16:
381 return IntLogicalType::Make(16, false);
382 case ConvertedType::UINT_32:
383 return IntLogicalType::Make(32, false);
384 case ConvertedType::UINT_64:
385 return IntLogicalType::Make(64, false);
386 case ConvertedType::JSON:
387 return JSONLogicalType::Make();
388 case ConvertedType::BSON:
389 return BSONLogicalType::Make();
390 case ConvertedType::NONE:
391 return NoLogicalType::Make();
392 case ConvertedType::NA:
393 case ConvertedType::UNDEFINED:
394 return UnknownLogicalType::Make();
395 }
396 return UnknownLogicalType::Make();
397}
398
399std::shared_ptr<const LogicalType> LogicalType::FromThrift(
400 const format::LogicalType& type) {
401 if (type.__isset.STRING) {
402 return StringLogicalType::Make();
403 } else if (type.__isset.MAP) {
404 return MapLogicalType::Make();
405 } else if (type.__isset.LIST) {
406 return ListLogicalType::Make();
407 } else if (type.__isset.ENUM) {
408 return EnumLogicalType::Make();
409 } else if (type.__isset.DECIMAL) {
410 return DecimalLogicalType::Make(type.DECIMAL.precision, type.DECIMAL.scale);
411 } else if (type.__isset.DATE) {
412 return DateLogicalType::Make();
413 } else if (type.__isset.TIME) {
414 LogicalType::TimeUnit::unit unit;
415 if (type.TIME.unit.__isset.MILLIS) {
416 unit = LogicalType::TimeUnit::MILLIS;
417 } else if (type.TIME.unit.__isset.MICROS) {
418 unit = LogicalType::TimeUnit::MICROS;
419 } else if (type.TIME.unit.__isset.NANOS) {
420 unit = LogicalType::TimeUnit::NANOS;
421 } else {
422 unit = LogicalType::TimeUnit::UNKNOWN;
423 }
424 return TimeLogicalType::Make(type.TIME.isAdjustedToUTC, unit);
425 } else if (type.__isset.TIMESTAMP) {
426 LogicalType::TimeUnit::unit unit;
427 if (type.TIMESTAMP.unit.__isset.MILLIS) {
428 unit = LogicalType::TimeUnit::MILLIS;
429 } else if (type.TIMESTAMP.unit.__isset.MICROS) {
430 unit = LogicalType::TimeUnit::MICROS;
431 } else if (type.TIMESTAMP.unit.__isset.NANOS) {
432 unit = LogicalType::TimeUnit::NANOS;
433 } else {
434 unit = LogicalType::TimeUnit::UNKNOWN;
435 }
436 return TimestampLogicalType::Make(type.TIMESTAMP.isAdjustedToUTC, unit);
437 // TODO(tpboudreau): activate the commented code after parquet.thrift
438 // recognizes IntervalType as a LogicalType
439 //} else if (type.__isset.INTERVAL) {
440 // return IntervalLogicalType::Make();
441 } else if (type.__isset.INTEGER) {
442 return IntLogicalType::Make(static_cast<int>(type.INTEGER.bitWidth),
443 type.INTEGER.isSigned);
444 } else if (type.__isset.UNKNOWN) {
445 return NullLogicalType::Make();
446 } else if (type.__isset.JSON) {
447 return JSONLogicalType::Make();
448 } else if (type.__isset.BSON) {
449 return BSONLogicalType::Make();
450 } else if (type.__isset.UUID) {
451 return UUIDLogicalType::Make();
452 } else {
453 throw ParquetException("Metadata contains Thrift LogicalType that is not recognized");
454 }
455}
456
457std::shared_ptr<const LogicalType> LogicalType::String() {
458 return StringLogicalType::Make();
459}
460
461std::shared_ptr<const LogicalType> LogicalType::Map() { return MapLogicalType::Make(); }
462
463std::shared_ptr<const LogicalType> LogicalType::List() { return ListLogicalType::Make(); }
464
465std::shared_ptr<const LogicalType> LogicalType::Enum() { return EnumLogicalType::Make(); }
466
467std::shared_ptr<const LogicalType> LogicalType::Decimal(int32_t precision,
468 int32_t scale) {
469 return DecimalLogicalType::Make(precision, scale);
470}
471
472std::shared_ptr<const LogicalType> LogicalType::Date() { return DateLogicalType::Make(); }
473
474std::shared_ptr<const LogicalType> LogicalType::Time(
475 bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
476 DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
477 return TimeLogicalType::Make(is_adjusted_to_utc, time_unit);
478}
479
480std::shared_ptr<const LogicalType> LogicalType::Timestamp(
481 bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
482 bool is_from_converted_type, bool force_set_converted_type) {
483 DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
484 return TimestampLogicalType::Make(is_adjusted_to_utc, time_unit, is_from_converted_type,
485 force_set_converted_type);
486}
487
488std::shared_ptr<const LogicalType> LogicalType::Interval() {
489 return IntervalLogicalType::Make();
490}
491
492std::shared_ptr<const LogicalType> LogicalType::Int(int bit_width, bool is_signed) {
493 DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8);
494 return IntLogicalType::Make(bit_width, is_signed);
495}
496
497std::shared_ptr<const LogicalType> LogicalType::Null() { return NullLogicalType::Make(); }
498
499std::shared_ptr<const LogicalType> LogicalType::JSON() { return JSONLogicalType::Make(); }
500
501std::shared_ptr<const LogicalType> LogicalType::BSON() { return BSONLogicalType::Make(); }
502
503std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType::Make(); }
504
505std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
506
507std::shared_ptr<const LogicalType> LogicalType::Unknown() {
508 return UnknownLogicalType::Make();
509}
510
511/*
512 * The logical type implementation classes are built in four layers: (1) the base
513 * layer, which establishes the interface and provides generally reusable implementations
514 * for the ToJSON() and Equals() methods; (2) an intermediate derived layer for the
515 * "compatibility" methods, which provides implementations for is_compatible() and
516 * ToConvertedType(); (3) another intermediate layer for the "applicability" methods
517 * that provides several implementations for the is_applicable() method; and (4) the
518 * final derived classes, one for each logical type, which supply implementations
519 * for those methods that remain virtual (usually just ToString() and ToThrift()) or
520 * otherwise need to be overridden.
521 */
522
523// LogicalTypeImpl base class
524
525class LogicalType::Impl {
526 public:
527 virtual bool is_applicable(parquet::Type::type primitive_type,
528 int32_t primitive_length = -1) const = 0;
529
530 virtual bool is_compatible(ConvertedType::type converted_type,
531 schema::DecimalMetadata converted_decimal_metadata = {
532 false, -1, -1}) const = 0;
533
534 virtual ConvertedType::type ToConvertedType(
535 schema::DecimalMetadata* out_decimal_metadata) const = 0;
536
537 virtual std::string ToString() const = 0;
538
539 virtual bool is_serialized() const {
540 return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNKNOWN);
541 }
542
543 virtual std::string ToJSON() const {
544 std::stringstream json;
545 json << R"({"Type": ")" << ToString() << R"("})";
546 return json.str();
547 }
548
549 virtual format::LogicalType ToThrift() const {
550 // logical types inheriting this method should never be serialized
551 std::stringstream ss;
552 ss << "Logical type " << ToString() << " should not be serialized";
553 throw ParquetException(ss.str());
554 }
555
556 virtual bool Equals(const LogicalType& other) const { return other.type() == type_; }
557
558 LogicalType::Type::type type() const { return type_; }
559
560 SortOrder::type sort_order() const { return order_; }
561
562 Impl(const Impl&) = delete;
563 Impl& operator=(const Impl&) = delete;
564 virtual ~Impl() noexcept {}
565
566 class Compatible;
567 class SimpleCompatible;
568 class Incompatible;
569
570 class Applicable;
571 class SimpleApplicable;
572 class TypeLengthApplicable;
573 class UniversalApplicable;
574 class Inapplicable;
575
576 class String;
577 class Map;
578 class List;
579 class Enum;
580 class Decimal;
581 class Date;
582 class Time;
583 class Timestamp;
584 class Interval;
585 class Int;
586 class Null;
587 class JSON;
588 class BSON;
589 class UUID;
590 class No;
591 class Unknown;
592
593 protected:
594 Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
595 Impl() = default;
596
597 private:
598 LogicalType::Type::type type_ = LogicalType::Type::UNKNOWN;
599 SortOrder::type order_ = SortOrder::UNKNOWN;
600};
601
602// Special methods for public LogicalType class
603
604LogicalType::LogicalType() = default;
605LogicalType::~LogicalType() noexcept = default;
606
607// Delegating methods for public LogicalType class
608
609bool LogicalType::is_applicable(parquet::Type::type primitive_type,
610 int32_t primitive_length) const {
611 return impl_->is_applicable(primitive_type, primitive_length);
612}
613
614bool LogicalType::is_compatible(
615 ConvertedType::type converted_type,
616 schema::DecimalMetadata converted_decimal_metadata) const {
617 return impl_->is_compatible(converted_type, converted_decimal_metadata);
618}
619
620ConvertedType::type LogicalType::ToConvertedType(
621 schema::DecimalMetadata* out_decimal_metadata) const {
622 return impl_->ToConvertedType(out_decimal_metadata);
623}
624
625std::string LogicalType::ToString() const { return impl_->ToString(); }
626
627std::string LogicalType::ToJSON() const { return impl_->ToJSON(); }
628
629format::LogicalType LogicalType::ToThrift() const { return impl_->ToThrift(); }
630
631bool LogicalType::Equals(const LogicalType& other) const { return impl_->Equals(other); }
632
633LogicalType::Type::type LogicalType::type() const { return impl_->type(); }
634
635SortOrder::type LogicalType::sort_order() const { return impl_->sort_order(); }
636
637// Type checks for public LogicalType class
638
639bool LogicalType::is_string() const { return impl_->type() == LogicalType::Type::STRING; }
640bool LogicalType::is_map() const { return impl_->type() == LogicalType::Type::MAP; }
641bool LogicalType::is_list() const { return impl_->type() == LogicalType::Type::LIST; }
642bool LogicalType::is_enum() const { return impl_->type() == LogicalType::Type::ENUM; }
643bool LogicalType::is_decimal() const {
644 return impl_->type() == LogicalType::Type::DECIMAL;
645}
646bool LogicalType::is_date() const { return impl_->type() == LogicalType::Type::DATE; }
647bool LogicalType::is_time() const { return impl_->type() == LogicalType::Type::TIME; }
648bool LogicalType::is_timestamp() const {
649 return impl_->type() == LogicalType::Type::TIMESTAMP;
650}
651bool LogicalType::is_interval() const {
652 return impl_->type() == LogicalType::Type::INTERVAL;
653}
654bool LogicalType::is_int() const { return impl_->type() == LogicalType::Type::INT; }
655bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::NIL; }
656bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; }
657bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
658bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
659bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
660bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNKNOWN; }
661bool LogicalType::is_invalid() const { return !is_valid(); }
662bool LogicalType::is_nested() const {
663 return (impl_->type() == LogicalType::Type::LIST) ||
664 (impl_->type() == LogicalType::Type::MAP);
665}
666bool LogicalType::is_nonnested() const { return !is_nested(); }
667bool LogicalType::is_serialized() const { return impl_->is_serialized(); }
668
669// LogicalTypeImpl intermediate "compatibility" classes
670
671class LogicalType::Impl::Compatible : public virtual LogicalType::Impl {
672 protected:
673 Compatible() = default;
674};
675
676#define set_decimal_metadata(m___, i___, p___, s___) \
677 { \
678 if (m___) { \
679 (m___)->isset = (i___); \
680 (m___)->scale = (s___); \
681 (m___)->precision = (p___); \
682 } \
683 }
684
685#define reset_decimal_metadata(m___) \
686 { set_decimal_metadata(m___, false, -1, -1); }
687
688// For logical types that always translate to the same converted type
689class LogicalType::Impl::SimpleCompatible : public virtual LogicalType::Impl::Compatible {
690 public:
691 bool is_compatible(ConvertedType::type converted_type,
692 schema::DecimalMetadata converted_decimal_metadata) const override {
693 return (converted_type == converted_type_) && !converted_decimal_metadata.isset;
694 }
695
696 ConvertedType::type ToConvertedType(
697 schema::DecimalMetadata* out_decimal_metadata) const override {
698 reset_decimal_metadata(out_decimal_metadata);
699 return converted_type_;
700 }
701
702 protected:
703 explicit SimpleCompatible(ConvertedType::type c) : converted_type_(c) {}
704
705 private:
706 ConvertedType::type converted_type_ = ConvertedType::NA;
707};
708
709// For logical types that have no corresponding converted type
710class LogicalType::Impl::Incompatible : public virtual LogicalType::Impl {
711 public:
712 bool is_compatible(ConvertedType::type converted_type,
713 schema::DecimalMetadata converted_decimal_metadata) const override {
714 return (converted_type == ConvertedType::NONE ||
715 converted_type == ConvertedType::NA) &&
716 !converted_decimal_metadata.isset;
717 }
718
719 ConvertedType::type ToConvertedType(
720 schema::DecimalMetadata* out_decimal_metadata) const override {
721 reset_decimal_metadata(out_decimal_metadata);
722 return ConvertedType::NONE;
723 }
724
725 protected:
726 Incompatible() = default;
727};
728
729// LogicalTypeImpl intermediate "applicability" classes
730
731class LogicalType::Impl::Applicable : public virtual LogicalType::Impl {
732 protected:
733 Applicable() = default;
734};
735
736// For logical types that can apply only to a single
737// physical type
738class LogicalType::Impl::SimpleApplicable : public virtual LogicalType::Impl::Applicable {
739 public:
740 bool is_applicable(parquet::Type::type primitive_type,
741 int32_t primitive_length = -1) const override {
742 return primitive_type == type_;
743 }
744
745 protected:
746 explicit SimpleApplicable(parquet::Type::type t) : type_(t) {}
747
748 private:
749 parquet::Type::type type_;
750};
751
752// For logical types that can apply only to a particular
753// physical type and physical length combination
754class LogicalType::Impl::TypeLengthApplicable
755 : public virtual LogicalType::Impl::Applicable {
756 public:
757 bool is_applicable(parquet::Type::type primitive_type,
758 int32_t primitive_length = -1) const override {
759 return primitive_type == type_ && primitive_length == length_;
760 }
761
762 protected:
763 TypeLengthApplicable(parquet::Type::type t, int32_t l) : type_(t), length_(l) {}
764
765 private:
766 parquet::Type::type type_;
767 int32_t length_;
768};
769
770// For logical types that can apply to any physical type
771class LogicalType::Impl::UniversalApplicable
772 : public virtual LogicalType::Impl::Applicable {
773 public:
774 bool is_applicable(parquet::Type::type primitive_type,
775 int32_t primitive_length = -1) const override {
776 return true;
777 }
778
779 protected:
780 UniversalApplicable() = default;
781};
782
783// For logical types that can never apply to any primitive
784// physical type
785class LogicalType::Impl::Inapplicable : public virtual LogicalType::Impl {
786 public:
787 bool is_applicable(parquet::Type::type primitive_type,
788 int32_t primitive_length = -1) const override {
789 return false;
790 }
791
792 protected:
793 Inapplicable() = default;
794};
795
796// LogicalType implementation final classes
797
798#define OVERRIDE_TOSTRING(n___) \
799 std::string ToString() const override { return #n___; }
800
801#define OVERRIDE_TOTHRIFT(t___, s___) \
802 format::LogicalType ToThrift() const override { \
803 format::LogicalType type; \
804 format::t___ subtype; \
805 type.__set_##s___(subtype); \
806 return type; \
807 }
808
809class LogicalType::Impl::String final : public LogicalType::Impl::SimpleCompatible,
810 public LogicalType::Impl::SimpleApplicable {
811 public:
812 friend class StringLogicalType;
813
814 OVERRIDE_TOSTRING(String)
815 OVERRIDE_TOTHRIFT(StringType, STRING)
816
817 private:
818 String()
819 : LogicalType::Impl(LogicalType::Type::STRING, SortOrder::UNSIGNED),
820 LogicalType::Impl::SimpleCompatible(ConvertedType::UTF8),
821 LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
822};
823
824// Each public logical type class's Make() creation method instantiates a corresponding
825// LogicalType::Impl::* object and installs that implementation in the logical type
826// it returns.
827
828#define GENERATE_MAKE(a___) \
829 std::shared_ptr<const LogicalType> a___##LogicalType::Make() { \
830 auto* logical_type = new a___##LogicalType(); \
831 logical_type->impl_.reset(new LogicalType::Impl::a___()); \
832 return std::shared_ptr<const LogicalType>(logical_type); \
833 }
834
835GENERATE_MAKE(String)
836
837class LogicalType::Impl::Map final : public LogicalType::Impl::SimpleCompatible,
838 public LogicalType::Impl::Inapplicable {
839 public:
840 friend class MapLogicalType;
841
842 bool is_compatible(ConvertedType::type converted_type,
843 schema::DecimalMetadata converted_decimal_metadata) const override {
844 return (converted_type == ConvertedType::MAP ||
845 converted_type == ConvertedType::MAP_KEY_VALUE) &&
846 !converted_decimal_metadata.isset;
847 }
848
849 OVERRIDE_TOSTRING(Map)
850 OVERRIDE_TOTHRIFT(MapType, MAP)
851
852 private:
853 Map()
854 : LogicalType::Impl(LogicalType::Type::MAP, SortOrder::UNKNOWN),
855 LogicalType::Impl::SimpleCompatible(ConvertedType::MAP) {}
856};
857
858GENERATE_MAKE(Map)
859
860class LogicalType::Impl::List final : public LogicalType::Impl::SimpleCompatible,
861 public LogicalType::Impl::Inapplicable {
862 public:
863 friend class ListLogicalType;
864
865 OVERRIDE_TOSTRING(List)
866 OVERRIDE_TOTHRIFT(ListType, LIST)
867
868 private:
869 List()
870 : LogicalType::Impl(LogicalType::Type::LIST, SortOrder::UNKNOWN),
871 LogicalType::Impl::SimpleCompatible(ConvertedType::LIST) {}
872};
873
874GENERATE_MAKE(List)
875
876class LogicalType::Impl::Enum final : public LogicalType::Impl::SimpleCompatible,
877 public LogicalType::Impl::SimpleApplicable {
878 public:
879 friend class EnumLogicalType;
880
881 OVERRIDE_TOSTRING(Enum)
882 OVERRIDE_TOTHRIFT(EnumType, ENUM)
883
884 private:
885 Enum()
886 : LogicalType::Impl(LogicalType::Type::ENUM, SortOrder::UNSIGNED),
887 LogicalType::Impl::SimpleCompatible(ConvertedType::ENUM),
888 LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
889};
890
891GENERATE_MAKE(Enum)
892
893// The parameterized logical types (currently Decimal, Time, Timestamp, and Int)
894// generally can't reuse the simple method implementations available in the base and
895// intermediate classes and must (re)implement them all
896
897class LogicalType::Impl::Decimal final : public LogicalType::Impl::Compatible,
898 public LogicalType::Impl::Applicable {
899 public:
900 friend class DecimalLogicalType;
901
902 bool is_applicable(parquet::Type::type primitive_type,
903 int32_t primitive_length = -1) const override;
904 bool is_compatible(ConvertedType::type converted_type,
905 schema::DecimalMetadata converted_decimal_metadata) const override;
906 ConvertedType::type ToConvertedType(
907 schema::DecimalMetadata* out_decimal_metadata) const override;
908 std::string ToString() const override;
909 std::string ToJSON() const override;
910 format::LogicalType ToThrift() const override;
911 bool Equals(const LogicalType& other) const override;
912
913 int32_t precision() const { return precision_; }
914 int32_t scale() const { return scale_; }
915
916 private:
917 Decimal(int32_t p, int32_t s)
918 : LogicalType::Impl(LogicalType::Type::DECIMAL, SortOrder::SIGNED),
919 precision_(p),
920 scale_(s) {}
921 int32_t precision_ = -1;
922 int32_t scale_ = -1;
923};
924
925bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_type,
926 int32_t primitive_length) const {
927 bool ok = false;
928 switch (primitive_type) {
929 case parquet::Type::INT32: {
930 ok = (1 <= precision_) && (precision_ <= 9);
931 } break;
932 case parquet::Type::INT64: {
933 ok = (1 <= precision_) && (precision_ <= 18);
934 if (precision_ < 10) {
935 // FIXME(tpb): warn that INT32 could be used
936 }
937 } break;
938 case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
939 ok = precision_ <= static_cast<int32_t>(std::floor(
940 std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
941 } break;
942 case parquet::Type::BYTE_ARRAY: {
943 ok = true;
944 } break;
945 default: { } break; }
946 return ok;
947}
948
949bool LogicalType::Impl::Decimal::is_compatible(
950 ConvertedType::type converted_type,
951 schema::DecimalMetadata converted_decimal_metadata) const {
952 return converted_type == ConvertedType::DECIMAL &&
953 (converted_decimal_metadata.isset &&
954 converted_decimal_metadata.scale == scale_ &&
955 converted_decimal_metadata.precision == precision_);
956}
957
958ConvertedType::type LogicalType::Impl::Decimal::ToConvertedType(
959 schema::DecimalMetadata* out_decimal_metadata) const {
960 set_decimal_metadata(out_decimal_metadata, true, precision_, scale_);
961 return ConvertedType::DECIMAL;
962}
963
964std::string LogicalType::Impl::Decimal::ToString() const {
965 std::stringstream type;
966 type << "Decimal(precision=" << precision_ << ", scale=" << scale_ << ")";
967 return type.str();
968}
969
970std::string LogicalType::Impl::Decimal::ToJSON() const {
971 std::stringstream json;
972 json << R"({"Type": "Decimal", "precision": )" << precision_ << R"(, "scale": )"
973 << scale_ << "}";
974 return json.str();
975}
976
977format::LogicalType LogicalType::Impl::Decimal::ToThrift() const {
978 format::LogicalType type;
979 format::DecimalType decimal_type;
980 decimal_type.__set_precision(precision_);
981 decimal_type.__set_scale(scale_);
982 type.__set_DECIMAL(decimal_type);
983 return type;
984}
985
986bool LogicalType::Impl::Decimal::Equals(const LogicalType& other) const {
987 bool eq = false;
988 if (other.is_decimal()) {
989 const auto& other_decimal = checked_cast<const DecimalLogicalType&>(other);
990 eq = (precision_ == other_decimal.precision() && scale_ == other_decimal.scale());
991 }
992 return eq;
993}
994
995std::shared_ptr<const LogicalType> DecimalLogicalType::Make(int32_t precision,
996 int32_t scale) {
997 if (precision < 1) {
998 throw ParquetException(
999 "Precision must be greater than or equal to 1 for Decimal logical type");
1000 }
1001 if (scale < 0 || scale > precision) {
1002 throw ParquetException(
1003 "Scale must be a non-negative integer that does not exceed precision for "
1004 "Decimal logical type");
1005 }
1006 auto* logical_type = new DecimalLogicalType();
1007 logical_type->impl_.reset(new LogicalType::Impl::Decimal(precision, scale));
1008 return std::shared_ptr<const LogicalType>(logical_type);
1009}
1010
1011int32_t DecimalLogicalType::precision() const {
1012 return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).precision();
1013}
1014
1015int32_t DecimalLogicalType::scale() const {
1016 return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).scale();
1017}
1018
1019class LogicalType::Impl::Date final : public LogicalType::Impl::SimpleCompatible,
1020 public LogicalType::Impl::SimpleApplicable {
1021 public:
1022 friend class DateLogicalType;
1023
1024 OVERRIDE_TOSTRING(Date)
1025 OVERRIDE_TOTHRIFT(DateType, DATE)
1026
1027 private:
1028 Date()
1029 : LogicalType::Impl(LogicalType::Type::DATE, SortOrder::SIGNED),
1030 LogicalType::Impl::SimpleCompatible(ConvertedType::DATE),
1031 LogicalType::Impl::SimpleApplicable(parquet::Type::INT32) {}
1032};
1033
1034GENERATE_MAKE(Date)
1035
1036#define time_unit_string(u___) \
1037 ((u___) == LogicalType::TimeUnit::MILLIS \
1038 ? "milliseconds" \
1039 : ((u___) == LogicalType::TimeUnit::MICROS \
1040 ? "microseconds" \
1041 : ((u___) == LogicalType::TimeUnit::NANOS ? "nanoseconds" : "unknown")))
1042
1043class LogicalType::Impl::Time final : public LogicalType::Impl::Compatible,
1044 public LogicalType::Impl::Applicable {
1045 public:
1046 friend class TimeLogicalType;
1047
1048 bool is_applicable(parquet::Type::type primitive_type,
1049 int32_t primitive_length = -1) const override;
1050 bool is_compatible(ConvertedType::type converted_type,
1051 schema::DecimalMetadata converted_decimal_metadata) const override;
1052 ConvertedType::type ToConvertedType(
1053 schema::DecimalMetadata* out_decimal_metadata) const override;
1054 std::string ToString() const override;
1055 std::string ToJSON() const override;
1056 format::LogicalType ToThrift() const override;
1057 bool Equals(const LogicalType& other) const override;
1058
1059 bool is_adjusted_to_utc() const { return adjusted_; }
1060 LogicalType::TimeUnit::unit time_unit() const { return unit_; }
1061
1062 private:
1063 Time(bool a, LogicalType::TimeUnit::unit u)
1064 : LogicalType::Impl(LogicalType::Type::TIME, SortOrder::SIGNED),
1065 adjusted_(a),
1066 unit_(u) {}
1067 bool adjusted_ = false;
1068 LogicalType::TimeUnit::unit unit_;
1069};
1070
1071bool LogicalType::Impl::Time::is_applicable(parquet::Type::type primitive_type,
1072 int32_t primitive_length) const {
1073 return (primitive_type == parquet::Type::INT32 &&
1074 unit_ == LogicalType::TimeUnit::MILLIS) ||
1075 (primitive_type == parquet::Type::INT64 &&
1076 (unit_ == LogicalType::TimeUnit::MICROS ||
1077 unit_ == LogicalType::TimeUnit::NANOS));
1078}
1079
1080bool LogicalType::Impl::Time::is_compatible(
1081 ConvertedType::type converted_type,
1082 schema::DecimalMetadata converted_decimal_metadata) const {
1083 if (converted_decimal_metadata.isset) {
1084 return false;
1085 } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MILLIS) {
1086 return converted_type == ConvertedType::TIME_MILLIS;
1087 } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MICROS) {
1088 return converted_type == ConvertedType::TIME_MICROS;
1089 } else {
1090 return (converted_type == ConvertedType::NONE) ||
1091 (converted_type == ConvertedType::NA);
1092 }
1093}
1094
1095ConvertedType::type LogicalType::Impl::Time::ToConvertedType(
1096 schema::DecimalMetadata* out_decimal_metadata) const {
1097 reset_decimal_metadata(out_decimal_metadata);
1098 if (adjusted_) {
1099 if (unit_ == LogicalType::TimeUnit::MILLIS) {
1100 return ConvertedType::TIME_MILLIS;
1101 } else if (unit_ == LogicalType::TimeUnit::MICROS) {
1102 return ConvertedType::TIME_MICROS;
1103 }
1104 }
1105 return ConvertedType::NONE;
1106}
1107
1108std::string LogicalType::Impl::Time::ToString() const {
1109 std::stringstream type;
1110 type << "Time(isAdjustedToUTC=" << std::boolalpha << adjusted_
1111 << ", timeUnit=" << time_unit_string(unit_) << ")";
1112 return type.str();
1113}
1114
1115std::string LogicalType::Impl::Time::ToJSON() const {
1116 std::stringstream json;
1117 json << R"({"Type": "Time", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
1118 << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"("})";
1119 return json.str();
1120}
1121
1122format::LogicalType LogicalType::Impl::Time::ToThrift() const {
1123 format::LogicalType type;
1124 format::TimeType time_type;
1125 format::TimeUnit time_unit;
1126 DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
1127 if (unit_ == LogicalType::TimeUnit::MILLIS) {
1128 format::MilliSeconds millis;
1129 time_unit.__set_MILLIS(millis);
1130 } else if (unit_ == LogicalType::TimeUnit::MICROS) {
1131 format::MicroSeconds micros;
1132 time_unit.__set_MICROS(micros);
1133 } else if (unit_ == LogicalType::TimeUnit::NANOS) {
1134 format::NanoSeconds nanos;
1135 time_unit.__set_NANOS(nanos);
1136 }
1137 time_type.__set_isAdjustedToUTC(adjusted_);
1138 time_type.__set_unit(time_unit);
1139 type.__set_TIME(time_type);
1140 return type;
1141}
1142
1143bool LogicalType::Impl::Time::Equals(const LogicalType& other) const {
1144 bool eq = false;
1145 if (other.is_time()) {
1146 const auto& other_time = checked_cast<const TimeLogicalType&>(other);
1147 eq =
1148 (adjusted_ == other_time.is_adjusted_to_utc() && unit_ == other_time.time_unit());
1149 }
1150 return eq;
1151}
1152
1153std::shared_ptr<const LogicalType> TimeLogicalType::Make(
1154 bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
1155 if (time_unit == LogicalType::TimeUnit::MILLIS ||
1156 time_unit == LogicalType::TimeUnit::MICROS ||
1157 time_unit == LogicalType::TimeUnit::NANOS) {
1158 auto* logical_type = new TimeLogicalType();
1159 logical_type->impl_.reset(new LogicalType::Impl::Time(is_adjusted_to_utc, time_unit));
1160 return std::shared_ptr<const LogicalType>(logical_type);
1161 } else {
1162 throw ParquetException(
1163 "TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type");
1164 }
1165}
1166
1167bool TimeLogicalType::is_adjusted_to_utc() const {
1168 return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).is_adjusted_to_utc();
1169}
1170
1171LogicalType::TimeUnit::unit TimeLogicalType::time_unit() const {
1172 return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).time_unit();
1173}
1174
1175class LogicalType::Impl::Timestamp final : public LogicalType::Impl::Compatible,
1176 public LogicalType::Impl::SimpleApplicable {
1177 public:
1178 friend class TimestampLogicalType;
1179
1180 bool is_serialized() const override;
1181 bool is_compatible(ConvertedType::type converted_type,
1182 schema::DecimalMetadata converted_decimal_metadata) const override;
1183 ConvertedType::type ToConvertedType(
1184 schema::DecimalMetadata* out_decimal_metadata) const override;
1185 std::string ToString() const override;
1186 std::string ToJSON() const override;
1187 format::LogicalType ToThrift() const override;
1188 bool Equals(const LogicalType& other) const override;
1189
1190 bool is_adjusted_to_utc() const { return adjusted_; }
1191 LogicalType::TimeUnit::unit time_unit() const { return unit_; }
1192
1193 bool is_from_converted_type() const { return is_from_converted_type_; }
1194 bool force_set_converted_type() const { return force_set_converted_type_; }
1195
1196 private:
1197 Timestamp(bool adjusted, LogicalType::TimeUnit::unit unit, bool is_from_converted_type,
1198 bool force_set_converted_type)
1199 : LogicalType::Impl(LogicalType::Type::TIMESTAMP, SortOrder::SIGNED),
1200 LogicalType::Impl::SimpleApplicable(parquet::Type::INT64),
1201 adjusted_(adjusted),
1202 unit_(unit),
1203 is_from_converted_type_(is_from_converted_type),
1204 force_set_converted_type_(force_set_converted_type) {}
1205 bool adjusted_ = false;
1206 LogicalType::TimeUnit::unit unit_;
1207 bool is_from_converted_type_ = false;
1208 bool force_set_converted_type_ = false;
1209};
1210
1211bool LogicalType::Impl::Timestamp::is_serialized() const {
1212 return !is_from_converted_type_;
1213}
1214
1215bool LogicalType::Impl::Timestamp::is_compatible(
1216 ConvertedType::type converted_type,
1217 schema::DecimalMetadata converted_decimal_metadata) const {
1218 if (converted_decimal_metadata.isset) {
1219 return false;
1220 } else if (unit_ == LogicalType::TimeUnit::MILLIS) {
1221 if (adjusted_ || force_set_converted_type_) {
1222 return converted_type == ConvertedType::TIMESTAMP_MILLIS;
1223 } else {
1224 return (converted_type == ConvertedType::NONE) ||
1225 (converted_type == ConvertedType::NA);
1226 }
1227 } else if (unit_ == LogicalType::TimeUnit::MICROS) {
1228 if (adjusted_ || force_set_converted_type_) {
1229 return converted_type == ConvertedType::TIMESTAMP_MICROS;
1230 } else {
1231 return (converted_type == ConvertedType::NONE) ||
1232 (converted_type == ConvertedType::NA);
1233 }
1234 } else {
1235 return (converted_type == ConvertedType::NONE) ||
1236 (converted_type == ConvertedType::NA);
1237 }
1238}
1239
1240ConvertedType::type LogicalType::Impl::Timestamp::ToConvertedType(
1241 schema::DecimalMetadata* out_decimal_metadata) const {
1242 reset_decimal_metadata(out_decimal_metadata);
1243 if (adjusted_ || force_set_converted_type_) {
1244 if (unit_ == LogicalType::TimeUnit::MILLIS) {
1245 return ConvertedType::TIMESTAMP_MILLIS;
1246 } else if (unit_ == LogicalType::TimeUnit::MICROS) {
1247 return ConvertedType::TIMESTAMP_MICROS;
1248 }
1249 }
1250 return ConvertedType::NONE;
1251}
1252
1253std::string LogicalType::Impl::Timestamp::ToString() const {
1254 std::stringstream type;
1255 type << "Timestamp(isAdjustedToUTC=" << std::boolalpha << adjusted_
1256 << ", timeUnit=" << time_unit_string(unit_)
1257 << ", is_from_converted_type=" << is_from_converted_type_
1258 << ", force_set_converted_type=" << force_set_converted_type_ << ")";
1259 return type.str();
1260}
1261
1262std::string LogicalType::Impl::Timestamp::ToJSON() const {
1263 std::stringstream json;
1264 json << R"({"Type": "Timestamp", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
1265 << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"(")"
1266 << R"(, "is_from_converted_type": )" << is_from_converted_type_
1267 << R"(, "force_set_converted_type": )" << force_set_converted_type_ << R"(})";
1268 return json.str();
1269}
1270
1271format::LogicalType LogicalType::Impl::Timestamp::ToThrift() const {
1272 format::LogicalType type;
1273 format::TimestampType timestamp_type;
1274 format::TimeUnit time_unit;
1275 DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
1276 if (unit_ == LogicalType::TimeUnit::MILLIS) {
1277 format::MilliSeconds millis;
1278 time_unit.__set_MILLIS(millis);
1279 } else if (unit_ == LogicalType::TimeUnit::MICROS) {
1280 format::MicroSeconds micros;
1281 time_unit.__set_MICROS(micros);
1282 } else if (unit_ == LogicalType::TimeUnit::NANOS) {
1283 format::NanoSeconds nanos;
1284 time_unit.__set_NANOS(nanos);
1285 }
1286 timestamp_type.__set_isAdjustedToUTC(adjusted_);
1287 timestamp_type.__set_unit(time_unit);
1288 type.__set_TIMESTAMP(timestamp_type);
1289 return type;
1290}
1291
1292bool LogicalType::Impl::Timestamp::Equals(const LogicalType& other) const {
1293 bool eq = false;
1294 if (other.is_timestamp()) {
1295 const auto& other_timestamp = checked_cast<const TimestampLogicalType&>(other);
1296 eq = (adjusted_ == other_timestamp.is_adjusted_to_utc() &&
1297 unit_ == other_timestamp.time_unit());
1298 }
1299 return eq;
1300}
1301
1302std::shared_ptr<const LogicalType> TimestampLogicalType::Make(
1303 bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
1304 bool is_from_converted_type, bool force_set_converted_type) {
1305 if (time_unit == LogicalType::TimeUnit::MILLIS ||
1306 time_unit == LogicalType::TimeUnit::MICROS ||
1307 time_unit == LogicalType::TimeUnit::NANOS) {
1308 auto* logical_type = new TimestampLogicalType();
1309 logical_type->impl_.reset(new LogicalType::Impl::Timestamp(
1310 is_adjusted_to_utc, time_unit, is_from_converted_type, force_set_converted_type));
1311 return std::shared_ptr<const LogicalType>(logical_type);
1312 } else {
1313 throw ParquetException(
1314 "TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type");
1315 }
1316}
1317
1318bool TimestampLogicalType::is_adjusted_to_utc() const {
1319 return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).is_adjusted_to_utc();
1320}
1321
1322LogicalType::TimeUnit::unit TimestampLogicalType::time_unit() const {
1323 return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).time_unit();
1324}
1325
1326bool TimestampLogicalType::is_from_converted_type() const {
1327 return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
1328 .is_from_converted_type();
1329}
1330
1331bool TimestampLogicalType::force_set_converted_type() const {
1332 return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
1333 .force_set_converted_type();
1334}
1335
1336class LogicalType::Impl::Interval final : public LogicalType::Impl::SimpleCompatible,
1337 public LogicalType::Impl::TypeLengthApplicable {
1338 public:
1339 friend class IntervalLogicalType;
1340
1341 OVERRIDE_TOSTRING(Interval)
1342 // TODO(tpboudreau): uncomment the following line to enable serialization after
1343 // parquet.thrift recognizes IntervalType as a ConvertedType
1344 // OVERRIDE_TOTHRIFT(IntervalType, INTERVAL)
1345
1346 private:
1347 Interval()
1348 : LogicalType::Impl(LogicalType::Type::INTERVAL, SortOrder::UNKNOWN),
1349 LogicalType::Impl::SimpleCompatible(ConvertedType::INTERVAL),
1350 LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 12) {
1351 }
1352};
1353
1354GENERATE_MAKE(Interval)
1355
1356class LogicalType::Impl::Int final : public LogicalType::Impl::Compatible,
1357 public LogicalType::Impl::Applicable {
1358 public:
1359 friend class IntLogicalType;
1360
1361 bool is_applicable(parquet::Type::type primitive_type,
1362 int32_t primitive_length = -1) const override;
1363 bool is_compatible(ConvertedType::type converted_type,
1364 schema::DecimalMetadata converted_decimal_metadata) const override;
1365 ConvertedType::type ToConvertedType(
1366 schema::DecimalMetadata* out_decimal_metadata) const override;
1367 std::string ToString() const override;
1368 std::string ToJSON() const override;
1369 format::LogicalType ToThrift() const override;
1370 bool Equals(const LogicalType& other) const override;
1371
1372 int bit_width() const { return width_; }
1373 bool is_signed() const { return signed_; }
1374
1375 private:
1376 Int(int w, bool s)
1377 : LogicalType::Impl(LogicalType::Type::INT,
1378 (s ? SortOrder::SIGNED : SortOrder::UNSIGNED)),
1379 width_(w),
1380 signed_(s) {}
1381 int width_ = 0;
1382 bool signed_ = false;
1383};
1384
1385bool LogicalType::Impl::Int::is_applicable(parquet::Type::type primitive_type,
1386 int32_t primitive_length) const {
1387 return (primitive_type == parquet::Type::INT32 && width_ <= 32) ||
1388 (primitive_type == parquet::Type::INT64 && width_ == 64);
1389}
1390
1391bool LogicalType::Impl::Int::is_compatible(
1392 ConvertedType::type converted_type,
1393 schema::DecimalMetadata converted_decimal_metadata) const {
1394 if (converted_decimal_metadata.isset) {
1395 return false;
1396 } else if (signed_ && width_ == 8) {
1397 return converted_type == ConvertedType::INT_8;
1398 } else if (signed_ && width_ == 16) {
1399 return converted_type == ConvertedType::INT_16;
1400 } else if (signed_ && width_ == 32) {
1401 return converted_type == ConvertedType::INT_32;
1402 } else if (signed_ && width_ == 64) {
1403 return converted_type == ConvertedType::INT_64;
1404 } else if (!signed_ && width_ == 8) {
1405 return converted_type == ConvertedType::UINT_8;
1406 } else if (!signed_ && width_ == 16) {
1407 return converted_type == ConvertedType::UINT_16;
1408 } else if (!signed_ && width_ == 32) {
1409 return converted_type == ConvertedType::UINT_32;
1410 } else if (!signed_ && width_ == 64) {
1411 return converted_type == ConvertedType::UINT_64;
1412 } else {
1413 return false;
1414 }
1415}
1416
1417ConvertedType::type LogicalType::Impl::Int::ToConvertedType(
1418 schema::DecimalMetadata* out_decimal_metadata) const {
1419 reset_decimal_metadata(out_decimal_metadata);
1420 if (signed_) {
1421 switch (width_) {
1422 case 8:
1423 return ConvertedType::INT_8;
1424 case 16:
1425 return ConvertedType::INT_16;
1426 case 32:
1427 return ConvertedType::INT_32;
1428 case 64:
1429 return ConvertedType::INT_64;
1430 }
1431 } else { // unsigned
1432 switch (width_) {
1433 case 8:
1434 return ConvertedType::UINT_8;
1435 case 16:
1436 return ConvertedType::UINT_16;
1437 case 32:
1438 return ConvertedType::UINT_32;
1439 case 64:
1440 return ConvertedType::UINT_64;
1441 }
1442 }
1443 return ConvertedType::NONE;
1444}
1445
1446std::string LogicalType::Impl::Int::ToString() const {
1447 std::stringstream type;
1448 type << "Int(bitWidth=" << width_ << ", isSigned=" << std::boolalpha << signed_ << ")";
1449 return type.str();
1450}
1451
1452std::string LogicalType::Impl::Int::ToJSON() const {
1453 std::stringstream json;
1454 json << R"({"Type": "Int", "bitWidth": )" << width_ << R"(, "isSigned": )"
1455 << std::boolalpha << signed_ << "}";
1456 return json.str();
1457}
1458
1459format::LogicalType LogicalType::Impl::Int::ToThrift() const {
1460 format::LogicalType type;
1461 format::IntType int_type;
1462 DCHECK(width_ == 64 || width_ == 32 || width_ == 16 || width_ == 8);
1463 int_type.__set_bitWidth(static_cast<int8_t>(width_));
1464 int_type.__set_isSigned(signed_);
1465 type.__set_INTEGER(int_type);
1466 return type;
1467}
1468
1469bool LogicalType::Impl::Int::Equals(const LogicalType& other) const {
1470 bool eq = false;
1471 if (other.is_int()) {
1472 const auto& other_int = checked_cast<const IntLogicalType&>(other);
1473 eq = (width_ == other_int.bit_width() && signed_ == other_int.is_signed());
1474 }
1475 return eq;
1476}
1477
1478std::shared_ptr<const LogicalType> IntLogicalType::Make(int bit_width, bool is_signed) {
1479 if (bit_width == 8 || bit_width == 16 || bit_width == 32 || bit_width == 64) {
1480 auto* logical_type = new IntLogicalType();
1481 logical_type->impl_.reset(new LogicalType::Impl::Int(bit_width, is_signed));
1482 return std::shared_ptr<const LogicalType>(logical_type);
1483 } else {
1484 throw ParquetException(
1485 "Bit width must be exactly 8, 16, 32, or 64 for Int logical type");
1486 }
1487}
1488
1489int IntLogicalType::bit_width() const {
1490 return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).bit_width();
1491}
1492
1493bool IntLogicalType::is_signed() const {
1494 return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).is_signed();
1495}
1496
1497class LogicalType::Impl::Null final : public LogicalType::Impl::Incompatible,
1498 public LogicalType::Impl::UniversalApplicable {
1499 public:
1500 friend class NullLogicalType;
1501
1502 OVERRIDE_TOSTRING(Null)
1503 OVERRIDE_TOTHRIFT(NullType, UNKNOWN)
1504
1505 private:
1506 Null() : LogicalType::Impl(LogicalType::Type::NIL, SortOrder::UNKNOWN) {}
1507};
1508
1509GENERATE_MAKE(Null)
1510
1511class LogicalType::Impl::JSON final : public LogicalType::Impl::SimpleCompatible,
1512 public LogicalType::Impl::SimpleApplicable {
1513 public:
1514 friend class JSONLogicalType;
1515
1516 OVERRIDE_TOSTRING(JSON)
1517 OVERRIDE_TOTHRIFT(JsonType, JSON)
1518
1519 private:
1520 JSON()
1521 : LogicalType::Impl(LogicalType::Type::JSON, SortOrder::UNSIGNED),
1522 LogicalType::Impl::SimpleCompatible(ConvertedType::JSON),
1523 LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
1524};
1525
1526GENERATE_MAKE(JSON)
1527
1528class LogicalType::Impl::BSON final : public LogicalType::Impl::SimpleCompatible,
1529 public LogicalType::Impl::SimpleApplicable {
1530 public:
1531 friend class BSONLogicalType;
1532
1533 OVERRIDE_TOSTRING(BSON)
1534 OVERRIDE_TOTHRIFT(BsonType, BSON)
1535
1536 private:
1537 BSON()
1538 : LogicalType::Impl(LogicalType::Type::BSON, SortOrder::UNSIGNED),
1539 LogicalType::Impl::SimpleCompatible(ConvertedType::BSON),
1540 LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
1541};
1542
1543GENERATE_MAKE(BSON)
1544
1545class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible,
1546 public LogicalType::Impl::TypeLengthApplicable {
1547 public:
1548 friend class UUIDLogicalType;
1549
1550 OVERRIDE_TOSTRING(UUID)
1551 OVERRIDE_TOTHRIFT(UUIDType, UUID)
1552
1553 private:
1554 UUID()
1555 : LogicalType::Impl(LogicalType::Type::UUID, SortOrder::UNSIGNED),
1556 LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 16) {
1557 }
1558};
1559
1560GENERATE_MAKE(UUID)
1561
1562class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
1563 public LogicalType::Impl::UniversalApplicable {
1564 public:
1565 friend class NoLogicalType;
1566
1567 OVERRIDE_TOSTRING(None)
1568
1569 private:
1570 No()
1571 : LogicalType::Impl(LogicalType::Type::NONE, SortOrder::UNKNOWN),
1572 LogicalType::Impl::SimpleCompatible(ConvertedType::NONE) {}
1573};
1574
1575GENERATE_MAKE(No)
1576
1577class LogicalType::Impl::Unknown final : public LogicalType::Impl::SimpleCompatible,
1578 public LogicalType::Impl::UniversalApplicable {
1579 public:
1580 friend class UnknownLogicalType;
1581
1582 OVERRIDE_TOSTRING(Unknown)
1583
1584 private:
1585 Unknown()
1586 : LogicalType::Impl(LogicalType::Type::UNKNOWN, SortOrder::UNKNOWN),
1587 LogicalType::Impl::SimpleCompatible(ConvertedType::NA) {}
1588};
1589
1590GENERATE_MAKE(Unknown)
1591
1592namespace internal {
1593
1594/// \brief Compute the number of bytes required to represent a decimal of a
1595/// given precision. Taken from the Apache Impala codebase. The comments next
1596/// to the return values are the maximum value that can be represented in 2's
1597/// complement with the returned number of bytes.
1598int32_t DecimalSize(int32_t precision) {
1599 DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
1600 << precision;
1601 DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got "
1602 << precision;
1603
1604 switch (precision) {
1605 case 1:
1606 case 2:
1607 return 1; // 127
1608 case 3:
1609 case 4:
1610 return 2; // 32,767
1611 case 5:
1612 case 6:
1613 return 3; // 8,388,607
1614 case 7:
1615 case 8:
1616 case 9:
1617 return 4; // 2,147,483,427
1618 case 10:
1619 case 11:
1620 return 5; // 549,755,813,887
1621 case 12:
1622 case 13:
1623 case 14:
1624 return 6; // 140,737,488,355,327
1625 case 15:
1626 case 16:
1627 return 7; // 36,028,797,018,963,967
1628 case 17:
1629 case 18:
1630 return 8; // 9,223,372,036,854,775,807
1631 case 19:
1632 case 20:
1633 case 21:
1634 return 9; // 2,361,183,241,434,822,606,847
1635 case 22:
1636 case 23:
1637 return 10; // 604,462,909,807,314,587,353,087
1638 case 24:
1639 case 25:
1640 case 26:
1641 return 11; // 154,742,504,910,672,534,362,390,527
1642 case 27:
1643 case 28:
1644 return 12; // 39,614,081,257,132,168,796,771,975,167
1645 case 29:
1646 case 30:
1647 case 31:
1648 return 13; // 10,141,204,801,825,835,211,973,625,643,007
1649 case 32:
1650 case 33:
1651 return 14; // 2,596,148,429,267,413,814,265,248,164,610,047
1652 case 34:
1653 case 35:
1654 return 15; // 664,613,997,892,457,936,451,903,530,140,172,287
1655 case 36:
1656 case 37:
1657 case 38:
1658 return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727
1659 default:
1660 break;
1661 }
1662 DCHECK(false);
1663 return -1;
1664}
1665
1666} // namespace internal
1667
1668} // namespace parquet
1669