1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_TYPES_H
19#define PARQUET_TYPES_H
20
21#include <algorithm>
22#include <cstdint>
23#include <cstring>
24#include <iterator>
25#include <sstream>
26#include <string>
27
28#include "arrow/util/macros.h"
29
30#include "parquet/util/macros.h"
31#include "parquet/util/visibility.h"
32
33namespace parquet {
34
35// ----------------------------------------------------------------------
36// Metadata enums to match Thrift metadata
37//
38// The reason we maintain our own enums is to avoid transitive dependency on
39// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
40// public API. After building parquet-cpp, you should not need to include
41// Thrift headers in your application. This means some boilerplate to convert
42// between our types and Parquet's Thrift types.
43//
44// We can also add special values like NONE to distinguish between metadata
45// values being set and not set. As an example consider ConvertedType and
46// CompressionCodec
47
48// Mirrors parquet::Type
49struct Type {
50 enum type {
51 BOOLEAN = 0,
52 INT32 = 1,
53 INT64 = 2,
54 INT96 = 3,
55 FLOAT = 4,
56 DOUBLE = 5,
57 BYTE_ARRAY = 6,
58 FIXED_LEN_BYTE_ARRAY = 7
59 };
60};
61
62// Mirrors parquet::ConvertedType
63struct LogicalType {
64 enum type {
65 NONE,
66 UTF8,
67 MAP,
68 MAP_KEY_VALUE,
69 LIST,
70 ENUM,
71 DECIMAL,
72 DATE,
73 TIME_MILLIS,
74 TIME_MICROS,
75 TIMESTAMP_MILLIS,
76 TIMESTAMP_MICROS,
77 UINT_8,
78 UINT_16,
79 UINT_32,
80 UINT_64,
81 INT_8,
82 INT_16,
83 INT_32,
84 INT_64,
85 JSON,
86 BSON,
87 INTERVAL,
88 NA = 25
89 };
90};
91
92// Mirrors parquet::FieldRepetitionType
93struct Repetition {
94 enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2 };
95};
96
97// Data encodings. Mirrors parquet::Encoding
98struct Encoding {
99 enum type {
100 PLAIN = 0,
101 PLAIN_DICTIONARY = 2,
102 RLE = 3,
103 BIT_PACKED = 4,
104 DELTA_BINARY_PACKED = 5,
105 DELTA_LENGTH_BYTE_ARRAY = 6,
106 DELTA_BYTE_ARRAY = 7,
107 RLE_DICTIONARY = 8
108 };
109};
110
111// Compression, mirrors parquet::CompressionCodec
112struct Compression {
113 enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD };
114};
115
116struct Encryption {
117 enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
118};
119
120// parquet::PageType
121struct PageType {
122 enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 };
123};
124
125// Reference:
126// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
127// format/converter/ParquetMetadataConverter.java
128// Sort order for page and column statistics. Types are associated with sort
129// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
130// aggregated using a sort order. As of parquet-format version 2.3.1, the
131// order used to aggregate stats is always SIGNED and is not stored in the
132// Parquet file. These stats are discarded for types that need unsigned.
133// See PARQUET-686.
134struct SortOrder {
135 enum type { SIGNED, UNSIGNED, UNKNOWN };
136};
137
138class ColumnOrder {
139 public:
140 enum type { UNDEFINED, TYPE_DEFINED_ORDER };
141 explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
142 // Default to Type Defined Order
143 ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
144 ColumnOrder::type get_order() { return column_order_; }
145
146 static ColumnOrder undefined_;
147 static ColumnOrder type_defined_;
148
149 private:
150 ColumnOrder::type column_order_;
151};
152
153// ----------------------------------------------------------------------
154
155struct ByteArray {
156 ByteArray() : len(0), ptr(NULLPTR) {}
157 ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
158 uint32_t len;
159 const uint8_t* ptr;
160};
161
162inline bool operator==(const ByteArray& left, const ByteArray& right) {
163 return left.len == right.len &&
164 (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
165}
166
167inline bool operator!=(const ByteArray& left, const ByteArray& right) {
168 return !(left == right);
169}
170
171struct FixedLenByteArray {
172 FixedLenByteArray() : ptr(NULLPTR) {}
173 explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
174 const uint8_t* ptr;
175};
176
177using FLBA = FixedLenByteArray;
178
179// Julian day at unix epoch.
180//
181// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
182// the Julian day count starting from noon Universal time, with Julian day
183// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
184// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
185// calendar),
186constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
187constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
188constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
189constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
190constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
191
192MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
193STRUCT_END(Int96, 12);
194
195inline bool operator==(const Int96& left, const Int96& right) {
196 return std::equal(left.value, left.value + 3, right.value);
197}
198
199inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
200
201static inline std::string ByteArrayToString(const ByteArray& a) {
202 return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
203}
204
205static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
206 std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
207}
208
209static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
210 int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays;
211 int64_t nanoseconds = 0;
212
213 memcpy(&nanoseconds, &i96.value, sizeof(int64_t));
214 return days_since_epoch * kNanosecondsPerDay + nanoseconds;
215}
216
217static inline std::string Int96ToString(const Int96& a) {
218 std::ostringstream result;
219 std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
220 return result.str();
221}
222
223static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
224 std::ostringstream result;
225 std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
226 return result.str();
227}
228
229template <Type::type TYPE>
230struct type_traits {};
231
232template <>
233struct type_traits<Type::BOOLEAN> {
234 using value_type = bool;
235
236 static constexpr int value_byte_size = 1;
237 static constexpr const char* printf_code = "d";
238};
239
240template <>
241struct type_traits<Type::INT32> {
242 using value_type = int32_t;
243
244 static constexpr int value_byte_size = 4;
245 static constexpr const char* printf_code = "d";
246};
247
248template <>
249struct type_traits<Type::INT64> {
250 using value_type = int64_t;
251
252 static constexpr int value_byte_size = 8;
253 static constexpr const char* printf_code = "ld";
254};
255
256template <>
257struct type_traits<Type::INT96> {
258 using value_type = Int96;
259
260 static constexpr int value_byte_size = 12;
261 static constexpr const char* printf_code = "s";
262};
263
264template <>
265struct type_traits<Type::FLOAT> {
266 using value_type = float;
267
268 static constexpr int value_byte_size = 4;
269 static constexpr const char* printf_code = "f";
270};
271
272template <>
273struct type_traits<Type::DOUBLE> {
274 using value_type = double;
275
276 static constexpr int value_byte_size = 8;
277 static constexpr const char* printf_code = "lf";
278};
279
280template <>
281struct type_traits<Type::BYTE_ARRAY> {
282 using value_type = ByteArray;
283
284 static constexpr int value_byte_size = sizeof(ByteArray);
285 static constexpr const char* printf_code = "s";
286};
287
288template <>
289struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
290 using value_type = FixedLenByteArray;
291
292 static constexpr int value_byte_size = sizeof(FixedLenByteArray);
293 static constexpr const char* printf_code = "s";
294};
295
296template <Type::type TYPE>
297struct DataType {
298 using c_type = typename type_traits<TYPE>::value_type;
299 static constexpr Type::type type_num = TYPE;
300};
301
302using BooleanType = DataType<Type::BOOLEAN>;
303using Int32Type = DataType<Type::INT32>;
304using Int64Type = DataType<Type::INT64>;
305using Int96Type = DataType<Type::INT96>;
306using FloatType = DataType<Type::FLOAT>;
307using DoubleType = DataType<Type::DOUBLE>;
308using ByteArrayType = DataType<Type::BYTE_ARRAY>;
309using FLBAType = DataType<Type::FIXED_LEN_BYTE_ARRAY>;
310
311template <typename Type>
312inline std::string format_fwf(int width) {
313 std::stringstream ss;
314 ss << "%-" << width << type_traits<Type::type_num>::printf_code;
315 return ss.str();
316}
317
318PARQUET_EXPORT std::string CompressionToString(Compression::type t);
319
320PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
321
322PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t);
323
324PARQUET_EXPORT std::string TypeToString(Type::type t);
325
326PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
327 const std::string& val);
328
329/// \deprecated Since 1.5.0
330ARROW_DEPRECATED("Use std::string instead of char* as input")
331PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val);
332
333PARQUET_EXPORT int GetTypeByteSize(Type::type t);
334
335PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
336
337PARQUET_EXPORT SortOrder::type GetSortOrder(LogicalType::type converted,
338 Type::type primitive);
339
340} // namespace parquet
341
342#endif // PARQUET_TYPES_H
343