1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_TYPES_H |
19 | #define PARQUET_TYPES_H |
20 | |
21 | #include <algorithm> |
22 | #include <cstdint> |
23 | #include <cstring> |
24 | #include <iterator> |
25 | #include <sstream> |
26 | #include <string> |
27 | |
28 | #include "arrow/util/macros.h" |
29 | |
30 | #include "parquet/util/macros.h" |
31 | #include "parquet/util/visibility.h" |
32 | |
33 | namespace parquet { |
34 | |
35 | // ---------------------------------------------------------------------- |
36 | // Metadata enums to match Thrift metadata |
37 | // |
38 | // The reason we maintain our own enums is to avoid transitive dependency on |
39 | // the compiled Thrift headers (and thus thrift/Thrift.h) for users of the |
40 | // public API. After building parquet-cpp, you should not need to include |
41 | // Thrift headers in your application. This means some boilerplate to convert |
42 | // between our types and Parquet's Thrift types. |
43 | // |
44 | // We can also add special values like NONE to distinguish between metadata |
45 | // values being set and not set. As an example consider ConvertedType and |
46 | // CompressionCodec |
47 | |
48 | // Mirrors parquet::Type |
49 | struct Type { |
50 | enum type { |
51 | BOOLEAN = 0, |
52 | INT32 = 1, |
53 | INT64 = 2, |
54 | INT96 = 3, |
55 | FLOAT = 4, |
56 | DOUBLE = 5, |
57 | BYTE_ARRAY = 6, |
58 | FIXED_LEN_BYTE_ARRAY = 7 |
59 | }; |
60 | }; |
61 | |
62 | // Mirrors parquet::ConvertedType |
63 | struct LogicalType { |
64 | enum type { |
65 | NONE, |
66 | UTF8, |
67 | MAP, |
68 | MAP_KEY_VALUE, |
69 | LIST, |
70 | ENUM, |
71 | DECIMAL, |
72 | DATE, |
73 | TIME_MILLIS, |
74 | TIME_MICROS, |
75 | TIMESTAMP_MILLIS, |
76 | TIMESTAMP_MICROS, |
77 | UINT_8, |
78 | UINT_16, |
79 | UINT_32, |
80 | UINT_64, |
81 | INT_8, |
82 | INT_16, |
83 | INT_32, |
84 | INT_64, |
85 | JSON, |
86 | BSON, |
87 | INTERVAL, |
88 | NA = 25 |
89 | }; |
90 | }; |
91 | |
92 | // Mirrors parquet::FieldRepetitionType |
93 | struct Repetition { |
94 | enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2 }; |
95 | }; |
96 | |
97 | // Data encodings. Mirrors parquet::Encoding |
98 | struct Encoding { |
99 | enum type { |
100 | PLAIN = 0, |
101 | PLAIN_DICTIONARY = 2, |
102 | RLE = 3, |
103 | BIT_PACKED = 4, |
104 | DELTA_BINARY_PACKED = 5, |
105 | DELTA_LENGTH_BYTE_ARRAY = 6, |
106 | DELTA_BYTE_ARRAY = 7, |
107 | RLE_DICTIONARY = 8 |
108 | }; |
109 | }; |
110 | |
111 | // Compression, mirrors parquet::CompressionCodec |
112 | struct Compression { |
113 | enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD }; |
114 | }; |
115 | |
116 | struct Encryption { |
117 | enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; |
118 | }; |
119 | |
120 | // parquet::PageType |
121 | struct PageType { |
122 | enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; |
123 | }; |
124 | |
125 | // Reference: |
126 | // parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ |
127 | // format/converter/ParquetMetadataConverter.java |
128 | // Sort order for page and column statistics. Types are associated with sort |
129 | // orders (e.g., UTF8 columns should use UNSIGNED) and column stats are |
130 | // aggregated using a sort order. As of parquet-format version 2.3.1, the |
131 | // order used to aggregate stats is always SIGNED and is not stored in the |
132 | // Parquet file. These stats are discarded for types that need unsigned. |
133 | // See PARQUET-686. |
134 | struct SortOrder { |
135 | enum type { SIGNED, UNSIGNED, UNKNOWN }; |
136 | }; |
137 | |
138 | class ColumnOrder { |
139 | public: |
140 | enum type { UNDEFINED, TYPE_DEFINED_ORDER }; |
141 | explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {} |
142 | // Default to Type Defined Order |
143 | ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {} |
144 | ColumnOrder::type get_order() { return column_order_; } |
145 | |
146 | static ColumnOrder undefined_; |
147 | static ColumnOrder type_defined_; |
148 | |
149 | private: |
150 | ColumnOrder::type column_order_; |
151 | }; |
152 | |
153 | // ---------------------------------------------------------------------- |
154 | |
155 | struct ByteArray { |
156 | ByteArray() : len(0), ptr(NULLPTR) {} |
157 | ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} |
158 | uint32_t len; |
159 | const uint8_t* ptr; |
160 | }; |
161 | |
162 | inline bool operator==(const ByteArray& left, const ByteArray& right) { |
163 | return left.len == right.len && |
164 | (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); |
165 | } |
166 | |
167 | inline bool operator!=(const ByteArray& left, const ByteArray& right) { |
168 | return !(left == right); |
169 | } |
170 | |
171 | struct FixedLenByteArray { |
172 | FixedLenByteArray() : ptr(NULLPTR) {} |
173 | explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} |
174 | const uint8_t* ptr; |
175 | }; |
176 | |
177 | using FLBA = FixedLenByteArray; |
178 | |
179 | // Julian day at unix epoch. |
180 | // |
181 | // The Julian Day Number (JDN) is the integer assigned to a whole solar day in |
182 | // the Julian day count starting from noon Universal time, with Julian day |
183 | // number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, |
184 | // proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian |
185 | // calendar), |
186 | constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); |
187 | constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); |
188 | constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); |
189 | constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); |
190 | constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); |
191 | |
192 | MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; |
193 | STRUCT_END(Int96, 12); |
194 | |
195 | inline bool operator==(const Int96& left, const Int96& right) { |
196 | return std::equal(left.value, left.value + 3, right.value); |
197 | } |
198 | |
199 | inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); } |
200 | |
201 | static inline std::string ByteArrayToString(const ByteArray& a) { |
202 | return std::string(reinterpret_cast<const char*>(a.ptr), a.len); |
203 | } |
204 | |
205 | static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { |
206 | std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); |
207 | } |
208 | |
209 | static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { |
210 | int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; |
211 | int64_t nanoseconds = 0; |
212 | |
213 | memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); |
214 | return days_since_epoch * kNanosecondsPerDay + nanoseconds; |
215 | } |
216 | |
217 | static inline std::string Int96ToString(const Int96& a) { |
218 | std::ostringstream result; |
219 | std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " " )); |
220 | return result.str(); |
221 | } |
222 | |
223 | static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) { |
224 | std::ostringstream result; |
225 | std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " " )); |
226 | return result.str(); |
227 | } |
228 | |
229 | template <Type::type TYPE> |
230 | struct type_traits {}; |
231 | |
232 | template <> |
233 | struct type_traits<Type::BOOLEAN> { |
234 | using value_type = bool; |
235 | |
236 | static constexpr int value_byte_size = 1; |
237 | static constexpr const char* printf_code = "d" ; |
238 | }; |
239 | |
240 | template <> |
241 | struct type_traits<Type::INT32> { |
242 | using value_type = int32_t; |
243 | |
244 | static constexpr int value_byte_size = 4; |
245 | static constexpr const char* printf_code = "d" ; |
246 | }; |
247 | |
248 | template <> |
249 | struct type_traits<Type::INT64> { |
250 | using value_type = int64_t; |
251 | |
252 | static constexpr int value_byte_size = 8; |
253 | static constexpr const char* printf_code = "ld" ; |
254 | }; |
255 | |
256 | template <> |
257 | struct type_traits<Type::INT96> { |
258 | using value_type = Int96; |
259 | |
260 | static constexpr int value_byte_size = 12; |
261 | static constexpr const char* printf_code = "s" ; |
262 | }; |
263 | |
264 | template <> |
265 | struct type_traits<Type::FLOAT> { |
266 | using value_type = float; |
267 | |
268 | static constexpr int value_byte_size = 4; |
269 | static constexpr const char* printf_code = "f" ; |
270 | }; |
271 | |
272 | template <> |
273 | struct type_traits<Type::DOUBLE> { |
274 | using value_type = double; |
275 | |
276 | static constexpr int value_byte_size = 8; |
277 | static constexpr const char* printf_code = "lf" ; |
278 | }; |
279 | |
280 | template <> |
281 | struct type_traits<Type::BYTE_ARRAY> { |
282 | using value_type = ByteArray; |
283 | |
284 | static constexpr int value_byte_size = sizeof(ByteArray); |
285 | static constexpr const char* printf_code = "s" ; |
286 | }; |
287 | |
288 | template <> |
289 | struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> { |
290 | using value_type = FixedLenByteArray; |
291 | |
292 | static constexpr int value_byte_size = sizeof(FixedLenByteArray); |
293 | static constexpr const char* printf_code = "s" ; |
294 | }; |
295 | |
296 | template <Type::type TYPE> |
297 | struct DataType { |
298 | using c_type = typename type_traits<TYPE>::value_type; |
299 | static constexpr Type::type type_num = TYPE; |
300 | }; |
301 | |
302 | using BooleanType = DataType<Type::BOOLEAN>; |
303 | using Int32Type = DataType<Type::INT32>; |
304 | using Int64Type = DataType<Type::INT64>; |
305 | using Int96Type = DataType<Type::INT96>; |
306 | using FloatType = DataType<Type::FLOAT>; |
307 | using DoubleType = DataType<Type::DOUBLE>; |
308 | using ByteArrayType = DataType<Type::BYTE_ARRAY>; |
309 | using FLBAType = DataType<Type::FIXED_LEN_BYTE_ARRAY>; |
310 | |
311 | template <typename Type> |
312 | inline std::string format_fwf(int width) { |
313 | std::stringstream ss; |
314 | ss << "%-" << width << type_traits<Type::type_num>::printf_code; |
315 | return ss.str(); |
316 | } |
317 | |
318 | PARQUET_EXPORT std::string CompressionToString(Compression::type t); |
319 | |
320 | PARQUET_EXPORT std::string EncodingToString(Encoding::type t); |
321 | |
322 | PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t); |
323 | |
324 | PARQUET_EXPORT std::string TypeToString(Type::type t); |
325 | |
326 | PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, |
327 | const std::string& val); |
328 | |
329 | /// \deprecated Since 1.5.0 |
330 | ARROW_DEPRECATED("Use std::string instead of char* as input" ) |
331 | PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val); |
332 | |
333 | PARQUET_EXPORT int GetTypeByteSize(Type::type t); |
334 | |
335 | PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive); |
336 | |
337 | PARQUET_EXPORT SortOrder::type GetSortOrder(LogicalType::type converted, |
338 | Type::type primitive); |
339 | |
340 | } // namespace parquet |
341 | |
342 | #endif // PARQUET_TYPES_H |
343 | |