1 | // Licensed to the Apache Software Foundation (ASF) under one |
---|---|
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cstdint> |
19 | #include <cstring> |
20 | #include <sstream> |
21 | #include <string> |
22 | |
23 | #include "parquet/types.h" |
24 | |
25 | namespace parquet { |
26 | |
27 | std::string FormatStatValue(Type::type parquet_type, const std::string& val) { |
28 | std::stringstream result; |
29 | switch (parquet_type) { |
30 | case Type::BOOLEAN: |
31 | result << reinterpret_cast<const bool*>(val.c_str())[0]; |
32 | break; |
33 | case Type::INT32: |
34 | result << reinterpret_cast<const int32_t*>(val.c_str())[0]; |
35 | break; |
36 | case Type::INT64: |
37 | result << reinterpret_cast<const int64_t*>(val.c_str())[0]; |
38 | break; |
39 | case Type::DOUBLE: |
40 | result << reinterpret_cast<const double*>(val.c_str())[0]; |
41 | break; |
42 | case Type::FLOAT: |
43 | result << reinterpret_cast<const float*>(val.c_str())[0]; |
44 | break; |
45 | case Type::INT96: { |
46 | auto const i32_val = reinterpret_cast<const int32_t*>(val.c_str()); |
47 | result << i32_val[0] << " "<< i32_val[1] << " "<< i32_val[2]; |
48 | break; |
49 | } |
50 | case Type::BYTE_ARRAY: { |
51 | return val; |
52 | } |
53 | case Type::FIXED_LEN_BYTE_ARRAY: { |
54 | return val; |
55 | } |
56 | default: |
57 | break; |
58 | } |
59 | return result.str(); |
60 | } |
61 | |
62 | std::string FormatStatValue(Type::type parquet_type, const char* val) { |
63 | std::stringstream result; |
64 | switch (parquet_type) { |
65 | case Type::BOOLEAN: |
66 | result << reinterpret_cast<const bool*>(val)[0]; |
67 | break; |
68 | case Type::INT32: |
69 | result << reinterpret_cast<const int32_t*>(val)[0]; |
70 | break; |
71 | case Type::INT64: |
72 | result << reinterpret_cast<const int64_t*>(val)[0]; |
73 | break; |
74 | case Type::DOUBLE: |
75 | result << reinterpret_cast<const double*>(val)[0]; |
76 | break; |
77 | case Type::FLOAT: |
78 | result << reinterpret_cast<const float*>(val)[0]; |
79 | break; |
80 | case Type::INT96: { |
81 | auto const i32_val = reinterpret_cast<const int32_t*>(val); |
82 | result << i32_val[0] << " "<< i32_val[1] << " "<< i32_val[2]; |
83 | break; |
84 | } |
85 | case Type::BYTE_ARRAY: { |
86 | result << val; |
87 | break; |
88 | } |
89 | case Type::FIXED_LEN_BYTE_ARRAY: { |
90 | result << val; |
91 | break; |
92 | } |
93 | default: |
94 | break; |
95 | } |
96 | return result.str(); |
97 | } |
98 | |
99 | std::string EncodingToString(Encoding::type t) { |
100 | switch (t) { |
101 | case Encoding::PLAIN: |
102 | return "PLAIN"; |
103 | case Encoding::PLAIN_DICTIONARY: |
104 | return "PLAIN_DICTIONARY"; |
105 | case Encoding::RLE: |
106 | return "RLE"; |
107 | case Encoding::BIT_PACKED: |
108 | return "BIT_PACKED"; |
109 | case Encoding::DELTA_BINARY_PACKED: |
110 | return "DELTA_BINARY_PACKED"; |
111 | case Encoding::DELTA_LENGTH_BYTE_ARRAY: |
112 | return "DELTA_LENGTH_BYTE_ARRAY"; |
113 | case Encoding::DELTA_BYTE_ARRAY: |
114 | return "DELTA_BYTE_ARRAY"; |
115 | case Encoding::RLE_DICTIONARY: |
116 | return "RLE_DICTIONARY"; |
117 | default: |
118 | return "UNKNOWN"; |
119 | } |
120 | } |
121 | |
122 | std::string CompressionToString(Compression::type t) { |
123 | switch (t) { |
124 | case Compression::UNCOMPRESSED: |
125 | return "UNCOMPRESSED"; |
126 | case Compression::SNAPPY: |
127 | return "SNAPPY"; |
128 | case Compression::GZIP: |
129 | return "GZIP"; |
130 | case Compression::LZO: |
131 | return "LZO"; |
132 | case Compression::BROTLI: |
133 | return "BROTLI"; |
134 | case Compression::LZ4: |
135 | return "LZ4"; |
136 | case Compression::ZSTD: |
137 | return "ZSTD"; |
138 | default: |
139 | return "UNKNOWN"; |
140 | } |
141 | } |
142 | |
143 | std::string TypeToString(Type::type t) { |
144 | switch (t) { |
145 | case Type::BOOLEAN: |
146 | return "BOOLEAN"; |
147 | case Type::INT32: |
148 | return "INT32"; |
149 | case Type::INT64: |
150 | return "INT64"; |
151 | case Type::INT96: |
152 | return "INT96"; |
153 | case Type::FLOAT: |
154 | return "FLOAT"; |
155 | case Type::DOUBLE: |
156 | return "DOUBLE"; |
157 | case Type::BYTE_ARRAY: |
158 | return "BYTE_ARRAY"; |
159 | case Type::FIXED_LEN_BYTE_ARRAY: |
160 | return "FIXED_LEN_BYTE_ARRAY"; |
161 | default: |
162 | return "UNKNOWN"; |
163 | } |
164 | } |
165 | |
166 | std::string LogicalTypeToString(LogicalType::type t) { |
167 | switch (t) { |
168 | case LogicalType::NONE: |
169 | return "NONE"; |
170 | case LogicalType::UTF8: |
171 | return "UTF8"; |
172 | case LogicalType::MAP_KEY_VALUE: |
173 | return "MAP_KEY_VALUE"; |
174 | case LogicalType::LIST: |
175 | return "LIST"; |
176 | case LogicalType::ENUM: |
177 | return "ENUM"; |
178 | case LogicalType::DECIMAL: |
179 | return "DECIMAL"; |
180 | case LogicalType::DATE: |
181 | return "DATE"; |
182 | case LogicalType::TIME_MILLIS: |
183 | return "TIME_MILLIS"; |
184 | case LogicalType::TIME_MICROS: |
185 | return "TIME_MICROS"; |
186 | case LogicalType::TIMESTAMP_MILLIS: |
187 | return "TIMESTAMP_MILLIS"; |
188 | case LogicalType::TIMESTAMP_MICROS: |
189 | return "TIMESTAMP_MICROS"; |
190 | case LogicalType::UINT_8: |
191 | return "UINT_8"; |
192 | case LogicalType::UINT_16: |
193 | return "UINT_16"; |
194 | case LogicalType::UINT_32: |
195 | return "UINT_32"; |
196 | case LogicalType::UINT_64: |
197 | return "UINT_64"; |
198 | case LogicalType::INT_8: |
199 | return "INT_8"; |
200 | case LogicalType::INT_16: |
201 | return "INT_16"; |
202 | case LogicalType::INT_32: |
203 | return "INT_32"; |
204 | case LogicalType::INT_64: |
205 | return "INT_64"; |
206 | case LogicalType::JSON: |
207 | return "JSON"; |
208 | case LogicalType::BSON: |
209 | return "BSON"; |
210 | case LogicalType::INTERVAL: |
211 | return "INTERVAL"; |
212 | default: |
213 | return "UNKNOWN"; |
214 | } |
215 | } |
216 | |
217 | int GetTypeByteSize(Type::type parquet_type) { |
218 | switch (parquet_type) { |
219 | case Type::BOOLEAN: |
220 | return type_traits<BooleanType::type_num>::value_byte_size; |
221 | case Type::INT32: |
222 | return type_traits<Int32Type::type_num>::value_byte_size; |
223 | case Type::INT64: |
224 | return type_traits<Int64Type::type_num>::value_byte_size; |
225 | case Type::INT96: |
226 | return type_traits<Int96Type::type_num>::value_byte_size; |
227 | case Type::DOUBLE: |
228 | return type_traits<DoubleType::type_num>::value_byte_size; |
229 | case Type::FLOAT: |
230 | return type_traits<FloatType::type_num>::value_byte_size; |
231 | case Type::BYTE_ARRAY: |
232 | return type_traits<ByteArrayType::type_num>::value_byte_size; |
233 | case Type::FIXED_LEN_BYTE_ARRAY: |
234 | return type_traits<FLBAType::type_num>::value_byte_size; |
235 | default: |
236 | return 0; |
237 | } |
238 | return 0; |
239 | } |
240 | |
241 | // Return the Sort Order of the Parquet Physical Types |
242 | SortOrder::type DefaultSortOrder(Type::type primitive) { |
243 | switch (primitive) { |
244 | case Type::BOOLEAN: |
245 | case Type::INT32: |
246 | case Type::INT64: |
247 | case Type::FLOAT: |
248 | case Type::DOUBLE: |
249 | return SortOrder::SIGNED; |
250 | case Type::BYTE_ARRAY: |
251 | case Type::FIXED_LEN_BYTE_ARRAY: |
252 | return SortOrder::UNSIGNED; |
253 | case Type::INT96: |
254 | return SortOrder::UNKNOWN; |
255 | } |
256 | return SortOrder::UNKNOWN; |
257 | } |
258 | |
259 | // Return the SortOrder of the Parquet Types using Logical or Physical Types |
260 | SortOrder::type GetSortOrder(LogicalType::type converted, Type::type primitive) { |
261 | if (converted == LogicalType::NONE) return DefaultSortOrder(primitive); |
262 | switch (converted) { |
263 | case LogicalType::INT_8: |
264 | case LogicalType::INT_16: |
265 | case LogicalType::INT_32: |
266 | case LogicalType::INT_64: |
267 | case LogicalType::DATE: |
268 | case LogicalType::TIME_MICROS: |
269 | case LogicalType::TIME_MILLIS: |
270 | case LogicalType::TIMESTAMP_MICROS: |
271 | case LogicalType::TIMESTAMP_MILLIS: |
272 | return SortOrder::SIGNED; |
273 | case LogicalType::UINT_8: |
274 | case LogicalType::UINT_16: |
275 | case LogicalType::UINT_32: |
276 | case LogicalType::UINT_64: |
277 | case LogicalType::ENUM: |
278 | case LogicalType::UTF8: |
279 | case LogicalType::BSON: |
280 | case LogicalType::JSON: |
281 | return SortOrder::UNSIGNED; |
282 | case LogicalType::DECIMAL: |
283 | case LogicalType::LIST: |
284 | case LogicalType::MAP: |
285 | case LogicalType::MAP_KEY_VALUE: |
286 | case LogicalType::INTERVAL: |
287 | case LogicalType::NONE: // required instead of default |
288 | case LogicalType::NA: // required instead of default |
289 | return SortOrder::UNKNOWN; |
290 | } |
291 | return SortOrder::UNKNOWN; |
292 | } |
293 | |
294 | ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED); |
295 | ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER); |
296 | |
297 | } // namespace parquet |
298 |