1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | // This is a private header for string-to-number parsing utilitiers |
19 | |
20 | #ifndef ARROW_UTIL_PARSING_H |
21 | #define ARROW_UTIL_PARSING_H |
22 | |
23 | #include <cassert> |
24 | #include <chrono> |
25 | #include <limits> |
26 | #include <locale> |
27 | #include <memory> |
28 | #include <sstream> |
29 | #include <string> |
30 | #include <type_traits> |
31 | |
32 | #include <double-conversion/double-conversion.h> |
33 | |
34 | #include "arrow/type.h" |
35 | #include "arrow/type_traits.h" |
36 | #include "arrow/util/checked_cast.h" |
37 | #include "arrow/vendored/date.h" |
38 | |
39 | namespace arrow { |
40 | namespace internal { |
41 | |
42 | /// \brief A class providing conversion from strings to some Arrow data types |
43 | /// |
44 | /// Conversion is triggered by calling operator(). It returns true on |
45 | /// success, false on failure. |
46 | /// |
47 | /// The class may have a non-trivial construction cost in some cases, |
48 | /// so it's recommended to use a single instance many times, if doing bulk |
49 | /// conversion. |
50 | /// |
51 | template <typename ARROW_TYPE, typename Enable = void> |
52 | class StringConverter; |
53 | |
54 | template <> |
55 | class StringConverter<BooleanType> { |
56 | public: |
57 | using value_type = bool; |
58 | |
59 | bool operator()(const char* s, size_t length, value_type* out) { |
60 | if (length == 1) { |
61 | // "0" or "1"? |
62 | if (s[0] == '0') { |
63 | *out = false; |
64 | return true; |
65 | } |
66 | if (s[0] == '1') { |
67 | *out = true; |
68 | return true; |
69 | } |
70 | return false; |
71 | } |
72 | if (length == 4) { |
73 | // "true"? |
74 | *out = true; |
75 | return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') && |
76 | (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E')); |
77 | } |
78 | if (length == 5) { |
79 | // "false"? |
80 | *out = false; |
81 | return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') && |
82 | (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') && |
83 | (s[4] == 'e' || s[4] == 'E')); |
84 | } |
85 | return false; |
86 | } |
87 | }; |
88 | |
89 | // Ideas for faster float parsing: |
90 | // - http://rapidjson.org/md_doc_internals.html#ParsingDouble |
91 | // - https://github.com/google/double-conversion [used here] |
92 | // - https://github.com/achan001/dtoa-fast |
93 | |
94 | template <class ARROW_TYPE> |
95 | class StringToFloatConverterMixin { |
96 | public: |
97 | using value_type = typename ARROW_TYPE::c_type; |
98 | |
99 | StringToFloatConverterMixin() |
100 | : main_converter_(flags_, main_junk_value_, main_junk_value_, "inf" , "nan" ), |
101 | fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf" , |
102 | "nan" ) {} |
103 | |
104 | bool operator()(const char* s, size_t length, value_type* out) { |
105 | value_type v; |
106 | // double-conversion doesn't give us an error flag but signals parse |
107 | // errors with sentinel values. Since a sentinel value can appear as |
108 | // legitimate input, we fallback on a second converter with a different |
109 | // sentinel to eliminate false errors. |
110 | TryConvert(main_converter_, s, length, &v); |
111 | if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(main_junk_value_))) { |
112 | TryConvert(fallback_converter_, s, length, &v); |
113 | if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(fallback_junk_value_))) { |
114 | return false; |
115 | } |
116 | } |
117 | *out = v; |
118 | return true; |
119 | } |
120 | |
121 | protected: |
122 | static const int flags_ = |
123 | double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY; |
124 | // Two unlikely values to signal a parsing error |
125 | static constexpr double main_junk_value_ = 0.7066424364107089; |
126 | static constexpr double fallback_junk_value_ = 0.40088499148279166; |
127 | |
128 | double_conversion::StringToDoubleConverter main_converter_; |
129 | double_conversion::StringToDoubleConverter fallback_converter_; |
130 | |
131 | inline void TryConvert(double_conversion::StringToDoubleConverter& converter, |
132 | const char* s, size_t length, float* out) { |
133 | int processed_length; |
134 | *out = converter.StringToFloat(s, static_cast<int>(length), &processed_length); |
135 | } |
136 | |
137 | inline void TryConvert(double_conversion::StringToDoubleConverter& converter, |
138 | const char* s, size_t length, double* out) { |
139 | int processed_length; |
140 | *out = converter.StringToDouble(s, static_cast<int>(length), &processed_length); |
141 | } |
142 | }; |
143 | |
144 | template <> |
145 | class StringConverter<FloatType> : public StringToFloatConverterMixin<FloatType> {}; |
146 | |
147 | template <> |
148 | class StringConverter<DoubleType> : public StringToFloatConverterMixin<DoubleType> {}; |
149 | |
150 | // NOTE: HalfFloatType would require a half<->float conversion library |
151 | |
152 | namespace detail { |
153 | |
154 | inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); } |
155 | |
156 | #define PARSE_UNSIGNED_ITERATION(C_TYPE) \ |
157 | if (length > 0) { \ |
158 | uint8_t digit = ParseDecimalDigit(*s++); \ |
159 | result = static_cast<C_TYPE>(result * 10U); \ |
160 | length--; \ |
161 | if (ARROW_PREDICT_FALSE(digit > 9U)) { \ |
162 | /* Non-digit */ \ |
163 | return false; \ |
164 | } \ |
165 | result = static_cast<C_TYPE>(result + digit); \ |
166 | } |
167 | |
168 | #define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \ |
169 | if (length > 0) { \ |
170 | if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \ |
171 | /* Overflow */ \ |
172 | return false; \ |
173 | } \ |
174 | uint8_t digit = ParseDecimalDigit(*s++); \ |
175 | result = static_cast<C_TYPE>(result * 10U); \ |
176 | C_TYPE new_result = static_cast<C_TYPE>(result + digit); \ |
177 | if (ARROW_PREDICT_FALSE(--length > 0)) { \ |
178 | /* Too many digits */ \ |
179 | return false; \ |
180 | } \ |
181 | if (ARROW_PREDICT_FALSE(digit > 9U)) { \ |
182 | /* Non-digit */ \ |
183 | return false; \ |
184 | } \ |
185 | if (ARROW_PREDICT_FALSE(new_result < result)) { \ |
186 | /* Overflow */ \ |
187 | return false; \ |
188 | } \ |
189 | result = new_result; \ |
190 | } |
191 | |
192 | inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) { |
193 | uint8_t result = 0; |
194 | |
195 | PARSE_UNSIGNED_ITERATION(uint8_t); |
196 | PARSE_UNSIGNED_ITERATION(uint8_t); |
197 | PARSE_UNSIGNED_ITERATION_LAST(uint8_t); |
198 | *out = result; |
199 | return true; |
200 | } |
201 | |
202 | inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) { |
203 | uint16_t result = 0; |
204 | |
205 | PARSE_UNSIGNED_ITERATION(uint16_t); |
206 | PARSE_UNSIGNED_ITERATION(uint16_t); |
207 | PARSE_UNSIGNED_ITERATION(uint16_t); |
208 | PARSE_UNSIGNED_ITERATION(uint16_t); |
209 | PARSE_UNSIGNED_ITERATION_LAST(uint16_t); |
210 | *out = result; |
211 | return true; |
212 | } |
213 | |
214 | inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) { |
215 | uint32_t result = 0; |
216 | |
217 | PARSE_UNSIGNED_ITERATION(uint32_t); |
218 | PARSE_UNSIGNED_ITERATION(uint32_t); |
219 | PARSE_UNSIGNED_ITERATION(uint32_t); |
220 | PARSE_UNSIGNED_ITERATION(uint32_t); |
221 | PARSE_UNSIGNED_ITERATION(uint32_t); |
222 | |
223 | PARSE_UNSIGNED_ITERATION(uint32_t); |
224 | PARSE_UNSIGNED_ITERATION(uint32_t); |
225 | PARSE_UNSIGNED_ITERATION(uint32_t); |
226 | PARSE_UNSIGNED_ITERATION(uint32_t); |
227 | |
228 | PARSE_UNSIGNED_ITERATION_LAST(uint32_t); |
229 | *out = result; |
230 | return true; |
231 | } |
232 | |
233 | inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { |
234 | uint64_t result = 0; |
235 | |
236 | PARSE_UNSIGNED_ITERATION(uint64_t); |
237 | PARSE_UNSIGNED_ITERATION(uint64_t); |
238 | PARSE_UNSIGNED_ITERATION(uint64_t); |
239 | PARSE_UNSIGNED_ITERATION(uint64_t); |
240 | PARSE_UNSIGNED_ITERATION(uint64_t); |
241 | |
242 | PARSE_UNSIGNED_ITERATION(uint64_t); |
243 | PARSE_UNSIGNED_ITERATION(uint64_t); |
244 | PARSE_UNSIGNED_ITERATION(uint64_t); |
245 | PARSE_UNSIGNED_ITERATION(uint64_t); |
246 | PARSE_UNSIGNED_ITERATION(uint64_t); |
247 | |
248 | PARSE_UNSIGNED_ITERATION(uint64_t); |
249 | PARSE_UNSIGNED_ITERATION(uint64_t); |
250 | PARSE_UNSIGNED_ITERATION(uint64_t); |
251 | PARSE_UNSIGNED_ITERATION(uint64_t); |
252 | PARSE_UNSIGNED_ITERATION(uint64_t); |
253 | |
254 | PARSE_UNSIGNED_ITERATION(uint64_t); |
255 | PARSE_UNSIGNED_ITERATION(uint64_t); |
256 | PARSE_UNSIGNED_ITERATION(uint64_t); |
257 | PARSE_UNSIGNED_ITERATION(uint64_t); |
258 | |
259 | PARSE_UNSIGNED_ITERATION_LAST(uint64_t); |
260 | *out = result; |
261 | return true; |
262 | } |
263 | |
264 | #undef PARSE_UNSIGNED_ITERATION |
265 | #undef PARSE_UNSIGNED_ITERATION_LAST |
266 | |
267 | } // namespace detail |
268 | |
269 | template <class ARROW_TYPE> |
270 | class StringToUnsignedIntConverterMixin { |
271 | public: |
272 | using value_type = typename ARROW_TYPE::c_type; |
273 | |
274 | bool operator()(const char* s, size_t length, value_type* out) { |
275 | if (ARROW_PREDICT_FALSE(length == 0)) { |
276 | return false; |
277 | } |
278 | // Skip leading zeros |
279 | while (length > 0 && *s == '0') { |
280 | length--; |
281 | s++; |
282 | } |
283 | return detail::ParseUnsigned(s, length, out); |
284 | } |
285 | }; |
286 | |
287 | template <> |
288 | class StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {}; |
289 | |
290 | template <> |
291 | class StringConverter<UInt16Type> : public StringToUnsignedIntConverterMixin<UInt16Type> { |
292 | }; |
293 | |
294 | template <> |
295 | class StringConverter<UInt32Type> : public StringToUnsignedIntConverterMixin<UInt32Type> { |
296 | }; |
297 | |
298 | template <> |
299 | class StringConverter<UInt64Type> : public StringToUnsignedIntConverterMixin<UInt64Type> { |
300 | }; |
301 | |
302 | template <class ARROW_TYPE> |
303 | class StringToSignedIntConverterMixin { |
304 | public: |
305 | using value_type = typename ARROW_TYPE::c_type; |
306 | using unsigned_type = typename std::make_unsigned<value_type>::type; |
307 | |
308 | bool operator()(const char* s, size_t length, value_type* out) { |
309 | static constexpr unsigned_type max_positive = |
310 | static_cast<unsigned_type>(std::numeric_limits<value_type>::max()); |
311 | // Assuming two's complement |
312 | static constexpr unsigned_type max_negative = max_positive + 1; |
313 | bool negative = false; |
314 | unsigned_type unsigned_value = 0; |
315 | |
316 | if (ARROW_PREDICT_FALSE(length == 0)) { |
317 | return false; |
318 | } |
319 | if (*s == '-') { |
320 | negative = true; |
321 | s++; |
322 | if (--length == 0) { |
323 | return false; |
324 | } |
325 | } |
326 | // Skip leading zeros |
327 | while (length > 0 && *s == '0') { |
328 | length--; |
329 | s++; |
330 | } |
331 | if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) { |
332 | return false; |
333 | } |
334 | if (negative) { |
335 | if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { |
336 | return false; |
337 | } |
338 | // To avoid both compiler warnings (with unsigned negation) |
339 | // and undefined behaviour (with signed negation overflow), |
340 | // use the expanded formula for 2's complement negation. |
341 | *out = static_cast<value_type>(~unsigned_value + 1); |
342 | } else { |
343 | if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { |
344 | return false; |
345 | } |
346 | *out = static_cast<value_type>(unsigned_value); |
347 | } |
348 | return true; |
349 | } |
350 | }; |
351 | |
352 | template <> |
353 | class StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {}; |
354 | |
355 | template <> |
356 | class StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {}; |
357 | |
358 | template <> |
359 | class StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {}; |
360 | |
361 | template <> |
362 | class StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {}; |
363 | |
364 | template <> |
365 | class StringConverter<TimestampType> { |
366 | public: |
367 | using value_type = TimestampType::c_type; |
368 | |
369 | explicit StringConverter(const std::shared_ptr<DataType>& type) |
370 | : unit_(checked_cast<TimestampType*>(type.get())->unit()) {} |
371 | |
372 | bool operator()(const char* s, size_t length, value_type* out) { |
373 | // We allow the following formats: |
374 | // - "YYYY-MM-DD" |
375 | // - "YYYY-MM-DD[ T]hh:mm:ss" |
376 | // - "YYYY-MM-DD[ T]hh:mm:ssZ" |
377 | // UTC is always assumed, and the DataType's timezone is ignored. |
378 | date::year_month_day ymd; |
379 | if (ARROW_PREDICT_FALSE(length < 10)) { |
380 | return false; |
381 | } |
382 | if (length == 10) { |
383 | if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) { |
384 | return false; |
385 | } |
386 | return ConvertTimePoint(date::sys_days(ymd), out); |
387 | } |
388 | if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) { |
389 | return false; |
390 | } |
391 | if (s[length - 1] == 'Z') { |
392 | --length; |
393 | } |
394 | if (length == 19) { |
395 | if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) { |
396 | return false; |
397 | } |
398 | std::chrono::duration<value_type> seconds; |
399 | if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) { |
400 | return false; |
401 | } |
402 | return ConvertTimePoint(date::sys_days(ymd) + seconds, out); |
403 | } |
404 | return false; |
405 | } |
406 | |
407 | protected: |
408 | template <class TimePoint> |
409 | bool ConvertTimePoint(TimePoint tp, value_type* out) { |
410 | auto duration = tp.time_since_epoch(); |
411 | switch (unit_) { |
412 | case TimeUnit::SECOND: |
413 | *out = std::chrono::duration_cast<std::chrono::seconds>(duration).count(); |
414 | return true; |
415 | case TimeUnit::MILLI: |
416 | *out = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(); |
417 | return true; |
418 | case TimeUnit::MICRO: |
419 | *out = std::chrono::duration_cast<std::chrono::microseconds>(duration).count(); |
420 | return true; |
421 | case TimeUnit::NANO: |
422 | *out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count(); |
423 | return true; |
424 | } |
425 | // Unreachable, but suppress compiler warning |
426 | assert(0); |
427 | *out = 0; |
428 | return true; |
429 | } |
430 | |
431 | bool ParseYYYY_MM_DD(const char* s, date::year_month_day* out) { |
432 | uint16_t year; |
433 | uint8_t month, day; |
434 | if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) { |
435 | return false; |
436 | } |
437 | if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) { |
438 | return false; |
439 | } |
440 | if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) { |
441 | return false; |
442 | } |
443 | if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) { |
444 | return false; |
445 | } |
446 | *out = {date::year{year}, date::month{month}, date::day{day}}; |
447 | return out->ok(); |
448 | } |
449 | |
450 | bool ParseHH_MM_SS(const char* s, std::chrono::duration<value_type>* out) { |
451 | uint8_t hours, minutes, seconds; |
452 | if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) { |
453 | return false; |
454 | } |
455 | if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) { |
456 | return false; |
457 | } |
458 | if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) { |
459 | return false; |
460 | } |
461 | if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 6, 2, &seconds))) { |
462 | return false; |
463 | } |
464 | if (ARROW_PREDICT_FALSE(hours >= 24)) { |
465 | return false; |
466 | } |
467 | if (ARROW_PREDICT_FALSE(minutes >= 60)) { |
468 | return false; |
469 | } |
470 | if (ARROW_PREDICT_FALSE(seconds >= 60)) { |
471 | return false; |
472 | } |
473 | *out = std::chrono::duration<value_type>(3600U * hours + 60U * minutes + seconds); |
474 | return true; |
475 | } |
476 | |
477 | const TimeUnit::type unit_; |
478 | }; |
479 | |
480 | } // namespace internal |
481 | } // namespace arrow |
482 | |
483 | #endif // ARROW_UTIL_PARSING_H |
484 | |