1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// This is a private header for string-to-number parsing utilitiers
19
20#ifndef ARROW_UTIL_PARSING_H
21#define ARROW_UTIL_PARSING_H
22
23#include <cassert>
24#include <chrono>
25#include <limits>
26#include <locale>
27#include <memory>
28#include <sstream>
29#include <string>
30#include <type_traits>
31
32#include <double-conversion/double-conversion.h>
33
34#include "arrow/type.h"
35#include "arrow/type_traits.h"
36#include "arrow/util/checked_cast.h"
37#include "arrow/vendored/date.h"
38
39namespace arrow {
40namespace internal {
41
42/// \brief A class providing conversion from strings to some Arrow data types
43///
44/// Conversion is triggered by calling operator(). It returns true on
45/// success, false on failure.
46///
47/// The class may have a non-trivial construction cost in some cases,
48/// so it's recommended to use a single instance many times, if doing bulk
49/// conversion.
50///
51template <typename ARROW_TYPE, typename Enable = void>
52class StringConverter;
53
54template <>
55class StringConverter<BooleanType> {
56 public:
57 using value_type = bool;
58
59 bool operator()(const char* s, size_t length, value_type* out) {
60 if (length == 1) {
61 // "0" or "1"?
62 if (s[0] == '0') {
63 *out = false;
64 return true;
65 }
66 if (s[0] == '1') {
67 *out = true;
68 return true;
69 }
70 return false;
71 }
72 if (length == 4) {
73 // "true"?
74 *out = true;
75 return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
76 (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
77 }
78 if (length == 5) {
79 // "false"?
80 *out = false;
81 return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
82 (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
83 (s[4] == 'e' || s[4] == 'E'));
84 }
85 return false;
86 }
87};
88
89// Ideas for faster float parsing:
90// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
91// - https://github.com/google/double-conversion [used here]
92// - https://github.com/achan001/dtoa-fast
93
94template <class ARROW_TYPE>
95class StringToFloatConverterMixin {
96 public:
97 using value_type = typename ARROW_TYPE::c_type;
98
99 StringToFloatConverterMixin()
100 : main_converter_(flags_, main_junk_value_, main_junk_value_, "inf", "nan"),
101 fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf",
102 "nan") {}
103
104 bool operator()(const char* s, size_t length, value_type* out) {
105 value_type v;
106 // double-conversion doesn't give us an error flag but signals parse
107 // errors with sentinel values. Since a sentinel value can appear as
108 // legitimate input, we fallback on a second converter with a different
109 // sentinel to eliminate false errors.
110 TryConvert(main_converter_, s, length, &v);
111 if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(main_junk_value_))) {
112 TryConvert(fallback_converter_, s, length, &v);
113 if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(fallback_junk_value_))) {
114 return false;
115 }
116 }
117 *out = v;
118 return true;
119 }
120
121 protected:
122 static const int flags_ =
123 double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY;
124 // Two unlikely values to signal a parsing error
125 static constexpr double main_junk_value_ = 0.7066424364107089;
126 static constexpr double fallback_junk_value_ = 0.40088499148279166;
127
128 double_conversion::StringToDoubleConverter main_converter_;
129 double_conversion::StringToDoubleConverter fallback_converter_;
130
131 inline void TryConvert(double_conversion::StringToDoubleConverter& converter,
132 const char* s, size_t length, float* out) {
133 int processed_length;
134 *out = converter.StringToFloat(s, static_cast<int>(length), &processed_length);
135 }
136
137 inline void TryConvert(double_conversion::StringToDoubleConverter& converter,
138 const char* s, size_t length, double* out) {
139 int processed_length;
140 *out = converter.StringToDouble(s, static_cast<int>(length), &processed_length);
141 }
142};
143
144template <>
145class StringConverter<FloatType> : public StringToFloatConverterMixin<FloatType> {};
146
147template <>
148class StringConverter<DoubleType> : public StringToFloatConverterMixin<DoubleType> {};
149
150// NOTE: HalfFloatType would require a half<->float conversion library
151
152namespace detail {
153
154inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }
155
156#define PARSE_UNSIGNED_ITERATION(C_TYPE) \
157 if (length > 0) { \
158 uint8_t digit = ParseDecimalDigit(*s++); \
159 result = static_cast<C_TYPE>(result * 10U); \
160 length--; \
161 if (ARROW_PREDICT_FALSE(digit > 9U)) { \
162 /* Non-digit */ \
163 return false; \
164 } \
165 result = static_cast<C_TYPE>(result + digit); \
166 }
167
168#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \
169 if (length > 0) { \
170 if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
171 /* Overflow */ \
172 return false; \
173 } \
174 uint8_t digit = ParseDecimalDigit(*s++); \
175 result = static_cast<C_TYPE>(result * 10U); \
176 C_TYPE new_result = static_cast<C_TYPE>(result + digit); \
177 if (ARROW_PREDICT_FALSE(--length > 0)) { \
178 /* Too many digits */ \
179 return false; \
180 } \
181 if (ARROW_PREDICT_FALSE(digit > 9U)) { \
182 /* Non-digit */ \
183 return false; \
184 } \
185 if (ARROW_PREDICT_FALSE(new_result < result)) { \
186 /* Overflow */ \
187 return false; \
188 } \
189 result = new_result; \
190 }
191
192inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
193 uint8_t result = 0;
194
195 PARSE_UNSIGNED_ITERATION(uint8_t);
196 PARSE_UNSIGNED_ITERATION(uint8_t);
197 PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
198 *out = result;
199 return true;
200}
201
202inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
203 uint16_t result = 0;
204
205 PARSE_UNSIGNED_ITERATION(uint16_t);
206 PARSE_UNSIGNED_ITERATION(uint16_t);
207 PARSE_UNSIGNED_ITERATION(uint16_t);
208 PARSE_UNSIGNED_ITERATION(uint16_t);
209 PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
210 *out = result;
211 return true;
212}
213
214inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
215 uint32_t result = 0;
216
217 PARSE_UNSIGNED_ITERATION(uint32_t);
218 PARSE_UNSIGNED_ITERATION(uint32_t);
219 PARSE_UNSIGNED_ITERATION(uint32_t);
220 PARSE_UNSIGNED_ITERATION(uint32_t);
221 PARSE_UNSIGNED_ITERATION(uint32_t);
222
223 PARSE_UNSIGNED_ITERATION(uint32_t);
224 PARSE_UNSIGNED_ITERATION(uint32_t);
225 PARSE_UNSIGNED_ITERATION(uint32_t);
226 PARSE_UNSIGNED_ITERATION(uint32_t);
227
228 PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
229 *out = result;
230 return true;
231}
232
233inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
234 uint64_t result = 0;
235
236 PARSE_UNSIGNED_ITERATION(uint64_t);
237 PARSE_UNSIGNED_ITERATION(uint64_t);
238 PARSE_UNSIGNED_ITERATION(uint64_t);
239 PARSE_UNSIGNED_ITERATION(uint64_t);
240 PARSE_UNSIGNED_ITERATION(uint64_t);
241
242 PARSE_UNSIGNED_ITERATION(uint64_t);
243 PARSE_UNSIGNED_ITERATION(uint64_t);
244 PARSE_UNSIGNED_ITERATION(uint64_t);
245 PARSE_UNSIGNED_ITERATION(uint64_t);
246 PARSE_UNSIGNED_ITERATION(uint64_t);
247
248 PARSE_UNSIGNED_ITERATION(uint64_t);
249 PARSE_UNSIGNED_ITERATION(uint64_t);
250 PARSE_UNSIGNED_ITERATION(uint64_t);
251 PARSE_UNSIGNED_ITERATION(uint64_t);
252 PARSE_UNSIGNED_ITERATION(uint64_t);
253
254 PARSE_UNSIGNED_ITERATION(uint64_t);
255 PARSE_UNSIGNED_ITERATION(uint64_t);
256 PARSE_UNSIGNED_ITERATION(uint64_t);
257 PARSE_UNSIGNED_ITERATION(uint64_t);
258
259 PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
260 *out = result;
261 return true;
262}
263
264#undef PARSE_UNSIGNED_ITERATION
265#undef PARSE_UNSIGNED_ITERATION_LAST
266
267} // namespace detail
268
269template <class ARROW_TYPE>
270class StringToUnsignedIntConverterMixin {
271 public:
272 using value_type = typename ARROW_TYPE::c_type;
273
274 bool operator()(const char* s, size_t length, value_type* out) {
275 if (ARROW_PREDICT_FALSE(length == 0)) {
276 return false;
277 }
278 // Skip leading zeros
279 while (length > 0 && *s == '0') {
280 length--;
281 s++;
282 }
283 return detail::ParseUnsigned(s, length, out);
284 }
285};
286
287template <>
288class StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {};
289
290template <>
291class StringConverter<UInt16Type> : public StringToUnsignedIntConverterMixin<UInt16Type> {
292};
293
294template <>
295class StringConverter<UInt32Type> : public StringToUnsignedIntConverterMixin<UInt32Type> {
296};
297
298template <>
299class StringConverter<UInt64Type> : public StringToUnsignedIntConverterMixin<UInt64Type> {
300};
301
302template <class ARROW_TYPE>
303class StringToSignedIntConverterMixin {
304 public:
305 using value_type = typename ARROW_TYPE::c_type;
306 using unsigned_type = typename std::make_unsigned<value_type>::type;
307
308 bool operator()(const char* s, size_t length, value_type* out) {
309 static constexpr unsigned_type max_positive =
310 static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
311 // Assuming two's complement
312 static constexpr unsigned_type max_negative = max_positive + 1;
313 bool negative = false;
314 unsigned_type unsigned_value = 0;
315
316 if (ARROW_PREDICT_FALSE(length == 0)) {
317 return false;
318 }
319 if (*s == '-') {
320 negative = true;
321 s++;
322 if (--length == 0) {
323 return false;
324 }
325 }
326 // Skip leading zeros
327 while (length > 0 && *s == '0') {
328 length--;
329 s++;
330 }
331 if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) {
332 return false;
333 }
334 if (negative) {
335 if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
336 return false;
337 }
338 // To avoid both compiler warnings (with unsigned negation)
339 // and undefined behaviour (with signed negation overflow),
340 // use the expanded formula for 2's complement negation.
341 *out = static_cast<value_type>(~unsigned_value + 1);
342 } else {
343 if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
344 return false;
345 }
346 *out = static_cast<value_type>(unsigned_value);
347 }
348 return true;
349 }
350};
351
352template <>
353class StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {};
354
355template <>
356class StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {};
357
358template <>
359class StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {};
360
361template <>
362class StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {};
363
364template <>
365class StringConverter<TimestampType> {
366 public:
367 using value_type = TimestampType::c_type;
368
369 explicit StringConverter(const std::shared_ptr<DataType>& type)
370 : unit_(checked_cast<TimestampType*>(type.get())->unit()) {}
371
372 bool operator()(const char* s, size_t length, value_type* out) {
373 // We allow the following formats:
374 // - "YYYY-MM-DD"
375 // - "YYYY-MM-DD[ T]hh:mm:ss"
376 // - "YYYY-MM-DD[ T]hh:mm:ssZ"
377 // UTC is always assumed, and the DataType's timezone is ignored.
378 date::year_month_day ymd;
379 if (ARROW_PREDICT_FALSE(length < 10)) {
380 return false;
381 }
382 if (length == 10) {
383 if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
384 return false;
385 }
386 return ConvertTimePoint(date::sys_days(ymd), out);
387 }
388 if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
389 return false;
390 }
391 if (s[length - 1] == 'Z') {
392 --length;
393 }
394 if (length == 19) {
395 if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
396 return false;
397 }
398 std::chrono::duration<value_type> seconds;
399 if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) {
400 return false;
401 }
402 return ConvertTimePoint(date::sys_days(ymd) + seconds, out);
403 }
404 return false;
405 }
406
407 protected:
408 template <class TimePoint>
409 bool ConvertTimePoint(TimePoint tp, value_type* out) {
410 auto duration = tp.time_since_epoch();
411 switch (unit_) {
412 case TimeUnit::SECOND:
413 *out = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
414 return true;
415 case TimeUnit::MILLI:
416 *out = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
417 return true;
418 case TimeUnit::MICRO:
419 *out = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
420 return true;
421 case TimeUnit::NANO:
422 *out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
423 return true;
424 }
425 // Unreachable, but suppress compiler warning
426 assert(0);
427 *out = 0;
428 return true;
429 }
430
431 bool ParseYYYY_MM_DD(const char* s, date::year_month_day* out) {
432 uint16_t year;
433 uint8_t month, day;
434 if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
435 return false;
436 }
437 if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) {
438 return false;
439 }
440 if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) {
441 return false;
442 }
443 if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) {
444 return false;
445 }
446 *out = {date::year{year}, date::month{month}, date::day{day}};
447 return out->ok();
448 }
449
450 bool ParseHH_MM_SS(const char* s, std::chrono::duration<value_type>* out) {
451 uint8_t hours, minutes, seconds;
452 if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
453 return false;
454 }
455 if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) {
456 return false;
457 }
458 if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) {
459 return false;
460 }
461 if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 6, 2, &seconds))) {
462 return false;
463 }
464 if (ARROW_PREDICT_FALSE(hours >= 24)) {
465 return false;
466 }
467 if (ARROW_PREDICT_FALSE(minutes >= 60)) {
468 return false;
469 }
470 if (ARROW_PREDICT_FALSE(seconds >= 60)) {
471 return false;
472 }
473 *out = std::chrono::duration<value_type>(3600U * hours + 60U * minutes + seconds);
474 return true;
475 }
476
477 const TimeUnit::type unit_;
478};
479
480} // namespace internal
481} // namespace arrow
482
483#endif // ARROW_UTIL_PARSING_H
484