parsing.h source code [arrow/arrow/util/parsing.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	// This is a private header for string-to-number parsing utilitiers
19
20	#ifndef ARROW_UTIL_PARSING_H
21	#define ARROW_UTIL_PARSING_H
22
23	#include <cassert>
24	#include <chrono>
25	#include <limits>
26	#include <locale>
27	#include <memory>
28	#include <sstream>
29	#include <string>
30	#include <type_traits>
31
32	#include <double-conversion/double-conversion.h>
33
34	#include "arrow/type.h"
35	#include "arrow/type_traits.h"
36	#include "arrow/util/checked_cast.h"
37	#include "arrow/vendored/date.h"
38
39	namespace arrow {
40	namespace internal {
41
42	/// \brief A class providing conversion from strings to some Arrow data types
43	///
44	/// Conversion is triggered by calling operator(). It returns true on
45	/// success, false on failure.
46	///
47	/// The class may have a non-trivial construction cost in some cases,
48	/// so it's recommended to use a single instance many times, if doing bulk
49	/// conversion.
50	///
51	template <typename ARROW_TYPE, typename Enable = void>
52	class StringConverter;
53
54	template <>
55	class StringConverter<BooleanType> {
56	public:
57	using value_type = bool;
58
59	bool operator()(const char* s, size_t length, value_type* out) {
60	if (length == `1`) {
61	// "0" or "1"?
62	if (s[`0`] == `'0'`) {
63	out = false*;
64	return true;
65	}
66	if (s[`0`] == `'1'`) {
67	out = true*;
68	return true;
69	}
70	return false;
71	}
72	if (length == `4`) {
73	// "true"?
74	out = true*;
75	return ((s[`0`] == `'t'` \|\| s[`0`] == `'T'`) && (s[`1`] == `'r'` \|\| s[`1`] == `'R'`) &&
76	(s[`2`] == `'u'` \|\| s[`2`] == `'U'`) && (s[`3`] == `'e'` \|\| s[`3`] == `'E'`));
77	}
78	if (length == `5`) {
79	// "false"?
80	out = false*;
81	return ((s[`0`] == `'f'` \|\| s[`0`] == `'F'`) && (s[`1`] == `'a'` \|\| s[`1`] == `'A'`) &&
82	(s[`2`] == `'l'` \|\| s[`2`] == `'L'`) && (s[`3`] == `'s'` \|\| s[`3`] == `'S'`) &&
83	(s[`4`] == `'e'` \|\| s[`4`] == `'E'`));
84	}
85	return false;
86	}
87	};
88
89	// Ideas for faster float parsing:
90	// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
91	// - https://github.com/google/double-conversion [used here]
92	// - https://github.com/achan001/dtoa-fast
93
94	template <class ARROW_TYPE>
95	class StringToFloatConverterMixin {
96	public:
97	using value_type = typename ARROW_TYPE::c_type;
98
99	StringToFloatConverterMixin()
100	: main_converter_(flags_, main_junk_value_, main_junk_value_, "inf", "nan"),
101	fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf",
102	"nan") {}
103
104	bool operator()(const char* s, size_t length, value_type* out) {
105	value_type v;
106	// double-conversion doesn't give us an error flag but signals parse
107	// errors with sentinel values. Since a sentinel value can appear as
108	// legitimate input, we fallback on a second converter with a different
109	// sentinel to eliminate false errors.
110	TryConvert(main_converter_, s, length, &v);
111	if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(main_junk_value_))) {
112	TryConvert(fallback_converter_, s, length, &v);
113	if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(fallback_junk_value_))) {
114	return false;
115	}
116	}
117	*out = v;
118	return true;
119	}
120
121	protected:
122	static const int flags_ =
123	double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY;
124	// Two unlikely values to signal a parsing error
125	static constexpr double main_junk_value_ = `0.7066424364107089`;
126	static constexpr double fallback_junk_value_ = `0.40088499148279166`;
127
128	double_conversion::StringToDoubleConverter main_converter_;
129	double_conversion::StringToDoubleConverter fallback_converter_;
130
131	inline void TryConvert(double_conversion::StringToDoubleConverter& converter,
132	const char* s, size_t length, float* out) {
133	int processed_length;
134	out = converter.StringToFloat(s, static_cast<int*>(length), &processed_length);
135	}
136
137	inline void TryConvert(double_conversion::StringToDoubleConverter& converter,
138	const char* s, size_t length, double* out) {
139	int processed_length;
140	out = converter.StringToDouble(s, static_cast<int*>(length), &processed_length);
141	}
142	};
143
144	template <>
145	class StringConverter<FloatType> : public StringToFloatConverterMixin<FloatType> {};
146
147	template <>
148	class StringConverter<DoubleType> : public StringToFloatConverterMixin<DoubleType> {};
149
150	// NOTE: HalfFloatType would require a half<->float conversion library
151
152	namespace detail {
153
154	inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - `'0'`); }
155
156	#define PARSE_UNSIGNED_ITERATION(C_TYPE) \
157	if (length > 0) { \
158	uint8_t digit = ParseDecimalDigit(*s++); \
159	result = static_cast<C_TYPE>(result * 10U); \
160	length--; \
161	if (ARROW_PREDICT_FALSE(digit > 9U)) { \
162	/* Non-digit */ \
163	return false; \
164	} \
165	result = static_cast<C_TYPE>(result + digit); \
166	}
167
168	#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \
169	if (length > 0) { \
170	if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
171	/* Overflow */ \
172	return false; \
173	} \
174	uint8_t digit = ParseDecimalDigit(*s++); \
175	result = static_cast<C_TYPE>(result * 10U); \
176	C_TYPE new_result = static_cast<C_TYPE>(result + digit); \
177	if (ARROW_PREDICT_FALSE(--length > 0)) { \
178	/* Too many digits */ \
179	return false; \
180	} \
181	if (ARROW_PREDICT_FALSE(digit > 9U)) { \
182	/* Non-digit */ \
183	return false; \
184	} \
185	if (ARROW_PREDICT_FALSE(new_result < result)) { \
186	/* Overflow */ \
187	return false; \
188	} \
189	result = new_result; \
190	}
191
192	inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
193	uint8_t result = `0`;
194
195	PARSE_UNSIGNED_ITERATION(uint8_t);
196	PARSE_UNSIGNED_ITERATION(uint8_t);
197	PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
198	*out = result;
199	return true;
200	}
201
202	inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
203	uint16_t result = `0`;
204
205	PARSE_UNSIGNED_ITERATION(uint16_t);
206	PARSE_UNSIGNED_ITERATION(uint16_t);
207	PARSE_UNSIGNED_ITERATION(uint16_t);
208	PARSE_UNSIGNED_ITERATION(uint16_t);
209	PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
210	*out = result;
211	return true;
212	}
213
214	inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
215	uint32_t result = `0`;
216
217	PARSE_UNSIGNED_ITERATION(uint32_t);
218	PARSE_UNSIGNED_ITERATION(uint32_t);
219	PARSE_UNSIGNED_ITERATION(uint32_t);
220	PARSE_UNSIGNED_ITERATION(uint32_t);
221	PARSE_UNSIGNED_ITERATION(uint32_t);
222
223	PARSE_UNSIGNED_ITERATION(uint32_t);
224	PARSE_UNSIGNED_ITERATION(uint32_t);
225	PARSE_UNSIGNED_ITERATION(uint32_t);
226	PARSE_UNSIGNED_ITERATION(uint32_t);
227
228	PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
229	*out = result;
230	return true;
231	}
232
233	inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
234	uint64_t result = `0`;
235
236	PARSE_UNSIGNED_ITERATION(uint64_t);
237	PARSE_UNSIGNED_ITERATION(uint64_t);
238	PARSE_UNSIGNED_ITERATION(uint64_t);
239	PARSE_UNSIGNED_ITERATION(uint64_t);
240	PARSE_UNSIGNED_ITERATION(uint64_t);
241
242	PARSE_UNSIGNED_ITERATION(uint64_t);
243	PARSE_UNSIGNED_ITERATION(uint64_t);
244	PARSE_UNSIGNED_ITERATION(uint64_t);
245	PARSE_UNSIGNED_ITERATION(uint64_t);
246	PARSE_UNSIGNED_ITERATION(uint64_t);
247
248	PARSE_UNSIGNED_ITERATION(uint64_t);
249	PARSE_UNSIGNED_ITERATION(uint64_t);
250	PARSE_UNSIGNED_ITERATION(uint64_t);
251	PARSE_UNSIGNED_ITERATION(uint64_t);
252	PARSE_UNSIGNED_ITERATION(uint64_t);
253
254	PARSE_UNSIGNED_ITERATION(uint64_t);
255	PARSE_UNSIGNED_ITERATION(uint64_t);
256	PARSE_UNSIGNED_ITERATION(uint64_t);
257	PARSE_UNSIGNED_ITERATION(uint64_t);
258
259	PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
260	*out = result;
261	return true;
262	}
263
264	#undef PARSE_UNSIGNED_ITERATION
265	#undef PARSE_UNSIGNED_ITERATION_LAST
266
267	} // namespace detail
268
269	template <class ARROW_TYPE>
270	class StringToUnsignedIntConverterMixin {
271	public:
272	using value_type = typename ARROW_TYPE::c_type;
273
274	bool operator()(const char* s, size_t length, value_type* out) {
275	if (ARROW_PREDICT_FALSE(length == `0`)) {
276	return false;
277	}
278	// Skip leading zeros
279	while (length > `0` && *s == `'0'`) {
280	length--;
281	s++;
282	}
283	return detail::ParseUnsigned(s, length, out);
284	}
285	};
286
287	template <>
288	class StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {};
289
290	template <>
291	class StringConverter<UInt16Type> : public StringToUnsignedIntConverterMixin<UInt16Type> {
292	};
293
294	template <>
295	class StringConverter<UInt32Type> : public StringToUnsignedIntConverterMixin<UInt32Type> {
296	};
297
298	template <>
299	class StringConverter<UInt64Type> : public StringToUnsignedIntConverterMixin<UInt64Type> {
300	};
301
302	template <class ARROW_TYPE>
303	class StringToSignedIntConverterMixin {
304	public:
305	using value_type = typename ARROW_TYPE::c_type;
306	using unsigned_type = typename std::make_unsigned<value_type>::type;
307
308	bool operator()(const char* s, size_t length, value_type* out) {
309	static constexpr unsigned_type max_positive =
310	static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
311	// Assuming two's complement
312	static constexpr unsigned_type max_negative = max_positive + `1`;
313	bool negative = false;
314	unsigned_type unsigned_value = `0`;
315
316	if (ARROW_PREDICT_FALSE(length == `0`)) {
317	return false;
318	}
319	if (*s == `'-'`) {
320	negative = true;
321	s++;
322	if (--length == `0`) {
323	return false;
324	}
325	}
326	// Skip leading zeros
327	while (length > `0` && *s == `'0'`) {
328	length--;
329	s++;
330	}
331	if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) {
332	return false;
333	}
334	if (negative) {
335	if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
336	return false;
337	}
338	// To avoid both compiler warnings (with unsigned negation)
339	// and undefined behaviour (with signed negation overflow),
340	// use the expanded formula for 2's complement negation.
341	out = static_cast*<value_type>(~unsigned_value + `1`);
342	} else {
343	if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
344	return false;
345	}
346	out = static_cast*<value_type>(unsigned_value);
347	}
348	return true;
349	}
350	};
351
352	template <>
353	class StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {};
354
355	template <>
356	class StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {};
357
358	template <>
359	class StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {};
360
361	template <>
362	class StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {};
363
364	template <>
365	class StringConverter<TimestampType> {
366	public:
367	using value_type = TimestampType::c_type;
368
369	explicit StringConverter(const std::shared_ptr<DataType>& type)
370	: unit_(checked_cast<TimestampType*>(type.get())->unit()) {}
371
372	bool operator()(const char* s, size_t length, value_type* out) {
373	// We allow the following formats:
374	// - "YYYY-MM-DD"
375	// - "YYYY-MM-DD[ T]hh:mm:ss"
376	// - "YYYY-MM-DD[ T]hh:mm:ssZ"
377	// UTC is always assumed, and the DataType's timezone is ignored.
378	date::year_month_day ymd;
379	if (ARROW_PREDICT_FALSE(length < `10`)) {
380	return false;
381	}
382	if (length == `10`) {
383	if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
384	return false;
385	}
386	return ConvertTimePoint(date::sys_days (ymd), out);
387	}
388	if (ARROW_PREDICT_FALSE(s[`10`] != `' '`) && ARROW_PREDICT_FALSE(s[`10`] != `'T'`)) {
389	return false;
390	}
391	if (s[length - `1`] == `'Z'`) {
392	--length;
393	}
394	if (length == `19`) {
395	if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
396	return false;
397	}
398	std::chrono::duration<value_type> seconds;
399	if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + `11`, &seconds))) {
400	return false;
401	}
402	return ConvertTimePoint(date::sys_days (ymd) + seconds, out);
403	}
404	return false;
405	}
406
407	protected:
408	template <class TimePoint>
409	bool ConvertTimePoint(TimePoint tp, value_type* out) {
410	auto duration = tp.time_since_epoch();
411	switch (unit_) {
412	case TimeUnit::SECOND:
413	*out = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
414	return true;
415	case TimeUnit::MILLI:
416	*out = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
417	return true;
418	case TimeUnit::MICRO:
419	*out = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
420	return true;
421	case TimeUnit::NANO:
422	*out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
423	return true;
424	}
425	// Unreachable, but suppress compiler warning
426	assert(`0`);
427	*out = `0`;
428	return true;
429	}
430
431	bool ParseYYYY_MM_DD(const char* s, date::year_month_day* out) {
432	uint16_t year;
433	uint8_t month, day;
434	if (ARROW_PREDICT_FALSE(s[`4`] != `'-'`) \|\| ARROW_PREDICT_FALSE(s[`7`] != `'-'`)) {
435	return false;
436	}
437	if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + `0`, `4`, &year))) {
438	return false;
439	}
440	if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + `5`, `2`, &month))) {
441	return false;
442	}
443	if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + `8`, `2`, &day))) {
444	return false;
445	}
446	*out = {date::year {year}, date::month {month}, date::day {day}};
447	return out->ok();
448	}
449
450	bool ParseHH_MM_SS(const char* s, std::chrono::duration<value_type>* out) {
451	uint8_t hours, minutes, seconds;
452	if (ARROW_PREDICT_FALSE(s[`2`] != `':'`) \|\| ARROW_PREDICT_FALSE(s[`5`] != `':'`)) {
453	return false;
454	}
455	if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + `0`, `2`, &hours))) {
456	return false;
457	}
458	if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + `3`, `2`, &minutes))) {
459	return false;
460	}
461	if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + `6`, `2`, &seconds))) {
462	return false;
463	}
464	if (ARROW_PREDICT_FALSE(hours >= `24`)) {
465	return false;
466	}
467	if (ARROW_PREDICT_FALSE(minutes >= `60`)) {
468	return false;
469	}
470	if (ARROW_PREDICT_FALSE(seconds >= `60`)) {
471	return false;
472	}
473	out = std::chrono::duration<value_type>(`3600U` hours + `60U` * minutes + seconds);
474	return true;
475	}
476
477	const TimeUnit::type unit_;
478	};
479
480	} // namespace internal
481	} // namespace arrow
482
483	#endif // ARROW_UTIL_PARSING_H
484

Browse the source code of arrow/arrow/util/parsing.h