string.c++ source code [ClickHouse/contrib/capnproto/c++/src/kj/string.c++]

1	// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2	// Licensed under the MIT License:
3	//
4	// Permission is hereby granted, free of charge, to any person obtaining a copy
5	// of this software and associated documentation files (the "Software"), to deal
6	// in the Software without restriction, including without limitation the rights
7	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8	// copies of the Software, and to permit persons to whom the Software is
9	// furnished to do so, subject to the following conditions:
10	//
11	// The above copyright notice and this permission notice shall be included in
12	// all copies or substantial portions of the Software.
13	//
14	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20	// THE SOFTWARE.
21
22	#include "string.h"
23	#include "debug.h"
24	#include <stdio.h>
25	#include <float.h>
26	#include <errno.h>
27	#include <stdlib.h>
28	#include <stdint.h>
29
30	namespace kj {
31
32	#if _MSC_VER
33	#pragma warning(disable: 4996)
34	// Warns that sprintf() is buffer-overrunny. We know that, it's cool.
35	#endif
36
37	namespace {
38	bool isHex(const char *s) {
39	if (*s == `'-'`) s++;
40	return s[`0`] == `'0'` && (s[`1`] == `'x'` \|\| s[`1`] == `'X'`);
41	}
42
43	long long parseSigned(const StringPtr& s, long long min, long long max) {
44	KJ_REQUIRE(s != nullptr, "String does not contain valid number", s) { return `0`; }
45	char *endPtr;
46	errno = `0`;
47	auto value = strtoll(s.begin(), &endPtr, isHex(s.cStr()) ? `16` : `10`);
48	KJ_REQUIRE(endPtr == s.end(), "String does not contain valid number", s) { return `0`; }
49	KJ_REQUIRE(errno != ERANGE, "Value out-of-range", s) { return `0`; }
50	KJ_REQUIRE(value >= min && value <= max, "Value out-of-range", value, min, max) { return `0`; }
51	return value;
52	}
53
54	unsigned long long parseUnsigned(const StringPtr& s, unsigned long long max) {
55	KJ_REQUIRE(s != nullptr, "String does not contain valid number", s) { return `0`; }
56	char *endPtr;
57	errno = `0`;
58	auto value = strtoull(s.begin(), &endPtr, isHex(s.cStr()) ? `16` : `10`);
59	KJ_REQUIRE(endPtr == s.end(), "String does not contain valid number", s) { return `0`; }
60	KJ_REQUIRE(errno != ERANGE, "Value out-of-range", s) { return `0`; }
61	KJ_REQUIRE(value <= max, "Value out-of-range", value, max) { return `0`; }
62	//strtoull("-1") does not fail with ERANGE
63	KJ_REQUIRE(s[`0`] != `'-'`, "Value out-of-range", s) { return `0`; }
64	return value;
65	}
66
67	template <typename T>
68	T parseInteger(const StringPtr& s) {
69	if (static_cast<T>(minValue) < `0`) {
70	long long min = static_cast<T>(minValue);
71	long long max = static_cast<T>(maxValue);
72	return static_cast<T>(parseSigned(s, min, max));
73	} else {
74	unsigned long long max = static_cast<T>(maxValue);
75	return static_cast<T>(parseUnsigned(s, max));
76	}
77	}
78
79	double parseDouble(const StringPtr& s) {
80	KJ_REQUIRE(s != nullptr, "String does not contain valid number", s) { return `0`; }
81	char *endPtr;
82	errno = `0`;
83	auto value = strtod(s.begin(), &endPtr);
84	KJ_REQUIRE(endPtr == s.end(), "String does not contain valid floating number", s) { return `0`; }
85	#if _WIN32 \|\| __CYGWIN__ \|\| __BIONIC__
86	// When Windows' strtod() parses "nan", it returns a value with the sign bit set. But, our
87	// preferred canonical value for NaN does not have the sign bit set, and all other platforms
88	// return one without the sign bit set. So, on Windows, detect NaN and return our preferred
89	// version.
90	//
91	// Cygwin seemingly does not try to emulate Linux behavior here, but rather allows Windows'
92	// behavior to leak through. (Conversely, WINE actually produces the Linux behavior despite
93	// trying to behave like Win32...)
94	//
95	// Bionic (Android) failed the unit test and so I added it to the list without investigating
96	// further.
97	if (isNaN(value)) {
98	// NaN
99	return kj::nan();
100	}
101	#endif
102	return value;
103	}
104
105	} // namespace
106
107	#define PARSE_AS_INTEGER(T) \
108	template <> T StringPtr::parseAs<T>() const { return parseInteger<T>(*this); }
109	PARSE_AS_INTEGER(char);
110	PARSE_AS_INTEGER(signed char);
111	PARSE_AS_INTEGER(unsigned char);
112	PARSE_AS_INTEGER(short);
113	PARSE_AS_INTEGER(unsigned short);
114	PARSE_AS_INTEGER(int);
115	PARSE_AS_INTEGER(unsigned int);
116	PARSE_AS_INTEGER(long);
117	PARSE_AS_INTEGER(unsigned long);
118	PARSE_AS_INTEGER(long long);
119	PARSE_AS_INTEGER(unsigned long long);
120	#undef PARSE_AS_INTEGER
121	template <> double StringPtr::parseAs<double>() const { return parseDouble(*this); }
122	template <> float StringPtr::parseAs<float>() const { return parseDouble(*this); }
123
124	String heapString(size_t size) {
125	char* buffer = _::HeapArrayDisposer::allocate<char>(size + `1`);
126	buffer[size] = `'\0'`;
127	return String (buffer, size, _::HeapArrayDisposer::instance);
128	}
129
130	String heapString(const char* value, size_t size) {
131	char* buffer = _::HeapArrayDisposer::allocate<char>(size + `1`);
132	if (size != `0u`) {
133	memcpy(buffer, value, size);
134	}
135	buffer[size] = `'\0'`;
136	return String (buffer, size, _::HeapArrayDisposer::instance);
137	}
138
139	template <typename T>
140	static CappedArray<char, sizeof(T) * `2` + `1`> hexImpl(T i) {
141	// We don't use sprintf() because it's not async-signal-safe (for strPreallocated()).
142	CappedArray<char, sizeof(T) * `2` + `1`> result;
143	uint8_t reverse[sizeof(T) * `2`];
144	uint8_t* p = reverse;
145	if (i == `0`) {
146	*p++ = `0`;
147	} else {
148	while (i > `0`) {
149	*p++ = i % `16`;
150	i /= `16`;
151	}
152	}
153
154	char* p2 = result.begin();
155	while (p > reverse) {
156	p2++ = "0123456789abcdef"[--p];
157	}
158	result.setSize(p2 - result.begin());
159	return result;
160	}
161
162	#define HEXIFY_INT(type) \
163	CappedArray<char, sizeof(type) * 2 + 1> hex(type i) { \
164	return hexImpl<type>(i); \
165	}
166
167	HEXIFY_INT(unsigned char);
168	HEXIFY_INT(unsigned short);
169	HEXIFY_INT(unsigned int);
170	HEXIFY_INT(unsigned long);
171	HEXIFY_INT(unsigned long long);
172
173	#undef HEXIFY_INT
174
175	namespace _ { // private
176
177	StringPtr Stringifier::operator(decltype(nullptr)) const* {
178	return "nullptr";
179	}
180
181	StringPtr Stringifier::operator(bool* b) const {
182	return b ? StringPtr ("true") : StringPtr ("false");
183	}
184
185	template <typename T, typename Unsigned>
186	static CappedArray<char, sizeof(T) * `3` + `2`> stringifyImpl(T i) {
187	// We don't use sprintf() because it's not async-signal-safe (for strPreallocated()).
188	CappedArray<char, sizeof(T) * `3` + `2`> result;
189	bool negative = i < `0`;
190	Unsigned u = negative ? -i : i;
191	uint8_t reverse[sizeof(T) * `3` + `1`];
192	uint8_t* p = reverse;
193	if (u == `0`) {
194	*p++ = `0`;
195	} else {
196	while (u > `0`) {
197	*p++ = u % `10`;
198	u /= `10`;
199	}
200	}
201
202	char* p2 = result.begin();
203	if (negative) *p2++ = `'-'`;
204	while (p > reverse) {
205	p2++ = `'0'` + --p;
206	}
207	result.setSize(p2 - result.begin());
208	return result;
209	}
210
211	#define STRINGIFY_INT(type, unsigned) \
212	CappedArray<char, sizeof(type) * 3 + 2> Stringifier::operator*(type i) const { \
213	return stringifyImpl<type, unsigned>(i); \
214	}
215
216	STRINGIFY_INT(signed char, uint);
217	STRINGIFY_INT(unsigned char, uint);
218	STRINGIFY_INT(short, uint);
219	STRINGIFY_INT(unsigned short, uint);
220	STRINGIFY_INT(int, uint);
221	STRINGIFY_INT(unsigned int, uint);
222	STRINGIFY_INT(long, unsigned long);
223	STRINGIFY_INT(unsigned long, unsigned long);
224	STRINGIFY_INT(long long, unsigned long long);
225	STRINGIFY_INT(unsigned long long, unsigned long long);
226
227	#undef STRINGIFY_INT
228
229	CappedArray<char, sizeof(const void) `2` + `1`> Stringifier::operator(const* void* i) const { \
230	return hexImpl<uintptr_t>(reinterpret_cast<uintptr_t>(i));
231	}
232
233	namespace {
234
235	// ----------------------------------------------------------------------
236	// DoubleToBuffer()
237	// FloatToBuffer()
238	// Copied from Protocol Buffers, (C) Google, BSD license.
239	// Kenton wrote this code originally. The following commentary is
240	// from the original.
241	//
242	// Description: converts a double or float to a string which, if
243	// passed to NoLocaleStrtod(), will produce the exact same original double
244	// (except in case of NaN; all NaNs are considered the same value).
245	// We try to keep the string short but it's not guaranteed to be as
246	// short as possible.
247	//
248	// DoubleToBuffer() and FloatToBuffer() write the text to the given
249	// buffer and return it. The buffer must be at least
250	// kDoubleToBufferSize bytes for doubles and kFloatToBufferSize
251	// bytes for floats. kFastToBufferSize is also guaranteed to be large
252	// enough to hold either.
253	//
254	// We want to print the value without losing precision, but we also do
255	// not want to print more digits than necessary. This turns out to be
256	// trickier than it sounds. Numbers like 0.2 cannot be represented
257	// exactly in binary. If we print 0.2 with a very large precision,
258	// e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
259	// On the other hand, if we set the precision too low, we lose
260	// significant digits when printing numbers that actually need them.
261	// It turns out there is no precision value that does the right thing
262	// for all numbers.
263	//
264	// Our strategy is to first try printing with a precision that is never
265	// over-precise, then parse the result with strtod() to see if it
266	// matches. If not, we print again with a precision that will always
267	// give a precise result, but may use more digits than necessary.
268	//
269	// An arguably better strategy would be to use the algorithm described
270	// in "How to Print Floating-Point Numbers Accurately" by Steele &
271	// White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
272	// however, that the following implementation is about as fast as
273	// DMG's code. Furthermore, DMG's code locks mutexes, which means it
274	// will not scale well on multi-core machines. DMG's code is slightly
275	// more accurate (in that it will never use more digits than
276	// necessary), but this is probably irrelevant for most users.
277	//
278	// Rob Pike and Ken Thompson also have an implementation of dtoa() in
279	// third_party/fmt/fltfmt.cc. Their implementation is similar to this
280	// one in that it makes guesses and then uses strtod() to check them.
281	// Their implementation is faster because they use their own code to
282	// generate the digits in the first place rather than use snprintf(),
283	// thus avoiding format string parsing overhead. However, this makes
284	// it considerably more complicated than the following implementation,
285	// and it is embedded in a larger library. If speed turns out to be
286	// an issue, we could re-implement this in terms of their
287	// implementation.
288	// ----------------------------------------------------------------------
289
290	#ifdef _WIN32
291	// MSVC has only _snprintf, not snprintf.
292	//
293	// MinGW has both snprintf and _snprintf, but they appear to be different
294	// functions. The former is buggy. When invoked like so:
295	// char buffer[32];
296	// snprintf(buffer, 32, "%.g\n", FLT_DIG, 1.23e10f);*
297	// it prints "1.23000e+10". This is plainly wrong: %g should never print
298	// trailing zeros after the decimal point. For some reason this bug only
299	// occurs with some input values, not all. In any case, _snprintf does the
300	// right thing, so we use it.
301	#define snprintf _snprintf
302	#endif
303
304	inline bool IsNaN(double value) {
305	// NaN is never equal to anything, even itself.
306	return value != value;
307	}
308
309	// In practice, doubles should never need more than 24 bytes and floats
310	// should never need more than 14 (including null terminators), but we
311	// overestimate to be safe.
312	static const int kDoubleToBufferSize = `32`;
313	static const int kFloatToBufferSize = `24`;
314
315	static inline bool IsValidFloatChar(char c) {
316	return (`'0'` <= c && c <= `'9'`) \|\|
317	c == `'e'` \|\| c == `'E'` \|\|
318	c == `'+'` \|\| c == `'-'`;
319	}
320
321	void DelocalizeRadix(char* buffer) {
322	// Fast check: if the buffer has a normal decimal point, assume no
323	// translation is needed.
324	if (strchr(buffer, `'.'`) != NULL) return;
325
326	// Find the first unknown character.
327	while (IsValidFloatChar(*buffer)) ++buffer;
328
329	if (*buffer == `'\0'`) {
330	// No radix character found.
331	return;
332	}
333
334	// We are now pointing at the locale-specific radix character. Replace it
335	// with '.'.
336	*buffer = `'.'`;
337	++buffer;
338
339	if (!IsValidFloatChar(buffer) && buffer != `'\0'`) {
340	// It appears the radix was a multi-byte character. We need to remove the
341	// extra bytes.
342	char* target = buffer;
343	do { ++buffer; } while (!IsValidFloatChar(buffer) && buffer != `'\0'`);
344	memmove(target, buffer, strlen(buffer) + `1`);
345	}
346	}
347
348	void RemovePlus(char* buffer) {
349	// Remove any + characters because they are redundant and ugly.
350
351	for (;;) {
352	buffer = strchr(buffer, `'+'`);
353	if (buffer == NULL) {
354	return;
355	}
356	memmove(buffer, buffer + `1`, strlen(buffer + `1`) + `1`);
357	}
358	}
359
360	#if _WIN32
361	void RemoveE0(char* buffer) {
362	// Remove redundant leading 0's after an e, e.g. 1e012. Seems to appear on
363	// Windows.
364
365	// Find and skip 'e'.
366	char* ptr = strchr(buffer, `'e'`);
367	if (ptr == nullptr) return;
368	++ptr;
369
370	// Skip '-'.
371	if (*ptr == `'-'`) ++ptr;
372
373	// Skip '0's.
374	char* ptr2 = ptr;
375	while (*ptr2 == `'0'`) ++ptr2;
376
377	// If we went past the last digit, back up one.
378	if (ptr2 < `'0'` \|\| ptr2 > `'9'`) --ptr2;
379
380	// Move bytes backwards.
381	if (ptr2 > ptr) {
382	memmove(ptr, ptr2, strlen(ptr2) + `1`);
383	}
384	}
385	#endif
386
387	char* DoubleToBuffer(double value, char* buffer) {
388	// DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
389	// platforms these days. Just in case some system exists where DBL_DIG
390	// is significantly larger -- and risks overflowing our buffer -- we have
391	// this assert.
392	static_assert(DBL_DIG < `20`, "DBL_DIG is too big.");
393
394	if (value == inf()) {
395	strcpy(buffer, "inf");
396	return buffer;
397	} else if (value == -inf()) {
398	strcpy(buffer, "-inf");
399	return buffer;
400	} else if (IsNaN(value)) {
401	strcpy(buffer, "nan");
402	return buffer;
403	}
404
405	int snprintf_result KJ_UNUSED =
406	snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
407
408	// The snprintf should never overflow because the buffer is significantly
409	// larger than the precision we asked for.
410	KJ_DASSERT(snprintf_result > `0` && snprintf_result < kDoubleToBufferSize);
411
412	// We need to make parsed_value volatile in order to force the compiler to
413	// write it out to the stack. Otherwise, it may keep the value in a
414	// register, and if it does that, it may keep it as a long double instead
415	// of a double. This long double may have extra bits that make it compare
416	// unequal to "value" even though it would be exactly equal if it were
417	// truncated to a double.
418	volatile double parsed_value = strtod(buffer, NULL);
419	if (parsed_value != value) {
420	int snprintf_result2 KJ_UNUSED =
421	snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+`2`, value);
422
423	// Should never overflow; see above.
424	KJ_DASSERT(snprintf_result2 > `0` && snprintf_result2 < kDoubleToBufferSize);
425	}
426
427	DelocalizeRadix(buffer);
428	RemovePlus(buffer);
429	#if _WIN32
430	RemoveE0(buffer);
431	#endif // _WIN32
432	return buffer;
433	}
434
435	bool safe_strtof(const char* str, float* value) {
436	char* endptr;
437	errno = `0`; // errno only gets set on errors
438	#if defined(_WIN32) \|\| defined (__hpux) // has no strtof()
439	value = static_cast<float*>(strtod(str, &endptr));
440	#else
441	*value = strtof(str, &endptr);
442	#endif
443	return str != `0` && endptr == `0` && errno == `0`;
444	}
445
446	char* FloatToBuffer(float value, char* buffer) {
447	// FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
448	// platforms these days. Just in case some system exists where FLT_DIG
449	// is significantly larger -- and risks overflowing our buffer -- we have
450	// this assert.
451	static_assert(FLT_DIG < `10`, "FLT_DIG is too big");
452
453	if (value == inf()) {
454	strcpy(buffer, "inf");
455	return buffer;
456	} else if (value == -inf()) {
457	strcpy(buffer, "-inf");
458	return buffer;
459	} else if (IsNaN(value)) {
460	strcpy(buffer, "nan");
461	return buffer;
462	}
463
464	int snprintf_result KJ_UNUSED =
465	snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
466
467	// The snprintf should never overflow because the buffer is significantly
468	// larger than the precision we asked for.
469	KJ_DASSERT(snprintf_result > `0` && snprintf_result < kFloatToBufferSize);
470
471	float parsed_value;
472	if (!safe_strtof(buffer, &parsed_value) \|\| parsed_value != value) {
473	int snprintf_result2 KJ_UNUSED =
474	snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+`2`, value);
475
476	// Should never overflow; see above.
477	KJ_DASSERT(snprintf_result2 > `0` && snprintf_result2 < kFloatToBufferSize);
478	}
479
480	DelocalizeRadix(buffer);
481	RemovePlus(buffer);
482	#if _WIN32
483	RemoveE0(buffer);
484	#endif // _WIN32
485	return buffer;
486	}
487
488	} // namespace
489
490	CappedArray<char, kFloatToBufferSize> Stringifier::operator(float* f) const {
491	CappedArray<char, kFloatToBufferSize> result;
492	result.setSize(strlen(FloatToBuffer(f, result.begin())));
493	return result;
494	}
495
496	CappedArray<char, kDoubleToBufferSize> Stringifier::operator(double* f) const {
497	CappedArray<char, kDoubleToBufferSize> result;
498	result.setSize(strlen(DoubleToBuffer(f, result.begin())));
499	return result;
500	}
501
502	} // namespace _ (private)
503	} // namespace kj
504

Browse the source code of ClickHouse/contrib/capnproto/c++/src/kj/string.c++