split.cc source code [Aerospike/modules/s2-geometry-library/geometry/strings/split.cc]

1	// Copyright 2008 and onwards Google Inc. All rights reserved.
2
3	#include <limits>
4	using std::numeric_limits;
5
6
7	// #include "base/commandlineflags.h"
8	#include "base/integral_types.h"
9	#include "base/logging.h"
10	#include "base/macros.h"
11	#include "base/strtoint.h"
12	#include "split.h"
13	#include "strutil.h"
14	#include "util/hash/hash_jenkins_lookup2.h"
15
16	static const uint32 MIX32 = `0x12b9b0a1UL`; // pi; an arbitrary number
17	static const uint64 MIX64 = GG_ULONGLONG(`0x2b992ddfa23249d6`); // more of pi
18
19	// ----------------------------------------------------------------------
20	// Hash32StringWithSeed()
21	// Hash64StringWithSeed()
22	// Hash32NumWithSeed()
23	// Hash64NumWithSeed()
24	// These are Bob Jenkins' hash functions, one for 32 bit numbers
25	// and one for 64 bit numbers. Each takes a string as input and
26	// a start seed. Hashing the same string with two different seeds
27	// should give two independent hash values.
28	// The Num() functions just do a single mix, in order to
29	// convert the given number into something random.
30	//
31	// Note that these methods may return any value for the given size, while
32	// the corresponding HashToXX() methods avoids certain reserved values.
33	// ----------------------------------------------------------------------
34
35	// Once again, not included, but copied from:
36	// http://szl.googlecode.com/svn-history/r40/trunk/src/utilities/hashutils.cc
37	// This is an Apache license...make sure this is ok
38	const uint32 kPrimes32[`16`] ={
39	`65537`, `65539`, `65543`, `65551`, `65557`, `65563`, `65579`, `65581`,
40	`65587`, `65599`, `65609`, `65617`, `65629`, `65633`, `65647`, `65651`,
41	};
42	uint32 Hash32StringWithSeed(const char *s, uint32 len, uint32 seed) {
43	uint32 n = seed;
44	size_t prime1 = `0`, prime2 = `8`; // Indices into kPrimes32
45	union {
46	uint16 n;
47	char bytes[sizeof(uint16)];
48	} chunk;
49	for (const char i = s, const end = s + len; i != end; ) {
50	chunk.bytes[`0`] = *i++;
51	chunk.bytes[`1`] = i == end ? `0` : *i++;
52	n = n * kPrimes32[prime1++] ^ chunk.n * kPrimes32[prime2++];
53	prime1 &= `0x0F`;
54	prime2 &= `0x0F`;
55	}
56	return n;
57	}
58	extern uint64 Hash64StringWithSeed(const char *s, uint32 len, uint64 c);
59	extern uint32 Hash32StringWithSeedReferenceImplementation(const char *s,
60	uint32 len, uint32 c);
61	// ----------------------------------------------------------------------
62	// HashTo32()
63	// HashTo16()
64	// HashTo8()
65	// These functions take various types of input (through operator
66	// overloading) and return 32, 16, and 8 bit quantities, respectively.
67	// The basic rule of our hashing is: always mix(). Thus, even for
68	// char outputs we cast to a uint32 and mix with two arbitrary numbers.
69	// As indicated in basictypes.h, there are a few illegal hash
70	// values to watch out for.
71	//
72	// Note that these methods avoid returning certain reserved values, while
73	// the corresponding HashXXStringWithSeed() methdos may return any value.
74	// ----------------------------------------------------------------------
75
76	// This macro defines the HashTo32, To16, and To8 versions all in one go.
77	// It takes the argument list and a command that hashes your number.
78	// (For 16 and 8, we just mod retval before returning it.) Example:
79	// HASH_TO((char c), Hash32NumWithSeed(c, MIX32_1))
80	// evaluates to
81	// uint32 retval;
82	// retval = Hash32NumWithSeed(c, MIX32_1);
83	// return retval == kIllegalHash32 ? retval-1 : retval;
84	//
85
86	inline uint32 Hash32NumWithSeed(uint32 num, uint32 c) {
87	uint32 b = `0x9e3779b9UL`; // the golden ratio; an arbitrary value
88	mix(num, b, c);
89	return c;
90	}
91
92	inline uint64 Hash64NumWithSeed(uint64 num, uint64 c) {
93	uint64 b = GG_ULONGLONG(`0xe08c1d668b756f82`); // more of the golden ratio
94	mix(num, b, c);
95	return c;
96	}
97
98	#define HASH_TO(arglist, command) \
99	inline uint32 HashTo32 arglist { \
100	uint32 retval = command; \
101	return retval == kIllegalHash32 ? retval-1 : retval; \
102	} \
103	inline uint16 HashTo16 arglist { \
104	uint16 retval16 = command >> 16; /* take upper 16 bits */ \
105	return retval16 == kIllegalHash16 ? retval16-1 : retval16; \
106	} \
107	inline unsigned char HashTo8 arglist { \
108	unsigned char retval8 = command >> 24; /* take upper 8 bits */ \
109	return retval8 == kIllegalHash8 ? retval8-1 : retval8; \
110	}
111
112	// This defines:
113	// HashToXX(char s, int slen);*
114	// HashToXX(char c);
115	// etc
116
117	HASH_TO((const char *s, uint32 slen), Hash32StringWithSeed(s, slen, MIX32))
118	HASH_TO((const wchar_t *s, uint32 slen),
119	Hash32StringWithSeed(reinterpret_cast<const char*>(s),
120	sizeof(wchar_t) * slen,
121	MIX32))
122	HASH_TO((char c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
123	HASH_TO((schar c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
124	HASH_TO((uint16 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
125	HASH_TO((int16 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
126	HASH_TO((uint32 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
127	HASH_TO((int32 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
128	HASH_TO((uint64 c), static_cast<uint32>(Hash64NumWithSeed(c, MIX64) >> `32`))
129	HASH_TO((int64 c), static_cast<uint32>(Hash64NumWithSeed(c, MIX64) >> `32`))
130
131	#undef HASH_TO // clean up the macro space
132
133
134	// HASH_NAMESPACE_DECLARATION_START
135	namespace __gnu_cxx {
136
137	#if defined(__GNUC__)
138	// Use our nice hash function for strings
139	template<class _CharT, class _Traits, class _Alloc>
140	struct hash<basic_string<_CharT, _Traits, _Alloc> > {
141	size_t operator()(const basic_string<_CharT, _Traits, _Alloc>& k) const {
142	return HashTo32(k.data(), static_cast<uint32>(k.length()));
143	}
144	};
145
146	// they don't define a hash for const string at all
147	template<> struct hash<const string> {
148	size_t operator()(const string& k) const {
149	return HashTo32(k.data(), static_cast<uint32>(k.length()));
150	}
151	};
152	#endif // __GNUC__
153
154	// MSVC's STL requires an ever-so slightly different decl
155	#if defined STL_MSVC
156	template<> struct hash<char const*> : PortableHashBase {
157	size_t operator()(char const* const k) const {
158	return HashTo32(k, strlen(k));
159	}
160	// Less than operator:
161	bool operator()(char const* const a, char const* const b) const {
162	return strcmp(a, b) < `0`;
163	}
164	};
165
166	template<> struct hash<string> : PortableHashBase {
167	size_t operator()(const string& k) const {
168	return HashTo32(k.data(), k.length());
169	}
170	// Less than operator:
171	bool operator()(const string& a, const string& b) const {
172	return a < b;
173	}
174	};
175
176	#endif
177
178	} // hash namespace
179
180
181	namespace {
182	// NOTE(user): we have to implement our own interator because
183	// insert_iterator<set<string> > does not instantiate without
184	// errors, perhaps since string != std::string.
185	// This is not a fully functional iterator, but is
186	// sufficient for SplitStringToIteratorUsing().
187	template <typename T>
188	struct simple_insert_iterator {
189	T* t_;
190	simple_insert_iterator(T* t) : t_(t) { }
191
192	simple_insert_iterator<T>& operator=(const typename T::value_type& value) {
193	t_->insert(value);
194	return *this;
195	}
196
197	simple_insert_iterator<T>& operator() { return* *this; }
198	simple_insert_iterator<T>& operator++() { return *this; }
199	simple_insert_iterator<T>& operator++(int) { return *this; }
200	};
201
202	// Used to populate a hash_map out of pairs of consecutive strings in
203	// SplitStringToIterator{Using\|AllowEmpty}().
204	template <typename T>
205	struct simple_hash_map_iterator {
206	typedef hash_map<T, T> hashmap;
207	hashmap* t;
208	bool even;
209	typename hashmap::iterator curr;
210
211	simple_hash_map_iterator(hashmap* t_init) : t(t_init), even(true) {
212	curr = t->begin();
213	}
214
215	simple_hash_map_iterator<T>& operator=(const T& value) {
216	if (even) {
217	curr = t->insert(make_pair(value, T())).first;
218	} else {
219	curr->second = value;
220	}
221	even = !even;
222	return *this;
223	}
224
225	simple_hash_map_iterator<T>& operator() { return* *this; }
226	simple_hash_map_iterator<T>& operator++() { return *this; }
227	simple_hash_map_iterator<T>& operator++(int i) { return *this; }
228	};
229
230	} // anonymous namespace
231
232	// ----------------------------------------------------------------------
233	// SplitStringIntoNPiecesAllowEmpty()
234	// SplitStringToIteratorAllowEmpty()
235	// Split a string using a character delimiter. Append the components
236	// to 'result'. If there are consecutive delimiters, this function
237	// will return corresponding empty strings. The string is split into
238	// at most the specified number of pieces greedily. This means that the
239	// last piece may possibly be split further. To split into as many pieces
240	// as possible, specify 0 as the number of pieces.
241	//
242	// If "full" is the empty string, yields an empty string as the only value.
243	//
244	// If "pieces" is negative for some reason, it returns the whole string
245	// ----------------------------------------------------------------------
246	template <typename StringType, typename ITR>
247	static inline
248	void SplitStringToIteratorAllowEmpty(const StringType& full,
249	const char* delim,
250	int pieces,
251	ITR& result) {
252	string::size_type begin_index, end_index;
253	begin_index = `0`;
254
255	for (int i=`0`; (i < pieces-`1`) \|\| (pieces == `0`); i++) {
256	end_index = full.find_first_of(delim, begin_index);
257	if (end_index == string::npos) {
258	*result++ = full.substr(begin_index);
259	return;
260	}
261	*result++ = full.substr(begin_index, (end_index - begin_index));
262	begin_index = end_index + `1`;
263	}
264	*result++ = full.substr(begin_index);
265	}
266
267	void SplitStringIntoNPiecesAllowEmpty(const string& full,
268	const char* delim,
269	int pieces,
270	vector<string>* result) {
271	back_insert_iterator<vector<string> > it(*result);
272	SplitStringToIteratorAllowEmpty(full, delim, pieces, it);
273	}
274
275	// ----------------------------------------------------------------------
276	// SplitStringAllowEmpty
277	// SplitStringToHashsetAllowEmpty
278	// SplitStringToSetAllowEmpty
279	// SplitStringToHashmapAllowEmpty
280	// Split a string using a character delimiter. Append the components
281	// to 'result'. If there are consecutive delimiters, this function
282	// will return corresponding empty strings.
283	// ----------------------------------------------------------------------
284	void SplitStringAllowEmpty(const string& full, const char* delim,
285	vector<string>* result) {
286	back_insert_iterator<vector<string> > it(*result);
287	SplitStringToIteratorAllowEmpty(full, delim, `0`, it);
288	}
289
290	void SplitStringToHashsetAllowEmpty(const string& full, const char* delim,
291	hash_set<string>* result) {
292	simple_insert_iterator<hash_set<string> > it(result);
293	SplitStringToIteratorAllowEmpty(full, delim, `0`, it);
294	}
295
296	void SplitStringToSetAllowEmpty(const string& full, const char* delim,
297	set<string>* result) {
298	simple_insert_iterator<set<string> > it(result);
299	SplitStringToIteratorAllowEmpty(full, delim, `0`, it);
300	}
301
302	void SplitStringToHashmapAllowEmpty(const string& full, const char* delim,
303	hash_map<string, string>* result) {
304	simple_hash_map_iterator<string> it(result);
305	SplitStringToIteratorAllowEmpty(full, delim, `0`, it);
306	}
307
308	// If we know how much to allocate for a vector of strings, we can
309	// allocate the vector<string> only once and directly to the right size.
310	// This saves in between 33-66 % of memory space needed for the result,
311	// and runs faster in the microbenchmarks.
312	//
313	// The reserve is only implemented for the single character delim.
314	//
315	// The implementation for counting is cut-and-pasted from
316	// SplitStringToIteratorUsing. I could have written my own counting iterator,
317	// and use the existing template function, but probably this is more clear
318	// and more sure to get optimized to reasonable code.
319	static int CalculateReserveForVector(const string& full, const char* delim) {
320	int count = `0`;
321	if (delim[`0`] != `'\0'` && delim[`1`] == `'\0'`) {
322	// Optimize the common case where delim is a single character.
323	char c = delim[`0`];
324	const char* p = full.data();
325	const char* end = p + full.size();
326	while (p != end) {
327	if (p == c) { // This could be optimized with hasless(v,1) trick.*
328	++p;
329	} else {
330	while (++p != end && *p != c) {
331	// Skip to the next occurence of the delimiter.
332	}
333	++count;
334	}
335	}
336	}
337	return count;
338	}
339
340	// ----------------------------------------------------------------------
341	// SplitStringUsing()
342	// SplitStringToHashsetUsing()
343	// SplitStringToSetUsing()
344	// SplitStringToHashmapUsing()
345	// Split a string using a character delimiter. Append the components
346	// to 'result'.
347	//
348	// Note: For multi-character delimiters, this routine will split on ANY* of*
349	// the characters in the string, not the entire string as a single delimiter.
350	// ----------------------------------------------------------------------
351	template <typename StringType, typename ITR>
352	static inline
353	void SplitStringToIteratorUsing(const StringType& full,
354	const char* delim,
355	ITR& result) {
356	// Optimize the common case where delim is a single character.
357	if (delim[`0`] != `'\0'` && delim[`1`] == `'\0'`) {
358	char c = delim[`0`];
359	const char* p = full.data();
360	const char* end = p + full.size();
361	while (p != end) {
362	if (*p == c) {
363	++p;
364	} else {
365	const char* start = p;
366	while (++p != end && *p != c) {
367	// Skip to the next occurence of the delimiter.
368	}
369	*result++ = StringType(start, p - start);
370	}
371	}
372	return;
373	}
374
375	string::size_type begin_index, end_index;
376	begin_index = full.find_first_not_of(delim);
377	while (begin_index != string::npos) {
378	end_index = full.find_first_of(delim, begin_index);
379	if (end_index == string::npos) {
380	*result++ = full.substr(begin_index);
381	return;
382	}
383	*result++ = full.substr(begin_index, (end_index - begin_index));
384	begin_index = full.find_first_not_of(delim, end_index);
385	}
386	}
387
388	void SplitStringUsing(const string& full,
389	const char* delim,
390	vector<string>* result) {
391	result->reserve(result->size() + CalculateReserveForVector(full, delim));
392	back_insert_iterator< vector<string> > it(*result);
393	SplitStringToIteratorUsing(full, delim, it);
394	}
395
396	void SplitStringToHashsetUsing(const string& full, const char* delim,
397	hash_set<string>* result) {
398	simple_insert_iterator<hash_set<string> > it(result);
399	SplitStringToIteratorUsing(full, delim, it);
400	}
401
402	void SplitStringToSetUsing(const string& full, const char* delim,
403	set<string>* result) {
404	simple_insert_iterator<set<string> > it(result);
405	SplitStringToIteratorUsing(full, delim, it);
406	}
407
408	void SplitStringToHashmapUsing(const string& full, const char* delim,
409	hash_map<string, string>* result) {
410	simple_hash_map_iterator<string> it(result);
411	SplitStringToIteratorUsing(full, delim, it);
412	}
413
414	// ----------------------------------------------------------------------
415	// SplitOneIntToken()
416	// SplitOneInt32Token()
417	// SplitOneUint32Token()
418	// SplitOneInt64Token()
419	// SplitOneUint64Token()
420	// SplitOneDoubleToken()
421	// SplitOneFloatToken()
422	// SplitOneDecimalIntToken()
423	// SplitOneDecimalInt32Token()
424	// SplitOneDecimalUint32Token()
425	// SplitOneDecimalInt64Token()
426	// SplitOneDecimalUint64Token()
427	// SplitOneHexUint32Token()
428	// SplitOneHexUint64Token()
429	// Mainly a stringified wrapper around strtol/strtoul/strtod
430	// ----------------------------------------------------------------------
431	// Curried functions for the macro below
432	static inline long strto32_0(const char * source, char ** end) {
433	return strto32(source, end, `0`); }
434	static inline unsigned long strtou32_0(const char * source, char ** end) {
435	return strtou32(source, end, `0`); }
436	static inline int64 strto64_0(const char * source, char ** end) {
437	return strto64(source, end, `0`); }
438	static inline uint64 strtou64_0(const char * source, char ** end) {
439	return strtou64(source, end, `0`); }
440	static inline long strto32_10(const char * source, char ** end) {
441	return strto32(source, end, `10`); }
442	static inline unsigned long strtou32_10(const char * source, char ** end) {
443	return strtou32(source, end, `10`); }
444	static inline int64 strto64_10(const char * source, char ** end) {
445	return strto64(source, end, `10`); }
446	static inline uint64 strtou64_10(const char * source, char ** end) {
447	return strtou64(source, end, `10`); }
448	static inline uint32 strtou32_16(const char * source, char ** end) {
449	return strtou32(source, end, `16`); }
450	static inline uint64 strtou64_16(const char * source, char ** end) {
451	return strtou64(source, end, `16`); }
452
453	#define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \
454	bool SplitOne##name##Token(const char ** source, const char * delim, \
455	type * value) { \
456	assert(source); \
457	assert(delim); \
458	assert(value); \
459	if (!*source) \
460	return false; \
461	/* Parse int */ \
462	char * end; \
463	value = function(source, &end); \
464	if (end == *source) \
465	return false; /* number not present at start of string */ \
466	if (end[0] && !strchr(delim, end[0])) \
467	return false; /* Garbage characters after int */ \
468	/* Advance past token */ \
469	if (*end != '\0') \
470	source = const_cast<const char >(end+1); \
471	else \
472	*source = NULL; \
473	return true; \
474	}
475
476	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0)
477	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0)
478	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0)
479	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0)
480	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0)
481	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod)
482	#ifdef COMPILER_MSVC // has no strtof()
483	// Note: does an implicit cast to float.
484	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod)
485	#else
486	DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof)
487	#endif
488	DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10)
489	DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10)
490	DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10)
491	DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10)
492	DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10)
493	DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16)
494	DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16)
495

Browse the source code of Aerospike/modules/s2-geometry-library/geometry/strings/split.cc