1// Copyright 2008 and onwards Google Inc. All rights reserved.
2
3#include <limits>
4using std::numeric_limits;
5
6
7// #include "base/commandlineflags.h"
8#include "base/integral_types.h"
9#include "base/logging.h"
10#include "base/macros.h"
11#include "base/strtoint.h"
12#include "split.h"
13#include "strutil.h"
14#include "util/hash/hash_jenkins_lookup2.h"
15
16static const uint32 MIX32 = 0x12b9b0a1UL; // pi; an arbitrary number
17static const uint64 MIX64 = GG_ULONGLONG(0x2b992ddfa23249d6); // more of pi
18
19// ----------------------------------------------------------------------
20// Hash32StringWithSeed()
21// Hash64StringWithSeed()
22// Hash32NumWithSeed()
23// Hash64NumWithSeed()
24// These are Bob Jenkins' hash functions, one for 32 bit numbers
25// and one for 64 bit numbers. Each takes a string as input and
26// a start seed. Hashing the same string with two different seeds
27// should give two independent hash values.
28// The *Num*() functions just do a single mix, in order to
29// convert the given number into something *random*.
30//
31// Note that these methods may return any value for the given size, while
32// the corresponding HashToXX() methods avoids certain reserved values.
33// ----------------------------------------------------------------------
34
35// Once again, not included, but copied from:
36// http://szl.googlecode.com/svn-history/r40/trunk/src/utilities/hashutils.cc
37// This is an Apache license...make sure this is ok
38const uint32 kPrimes32[16] ={
39 65537, 65539, 65543, 65551, 65557, 65563, 65579, 65581,
40 65587, 65599, 65609, 65617, 65629, 65633, 65647, 65651,
41};
42uint32 Hash32StringWithSeed(const char *s, uint32 len, uint32 seed) {
43 uint32 n = seed;
44 size_t prime1 = 0, prime2 = 8; // Indices into kPrimes32
45 union {
46 uint16 n;
47 char bytes[sizeof(uint16)];
48 } chunk;
49 for (const char *i = s, *const end = s + len; i != end; ) {
50 chunk.bytes[0] = *i++;
51 chunk.bytes[1] = i == end ? 0 : *i++;
52 n = n * kPrimes32[prime1++] ^ chunk.n * kPrimes32[prime2++];
53 prime1 &= 0x0F;
54 prime2 &= 0x0F;
55 }
56 return n;
57}
58extern uint64 Hash64StringWithSeed(const char *s, uint32 len, uint64 c);
59extern uint32 Hash32StringWithSeedReferenceImplementation(const char *s,
60 uint32 len, uint32 c);
61// ----------------------------------------------------------------------
62// HashTo32()
63// HashTo16()
64// HashTo8()
65// These functions take various types of input (through operator
66// overloading) and return 32, 16, and 8 bit quantities, respectively.
67// The basic rule of our hashing is: always mix(). Thus, even for
68// char outputs we cast to a uint32 and mix with two arbitrary numbers.
69// As indicated in basictypes.h, there are a few illegal hash
70// values to watch out for.
71//
72// Note that these methods avoid returning certain reserved values, while
73// the corresponding HashXXStringWithSeed() methdos may return any value.
74// ----------------------------------------------------------------------
75
76// This macro defines the HashTo32, To16, and To8 versions all in one go.
77// It takes the argument list and a command that hashes your number.
78// (For 16 and 8, we just mod retval before returning it.) Example:
79// HASH_TO((char c), Hash32NumWithSeed(c, MIX32_1))
80// evaluates to
81// uint32 retval;
82// retval = Hash32NumWithSeed(c, MIX32_1);
83// return retval == kIllegalHash32 ? retval-1 : retval;
84//
85
86inline uint32 Hash32NumWithSeed(uint32 num, uint32 c) {
87 uint32 b = 0x9e3779b9UL; // the golden ratio; an arbitrary value
88 mix(num, b, c);
89 return c;
90}
91
92inline uint64 Hash64NumWithSeed(uint64 num, uint64 c) {
93 uint64 b = GG_ULONGLONG(0xe08c1d668b756f82); // more of the golden ratio
94 mix(num, b, c);
95 return c;
96}
97
98#define HASH_TO(arglist, command) \
99inline uint32 HashTo32 arglist { \
100 uint32 retval = command; \
101 return retval == kIllegalHash32 ? retval-1 : retval; \
102} \
103inline uint16 HashTo16 arglist { \
104 uint16 retval16 = command >> 16; /* take upper 16 bits */ \
105 return retval16 == kIllegalHash16 ? retval16-1 : retval16; \
106} \
107inline unsigned char HashTo8 arglist { \
108 unsigned char retval8 = command >> 24; /* take upper 8 bits */ \
109 return retval8 == kIllegalHash8 ? retval8-1 : retval8; \
110}
111
112// This defines:
113// HashToXX(char *s, int slen);
114// HashToXX(char c);
115// etc
116
117HASH_TO((const char *s, uint32 slen), Hash32StringWithSeed(s, slen, MIX32))
118HASH_TO((const wchar_t *s, uint32 slen),
119 Hash32StringWithSeed(reinterpret_cast<const char*>(s),
120 sizeof(wchar_t) * slen,
121 MIX32))
122HASH_TO((char c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
123HASH_TO((schar c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
124HASH_TO((uint16 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
125HASH_TO((int16 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
126HASH_TO((uint32 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
127HASH_TO((int32 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32))
128HASH_TO((uint64 c), static_cast<uint32>(Hash64NumWithSeed(c, MIX64) >> 32))
129HASH_TO((int64 c), static_cast<uint32>(Hash64NumWithSeed(c, MIX64) >> 32))
130
131#undef HASH_TO // clean up the macro space
132
133
134// HASH_NAMESPACE_DECLARATION_START
135namespace __gnu_cxx {
136
137#if defined(__GNUC__)
138// Use our nice hash function for strings
139template<class _CharT, class _Traits, class _Alloc>
140struct hash<basic_string<_CharT, _Traits, _Alloc> > {
141 size_t operator()(const basic_string<_CharT, _Traits, _Alloc>& k) const {
142 return HashTo32(k.data(), static_cast<uint32>(k.length()));
143 }
144};
145
146// they don't define a hash for const string at all
147template<> struct hash<const string> {
148 size_t operator()(const string& k) const {
149 return HashTo32(k.data(), static_cast<uint32>(k.length()));
150 }
151};
152#endif // __GNUC__
153
154// MSVC's STL requires an ever-so slightly different decl
155#if defined STL_MSVC
156template<> struct hash<char const*> : PortableHashBase {
157 size_t operator()(char const* const k) const {
158 return HashTo32(k, strlen(k));
159 }
160 // Less than operator:
161 bool operator()(char const* const a, char const* const b) const {
162 return strcmp(a, b) < 0;
163 }
164};
165
166template<> struct hash<string> : PortableHashBase {
167 size_t operator()(const string& k) const {
168 return HashTo32(k.data(), k.length());
169 }
170 // Less than operator:
171 bool operator()(const string& a, const string& b) const {
172 return a < b;
173 }
174};
175
176#endif
177
178} // hash namespace
179
180
181namespace {
182// NOTE(user): we have to implement our own interator because
183// insert_iterator<set<string> > does not instantiate without
184// errors, perhaps since string != std::string.
185// This is not a fully functional iterator, but is
186// sufficient for SplitStringToIteratorUsing().
187template <typename T>
188struct simple_insert_iterator {
189 T* t_;
190 simple_insert_iterator(T* t) : t_(t) { }
191
192 simple_insert_iterator<T>& operator=(const typename T::value_type& value) {
193 t_->insert(value);
194 return *this;
195 }
196
197 simple_insert_iterator<T>& operator*() { return *this; }
198 simple_insert_iterator<T>& operator++() { return *this; }
199 simple_insert_iterator<T>& operator++(int) { return *this; }
200};
201
202// Used to populate a hash_map out of pairs of consecutive strings in
203// SplitStringToIterator{Using|AllowEmpty}().
204template <typename T>
205struct simple_hash_map_iterator {
206 typedef hash_map<T, T> hashmap;
207 hashmap* t;
208 bool even;
209 typename hashmap::iterator curr;
210
211 simple_hash_map_iterator(hashmap* t_init) : t(t_init), even(true) {
212 curr = t->begin();
213 }
214
215 simple_hash_map_iterator<T>& operator=(const T& value) {
216 if (even) {
217 curr = t->insert(make_pair(value, T())).first;
218 } else {
219 curr->second = value;
220 }
221 even = !even;
222 return *this;
223 }
224
225 simple_hash_map_iterator<T>& operator*() { return *this; }
226 simple_hash_map_iterator<T>& operator++() { return *this; }
227 simple_hash_map_iterator<T>& operator++(int i) { return *this; }
228};
229
230} // anonymous namespace
231
232// ----------------------------------------------------------------------
233// SplitStringIntoNPiecesAllowEmpty()
234// SplitStringToIteratorAllowEmpty()
235// Split a string using a character delimiter. Append the components
236// to 'result'. If there are consecutive delimiters, this function
237// will return corresponding empty strings. The string is split into
238// at most the specified number of pieces greedily. This means that the
239// last piece may possibly be split further. To split into as many pieces
240// as possible, specify 0 as the number of pieces.
241//
242// If "full" is the empty string, yields an empty string as the only value.
243//
244// If "pieces" is negative for some reason, it returns the whole string
245// ----------------------------------------------------------------------
246template <typename StringType, typename ITR>
247static inline
248void SplitStringToIteratorAllowEmpty(const StringType& full,
249 const char* delim,
250 int pieces,
251 ITR& result) {
252 string::size_type begin_index, end_index;
253 begin_index = 0;
254
255 for (int i=0; (i < pieces-1) || (pieces == 0); i++) {
256 end_index = full.find_first_of(delim, begin_index);
257 if (end_index == string::npos) {
258 *result++ = full.substr(begin_index);
259 return;
260 }
261 *result++ = full.substr(begin_index, (end_index - begin_index));
262 begin_index = end_index + 1;
263 }
264 *result++ = full.substr(begin_index);
265}
266
267void SplitStringIntoNPiecesAllowEmpty(const string& full,
268 const char* delim,
269 int pieces,
270 vector<string>* result) {
271 back_insert_iterator<vector<string> > it(*result);
272 SplitStringToIteratorAllowEmpty(full, delim, pieces, it);
273}
274
275// ----------------------------------------------------------------------
276// SplitStringAllowEmpty
277// SplitStringToHashsetAllowEmpty
278// SplitStringToSetAllowEmpty
279// SplitStringToHashmapAllowEmpty
280// Split a string using a character delimiter. Append the components
281// to 'result'. If there are consecutive delimiters, this function
282// will return corresponding empty strings.
283// ----------------------------------------------------------------------
284void SplitStringAllowEmpty(const string& full, const char* delim,
285 vector<string>* result) {
286 back_insert_iterator<vector<string> > it(*result);
287 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
288}
289
290void SplitStringToHashsetAllowEmpty(const string& full, const char* delim,
291 hash_set<string>* result) {
292 simple_insert_iterator<hash_set<string> > it(result);
293 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
294}
295
296void SplitStringToSetAllowEmpty(const string& full, const char* delim,
297 set<string>* result) {
298 simple_insert_iterator<set<string> > it(result);
299 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
300}
301
302void SplitStringToHashmapAllowEmpty(const string& full, const char* delim,
303 hash_map<string, string>* result) {
304 simple_hash_map_iterator<string> it(result);
305 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
306}
307
308// If we know how much to allocate for a vector of strings, we can
309// allocate the vector<string> only once and directly to the right size.
310// This saves in between 33-66 % of memory space needed for the result,
311// and runs faster in the microbenchmarks.
312//
313// The reserve is only implemented for the single character delim.
314//
315// The implementation for counting is cut-and-pasted from
316// SplitStringToIteratorUsing. I could have written my own counting iterator,
317// and use the existing template function, but probably this is more clear
318// and more sure to get optimized to reasonable code.
319static int CalculateReserveForVector(const string& full, const char* delim) {
320 int count = 0;
321 if (delim[0] != '\0' && delim[1] == '\0') {
322 // Optimize the common case where delim is a single character.
323 char c = delim[0];
324 const char* p = full.data();
325 const char* end = p + full.size();
326 while (p != end) {
327 if (*p == c) { // This could be optimized with hasless(v,1) trick.
328 ++p;
329 } else {
330 while (++p != end && *p != c) {
331 // Skip to the next occurence of the delimiter.
332 }
333 ++count;
334 }
335 }
336 }
337 return count;
338}
339
340// ----------------------------------------------------------------------
341// SplitStringUsing()
342// SplitStringToHashsetUsing()
343// SplitStringToSetUsing()
344// SplitStringToHashmapUsing()
345// Split a string using a character delimiter. Append the components
346// to 'result'.
347//
348// Note: For multi-character delimiters, this routine will split on *ANY* of
349// the characters in the string, not the entire string as a single delimiter.
350// ----------------------------------------------------------------------
351template <typename StringType, typename ITR>
352static inline
353void SplitStringToIteratorUsing(const StringType& full,
354 const char* delim,
355 ITR& result) {
356 // Optimize the common case where delim is a single character.
357 if (delim[0] != '\0' && delim[1] == '\0') {
358 char c = delim[0];
359 const char* p = full.data();
360 const char* end = p + full.size();
361 while (p != end) {
362 if (*p == c) {
363 ++p;
364 } else {
365 const char* start = p;
366 while (++p != end && *p != c) {
367 // Skip to the next occurence of the delimiter.
368 }
369 *result++ = StringType(start, p - start);
370 }
371 }
372 return;
373 }
374
375 string::size_type begin_index, end_index;
376 begin_index = full.find_first_not_of(delim);
377 while (begin_index != string::npos) {
378 end_index = full.find_first_of(delim, begin_index);
379 if (end_index == string::npos) {
380 *result++ = full.substr(begin_index);
381 return;
382 }
383 *result++ = full.substr(begin_index, (end_index - begin_index));
384 begin_index = full.find_first_not_of(delim, end_index);
385 }
386}
387
388void SplitStringUsing(const string& full,
389 const char* delim,
390 vector<string>* result) {
391 result->reserve(result->size() + CalculateReserveForVector(full, delim));
392 back_insert_iterator< vector<string> > it(*result);
393 SplitStringToIteratorUsing(full, delim, it);
394}
395
396void SplitStringToHashsetUsing(const string& full, const char* delim,
397 hash_set<string>* result) {
398 simple_insert_iterator<hash_set<string> > it(result);
399 SplitStringToIteratorUsing(full, delim, it);
400}
401
402void SplitStringToSetUsing(const string& full, const char* delim,
403 set<string>* result) {
404 simple_insert_iterator<set<string> > it(result);
405 SplitStringToIteratorUsing(full, delim, it);
406}
407
408void SplitStringToHashmapUsing(const string& full, const char* delim,
409 hash_map<string, string>* result) {
410 simple_hash_map_iterator<string> it(result);
411 SplitStringToIteratorUsing(full, delim, it);
412}
413
414// ----------------------------------------------------------------------
415// SplitOneIntToken()
416// SplitOneInt32Token()
417// SplitOneUint32Token()
418// SplitOneInt64Token()
419// SplitOneUint64Token()
420// SplitOneDoubleToken()
421// SplitOneFloatToken()
422// SplitOneDecimalIntToken()
423// SplitOneDecimalInt32Token()
424// SplitOneDecimalUint32Token()
425// SplitOneDecimalInt64Token()
426// SplitOneDecimalUint64Token()
427// SplitOneHexUint32Token()
428// SplitOneHexUint64Token()
429// Mainly a stringified wrapper around strtol/strtoul/strtod
430// ----------------------------------------------------------------------
431// Curried functions for the macro below
432static inline long strto32_0(const char * source, char ** end) {
433 return strto32(source, end, 0); }
434static inline unsigned long strtou32_0(const char * source, char ** end) {
435 return strtou32(source, end, 0); }
436static inline int64 strto64_0(const char * source, char ** end) {
437 return strto64(source, end, 0); }
438static inline uint64 strtou64_0(const char * source, char ** end) {
439 return strtou64(source, end, 0); }
440static inline long strto32_10(const char * source, char ** end) {
441 return strto32(source, end, 10); }
442static inline unsigned long strtou32_10(const char * source, char ** end) {
443 return strtou32(source, end, 10); }
444static inline int64 strto64_10(const char * source, char ** end) {
445 return strto64(source, end, 10); }
446static inline uint64 strtou64_10(const char * source, char ** end) {
447 return strtou64(source, end, 10); }
448static inline uint32 strtou32_16(const char * source, char ** end) {
449 return strtou32(source, end, 16); }
450static inline uint64 strtou64_16(const char * source, char ** end) {
451 return strtou64(source, end, 16); }
452
453#define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \
454bool SplitOne##name##Token(const char ** source, const char * delim, \
455 type * value) { \
456 assert(source); \
457 assert(delim); \
458 assert(value); \
459 if (!*source) \
460 return false; \
461 /* Parse int */ \
462 char * end; \
463 *value = function(*source, &end); \
464 if (end == *source) \
465 return false; /* number not present at start of string */ \
466 if (end[0] && !strchr(delim, end[0])) \
467 return false; /* Garbage characters after int */ \
468 /* Advance past token */ \
469 if (*end != '\0') \
470 *source = const_cast<const char *>(end+1); \
471 else \
472 *source = NULL; \
473 return true; \
474}
475
476DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0)
477DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0)
478DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0)
479DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0)
480DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0)
481DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod)
482#ifdef COMPILER_MSVC // has no strtof()
483// Note: does an implicit cast to float.
484DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod)
485#else
486DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof)
487#endif
488DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10)
489DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10)
490DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10)
491DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10)
492DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10)
493DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16)
494DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16)
495