1 | // Copyright 2008 and onwards Google Inc. All rights reserved. |
2 | |
3 | #include <limits> |
4 | using std::numeric_limits; |
5 | |
6 | |
7 | // #include "base/commandlineflags.h" |
8 | #include "base/integral_types.h" |
9 | #include "base/logging.h" |
10 | #include "base/macros.h" |
11 | #include "base/strtoint.h" |
12 | #include "split.h" |
13 | #include "strutil.h" |
14 | #include "util/hash/hash_jenkins_lookup2.h" |
15 | |
16 | static const uint32 MIX32 = 0x12b9b0a1UL; // pi; an arbitrary number |
17 | static const uint64 MIX64 = GG_ULONGLONG(0x2b992ddfa23249d6); // more of pi |
18 | |
19 | // ---------------------------------------------------------------------- |
20 | // Hash32StringWithSeed() |
21 | // Hash64StringWithSeed() |
22 | // Hash32NumWithSeed() |
23 | // Hash64NumWithSeed() |
24 | // These are Bob Jenkins' hash functions, one for 32 bit numbers |
25 | // and one for 64 bit numbers. Each takes a string as input and |
26 | // a start seed. Hashing the same string with two different seeds |
27 | // should give two independent hash values. |
28 | // The *Num*() functions just do a single mix, in order to |
29 | // convert the given number into something *random*. |
30 | // |
31 | // Note that these methods may return any value for the given size, while |
32 | // the corresponding HashToXX() methods avoids certain reserved values. |
33 | // ---------------------------------------------------------------------- |
34 | |
35 | // Once again, not included, but copied from: |
36 | // http://szl.googlecode.com/svn-history/r40/trunk/src/utilities/hashutils.cc |
37 | // This is an Apache license...make sure this is ok |
38 | const uint32 kPrimes32[16] ={ |
39 | 65537, 65539, 65543, 65551, 65557, 65563, 65579, 65581, |
40 | 65587, 65599, 65609, 65617, 65629, 65633, 65647, 65651, |
41 | }; |
42 | uint32 Hash32StringWithSeed(const char *s, uint32 len, uint32 seed) { |
43 | uint32 n = seed; |
44 | size_t prime1 = 0, prime2 = 8; // Indices into kPrimes32 |
45 | union { |
46 | uint16 n; |
47 | char bytes[sizeof(uint16)]; |
48 | } chunk; |
49 | for (const char *i = s, *const end = s + len; i != end; ) { |
50 | chunk.bytes[0] = *i++; |
51 | chunk.bytes[1] = i == end ? 0 : *i++; |
52 | n = n * kPrimes32[prime1++] ^ chunk.n * kPrimes32[prime2++]; |
53 | prime1 &= 0x0F; |
54 | prime2 &= 0x0F; |
55 | } |
56 | return n; |
57 | } |
58 | extern uint64 Hash64StringWithSeed(const char *s, uint32 len, uint64 c); |
59 | extern uint32 Hash32StringWithSeedReferenceImplementation(const char *s, |
60 | uint32 len, uint32 c); |
61 | // ---------------------------------------------------------------------- |
62 | // HashTo32() |
63 | // HashTo16() |
64 | // HashTo8() |
65 | // These functions take various types of input (through operator |
66 | // overloading) and return 32, 16, and 8 bit quantities, respectively. |
67 | // The basic rule of our hashing is: always mix(). Thus, even for |
68 | // char outputs we cast to a uint32 and mix with two arbitrary numbers. |
69 | // As indicated in basictypes.h, there are a few illegal hash |
70 | // values to watch out for. |
71 | // |
72 | // Note that these methods avoid returning certain reserved values, while |
73 | // the corresponding HashXXStringWithSeed() methdos may return any value. |
74 | // ---------------------------------------------------------------------- |
75 | |
76 | // This macro defines the HashTo32, To16, and To8 versions all in one go. |
77 | // It takes the argument list and a command that hashes your number. |
78 | // (For 16 and 8, we just mod retval before returning it.) Example: |
79 | // HASH_TO((char c), Hash32NumWithSeed(c, MIX32_1)) |
80 | // evaluates to |
81 | // uint32 retval; |
82 | // retval = Hash32NumWithSeed(c, MIX32_1); |
83 | // return retval == kIllegalHash32 ? retval-1 : retval; |
84 | // |
85 | |
86 | inline uint32 Hash32NumWithSeed(uint32 num, uint32 c) { |
87 | uint32 b = 0x9e3779b9UL; // the golden ratio; an arbitrary value |
88 | mix(num, b, c); |
89 | return c; |
90 | } |
91 | |
92 | inline uint64 Hash64NumWithSeed(uint64 num, uint64 c) { |
93 | uint64 b = GG_ULONGLONG(0xe08c1d668b756f82); // more of the golden ratio |
94 | mix(num, b, c); |
95 | return c; |
96 | } |
97 | |
98 | #define HASH_TO(arglist, command) \ |
99 | inline uint32 HashTo32 arglist { \ |
100 | uint32 retval = command; \ |
101 | return retval == kIllegalHash32 ? retval-1 : retval; \ |
102 | } \ |
103 | inline uint16 HashTo16 arglist { \ |
104 | uint16 retval16 = command >> 16; /* take upper 16 bits */ \ |
105 | return retval16 == kIllegalHash16 ? retval16-1 : retval16; \ |
106 | } \ |
107 | inline unsigned char HashTo8 arglist { \ |
108 | unsigned char retval8 = command >> 24; /* take upper 8 bits */ \ |
109 | return retval8 == kIllegalHash8 ? retval8-1 : retval8; \ |
110 | } |
111 | |
112 | // This defines: |
113 | // HashToXX(char *s, int slen); |
114 | // HashToXX(char c); |
115 | // etc |
116 | |
117 | HASH_TO((const char *s, uint32 slen), Hash32StringWithSeed(s, slen, MIX32)) |
118 | HASH_TO((const wchar_t *s, uint32 slen), |
119 | Hash32StringWithSeed(reinterpret_cast<const char*>(s), |
120 | sizeof(wchar_t) * slen, |
121 | MIX32)) |
122 | HASH_TO((char c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32)) |
123 | HASH_TO((schar c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32)) |
124 | HASH_TO((uint16 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32)) |
125 | HASH_TO((int16 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32)) |
126 | HASH_TO((uint32 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32)) |
127 | HASH_TO((int32 c), Hash32NumWithSeed(static_cast<uint32>(c), MIX32)) |
128 | HASH_TO((uint64 c), static_cast<uint32>(Hash64NumWithSeed(c, MIX64) >> 32)) |
129 | HASH_TO((int64 c), static_cast<uint32>(Hash64NumWithSeed(c, MIX64) >> 32)) |
130 | |
131 | #undef HASH_TO // clean up the macro space |
132 | |
133 | |
134 | // HASH_NAMESPACE_DECLARATION_START |
135 | namespace __gnu_cxx { |
136 | |
137 | #if defined(__GNUC__) |
138 | // Use our nice hash function for strings |
139 | template<class _CharT, class _Traits, class _Alloc> |
140 | struct hash<basic_string<_CharT, _Traits, _Alloc> > { |
141 | size_t operator()(const basic_string<_CharT, _Traits, _Alloc>& k) const { |
142 | return HashTo32(k.data(), static_cast<uint32>(k.length())); |
143 | } |
144 | }; |
145 | |
146 | // they don't define a hash for const string at all |
147 | template<> struct hash<const string> { |
148 | size_t operator()(const string& k) const { |
149 | return HashTo32(k.data(), static_cast<uint32>(k.length())); |
150 | } |
151 | }; |
152 | #endif // __GNUC__ |
153 | |
154 | // MSVC's STL requires an ever-so slightly different decl |
155 | #if defined STL_MSVC |
156 | template<> struct hash<char const*> : PortableHashBase { |
157 | size_t operator()(char const* const k) const { |
158 | return HashTo32(k, strlen(k)); |
159 | } |
160 | // Less than operator: |
161 | bool operator()(char const* const a, char const* const b) const { |
162 | return strcmp(a, b) < 0; |
163 | } |
164 | }; |
165 | |
166 | template<> struct hash<string> : PortableHashBase { |
167 | size_t operator()(const string& k) const { |
168 | return HashTo32(k.data(), k.length()); |
169 | } |
170 | // Less than operator: |
171 | bool operator()(const string& a, const string& b) const { |
172 | return a < b; |
173 | } |
174 | }; |
175 | |
176 | #endif |
177 | |
178 | } // hash namespace |
179 | |
180 | |
181 | namespace { |
182 | // NOTE(user): we have to implement our own interator because |
183 | // insert_iterator<set<string> > does not instantiate without |
184 | // errors, perhaps since string != std::string. |
185 | // This is not a fully functional iterator, but is |
186 | // sufficient for SplitStringToIteratorUsing(). |
187 | template <typename T> |
188 | struct simple_insert_iterator { |
189 | T* t_; |
190 | simple_insert_iterator(T* t) : t_(t) { } |
191 | |
192 | simple_insert_iterator<T>& operator=(const typename T::value_type& value) { |
193 | t_->insert(value); |
194 | return *this; |
195 | } |
196 | |
197 | simple_insert_iterator<T>& operator*() { return *this; } |
198 | simple_insert_iterator<T>& operator++() { return *this; } |
199 | simple_insert_iterator<T>& operator++(int) { return *this; } |
200 | }; |
201 | |
202 | // Used to populate a hash_map out of pairs of consecutive strings in |
203 | // SplitStringToIterator{Using|AllowEmpty}(). |
204 | template <typename T> |
205 | struct simple_hash_map_iterator { |
206 | typedef hash_map<T, T> hashmap; |
207 | hashmap* t; |
208 | bool even; |
209 | typename hashmap::iterator curr; |
210 | |
211 | simple_hash_map_iterator(hashmap* t_init) : t(t_init), even(true) { |
212 | curr = t->begin(); |
213 | } |
214 | |
215 | simple_hash_map_iterator<T>& operator=(const T& value) { |
216 | if (even) { |
217 | curr = t->insert(make_pair(value, T())).first; |
218 | } else { |
219 | curr->second = value; |
220 | } |
221 | even = !even; |
222 | return *this; |
223 | } |
224 | |
225 | simple_hash_map_iterator<T>& operator*() { return *this; } |
226 | simple_hash_map_iterator<T>& operator++() { return *this; } |
227 | simple_hash_map_iterator<T>& operator++(int i) { return *this; } |
228 | }; |
229 | |
230 | } // anonymous namespace |
231 | |
232 | // ---------------------------------------------------------------------- |
233 | // SplitStringIntoNPiecesAllowEmpty() |
234 | // SplitStringToIteratorAllowEmpty() |
235 | // Split a string using a character delimiter. Append the components |
236 | // to 'result'. If there are consecutive delimiters, this function |
237 | // will return corresponding empty strings. The string is split into |
238 | // at most the specified number of pieces greedily. This means that the |
239 | // last piece may possibly be split further. To split into as many pieces |
240 | // as possible, specify 0 as the number of pieces. |
241 | // |
242 | // If "full" is the empty string, yields an empty string as the only value. |
243 | // |
244 | // If "pieces" is negative for some reason, it returns the whole string |
245 | // ---------------------------------------------------------------------- |
246 | template <typename StringType, typename ITR> |
247 | static inline |
248 | void SplitStringToIteratorAllowEmpty(const StringType& full, |
249 | const char* delim, |
250 | int pieces, |
251 | ITR& result) { |
252 | string::size_type begin_index, end_index; |
253 | begin_index = 0; |
254 | |
255 | for (int i=0; (i < pieces-1) || (pieces == 0); i++) { |
256 | end_index = full.find_first_of(delim, begin_index); |
257 | if (end_index == string::npos) { |
258 | *result++ = full.substr(begin_index); |
259 | return; |
260 | } |
261 | *result++ = full.substr(begin_index, (end_index - begin_index)); |
262 | begin_index = end_index + 1; |
263 | } |
264 | *result++ = full.substr(begin_index); |
265 | } |
266 | |
267 | void SplitStringIntoNPiecesAllowEmpty(const string& full, |
268 | const char* delim, |
269 | int pieces, |
270 | vector<string>* result) { |
271 | back_insert_iterator<vector<string> > it(*result); |
272 | SplitStringToIteratorAllowEmpty(full, delim, pieces, it); |
273 | } |
274 | |
275 | // ---------------------------------------------------------------------- |
276 | // SplitStringAllowEmpty |
277 | // SplitStringToHashsetAllowEmpty |
278 | // SplitStringToSetAllowEmpty |
279 | // SplitStringToHashmapAllowEmpty |
280 | // Split a string using a character delimiter. Append the components |
281 | // to 'result'. If there are consecutive delimiters, this function |
282 | // will return corresponding empty strings. |
283 | // ---------------------------------------------------------------------- |
284 | void SplitStringAllowEmpty(const string& full, const char* delim, |
285 | vector<string>* result) { |
286 | back_insert_iterator<vector<string> > it(*result); |
287 | SplitStringToIteratorAllowEmpty(full, delim, 0, it); |
288 | } |
289 | |
290 | void SplitStringToHashsetAllowEmpty(const string& full, const char* delim, |
291 | hash_set<string>* result) { |
292 | simple_insert_iterator<hash_set<string> > it(result); |
293 | SplitStringToIteratorAllowEmpty(full, delim, 0, it); |
294 | } |
295 | |
296 | void SplitStringToSetAllowEmpty(const string& full, const char* delim, |
297 | set<string>* result) { |
298 | simple_insert_iterator<set<string> > it(result); |
299 | SplitStringToIteratorAllowEmpty(full, delim, 0, it); |
300 | } |
301 | |
302 | void SplitStringToHashmapAllowEmpty(const string& full, const char* delim, |
303 | hash_map<string, string>* result) { |
304 | simple_hash_map_iterator<string> it(result); |
305 | SplitStringToIteratorAllowEmpty(full, delim, 0, it); |
306 | } |
307 | |
308 | // If we know how much to allocate for a vector of strings, we can |
309 | // allocate the vector<string> only once and directly to the right size. |
310 | // This saves in between 33-66 % of memory space needed for the result, |
311 | // and runs faster in the microbenchmarks. |
312 | // |
313 | // The reserve is only implemented for the single character delim. |
314 | // |
315 | // The implementation for counting is cut-and-pasted from |
316 | // SplitStringToIteratorUsing. I could have written my own counting iterator, |
317 | // and use the existing template function, but probably this is more clear |
318 | // and more sure to get optimized to reasonable code. |
319 | static int CalculateReserveForVector(const string& full, const char* delim) { |
320 | int count = 0; |
321 | if (delim[0] != '\0' && delim[1] == '\0') { |
322 | // Optimize the common case where delim is a single character. |
323 | char c = delim[0]; |
324 | const char* p = full.data(); |
325 | const char* end = p + full.size(); |
326 | while (p != end) { |
327 | if (*p == c) { // This could be optimized with hasless(v,1) trick. |
328 | ++p; |
329 | } else { |
330 | while (++p != end && *p != c) { |
331 | // Skip to the next occurence of the delimiter. |
332 | } |
333 | ++count; |
334 | } |
335 | } |
336 | } |
337 | return count; |
338 | } |
339 | |
340 | // ---------------------------------------------------------------------- |
341 | // SplitStringUsing() |
342 | // SplitStringToHashsetUsing() |
343 | // SplitStringToSetUsing() |
344 | // SplitStringToHashmapUsing() |
345 | // Split a string using a character delimiter. Append the components |
346 | // to 'result'. |
347 | // |
348 | // Note: For multi-character delimiters, this routine will split on *ANY* of |
349 | // the characters in the string, not the entire string as a single delimiter. |
350 | // ---------------------------------------------------------------------- |
351 | template <typename StringType, typename ITR> |
352 | static inline |
353 | void SplitStringToIteratorUsing(const StringType& full, |
354 | const char* delim, |
355 | ITR& result) { |
356 | // Optimize the common case where delim is a single character. |
357 | if (delim[0] != '\0' && delim[1] == '\0') { |
358 | char c = delim[0]; |
359 | const char* p = full.data(); |
360 | const char* end = p + full.size(); |
361 | while (p != end) { |
362 | if (*p == c) { |
363 | ++p; |
364 | } else { |
365 | const char* start = p; |
366 | while (++p != end && *p != c) { |
367 | // Skip to the next occurence of the delimiter. |
368 | } |
369 | *result++ = StringType(start, p - start); |
370 | } |
371 | } |
372 | return; |
373 | } |
374 | |
375 | string::size_type begin_index, end_index; |
376 | begin_index = full.find_first_not_of(delim); |
377 | while (begin_index != string::npos) { |
378 | end_index = full.find_first_of(delim, begin_index); |
379 | if (end_index == string::npos) { |
380 | *result++ = full.substr(begin_index); |
381 | return; |
382 | } |
383 | *result++ = full.substr(begin_index, (end_index - begin_index)); |
384 | begin_index = full.find_first_not_of(delim, end_index); |
385 | } |
386 | } |
387 | |
388 | void SplitStringUsing(const string& full, |
389 | const char* delim, |
390 | vector<string>* result) { |
391 | result->reserve(result->size() + CalculateReserveForVector(full, delim)); |
392 | back_insert_iterator< vector<string> > it(*result); |
393 | SplitStringToIteratorUsing(full, delim, it); |
394 | } |
395 | |
396 | void SplitStringToHashsetUsing(const string& full, const char* delim, |
397 | hash_set<string>* result) { |
398 | simple_insert_iterator<hash_set<string> > it(result); |
399 | SplitStringToIteratorUsing(full, delim, it); |
400 | } |
401 | |
402 | void SplitStringToSetUsing(const string& full, const char* delim, |
403 | set<string>* result) { |
404 | simple_insert_iterator<set<string> > it(result); |
405 | SplitStringToIteratorUsing(full, delim, it); |
406 | } |
407 | |
408 | void SplitStringToHashmapUsing(const string& full, const char* delim, |
409 | hash_map<string, string>* result) { |
410 | simple_hash_map_iterator<string> it(result); |
411 | SplitStringToIteratorUsing(full, delim, it); |
412 | } |
413 | |
414 | // ---------------------------------------------------------------------- |
415 | // SplitOneIntToken() |
416 | // SplitOneInt32Token() |
417 | // SplitOneUint32Token() |
418 | // SplitOneInt64Token() |
419 | // SplitOneUint64Token() |
420 | // SplitOneDoubleToken() |
421 | // SplitOneFloatToken() |
422 | // SplitOneDecimalIntToken() |
423 | // SplitOneDecimalInt32Token() |
424 | // SplitOneDecimalUint32Token() |
425 | // SplitOneDecimalInt64Token() |
426 | // SplitOneDecimalUint64Token() |
427 | // SplitOneHexUint32Token() |
428 | // SplitOneHexUint64Token() |
429 | // Mainly a stringified wrapper around strtol/strtoul/strtod |
430 | // ---------------------------------------------------------------------- |
431 | // Curried functions for the macro below |
432 | static inline long strto32_0(const char * source, char ** end) { |
433 | return strto32(source, end, 0); } |
434 | static inline unsigned long strtou32_0(const char * source, char ** end) { |
435 | return strtou32(source, end, 0); } |
436 | static inline int64 strto64_0(const char * source, char ** end) { |
437 | return strto64(source, end, 0); } |
438 | static inline uint64 strtou64_0(const char * source, char ** end) { |
439 | return strtou64(source, end, 0); } |
440 | static inline long strto32_10(const char * source, char ** end) { |
441 | return strto32(source, end, 10); } |
442 | static inline unsigned long strtou32_10(const char * source, char ** end) { |
443 | return strtou32(source, end, 10); } |
444 | static inline int64 strto64_10(const char * source, char ** end) { |
445 | return strto64(source, end, 10); } |
446 | static inline uint64 strtou64_10(const char * source, char ** end) { |
447 | return strtou64(source, end, 10); } |
448 | static inline uint32 strtou32_16(const char * source, char ** end) { |
449 | return strtou32(source, end, 16); } |
450 | static inline uint64 strtou64_16(const char * source, char ** end) { |
451 | return strtou64(source, end, 16); } |
452 | |
453 | #define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \ |
454 | bool SplitOne##name##Token(const char ** source, const char * delim, \ |
455 | type * value) { \ |
456 | assert(source); \ |
457 | assert(delim); \ |
458 | assert(value); \ |
459 | if (!*source) \ |
460 | return false; \ |
461 | /* Parse int */ \ |
462 | char * end; \ |
463 | *value = function(*source, &end); \ |
464 | if (end == *source) \ |
465 | return false; /* number not present at start of string */ \ |
466 | if (end[0] && !strchr(delim, end[0])) \ |
467 | return false; /* Garbage characters after int */ \ |
468 | /* Advance past token */ \ |
469 | if (*end != '\0') \ |
470 | *source = const_cast<const char *>(end+1); \ |
471 | else \ |
472 | *source = NULL; \ |
473 | return true; \ |
474 | } |
475 | |
476 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0) |
477 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0) |
478 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0) |
479 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0) |
480 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0) |
481 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod) |
482 | #ifdef COMPILER_MSVC // has no strtof() |
483 | // Note: does an implicit cast to float. |
484 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod) |
485 | #else |
486 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof) |
487 | #endif |
488 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10) |
489 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10) |
490 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10) |
491 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10) |
492 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10) |
493 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16) |
494 | DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16) |
495 | |