1 | // Protocol Buffers - Google's data interchange format |
2 | // Copyright 2008 Google Inc. All rights reserved. |
3 | // https://developers.google.com/protocol-buffers/ |
4 | // |
5 | // Redistribution and use in source and binary forms, with or without |
6 | // modification, are permitted provided that the following conditions are |
7 | // met: |
8 | // |
9 | // * Redistributions of source code must retain the above copyright |
10 | // notice, this list of conditions and the following disclaimer. |
11 | // * Redistributions in binary form must reproduce the above |
12 | // copyright notice, this list of conditions and the following disclaimer |
13 | // in the documentation and/or other materials provided with the |
14 | // distribution. |
15 | // * Neither the name of Google Inc. nor the names of its |
16 | // contributors may be used to endorse or promote products derived from |
17 | // this software without specific prior written permission. |
18 | // |
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | |
31 | // from google3/strings/strutil.cc |
32 | |
33 | #include <google/protobuf/stubs/strutil.h> |
34 | |
35 | #include <errno.h> |
36 | #include <float.h> // FLT_DIG and DBL_DIG |
37 | #include <limits.h> |
38 | #include <stdio.h> |
39 | #include <cmath> |
40 | #include <iterator> |
41 | #include <limits> |
42 | |
43 | #include <google/protobuf/stubs/logging.h> |
44 | #include <google/protobuf/stubs/stl_util.h> |
45 | |
46 | #ifdef _WIN32 |
47 | // MSVC has only _snprintf, not snprintf. |
48 | // |
49 | // MinGW has both snprintf and _snprintf, but they appear to be different |
50 | // functions. The former is buggy. When invoked like so: |
51 | // char buffer[32]; |
52 | // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f); |
53 | // it prints "1.23000e+10". This is plainly wrong: %g should never print |
54 | // trailing zeros after the decimal point. For some reason this bug only |
55 | // occurs with some input values, not all. In any case, _snprintf does the |
56 | // right thing, so we use it. |
57 | #define snprintf _snprintf |
58 | #endif |
59 | |
60 | namespace google { |
61 | namespace protobuf { |
62 | |
63 | // These are defined as macros on some platforms. #undef them so that we can |
64 | // redefine them. |
65 | #undef isxdigit |
66 | #undef isprint |
67 | |
68 | // The definitions of these in ctype.h change based on locale. Since our |
69 | // string manipulation is all in relation to the protocol buffer and C++ |
70 | // languages, we always want to use the C locale. So, we re-define these |
71 | // exactly as we want them. |
72 | inline bool isxdigit(char c) { |
73 | return ('0' <= c && c <= '9') || |
74 | ('a' <= c && c <= 'f') || |
75 | ('A' <= c && c <= 'F'); |
76 | } |
77 | |
78 | inline bool isprint(char c) { |
79 | return c >= 0x20 && c <= 0x7E; |
80 | } |
81 | |
82 | // ---------------------------------------------------------------------- |
83 | // ReplaceCharacters |
84 | // Replaces any occurrence of the character 'remove' (or the characters |
85 | // in 'remove') with the character 'replacewith'. |
86 | // ---------------------------------------------------------------------- |
87 | void ReplaceCharacters(std::string *s, const char *remove, char replacewith) { |
88 | const char *str_start = s->c_str(); |
89 | const char *str = str_start; |
90 | for (str = strpbrk(s: str, accept: remove); |
91 | str != nullptr; |
92 | str = strpbrk(s: str + 1, accept: remove)) { |
93 | (*s)[str - str_start] = replacewith; |
94 | } |
95 | } |
96 | |
97 | void StripWhitespace(std::string *str) { |
98 | int str_length = str->length(); |
99 | |
100 | // Strip off leading whitespace. |
101 | int first = 0; |
102 | while (first < str_length && ascii_isspace(c: str->at(n: first))) { |
103 | ++first; |
104 | } |
105 | // If entire string is white space. |
106 | if (first == str_length) { |
107 | str->clear(); |
108 | return; |
109 | } |
110 | if (first > 0) { |
111 | str->erase(pos: 0, n: first); |
112 | str_length -= first; |
113 | } |
114 | |
115 | // Strip off trailing whitespace. |
116 | int last = str_length - 1; |
117 | while (last >= 0 && ascii_isspace(c: str->at(n: last))) { |
118 | --last; |
119 | } |
120 | if (last != (str_length - 1) && last >= 0) { |
121 | str->erase(pos: last + 1, n: std::string::npos); |
122 | } |
123 | } |
124 | |
125 | // ---------------------------------------------------------------------- |
126 | // StringReplace() |
127 | // Replace the "old" pattern with the "new" pattern in a string, |
128 | // and append the result to "res". If replace_all is false, |
129 | // it only replaces the first instance of "old." |
130 | // ---------------------------------------------------------------------- |
131 | |
132 | void StringReplace(const std::string &s, const std::string &oldsub, |
133 | const std::string &newsub, bool replace_all, |
134 | std::string *res) { |
135 | if (oldsub.empty()) { |
136 | res->append(str: s); // if empty, append the given string. |
137 | return; |
138 | } |
139 | |
140 | std::string::size_type start_pos = 0; |
141 | std::string::size_type pos; |
142 | do { |
143 | pos = s.find(str: oldsub, pos: start_pos); |
144 | if (pos == std::string::npos) { |
145 | break; |
146 | } |
147 | res->append(str: s, pos: start_pos, n: pos - start_pos); |
148 | res->append(str: newsub); |
149 | start_pos = pos + oldsub.size(); // start searching again after the "old" |
150 | } while (replace_all); |
151 | res->append(str: s, pos: start_pos, n: s.length() - start_pos); |
152 | } |
153 | |
154 | // ---------------------------------------------------------------------- |
155 | // StringReplace() |
156 | // Give me a string and two patterns "old" and "new", and I replace |
157 | // the first instance of "old" in the string with "new", if it |
158 | // exists. If "global" is true; call this repeatedly until it |
159 | // fails. RETURN a new string, regardless of whether the replacement |
160 | // happened or not. |
161 | // ---------------------------------------------------------------------- |
162 | |
163 | std::string StringReplace(const std::string &s, const std::string &oldsub, |
164 | const std::string &newsub, bool replace_all) { |
165 | std::string ret; |
166 | StringReplace(s, oldsub, newsub, replace_all, res: &ret); |
167 | return ret; |
168 | } |
169 | |
170 | // ---------------------------------------------------------------------- |
171 | // SplitStringUsing() |
172 | // Split a string using a character delimiter. Append the components |
173 | // to 'result'. |
174 | // |
175 | // Note: For multi-character delimiters, this routine will split on *ANY* of |
176 | // the characters in the string, not the entire string as a single delimiter. |
177 | // ---------------------------------------------------------------------- |
178 | template <typename ITR> |
179 | static inline void SplitStringToIteratorUsing(StringPiece full, |
180 | const char *delim, ITR &result) { |
181 | // Optimize the common case where delim is a single character. |
182 | if (delim[0] != '\0' && delim[1] == '\0') { |
183 | char c = delim[0]; |
184 | const char* p = full.data(); |
185 | const char* end = p + full.size(); |
186 | while (p != end) { |
187 | if (*p == c) { |
188 | ++p; |
189 | } else { |
190 | const char* start = p; |
191 | while (++p != end && *p != c); |
192 | *result++ = std::string(start, p - start); |
193 | } |
194 | } |
195 | return; |
196 | } |
197 | |
198 | std::string::size_type begin_index, end_index; |
199 | begin_index = full.find_first_not_of(s: delim); |
200 | while (begin_index != std::string::npos) { |
201 | end_index = full.find_first_of(s: delim, pos: begin_index); |
202 | if (end_index == std::string::npos) { |
203 | *result++ = std::string(full.substr(pos: begin_index)); |
204 | return; |
205 | } |
206 | *result++ = |
207 | std::string(full.substr(pos: begin_index, n: (end_index - begin_index))); |
208 | begin_index = full.find_first_not_of(s: delim, pos: end_index); |
209 | } |
210 | } |
211 | |
212 | void SplitStringUsing(StringPiece full, const char *delim, |
213 | std::vector<std::string> *result) { |
214 | std::back_insert_iterator<std::vector<std::string> > it(*result); |
215 | SplitStringToIteratorUsing(full, delim, result&: it); |
216 | } |
217 | |
218 | // Split a string using a character delimiter. Append the components |
219 | // to 'result'. If there are consecutive delimiters, this function |
220 | // will return corresponding empty strings. The string is split into |
221 | // at most the specified number of pieces greedily. This means that the |
222 | // last piece may possibly be split further. To split into as many pieces |
223 | // as possible, specify 0 as the number of pieces. |
224 | // |
225 | // If "full" is the empty string, yields an empty string as the only value. |
226 | // |
227 | // If "pieces" is negative for some reason, it returns the whole string |
228 | // ---------------------------------------------------------------------- |
229 | template <typename ITR> |
230 | static inline void SplitStringToIteratorAllowEmpty(StringPiece full, |
231 | const char *delim, |
232 | int pieces, ITR &result) { |
233 | std::string::size_type begin_index, end_index; |
234 | begin_index = 0; |
235 | |
236 | for (int i = 0; (i < pieces-1) || (pieces == 0); i++) { |
237 | end_index = full.find_first_of(s: delim, pos: begin_index); |
238 | if (end_index == std::string::npos) { |
239 | *result++ = std::string(full.substr(pos: begin_index)); |
240 | return; |
241 | } |
242 | *result++ = |
243 | std::string(full.substr(pos: begin_index, n: (end_index - begin_index))); |
244 | begin_index = end_index + 1; |
245 | } |
246 | *result++ = std::string(full.substr(pos: begin_index)); |
247 | } |
248 | |
249 | void SplitStringAllowEmpty(StringPiece full, const char *delim, |
250 | std::vector<std::string> *result) { |
251 | std::back_insert_iterator<std::vector<std::string> > it(*result); |
252 | SplitStringToIteratorAllowEmpty(full, delim, pieces: 0, result&: it); |
253 | } |
254 | |
255 | // ---------------------------------------------------------------------- |
256 | // JoinStrings() |
257 | // This merges a vector of string components with delim inserted |
258 | // as separaters between components. |
259 | // |
260 | // ---------------------------------------------------------------------- |
261 | template <class ITERATOR> |
262 | static void JoinStringsIterator(const ITERATOR &start, const ITERATOR &end, |
263 | const char *delim, std::string *result) { |
264 | GOOGLE_CHECK(result != nullptr); |
265 | result->clear(); |
266 | int delim_length = strlen(s: delim); |
267 | |
268 | // Precompute resulting length so we can reserve() memory in one shot. |
269 | int length = 0; |
270 | for (ITERATOR iter = start; iter != end; ++iter) { |
271 | if (iter != start) { |
272 | length += delim_length; |
273 | } |
274 | length += iter->size(); |
275 | } |
276 | result->reserve(res_arg: length); |
277 | |
278 | // Now combine everything. |
279 | for (ITERATOR iter = start; iter != end; ++iter) { |
280 | if (iter != start) { |
281 | result->append(s: delim, n: delim_length); |
282 | } |
283 | result->append(iter->data(), iter->size()); |
284 | } |
285 | } |
286 | |
287 | void JoinStrings(const std::vector<std::string> &components, const char *delim, |
288 | std::string *result) { |
289 | JoinStringsIterator(start: components.begin(), end: components.end(), delim, result); |
290 | } |
291 | |
292 | // ---------------------------------------------------------------------- |
293 | // UnescapeCEscapeSequences() |
294 | // This does all the unescaping that C does: \ooo, \r, \n, etc |
295 | // Returns length of resulting string. |
296 | // The implementation of \x parses any positive number of hex digits, |
297 | // but it is an error if the value requires more than 8 bits, and the |
298 | // result is truncated to 8 bits. |
299 | // |
300 | // The second call stores its errors in a supplied string vector. |
301 | // If the string vector pointer is nullptr, it reports the errors with LOG(). |
302 | // ---------------------------------------------------------------------- |
303 | |
304 | #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7')) |
305 | |
306 | // Protocol buffers doesn't ever care about errors, but I don't want to remove |
307 | // the code. |
308 | #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false) |
309 | |
310 | int UnescapeCEscapeSequences(const char* source, char* dest) { |
311 | return UnescapeCEscapeSequences(source, dest, errors: nullptr); |
312 | } |
313 | |
314 | int UnescapeCEscapeSequences(const char *source, char *dest, |
315 | std::vector<std::string> *errors) { |
316 | GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented." ; |
317 | |
318 | char* d = dest; |
319 | const char* p = source; |
320 | |
321 | // Small optimization for case where source = dest and there's no escaping |
322 | while ( p == d && *p != '\0' && *p != '\\' ) |
323 | p++, d++; |
324 | |
325 | while (*p != '\0') { |
326 | if (*p != '\\') { |
327 | *d++ = *p++; |
328 | } else { |
329 | switch ( *++p ) { // skip past the '\\' |
330 | case '\0': |
331 | LOG_STRING(ERROR, errors) << "String cannot end with \\" ; |
332 | *d = '\0'; |
333 | return d - dest; // we're done with p |
334 | case 'a': *d++ = '\a'; break; |
335 | case 'b': *d++ = '\b'; break; |
336 | case 'f': *d++ = '\f'; break; |
337 | case 'n': *d++ = '\n'; break; |
338 | case 'r': *d++ = '\r'; break; |
339 | case 't': *d++ = '\t'; break; |
340 | case 'v': *d++ = '\v'; break; |
341 | case '\\': *d++ = '\\'; break; |
342 | case '?': *d++ = '\?'; break; // \? Who knew? |
343 | case '\'': *d++ = '\''; break; |
344 | case '"': *d++ = '\"'; break; |
345 | case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits |
346 | case '4': case '5': case '6': case '7': { |
347 | char ch = *p - '0'; |
348 | if ( IS_OCTAL_DIGIT(p[1]) ) |
349 | ch = ch * 8 + *++p - '0'; |
350 | if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice |
351 | ch = ch * 8 + *++p - '0'; // now points at last digit |
352 | *d++ = ch; |
353 | break; |
354 | } |
355 | case 'x': case 'X': { |
356 | if (!isxdigit(c: p[1])) { |
357 | if (p[1] == '\0') { |
358 | LOG_STRING(ERROR, errors) << "String cannot end with \\x" ; |
359 | } else { |
360 | LOG_STRING(ERROR, errors) << |
361 | "\\x cannot be followed by non-hex digit: \\" << *p << p[1]; |
362 | } |
363 | break; |
364 | } |
365 | unsigned int ch = 0; |
366 | const char *hex_start = p; |
367 | while (isxdigit(c: p[1])) // arbitrarily many hex digits |
368 | ch = (ch << 4) + hex_digit_to_int(c: *++p); |
369 | if (ch > 0xFF) |
370 | LOG_STRING(ERROR, errors) |
371 | << "Value of " |
372 | << "\\" << std::string(hex_start, p + 1 - hex_start) |
373 | << " exceeds 8 bits" ; |
374 | *d++ = ch; |
375 | break; |
376 | } |
377 | #if 0 // TODO(kenton): Support \u and \U? Requires runetochar(). |
378 | case 'u': { |
379 | // \uhhhh => convert 4 hex digits to UTF-8 |
380 | char32 rune = 0; |
381 | const char *hex_start = p; |
382 | for (int i = 0; i < 4; ++i) { |
383 | if (isxdigit(p[1])) { // Look one char ahead. |
384 | rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p. |
385 | } else { |
386 | LOG_STRING(ERROR, errors) |
387 | << "\\u must be followed by 4 hex digits: \\" |
388 | << std::string(hex_start, p+1-hex_start); |
389 | break; |
390 | } |
391 | } |
392 | d += runetochar(d, &rune); |
393 | break; |
394 | } |
395 | case 'U': { |
396 | // \Uhhhhhhhh => convert 8 hex digits to UTF-8 |
397 | char32 rune = 0; |
398 | const char *hex_start = p; |
399 | for (int i = 0; i < 8; ++i) { |
400 | if (isxdigit(p[1])) { // Look one char ahead. |
401 | // Don't change rune until we're sure this |
402 | // is within the Unicode limit, but do advance p. |
403 | char32 newrune = (rune << 4) + hex_digit_to_int(*++p); |
404 | if (newrune > 0x10FFFF) { |
405 | LOG_STRING(ERROR, errors) |
406 | << "Value of \\" |
407 | << std::string(hex_start, p + 1 - hex_start) |
408 | << " exceeds Unicode limit (0x10FFFF)" ; |
409 | break; |
410 | } else { |
411 | rune = newrune; |
412 | } |
413 | } else { |
414 | LOG_STRING(ERROR, errors) |
415 | << "\\U must be followed by 8 hex digits: \\" |
416 | << std::string(hex_start, p+1-hex_start); |
417 | break; |
418 | } |
419 | } |
420 | d += runetochar(d, &rune); |
421 | break; |
422 | } |
423 | #endif |
424 | default: |
425 | LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p; |
426 | } |
427 | p++; // read past letter we escaped |
428 | } |
429 | } |
430 | *d = '\0'; |
431 | return d - dest; |
432 | } |
433 | |
434 | // ---------------------------------------------------------------------- |
435 | // UnescapeCEscapeString() |
436 | // This does the same thing as UnescapeCEscapeSequences, but creates |
437 | // a new string. The caller does not need to worry about allocating |
438 | // a dest buffer. This should be used for non performance critical |
439 | // tasks such as printing debug messages. It is safe for src and dest |
440 | // to be the same. |
441 | // |
442 | // The second call stores its errors in a supplied string vector. |
443 | // If the string vector pointer is nullptr, it reports the errors with LOG(). |
444 | // |
445 | // In the first and second calls, the length of dest is returned. In the |
446 | // the third call, the new string is returned. |
447 | // ---------------------------------------------------------------------- |
448 | int UnescapeCEscapeString(const std::string &src, std::string *dest) { |
449 | return UnescapeCEscapeString(src, dest, errors: nullptr); |
450 | } |
451 | |
452 | int UnescapeCEscapeString(const std::string &src, std::string *dest, |
453 | std::vector<std::string> *errors) { |
454 | std::unique_ptr<char[]> unescaped(new char[src.size() + 1]); |
455 | int len = UnescapeCEscapeSequences(source: src.c_str(), dest: unescaped.get(), errors); |
456 | GOOGLE_CHECK(dest); |
457 | dest->assign(s: unescaped.get(), n: len); |
458 | return len; |
459 | } |
460 | |
461 | std::string UnescapeCEscapeString(const std::string &src) { |
462 | std::unique_ptr<char[]> unescaped(new char[src.size() + 1]); |
463 | int len = UnescapeCEscapeSequences(source: src.c_str(), dest: unescaped.get(), errors: nullptr); |
464 | return std::string(unescaped.get(), len); |
465 | } |
466 | |
467 | // ---------------------------------------------------------------------- |
468 | // CEscapeString() |
469 | // CHexEscapeString() |
470 | // Copies 'src' to 'dest', escaping dangerous characters using |
471 | // C-style escape sequences. This is very useful for preparing query |
472 | // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses |
473 | // hexadecimal rather than octal sequences. |
474 | // Returns the number of bytes written to 'dest' (not including the \0) |
475 | // or -1 if there was insufficient space. |
476 | // |
477 | // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped. |
478 | // ---------------------------------------------------------------------- |
479 | int CEscapeInternal(const char* src, int src_len, char* dest, |
480 | int dest_len, bool use_hex, bool utf8_safe) { |
481 | const char* src_end = src + src_len; |
482 | int used = 0; |
483 | bool last_hex_escape = false; // true if last output char was \xNN |
484 | |
485 | for (; src < src_end; src++) { |
486 | if (dest_len - used < 2) // Need space for two letter escape |
487 | return -1; |
488 | |
489 | bool is_hex_escape = false; |
490 | switch (*src) { |
491 | case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; |
492 | case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; |
493 | case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; |
494 | case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; |
495 | case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; |
496 | case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; |
497 | default: |
498 | // Note that if we emit \xNN and the src character after that is a hex |
499 | // digit then that digit must be escaped too to prevent it being |
500 | // interpreted as part of the character code by C. |
501 | if ((!utf8_safe || static_cast<uint8_t>(*src) < 0x80) && |
502 | (!isprint(c: *src) || |
503 | (last_hex_escape && isxdigit(c: *src)))) { |
504 | if (dest_len - used < 4) // need space for 4 letter escape |
505 | return -1; |
506 | sprintf(s: dest + used, format: (use_hex ? "\\x%02x" : "\\%03o" ), |
507 | static_cast<uint8_t>(*src)); |
508 | is_hex_escape = use_hex; |
509 | used += 4; |
510 | } else { |
511 | dest[used++] = *src; break; |
512 | } |
513 | } |
514 | last_hex_escape = is_hex_escape; |
515 | } |
516 | |
517 | if (dest_len - used < 1) // make sure that there is room for \0 |
518 | return -1; |
519 | |
520 | dest[used] = '\0'; // doesn't count towards return value though |
521 | return used; |
522 | } |
523 | |
524 | // Calculates the length of the C-style escaped version of 'src'. |
525 | // Assumes that non-printable characters are escaped using octal sequences, and |
526 | // that UTF-8 bytes are not handled specially. |
527 | static inline size_t CEscapedLength(StringPiece src) { |
528 | static char c_escaped_len[256] = { |
529 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r |
530 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
531 | 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", ' |
532 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9' |
533 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O' |
534 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\' |
535 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o' |
536 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL |
537 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
538 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
539 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
540 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
541 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
542 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
543 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
544 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
545 | }; |
546 | |
547 | size_t escaped_len = 0; |
548 | for (StringPiece::size_type i = 0; i < src.size(); ++i) { |
549 | unsigned char c = static_cast<unsigned char>(src[i]); |
550 | escaped_len += c_escaped_len[c]; |
551 | } |
552 | return escaped_len; |
553 | } |
554 | |
555 | // ---------------------------------------------------------------------- |
556 | // Escapes 'src' using C-style escape sequences, and appends the escaped string |
557 | // to 'dest'. This version is faster than calling CEscapeInternal as it computes |
558 | // the required space using a lookup table, and also does not do any special |
559 | // handling for Hex or UTF-8 characters. |
560 | // ---------------------------------------------------------------------- |
561 | void CEscapeAndAppend(StringPiece src, std::string *dest) { |
562 | size_t escaped_len = CEscapedLength(src); |
563 | if (escaped_len == src.size()) { |
564 | dest->append(s: src.data(), n: src.size()); |
565 | return; |
566 | } |
567 | |
568 | size_t cur_dest_len = dest->size(); |
569 | dest->resize(n: cur_dest_len + escaped_len); |
570 | char* append_ptr = &(*dest)[cur_dest_len]; |
571 | |
572 | for (StringPiece::size_type i = 0; i < src.size(); ++i) { |
573 | unsigned char c = static_cast<unsigned char>(src[i]); |
574 | switch (c) { |
575 | case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break; |
576 | case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break; |
577 | case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break; |
578 | case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break; |
579 | case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break; |
580 | case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break; |
581 | default: |
582 | if (!isprint(c)) { |
583 | *append_ptr++ = '\\'; |
584 | *append_ptr++ = '0' + c / 64; |
585 | *append_ptr++ = '0' + (c % 64) / 8; |
586 | *append_ptr++ = '0' + c % 8; |
587 | } else { |
588 | *append_ptr++ = c; |
589 | } |
590 | break; |
591 | } |
592 | } |
593 | } |
594 | |
595 | std::string CEscape(const std::string &src) { |
596 | std::string dest; |
597 | CEscapeAndAppend(src, dest: &dest); |
598 | return dest; |
599 | } |
600 | |
601 | namespace strings { |
602 | |
603 | std::string Utf8SafeCEscape(const std::string &src) { |
604 | const int dest_length = src.size() * 4 + 1; // Maximum possible expansion |
605 | std::unique_ptr<char[]> dest(new char[dest_length]); |
606 | const int len = CEscapeInternal(src: src.data(), src_len: src.size(), |
607 | dest: dest.get(), dest_len: dest_length, use_hex: false, utf8_safe: true); |
608 | GOOGLE_DCHECK_GE(len, 0); |
609 | return std::string(dest.get(), len); |
610 | } |
611 | |
612 | std::string CHexEscape(const std::string &src) { |
613 | const int dest_length = src.size() * 4 + 1; // Maximum possible expansion |
614 | std::unique_ptr<char[]> dest(new char[dest_length]); |
615 | const int len = CEscapeInternal(src: src.data(), src_len: src.size(), |
616 | dest: dest.get(), dest_len: dest_length, use_hex: true, utf8_safe: false); |
617 | GOOGLE_DCHECK_GE(len, 0); |
618 | return std::string(dest.get(), len); |
619 | } |
620 | |
621 | } // namespace strings |
622 | |
623 | // ---------------------------------------------------------------------- |
624 | // strto32_adaptor() |
625 | // strtou32_adaptor() |
626 | // Implementation of strto[u]l replacements that have identical |
627 | // overflow and underflow characteristics for both ILP-32 and LP-64 |
628 | // platforms, including errno preservation in error-free calls. |
629 | // ---------------------------------------------------------------------- |
630 | |
631 | int32_t strto32_adaptor(const char *nptr, char **endptr, int base) { |
632 | const int saved_errno = errno; |
633 | errno = 0; |
634 | const long result = strtol(nptr: nptr, endptr: endptr, base: base); |
635 | if (errno == ERANGE && result == LONG_MIN) { |
636 | return std::numeric_limits<int32_t>::min(); |
637 | } else if (errno == ERANGE && result == LONG_MAX) { |
638 | return std::numeric_limits<int32_t>::max(); |
639 | } else if (errno == 0 && result < std::numeric_limits<int32_t>::min()) { |
640 | errno = ERANGE; |
641 | return std::numeric_limits<int32_t>::min(); |
642 | } else if (errno == 0 && result > std::numeric_limits<int32_t>::max()) { |
643 | errno = ERANGE; |
644 | return std::numeric_limits<int32_t>::max(); |
645 | } |
646 | if (errno == 0) |
647 | errno = saved_errno; |
648 | return static_cast<int32_t>(result); |
649 | } |
650 | |
651 | uint32_t strtou32_adaptor(const char *nptr, char **endptr, int base) { |
652 | const int saved_errno = errno; |
653 | errno = 0; |
654 | const unsigned long result = strtoul(nptr: nptr, endptr: endptr, base: base); |
655 | if (errno == ERANGE && result == ULONG_MAX) { |
656 | return std::numeric_limits<uint32_t>::max(); |
657 | } else if (errno == 0 && result > std::numeric_limits<uint32_t>::max()) { |
658 | errno = ERANGE; |
659 | return std::numeric_limits<uint32_t>::max(); |
660 | } |
661 | if (errno == 0) |
662 | errno = saved_errno; |
663 | return static_cast<uint32_t>(result); |
664 | } |
665 | |
666 | inline bool safe_parse_sign(std::string *text /*inout*/, |
667 | bool *negative_ptr /*output*/) { |
668 | const char* start = text->data(); |
669 | const char* end = start + text->size(); |
670 | |
671 | // Consume whitespace. |
672 | while (start < end && (start[0] == ' ')) { |
673 | ++start; |
674 | } |
675 | while (start < end && (end[-1] == ' ')) { |
676 | --end; |
677 | } |
678 | if (start >= end) { |
679 | return false; |
680 | } |
681 | |
682 | // Consume sign. |
683 | *negative_ptr = (start[0] == '-'); |
684 | if (*negative_ptr || start[0] == '+') { |
685 | ++start; |
686 | if (start >= end) { |
687 | return false; |
688 | } |
689 | } |
690 | *text = text->substr(pos: start - text->data(), n: end - start); |
691 | return true; |
692 | } |
693 | |
694 | template <typename IntType> |
695 | bool safe_parse_positive_int(std::string text, IntType *value_p) { |
696 | int base = 10; |
697 | IntType value = 0; |
698 | const IntType vmax = std::numeric_limits<IntType>::max(); |
699 | assert(vmax > 0); |
700 | assert(vmax >= base); |
701 | const IntType vmax_over_base = vmax / base; |
702 | const char* start = text.data(); |
703 | const char* end = start + text.size(); |
704 | // loop over digits |
705 | for (; start < end; ++start) { |
706 | unsigned char c = static_cast<unsigned char>(start[0]); |
707 | int digit = c - '0'; |
708 | if (digit >= base || digit < 0) { |
709 | *value_p = value; |
710 | return false; |
711 | } |
712 | if (value > vmax_over_base) { |
713 | *value_p = vmax; |
714 | return false; |
715 | } |
716 | value *= base; |
717 | if (value > vmax - digit) { |
718 | *value_p = vmax; |
719 | return false; |
720 | } |
721 | value += digit; |
722 | } |
723 | *value_p = value; |
724 | return true; |
725 | } |
726 | |
727 | template <typename IntType> |
728 | bool safe_parse_negative_int(const std::string &text, IntType *value_p) { |
729 | int base = 10; |
730 | IntType value = 0; |
731 | const IntType vmin = std::numeric_limits<IntType>::min(); |
732 | assert(vmin < 0); |
733 | assert(vmin <= 0 - base); |
734 | IntType vmin_over_base = vmin / base; |
735 | // 2003 c++ standard [expr.mul] |
736 | // "... the sign of the remainder is implementation-defined." |
737 | // Although (vmin/base)*base + vmin%base is always vmin. |
738 | // 2011 c++ standard tightens the spec but we cannot rely on it. |
739 | if (vmin % base > 0) { |
740 | vmin_over_base += 1; |
741 | } |
742 | const char* start = text.data(); |
743 | const char* end = start + text.size(); |
744 | // loop over digits |
745 | for (; start < end; ++start) { |
746 | unsigned char c = static_cast<unsigned char>(start[0]); |
747 | int digit = c - '0'; |
748 | if (digit >= base || digit < 0) { |
749 | *value_p = value; |
750 | return false; |
751 | } |
752 | if (value < vmin_over_base) { |
753 | *value_p = vmin; |
754 | return false; |
755 | } |
756 | value *= base; |
757 | if (value < vmin + digit) { |
758 | *value_p = vmin; |
759 | return false; |
760 | } |
761 | value -= digit; |
762 | } |
763 | *value_p = value; |
764 | return true; |
765 | } |
766 | |
767 | template <typename IntType> |
768 | bool safe_int_internal(std::string text, IntType *value_p) { |
769 | *value_p = 0; |
770 | bool negative; |
771 | if (!safe_parse_sign(text: &text, negative_ptr: &negative)) { |
772 | return false; |
773 | } |
774 | if (!negative) { |
775 | return safe_parse_positive_int(text, value_p); |
776 | } else { |
777 | return safe_parse_negative_int(text, value_p); |
778 | } |
779 | } |
780 | |
781 | template <typename IntType> |
782 | bool safe_uint_internal(std::string text, IntType *value_p) { |
783 | *value_p = 0; |
784 | bool negative; |
785 | if (!safe_parse_sign(text: &text, negative_ptr: &negative) || negative) { |
786 | return false; |
787 | } |
788 | return safe_parse_positive_int(text, value_p); |
789 | } |
790 | |
791 | // ---------------------------------------------------------------------- |
792 | // FastIntToBuffer() |
793 | // FastInt64ToBuffer() |
794 | // FastHexToBuffer() |
795 | // FastHex64ToBuffer() |
796 | // FastHex32ToBuffer() |
797 | // ---------------------------------------------------------------------- |
798 | |
799 | // Offset into buffer where FastInt64ToBuffer places the end of string |
800 | // null character. Also used by FastInt64ToBufferLeft. |
801 | static const int kFastInt64ToBufferOffset = 21; |
802 | |
803 | char *FastInt64ToBuffer(int64_t i, char* buffer) { |
804 | // We could collapse the positive and negative sections, but that |
805 | // would be slightly slower for positive numbers... |
806 | // 22 bytes is enough to store -2**64, -18446744073709551616. |
807 | char* p = buffer + kFastInt64ToBufferOffset; |
808 | *p-- = '\0'; |
809 | if (i >= 0) { |
810 | do { |
811 | *p-- = '0' + i % 10; |
812 | i /= 10; |
813 | } while (i > 0); |
814 | return p + 1; |
815 | } else { |
816 | // On different platforms, % and / have different behaviors for |
817 | // negative numbers, so we need to jump through hoops to make sure |
818 | // we don't divide negative numbers. |
819 | if (i > -10) { |
820 | i = -i; |
821 | *p-- = '0' + i; |
822 | *p = '-'; |
823 | return p; |
824 | } else { |
825 | // Make sure we aren't at MIN_INT, in which case we can't say i = -i |
826 | i = i + 10; |
827 | i = -i; |
828 | *p-- = '0' + i % 10; |
829 | // Undo what we did a moment ago |
830 | i = i / 10 + 1; |
831 | do { |
832 | *p-- = '0' + i % 10; |
833 | i /= 10; |
834 | } while (i > 0); |
835 | *p = '-'; |
836 | return p; |
837 | } |
838 | } |
839 | } |
840 | |
841 | // Offset into buffer where FastInt32ToBuffer places the end of string |
842 | // null character. Also used by FastInt32ToBufferLeft |
843 | static const int kFastInt32ToBufferOffset = 11; |
844 | |
845 | // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the |
846 | // compiler to generate 32 bit arithmetic instructions. It's much faster, at |
847 | // least with 32 bit binaries. |
848 | char *FastInt32ToBuffer(int32_t i, char* buffer) { |
849 | // We could collapse the positive and negative sections, but that |
850 | // would be slightly slower for positive numbers... |
851 | // 12 bytes is enough to store -2**32, -4294967296. |
852 | char* p = buffer + kFastInt32ToBufferOffset; |
853 | *p-- = '\0'; |
854 | if (i >= 0) { |
855 | do { |
856 | *p-- = '0' + i % 10; |
857 | i /= 10; |
858 | } while (i > 0); |
859 | return p + 1; |
860 | } else { |
861 | // On different platforms, % and / have different behaviors for |
862 | // negative numbers, so we need to jump through hoops to make sure |
863 | // we don't divide negative numbers. |
864 | if (i > -10) { |
865 | i = -i; |
866 | *p-- = '0' + i; |
867 | *p = '-'; |
868 | return p; |
869 | } else { |
870 | // Make sure we aren't at MIN_INT, in which case we can't say i = -i |
871 | i = i + 10; |
872 | i = -i; |
873 | *p-- = '0' + i % 10; |
874 | // Undo what we did a moment ago |
875 | i = i / 10 + 1; |
876 | do { |
877 | *p-- = '0' + i % 10; |
878 | i /= 10; |
879 | } while (i > 0); |
880 | *p = '-'; |
881 | return p; |
882 | } |
883 | } |
884 | } |
885 | |
886 | char *FastHexToBuffer(int i, char* buffer) { |
887 | GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i; |
888 | |
889 | static const char *hexdigits = "0123456789abcdef" ; |
890 | char *p = buffer + 21; |
891 | *p-- = '\0'; |
892 | do { |
893 | *p-- = hexdigits[i & 15]; // mod by 16 |
894 | i >>= 4; // divide by 16 |
895 | } while (i > 0); |
896 | return p + 1; |
897 | } |
898 | |
899 | char *InternalFastHexToBuffer(uint64_t value, char* buffer, int num_byte) { |
900 | static const char *hexdigits = "0123456789abcdef" ; |
901 | buffer[num_byte] = '\0'; |
902 | for (int i = num_byte - 1; i >= 0; i--) { |
903 | #ifdef _M_X64 |
904 | // MSVC x64 platform has a bug optimizing the uint32(value) in the #else |
905 | // block. Given that the uint32 cast was to improve performance on 32-bit |
906 | // platforms, we use 64-bit '&' directly. |
907 | buffer[i] = hexdigits[value & 0xf]; |
908 | #else |
909 | buffer[i] = hexdigits[uint32_t(value) & 0xf]; |
910 | #endif |
911 | value >>= 4; |
912 | } |
913 | return buffer; |
914 | } |
915 | |
916 | char *FastHex64ToBuffer(uint64_t value, char* buffer) { |
917 | return InternalFastHexToBuffer(value, buffer, num_byte: 16); |
918 | } |
919 | |
920 | char *FastHex32ToBuffer(uint32_t value, char* buffer) { |
921 | return InternalFastHexToBuffer(value, buffer, num_byte: 8); |
922 | } |
923 | |
924 | // ---------------------------------------------------------------------- |
925 | // FastInt32ToBufferLeft() |
926 | // FastUInt32ToBufferLeft() |
927 | // FastInt64ToBufferLeft() |
928 | // FastUInt64ToBufferLeft() |
929 | // |
930 | // Like the Fast*ToBuffer() functions above, these are intended for speed. |
931 | // Unlike the Fast*ToBuffer() functions, however, these functions write |
932 | // their output to the beginning of the buffer (hence the name, as the |
933 | // output is left-aligned). The caller is responsible for ensuring that |
934 | // the buffer has enough space to hold the output. |
935 | // |
936 | // Returns a pointer to the end of the string (i.e. the null character |
937 | // terminating the string). |
938 | // ---------------------------------------------------------------------- |
939 | |
940 | static const char two_ASCII_digits[100][2] = { |
941 | {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, |
942 | {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'}, |
943 | {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, |
944 | {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'}, |
945 | {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, |
946 | {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'}, |
947 | {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, |
948 | {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'}, |
949 | {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, |
950 | {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'}, |
951 | {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, |
952 | {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'}, |
953 | {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, |
954 | {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'}, |
955 | {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, |
956 | {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'}, |
957 | {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, |
958 | {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'}, |
959 | {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, |
960 | {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'} |
961 | }; |
962 | |
963 | char* FastUInt32ToBufferLeft(uint32_t u, char* buffer) { |
964 | uint32_t digits; |
965 | const char *ASCII_digits = nullptr; |
966 | // The idea of this implementation is to trim the number of divides to as few |
967 | // as possible by using multiplication and subtraction rather than mod (%), |
968 | // and by outputting two digits at a time rather than one. |
969 | // The huge-number case is first, in the hopes that the compiler will output |
970 | // that case in one branch-free block of code, and only output conditional |
971 | // branches into it from below. |
972 | if (u >= 1000000000) { // >= 1,000,000,000 |
973 | digits = u / 100000000; // 100,000,000 |
974 | ASCII_digits = two_ASCII_digits[digits]; |
975 | buffer[0] = ASCII_digits[0]; |
976 | buffer[1] = ASCII_digits[1]; |
977 | buffer += 2; |
978 | sublt100_000_000: |
979 | u -= digits * 100000000; // 100,000,000 |
980 | lt100_000_000: |
981 | digits = u / 1000000; // 1,000,000 |
982 | ASCII_digits = two_ASCII_digits[digits]; |
983 | buffer[0] = ASCII_digits[0]; |
984 | buffer[1] = ASCII_digits[1]; |
985 | buffer += 2; |
986 | sublt1_000_000: |
987 | u -= digits * 1000000; // 1,000,000 |
988 | lt1_000_000: |
989 | digits = u / 10000; // 10,000 |
990 | ASCII_digits = two_ASCII_digits[digits]; |
991 | buffer[0] = ASCII_digits[0]; |
992 | buffer[1] = ASCII_digits[1]; |
993 | buffer += 2; |
994 | sublt10_000: |
995 | u -= digits * 10000; // 10,000 |
996 | lt10_000: |
997 | digits = u / 100; |
998 | ASCII_digits = two_ASCII_digits[digits]; |
999 | buffer[0] = ASCII_digits[0]; |
1000 | buffer[1] = ASCII_digits[1]; |
1001 | buffer += 2; |
1002 | sublt100: |
1003 | u -= digits * 100; |
1004 | lt100: |
1005 | digits = u; |
1006 | ASCII_digits = two_ASCII_digits[digits]; |
1007 | buffer[0] = ASCII_digits[0]; |
1008 | buffer[1] = ASCII_digits[1]; |
1009 | buffer += 2; |
1010 | done: |
1011 | *buffer = 0; |
1012 | return buffer; |
1013 | } |
1014 | |
1015 | if (u < 100) { |
1016 | digits = u; |
1017 | if (u >= 10) goto lt100; |
1018 | *buffer++ = '0' + digits; |
1019 | goto done; |
1020 | } |
1021 | if (u < 10000) { // 10,000 |
1022 | if (u >= 1000) goto lt10_000; |
1023 | digits = u / 100; |
1024 | *buffer++ = '0' + digits; |
1025 | goto sublt100; |
1026 | } |
1027 | if (u < 1000000) { // 1,000,000 |
1028 | if (u >= 100000) goto lt1_000_000; |
1029 | digits = u / 10000; // 10,000 |
1030 | *buffer++ = '0' + digits; |
1031 | goto sublt10_000; |
1032 | } |
1033 | if (u < 100000000) { // 100,000,000 |
1034 | if (u >= 10000000) goto lt100_000_000; |
1035 | digits = u / 1000000; // 1,000,000 |
1036 | *buffer++ = '0' + digits; |
1037 | goto sublt1_000_000; |
1038 | } |
1039 | // we already know that u < 1,000,000,000 |
1040 | digits = u / 100000000; // 100,000,000 |
1041 | *buffer++ = '0' + digits; |
1042 | goto sublt100_000_000; |
1043 | } |
1044 | |
1045 | char* FastInt32ToBufferLeft(int32_t i, char* buffer) { |
1046 | uint32_t u = 0; |
1047 | if (i < 0) { |
1048 | *buffer++ = '-'; |
1049 | u -= i; |
1050 | } else { |
1051 | u = i; |
1052 | } |
1053 | return FastUInt32ToBufferLeft(u, buffer); |
1054 | } |
1055 | |
1056 | char* FastUInt64ToBufferLeft(uint64_t u64, char* buffer) { |
1057 | int digits; |
1058 | const char *ASCII_digits = nullptr; |
1059 | |
1060 | uint32_t u = static_cast<uint32_t>(u64); |
1061 | if (u == u64) return FastUInt32ToBufferLeft(u, buffer); |
1062 | |
1063 | uint64_t top_11_digits = u64 / 1000000000; |
1064 | buffer = FastUInt64ToBufferLeft(u64: top_11_digits, buffer); |
1065 | u = u64 - (top_11_digits * 1000000000); |
1066 | |
1067 | digits = u / 10000000; // 10,000,000 |
1068 | GOOGLE_DCHECK_LT(digits, 100); |
1069 | ASCII_digits = two_ASCII_digits[digits]; |
1070 | buffer[0] = ASCII_digits[0]; |
1071 | buffer[1] = ASCII_digits[1]; |
1072 | buffer += 2; |
1073 | u -= digits * 10000000; // 10,000,000 |
1074 | digits = u / 100000; // 100,000 |
1075 | ASCII_digits = two_ASCII_digits[digits]; |
1076 | buffer[0] = ASCII_digits[0]; |
1077 | buffer[1] = ASCII_digits[1]; |
1078 | buffer += 2; |
1079 | u -= digits * 100000; // 100,000 |
1080 | digits = u / 1000; // 1,000 |
1081 | ASCII_digits = two_ASCII_digits[digits]; |
1082 | buffer[0] = ASCII_digits[0]; |
1083 | buffer[1] = ASCII_digits[1]; |
1084 | buffer += 2; |
1085 | u -= digits * 1000; // 1,000 |
1086 | digits = u / 10; |
1087 | ASCII_digits = two_ASCII_digits[digits]; |
1088 | buffer[0] = ASCII_digits[0]; |
1089 | buffer[1] = ASCII_digits[1]; |
1090 | buffer += 2; |
1091 | u -= digits * 10; |
1092 | digits = u; |
1093 | *buffer++ = '0' + digits; |
1094 | *buffer = 0; |
1095 | return buffer; |
1096 | } |
1097 | |
1098 | char* FastInt64ToBufferLeft(int64_t i, char* buffer) { |
1099 | uint64_t u = 0; |
1100 | if (i < 0) { |
1101 | *buffer++ = '-'; |
1102 | u -= i; |
1103 | } else { |
1104 | u = i; |
1105 | } |
1106 | return FastUInt64ToBufferLeft(u64: u, buffer); |
1107 | } |
1108 | |
1109 | // ---------------------------------------------------------------------- |
1110 | // SimpleItoa() |
1111 | // Description: converts an integer to a string. |
1112 | // |
1113 | // Return value: string |
1114 | // ---------------------------------------------------------------------- |
1115 | |
1116 | std::string SimpleItoa(int i) { |
1117 | char buffer[kFastToBufferSize]; |
1118 | return (sizeof(i) == 4) ? |
1119 | FastInt32ToBuffer(i, buffer) : |
1120 | FastInt64ToBuffer(i, buffer); |
1121 | } |
1122 | |
1123 | std::string SimpleItoa(unsigned int i) { |
1124 | char buffer[kFastToBufferSize]; |
1125 | return std::string(buffer, (sizeof(i) == 4) |
1126 | ? FastUInt32ToBufferLeft(u: i, buffer) |
1127 | : FastUInt64ToBufferLeft(u64: i, buffer)); |
1128 | } |
1129 | |
1130 | std::string SimpleItoa(long i) { |
1131 | char buffer[kFastToBufferSize]; |
1132 | return (sizeof(i) == 4) ? |
1133 | FastInt32ToBuffer(i, buffer) : |
1134 | FastInt64ToBuffer(i, buffer); |
1135 | } |
1136 | |
1137 | std::string SimpleItoa(unsigned long i) { |
1138 | char buffer[kFastToBufferSize]; |
1139 | return std::string(buffer, (sizeof(i) == 4) |
1140 | ? FastUInt32ToBufferLeft(u: i, buffer) |
1141 | : FastUInt64ToBufferLeft(u64: i, buffer)); |
1142 | } |
1143 | |
1144 | std::string SimpleItoa(long long i) { |
1145 | char buffer[kFastToBufferSize]; |
1146 | return (sizeof(i) == 4) ? |
1147 | FastInt32ToBuffer(i, buffer) : |
1148 | FastInt64ToBuffer(i, buffer); |
1149 | } |
1150 | |
1151 | std::string SimpleItoa(unsigned long long i) { |
1152 | char buffer[kFastToBufferSize]; |
1153 | return std::string(buffer, (sizeof(i) == 4) |
1154 | ? FastUInt32ToBufferLeft(u: i, buffer) |
1155 | : FastUInt64ToBufferLeft(u64: i, buffer)); |
1156 | } |
1157 | |
1158 | // ---------------------------------------------------------------------- |
1159 | // SimpleDtoa() |
1160 | // SimpleFtoa() |
1161 | // DoubleToBuffer() |
1162 | // FloatToBuffer() |
1163 | // We want to print the value without losing precision, but we also do |
1164 | // not want to print more digits than necessary. This turns out to be |
1165 | // trickier than it sounds. Numbers like 0.2 cannot be represented |
1166 | // exactly in binary. If we print 0.2 with a very large precision, |
1167 | // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". |
1168 | // On the other hand, if we set the precision too low, we lose |
1169 | // significant digits when printing numbers that actually need them. |
1170 | // It turns out there is no precision value that does the right thing |
1171 | // for all numbers. |
1172 | // |
1173 | // Our strategy is to first try printing with a precision that is never |
1174 | // over-precise, then parse the result with strtod() to see if it |
1175 | // matches. If not, we print again with a precision that will always |
1176 | // give a precise result, but may use more digits than necessary. |
1177 | // |
1178 | // An arguably better strategy would be to use the algorithm described |
1179 | // in "How to Print Floating-Point Numbers Accurately" by Steele & |
1180 | // White, e.g. as implemented by David M. Gay's dtoa(). It turns out, |
1181 | // however, that the following implementation is about as fast as |
1182 | // DMG's code. Furthermore, DMG's code locks mutexes, which means it |
1183 | // will not scale well on multi-core machines. DMG's code is slightly |
1184 | // more accurate (in that it will never use more digits than |
1185 | // necessary), but this is probably irrelevant for most users. |
1186 | // |
1187 | // Rob Pike and Ken Thompson also have an implementation of dtoa() in |
1188 | // third_party/fmt/fltfmt.cc. Their implementation is similar to this |
1189 | // one in that it makes guesses and then uses strtod() to check them. |
1190 | // Their implementation is faster because they use their own code to |
1191 | // generate the digits in the first place rather than use snprintf(), |
1192 | // thus avoiding format string parsing overhead. However, this makes |
1193 | // it considerably more complicated than the following implementation, |
1194 | // and it is embedded in a larger library. If speed turns out to be |
1195 | // an issue, we could re-implement this in terms of their |
1196 | // implementation. |
1197 | // ---------------------------------------------------------------------- |
1198 | |
1199 | std::string SimpleDtoa(double value) { |
1200 | char buffer[kDoubleToBufferSize]; |
1201 | return DoubleToBuffer(i: value, buffer); |
1202 | } |
1203 | |
1204 | std::string SimpleFtoa(float value) { |
1205 | char buffer[kFloatToBufferSize]; |
1206 | return FloatToBuffer(i: value, buffer); |
1207 | } |
1208 | |
1209 | static inline bool IsValidFloatChar(char c) { |
1210 | return ('0' <= c && c <= '9') || |
1211 | c == 'e' || c == 'E' || |
1212 | c == '+' || c == '-'; |
1213 | } |
1214 | |
1215 | void DelocalizeRadix(char* buffer) { |
1216 | // Fast check: if the buffer has a normal decimal point, assume no |
1217 | // translation is needed. |
1218 | if (strchr(s: buffer, c: '.') != nullptr) return; |
1219 | |
1220 | // Find the first unknown character. |
1221 | while (IsValidFloatChar(c: *buffer)) ++buffer; |
1222 | |
1223 | if (*buffer == '\0') { |
1224 | // No radix character found. |
1225 | return; |
1226 | } |
1227 | |
1228 | // We are now pointing at the locale-specific radix character. Replace it |
1229 | // with '.'. |
1230 | *buffer = '.'; |
1231 | ++buffer; |
1232 | |
1233 | if (!IsValidFloatChar(c: *buffer) && *buffer != '\0') { |
1234 | // It appears the radix was a multi-byte character. We need to remove the |
1235 | // extra bytes. |
1236 | char* target = buffer; |
1237 | do { ++buffer; } while (!IsValidFloatChar(c: *buffer) && *buffer != '\0'); |
1238 | memmove(dest: target, src: buffer, n: strlen(s: buffer) + 1); |
1239 | } |
1240 | } |
1241 | |
1242 | char* DoubleToBuffer(double value, char* buffer) { |
1243 | // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all |
1244 | // platforms these days. Just in case some system exists where DBL_DIG |
1245 | // is significantly larger -- and risks overflowing our buffer -- we have |
1246 | // this assert. |
1247 | static_assert(DBL_DIG < 20, "DBL_DIG_is_too_big" ); |
1248 | |
1249 | if (value == std::numeric_limits<double>::infinity()) { |
1250 | strcpy(dest: buffer, src: "inf" ); |
1251 | return buffer; |
1252 | } else if (value == -std::numeric_limits<double>::infinity()) { |
1253 | strcpy(dest: buffer, src: "-inf" ); |
1254 | return buffer; |
1255 | } else if (std::isnan(x: value)) { |
1256 | strcpy(dest: buffer, src: "nan" ); |
1257 | return buffer; |
1258 | } |
1259 | |
1260 | int snprintf_result = |
1261 | snprintf(s: buffer, maxlen: kDoubleToBufferSize, format: "%.*g" , DBL_DIG, value); |
1262 | |
1263 | // The snprintf should never overflow because the buffer is significantly |
1264 | // larger than the precision we asked for. |
1265 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); |
1266 | |
1267 | // We need to make parsed_value volatile in order to force the compiler to |
1268 | // write it out to the stack. Otherwise, it may keep the value in a |
1269 | // register, and if it does that, it may keep it as a long double instead |
1270 | // of a double. This long double may have extra bits that make it compare |
1271 | // unequal to "value" even though it would be exactly equal if it were |
1272 | // truncated to a double. |
1273 | volatile double parsed_value = internal::NoLocaleStrtod(str: buffer, endptr: nullptr); |
1274 | if (parsed_value != value) { |
1275 | snprintf_result = |
1276 | snprintf(s: buffer, maxlen: kDoubleToBufferSize, format: "%.*g" , DBL_DIG + 2, value); |
1277 | |
1278 | // Should never overflow; see above. |
1279 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); |
1280 | } |
1281 | |
1282 | DelocalizeRadix(buffer); |
1283 | return buffer; |
1284 | } |
1285 | |
1286 | static int memcasecmp(const char *s1, const char *s2, size_t len) { |
1287 | const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1); |
1288 | const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2); |
1289 | |
1290 | for (size_t i = 0; i < len; i++) { |
1291 | const int diff = |
1292 | static_cast<int>(static_cast<unsigned char>(ascii_tolower(c: us1[i]))) - |
1293 | static_cast<int>(static_cast<unsigned char>(ascii_tolower(c: us2[i]))); |
1294 | if (diff != 0) return diff; |
1295 | } |
1296 | return 0; |
1297 | } |
1298 | |
1299 | inline bool CaseEqual(StringPiece s1, StringPiece s2) { |
1300 | if (s1.size() != s2.size()) return false; |
1301 | return memcasecmp(s1: s1.data(), s2: s2.data(), len: s1.size()) == 0; |
1302 | } |
1303 | |
1304 | bool safe_strtob(StringPiece str, bool* value) { |
1305 | GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given." ; |
1306 | if (CaseEqual(s1: str, s2: "true" ) || CaseEqual(s1: str, s2: "t" ) || |
1307 | CaseEqual(s1: str, s2: "yes" ) || CaseEqual(s1: str, s2: "y" ) || |
1308 | CaseEqual(s1: str, s2: "1" )) { |
1309 | *value = true; |
1310 | return true; |
1311 | } |
1312 | if (CaseEqual(s1: str, s2: "false" ) || CaseEqual(s1: str, s2: "f" ) || |
1313 | CaseEqual(s1: str, s2: "no" ) || CaseEqual(s1: str, s2: "n" ) || |
1314 | CaseEqual(s1: str, s2: "0" )) { |
1315 | *value = false; |
1316 | return true; |
1317 | } |
1318 | return false; |
1319 | } |
1320 | |
1321 | bool safe_strtof(const char* str, float* value) { |
1322 | char* endptr; |
1323 | errno = 0; // errno only gets set on errors |
1324 | #if defined(_WIN32) || defined (__hpux) // has no strtof() |
1325 | *value = internal::NoLocaleStrtod(str, &endptr); |
1326 | #else |
1327 | *value = strtof(nptr: str, endptr: &endptr); |
1328 | #endif |
1329 | return *str != 0 && *endptr == 0 && errno == 0; |
1330 | } |
1331 | |
1332 | bool safe_strtod(const char* str, double* value) { |
1333 | char* endptr; |
1334 | *value = internal::NoLocaleStrtod(str, endptr: &endptr); |
1335 | if (endptr != str) { |
1336 | while (ascii_isspace(c: *endptr)) ++endptr; |
1337 | } |
1338 | // Ignore range errors from strtod. The values it |
1339 | // returns on underflow and overflow are the right |
1340 | // fallback in a robust setting. |
1341 | return *str != '\0' && *endptr == '\0'; |
1342 | } |
1343 | |
1344 | bool safe_strto32(const std::string &str, int32_t *value) { |
1345 | return safe_int_internal(text: str, value_p: value); |
1346 | } |
1347 | |
1348 | bool safe_strtou32(const std::string &str, uint32_t *value) { |
1349 | return safe_uint_internal(text: str, value_p: value); |
1350 | } |
1351 | |
1352 | bool safe_strto64(const std::string &str, int64_t *value) { |
1353 | return safe_int_internal(text: str, value_p: value); |
1354 | } |
1355 | |
1356 | bool safe_strtou64(const std::string &str, uint64_t *value) { |
1357 | return safe_uint_internal(text: str, value_p: value); |
1358 | } |
1359 | |
1360 | char* FloatToBuffer(float value, char* buffer) { |
1361 | // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all |
1362 | // platforms these days. Just in case some system exists where FLT_DIG |
1363 | // is significantly larger -- and risks overflowing our buffer -- we have |
1364 | // this assert. |
1365 | static_assert(FLT_DIG < 10, "FLT_DIG_is_too_big" ); |
1366 | |
1367 | if (value == std::numeric_limits<double>::infinity()) { |
1368 | strcpy(dest: buffer, src: "inf" ); |
1369 | return buffer; |
1370 | } else if (value == -std::numeric_limits<double>::infinity()) { |
1371 | strcpy(dest: buffer, src: "-inf" ); |
1372 | return buffer; |
1373 | } else if (std::isnan(x: value)) { |
1374 | strcpy(dest: buffer, src: "nan" ); |
1375 | return buffer; |
1376 | } |
1377 | |
1378 | int snprintf_result = |
1379 | snprintf(s: buffer, maxlen: kFloatToBufferSize, format: "%.*g" , FLT_DIG, value); |
1380 | |
1381 | // The snprintf should never overflow because the buffer is significantly |
1382 | // larger than the precision we asked for. |
1383 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); |
1384 | |
1385 | float parsed_value; |
1386 | if (!safe_strtof(str: buffer, value: &parsed_value) || parsed_value != value) { |
1387 | snprintf_result = |
1388 | snprintf(s: buffer, maxlen: kFloatToBufferSize, format: "%.*g" , FLT_DIG + 3, value); |
1389 | |
1390 | // Should never overflow; see above. |
1391 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); |
1392 | } |
1393 | |
1394 | DelocalizeRadix(buffer); |
1395 | return buffer; |
1396 | } |
1397 | |
1398 | namespace strings { |
1399 | |
1400 | AlphaNum::AlphaNum(strings::Hex hex) { |
1401 | char *const end = &digits[kFastToBufferSize]; |
1402 | char *writer = end; |
1403 | uint64_t value = hex.value; |
1404 | uint64_t width = hex.spec; |
1405 | // We accomplish minimum width by OR'ing in 0x10000 to the user's value, |
1406 | // where 0x10000 is the smallest hex number that is as wide as the user |
1407 | // asked for. |
1408 | uint64_t mask = (static_cast<uint64_t>(1) << ((width - 1) * 4)) | value; |
1409 | static const char hexdigits[] = "0123456789abcdef" ; |
1410 | do { |
1411 | *--writer = hexdigits[value & 0xF]; |
1412 | value >>= 4; |
1413 | mask >>= 4; |
1414 | } while (mask != 0); |
1415 | piece_data_ = writer; |
1416 | piece_size_ = end - writer; |
1417 | } |
1418 | |
1419 | } // namespace strings |
1420 | |
1421 | // ---------------------------------------------------------------------- |
1422 | // StrCat() |
1423 | // This merges the given strings or integers, with no delimiter. This |
1424 | // is designed to be the fastest possible way to construct a string out |
1425 | // of a mix of raw C strings, C++ strings, and integer values. |
1426 | // ---------------------------------------------------------------------- |
1427 | |
1428 | // Append is merely a version of memcpy that returns the address of the byte |
1429 | // after the area just overwritten. It comes in multiple flavors to minimize |
1430 | // call overhead. |
1431 | static char *Append1(char *out, const AlphaNum &x) { |
1432 | if (x.size() > 0) { |
1433 | memcpy(dest: out, src: x.data(), n: x.size()); |
1434 | out += x.size(); |
1435 | } |
1436 | return out; |
1437 | } |
1438 | |
1439 | static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { |
1440 | if (x1.size() > 0) { |
1441 | memcpy(dest: out, src: x1.data(), n: x1.size()); |
1442 | out += x1.size(); |
1443 | } |
1444 | if (x2.size() > 0) { |
1445 | memcpy(dest: out, src: x2.data(), n: x2.size()); |
1446 | out += x2.size(); |
1447 | } |
1448 | return out; |
1449 | } |
1450 | |
1451 | static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2, |
1452 | const AlphaNum &x3, const AlphaNum &x4) { |
1453 | if (x1.size() > 0) { |
1454 | memcpy(dest: out, src: x1.data(), n: x1.size()); |
1455 | out += x1.size(); |
1456 | } |
1457 | if (x2.size() > 0) { |
1458 | memcpy(dest: out, src: x2.data(), n: x2.size()); |
1459 | out += x2.size(); |
1460 | } |
1461 | if (x3.size() > 0) { |
1462 | memcpy(dest: out, src: x3.data(), n: x3.size()); |
1463 | out += x3.size(); |
1464 | } |
1465 | if (x4.size() > 0) { |
1466 | memcpy(dest: out, src: x4.data(), n: x4.size()); |
1467 | out += x4.size(); |
1468 | } |
1469 | return out; |
1470 | } |
1471 | |
1472 | std::string StrCat(const AlphaNum &a, const AlphaNum &b) { |
1473 | std::string result; |
1474 | result.resize(n: a.size() + b.size()); |
1475 | char *const begin = &*result.begin(); |
1476 | char *out = Append2(out: begin, x1: a, x2: b); |
1477 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1478 | return result; |
1479 | } |
1480 | |
1481 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { |
1482 | std::string result; |
1483 | result.resize(n: a.size() + b.size() + c.size()); |
1484 | char *const begin = &*result.begin(); |
1485 | char *out = Append2(out: begin, x1: a, x2: b); |
1486 | out = Append1(out, x: c); |
1487 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1488 | return result; |
1489 | } |
1490 | |
1491 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1492 | const AlphaNum &d) { |
1493 | std::string result; |
1494 | result.resize(n: a.size() + b.size() + c.size() + d.size()); |
1495 | char *const begin = &*result.begin(); |
1496 | char *out = Append4(out: begin, x1: a, x2: b, x3: c, x4: d); |
1497 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1498 | return result; |
1499 | } |
1500 | |
1501 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1502 | const AlphaNum &d, const AlphaNum &e) { |
1503 | std::string result; |
1504 | result.resize(n: a.size() + b.size() + c.size() + d.size() + e.size()); |
1505 | char *const begin = &*result.begin(); |
1506 | char *out = Append4(out: begin, x1: a, x2: b, x3: c, x4: d); |
1507 | out = Append1(out, x: e); |
1508 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1509 | return result; |
1510 | } |
1511 | |
1512 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1513 | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) { |
1514 | std::string result; |
1515 | result.resize(n: a.size() + b.size() + c.size() + d.size() + e.size() + |
1516 | f.size()); |
1517 | char *const begin = &*result.begin(); |
1518 | char *out = Append4(out: begin, x1: a, x2: b, x3: c, x4: d); |
1519 | out = Append2(out, x1: e, x2: f); |
1520 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1521 | return result; |
1522 | } |
1523 | |
1524 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1525 | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, |
1526 | const AlphaNum &g) { |
1527 | std::string result; |
1528 | result.resize(n: a.size() + b.size() + c.size() + d.size() + e.size() + |
1529 | f.size() + g.size()); |
1530 | char *const begin = &*result.begin(); |
1531 | char *out = Append4(out: begin, x1: a, x2: b, x3: c, x4: d); |
1532 | out = Append2(out, x1: e, x2: f); |
1533 | out = Append1(out, x: g); |
1534 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1535 | return result; |
1536 | } |
1537 | |
1538 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1539 | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, |
1540 | const AlphaNum &g, const AlphaNum &h) { |
1541 | std::string result; |
1542 | result.resize(n: a.size() + b.size() + c.size() + d.size() + e.size() + |
1543 | f.size() + g.size() + h.size()); |
1544 | char *const begin = &*result.begin(); |
1545 | char *out = Append4(out: begin, x1: a, x2: b, x3: c, x4: d); |
1546 | out = Append4(out, x1: e, x2: f, x3: g, x4: h); |
1547 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1548 | return result; |
1549 | } |
1550 | |
1551 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1552 | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, |
1553 | const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { |
1554 | std::string result; |
1555 | result.resize(n: a.size() + b.size() + c.size() + d.size() + e.size() + |
1556 | f.size() + g.size() + h.size() + i.size()); |
1557 | char *const begin = &*result.begin(); |
1558 | char *out = Append4(out: begin, x1: a, x2: b, x3: c, x4: d); |
1559 | out = Append4(out, x1: e, x2: f, x3: g, x4: h); |
1560 | out = Append1(out, x: i); |
1561 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1562 | return result; |
1563 | } |
1564 | |
1565 | // It's possible to call StrAppend with a char * pointer that is partway into |
1566 | // the string we're appending to. However the results of this are random. |
1567 | // Therefore, check for this in debug mode. Use unsigned math so we only have |
1568 | // to do one comparison. |
1569 | #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \ |
1570 | GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \ |
1571 | uintptr_t((dest).size())) |
1572 | |
1573 | void StrAppend(std::string *result, const AlphaNum &a) { |
1574 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1575 | result->append(s: a.data(), n: a.size()); |
1576 | } |
1577 | |
1578 | void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b) { |
1579 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1580 | GOOGLE_DCHECK_NO_OVERLAP(*result, b); |
1581 | std::string::size_type old_size = result->size(); |
1582 | result->resize(n: old_size + a.size() + b.size()); |
1583 | char *const begin = &*result->begin(); |
1584 | char *out = Append2(out: begin + old_size, x1: a, x2: b); |
1585 | GOOGLE_DCHECK_EQ(out, begin + result->size()); |
1586 | } |
1587 | |
1588 | void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b, |
1589 | const AlphaNum &c) { |
1590 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1591 | GOOGLE_DCHECK_NO_OVERLAP(*result, b); |
1592 | GOOGLE_DCHECK_NO_OVERLAP(*result, c); |
1593 | std::string::size_type old_size = result->size(); |
1594 | result->resize(n: old_size + a.size() + b.size() + c.size()); |
1595 | char *const begin = &*result->begin(); |
1596 | char *out = Append2(out: begin + old_size, x1: a, x2: b); |
1597 | out = Append1(out, x: c); |
1598 | GOOGLE_DCHECK_EQ(out, begin + result->size()); |
1599 | } |
1600 | |
1601 | void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b, |
1602 | const AlphaNum &c, const AlphaNum &d) { |
1603 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1604 | GOOGLE_DCHECK_NO_OVERLAP(*result, b); |
1605 | GOOGLE_DCHECK_NO_OVERLAP(*result, c); |
1606 | GOOGLE_DCHECK_NO_OVERLAP(*result, d); |
1607 | std::string::size_type old_size = result->size(); |
1608 | result->resize(n: old_size + a.size() + b.size() + c.size() + d.size()); |
1609 | char *const begin = &*result->begin(); |
1610 | char *out = Append4(out: begin + old_size, x1: a, x2: b, x3: c, x4: d); |
1611 | GOOGLE_DCHECK_EQ(out, begin + result->size()); |
1612 | } |
1613 | |
1614 | int GlobalReplaceSubstring(const std::string &substring, |
1615 | const std::string &replacement, std::string *s) { |
1616 | GOOGLE_CHECK(s != nullptr); |
1617 | if (s->empty() || substring.empty()) |
1618 | return 0; |
1619 | std::string tmp; |
1620 | int num_replacements = 0; |
1621 | int pos = 0; |
1622 | for (StringPiece::size_type match_pos = |
1623 | s->find(s: substring.data(), pos: pos, n: substring.length()); |
1624 | match_pos != std::string::npos; pos = match_pos + substring.length(), |
1625 | match_pos = s->find(s: substring.data(), pos: pos, |
1626 | n: substring.length())) { |
1627 | ++num_replacements; |
1628 | // Append the original content before the match. |
1629 | tmp.append(str: *s, pos: pos, n: match_pos - pos); |
1630 | // Append the replacement for the match. |
1631 | tmp.append(first: replacement.begin(), last: replacement.end()); |
1632 | } |
1633 | // Append the content after the last match. If no replacements were made, the |
1634 | // original string is left untouched. |
1635 | if (num_replacements > 0) { |
1636 | tmp.append(str: *s, pos: pos, n: s->length() - pos); |
1637 | s->swap(s&: tmp); |
1638 | } |
1639 | return num_replacements; |
1640 | } |
1641 | |
1642 | int CalculateBase64EscapedLen(int input_len, bool do_padding) { |
1643 | // Base64 encodes three bytes of input at a time. If the input is not |
1644 | // divisible by three, we pad as appropriate. |
1645 | // |
1646 | // (from http://tools.ietf.org/html/rfc3548) |
1647 | // Special processing is performed if fewer than 24 bits are available |
1648 | // at the end of the data being encoded. A full encoding quantum is |
1649 | // always completed at the end of a quantity. When fewer than 24 input |
1650 | // bits are available in an input group, zero bits are added (on the |
1651 | // right) to form an integral number of 6-bit groups. Padding at the |
1652 | // end of the data is performed using the '=' character. Since all base |
1653 | // 64 input is an integral number of octets, only the following cases |
1654 | // can arise: |
1655 | |
1656 | |
1657 | // Base64 encodes each three bytes of input into four bytes of output. |
1658 | int len = (input_len / 3) * 4; |
1659 | |
1660 | if (input_len % 3 == 0) { |
1661 | // (from http://tools.ietf.org/html/rfc3548) |
1662 | // (1) the final quantum of encoding input is an integral multiple of 24 |
1663 | // bits; here, the final unit of encoded output will be an integral |
1664 | // multiple of 4 characters with no "=" padding, |
1665 | } else if (input_len % 3 == 1) { |
1666 | // (from http://tools.ietf.org/html/rfc3548) |
1667 | // (2) the final quantum of encoding input is exactly 8 bits; here, the |
1668 | // final unit of encoded output will be two characters followed by two |
1669 | // "=" padding characters, or |
1670 | len += 2; |
1671 | if (do_padding) { |
1672 | len += 2; |
1673 | } |
1674 | } else { // (input_len % 3 == 2) |
1675 | // (from http://tools.ietf.org/html/rfc3548) |
1676 | // (3) the final quantum of encoding input is exactly 16 bits; here, the |
1677 | // final unit of encoded output will be three characters followed by one |
1678 | // "=" padding character. |
1679 | len += 3; |
1680 | if (do_padding) { |
1681 | len += 1; |
1682 | } |
1683 | } |
1684 | |
1685 | assert(len >= input_len); // make sure we didn't overflow |
1686 | return len; |
1687 | } |
1688 | |
1689 | // Base64Escape does padding, so this calculation includes padding. |
1690 | int CalculateBase64EscapedLen(int input_len) { |
1691 | return CalculateBase64EscapedLen(input_len, do_padding: true); |
1692 | } |
1693 | |
1694 | // ---------------------------------------------------------------------- |
1695 | // int Base64Unescape() - base64 decoder |
1696 | // int Base64Escape() - base64 encoder |
1697 | // int WebSafeBase64Unescape() - Google's variation of base64 decoder |
1698 | // int WebSafeBase64Escape() - Google's variation of base64 encoder |
1699 | // |
1700 | // Check out |
1701 | // http://tools.ietf.org/html/rfc2045 for formal description, but what we |
1702 | // care about is that... |
1703 | // Take the encoded stuff in groups of 4 characters and turn each |
1704 | // character into a code 0 to 63 thus: |
1705 | // A-Z map to 0 to 25 |
1706 | // a-z map to 26 to 51 |
1707 | // 0-9 map to 52 to 61 |
1708 | // +(- for WebSafe) maps to 62 |
1709 | // /(_ for WebSafe) maps to 63 |
1710 | // There will be four numbers, all less than 64 which can be represented |
1711 | // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively). |
1712 | // Arrange the 6 digit binary numbers into three bytes as such: |
1713 | // aaaaaabb bbbbcccc ccdddddd |
1714 | // Equals signs (one or two) are used at the end of the encoded block to |
1715 | // indicate that the text was not an integer multiple of three bytes long. |
1716 | // ---------------------------------------------------------------------- |
1717 | |
1718 | int Base64UnescapeInternal(const char *src_param, int szsrc, |
1719 | char *dest, int szdest, |
1720 | const signed char* unbase64) { |
1721 | static const char kPad64Equals = '='; |
1722 | static const char kPad64Dot = '.'; |
1723 | |
1724 | int decode = 0; |
1725 | int destidx = 0; |
1726 | int state = 0; |
1727 | unsigned int ch = 0; |
1728 | unsigned int temp = 0; |
1729 | |
1730 | // If "char" is signed by default, using *src as an array index results in |
1731 | // accessing negative array elements. Treat the input as a pointer to |
1732 | // unsigned char to avoid this. |
1733 | const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param); |
1734 | |
1735 | // The GET_INPUT macro gets the next input character, skipping |
1736 | // over any whitespace, and stopping when we reach the end of the |
1737 | // string or when we read any non-data character. The arguments are |
1738 | // an arbitrary identifier (used as a label for goto) and the number |
1739 | // of data bytes that must remain in the input to avoid aborting the |
1740 | // loop. |
1741 | #define GET_INPUT(label, remain) \ |
1742 | label: \ |
1743 | --szsrc; \ |
1744 | ch = *src++; \ |
1745 | decode = unbase64[ch]; \ |
1746 | if (decode < 0) { \ |
1747 | if (ascii_isspace(ch) && szsrc >= remain) \ |
1748 | goto label; \ |
1749 | state = 4 - remain; \ |
1750 | break; \ |
1751 | } |
1752 | |
1753 | // if dest is null, we're just checking to see if it's legal input |
1754 | // rather than producing output. (I suspect this could just be done |
1755 | // with a regexp...). We duplicate the loop so this test can be |
1756 | // outside it instead of in every iteration. |
1757 | |
1758 | if (dest) { |
1759 | // This loop consumes 4 input bytes and produces 3 output bytes |
1760 | // per iteration. We can't know at the start that there is enough |
1761 | // data left in the string for a full iteration, so the loop may |
1762 | // break out in the middle; if so 'state' will be set to the |
1763 | // number of input bytes read. |
1764 | |
1765 | while (szsrc >= 4) { |
1766 | // We'll start by optimistically assuming that the next four |
1767 | // bytes of the string (src[0..3]) are four good data bytes |
1768 | // (that is, no nulls, whitespace, padding chars, or illegal |
1769 | // chars). We need to test src[0..2] for nulls individually |
1770 | // before constructing temp to preserve the property that we |
1771 | // never read past a null in the string (no matter how long |
1772 | // szsrc claims the string is). |
1773 | |
1774 | if (!src[0] || !src[1] || !src[2] || |
1775 | (temp = ((unsigned(unbase64[src[0]]) << 18) | |
1776 | (unsigned(unbase64[src[1]]) << 12) | |
1777 | (unsigned(unbase64[src[2]]) << 6) | |
1778 | (unsigned(unbase64[src[3]])))) & 0x80000000) { |
1779 | // Iff any of those four characters was bad (null, illegal, |
1780 | // whitespace, padding), then temp's high bit will be set |
1781 | // (because unbase64[] is -1 for all bad characters). |
1782 | // |
1783 | // We'll back up and resort to the slower decoder, which knows |
1784 | // how to handle those cases. |
1785 | |
1786 | GET_INPUT(first, 4); |
1787 | temp = decode; |
1788 | GET_INPUT(second, 3); |
1789 | temp = (temp << 6) | decode; |
1790 | GET_INPUT(third, 2); |
1791 | temp = (temp << 6) | decode; |
1792 | GET_INPUT(fourth, 1); |
1793 | temp = (temp << 6) | decode; |
1794 | } else { |
1795 | // We really did have four good data bytes, so advance four |
1796 | // characters in the string. |
1797 | |
1798 | szsrc -= 4; |
1799 | src += 4; |
1800 | decode = -1; |
1801 | ch = '\0'; |
1802 | } |
1803 | |
1804 | // temp has 24 bits of input, so write that out as three bytes. |
1805 | |
1806 | if (destidx+3 > szdest) return -1; |
1807 | dest[destidx+2] = temp; |
1808 | temp >>= 8; |
1809 | dest[destidx+1] = temp; |
1810 | temp >>= 8; |
1811 | dest[destidx] = temp; |
1812 | destidx += 3; |
1813 | } |
1814 | } else { |
1815 | while (szsrc >= 4) { |
1816 | if (!src[0] || !src[1] || !src[2] || |
1817 | (temp = ((unsigned(unbase64[src[0]]) << 18) | |
1818 | (unsigned(unbase64[src[1]]) << 12) | |
1819 | (unsigned(unbase64[src[2]]) << 6) | |
1820 | (unsigned(unbase64[src[3]])))) & 0x80000000) { |
1821 | GET_INPUT(first_no_dest, 4); |
1822 | GET_INPUT(second_no_dest, 3); |
1823 | GET_INPUT(third_no_dest, 2); |
1824 | GET_INPUT(fourth_no_dest, 1); |
1825 | } else { |
1826 | szsrc -= 4; |
1827 | src += 4; |
1828 | decode = -1; |
1829 | ch = '\0'; |
1830 | } |
1831 | destidx += 3; |
1832 | } |
1833 | } |
1834 | |
1835 | #undef GET_INPUT |
1836 | |
1837 | // if the loop terminated because we read a bad character, return |
1838 | // now. |
1839 | if (decode < 0 && ch != '\0' && |
1840 | ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(c: ch)) |
1841 | return -1; |
1842 | |
1843 | if (ch == kPad64Equals || ch == kPad64Dot) { |
1844 | // if we stopped by hitting an '=' or '.', un-read that character -- we'll |
1845 | // look at it again when we count to check for the proper number of |
1846 | // equals signs at the end. |
1847 | ++szsrc; |
1848 | --src; |
1849 | } else { |
1850 | // This loop consumes 1 input byte per iteration. It's used to |
1851 | // clean up the 0-3 input bytes remaining when the first, faster |
1852 | // loop finishes. 'temp' contains the data from 'state' input |
1853 | // characters read by the first loop. |
1854 | while (szsrc > 0) { |
1855 | --szsrc; |
1856 | ch = *src++; |
1857 | decode = unbase64[ch]; |
1858 | if (decode < 0) { |
1859 | if (ascii_isspace(c: ch)) { |
1860 | continue; |
1861 | } else if (ch == '\0') { |
1862 | break; |
1863 | } else if (ch == kPad64Equals || ch == kPad64Dot) { |
1864 | // back up one character; we'll read it again when we check |
1865 | // for the correct number of pad characters at the end. |
1866 | ++szsrc; |
1867 | --src; |
1868 | break; |
1869 | } else { |
1870 | return -1; |
1871 | } |
1872 | } |
1873 | |
1874 | // Each input character gives us six bits of output. |
1875 | temp = (temp << 6) | decode; |
1876 | ++state; |
1877 | if (state == 4) { |
1878 | // If we've accumulated 24 bits of output, write that out as |
1879 | // three bytes. |
1880 | if (dest) { |
1881 | if (destidx+3 > szdest) return -1; |
1882 | dest[destidx+2] = temp; |
1883 | temp >>= 8; |
1884 | dest[destidx+1] = temp; |
1885 | temp >>= 8; |
1886 | dest[destidx] = temp; |
1887 | } |
1888 | destidx += 3; |
1889 | state = 0; |
1890 | temp = 0; |
1891 | } |
1892 | } |
1893 | } |
1894 | |
1895 | // Process the leftover data contained in 'temp' at the end of the input. |
1896 | int expected_equals = 0; |
1897 | switch (state) { |
1898 | case 0: |
1899 | // Nothing left over; output is a multiple of 3 bytes. |
1900 | break; |
1901 | |
1902 | case 1: |
1903 | // Bad input; we have 6 bits left over. |
1904 | return -1; |
1905 | |
1906 | case 2: |
1907 | // Produce one more output byte from the 12 input bits we have left. |
1908 | if (dest) { |
1909 | if (destidx+1 > szdest) return -1; |
1910 | temp >>= 4; |
1911 | dest[destidx] = temp; |
1912 | } |
1913 | ++destidx; |
1914 | expected_equals = 2; |
1915 | break; |
1916 | |
1917 | case 3: |
1918 | // Produce two more output bytes from the 18 input bits we have left. |
1919 | if (dest) { |
1920 | if (destidx+2 > szdest) return -1; |
1921 | temp >>= 2; |
1922 | dest[destidx+1] = temp; |
1923 | temp >>= 8; |
1924 | dest[destidx] = temp; |
1925 | } |
1926 | destidx += 2; |
1927 | expected_equals = 1; |
1928 | break; |
1929 | |
1930 | default: |
1931 | // state should have no other values at this point. |
1932 | GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state; |
1933 | } |
1934 | |
1935 | // The remainder of the string should be all whitespace, mixed with |
1936 | // exactly 0 equals signs, or exactly 'expected_equals' equals |
1937 | // signs. (Always accepting 0 equals signs is a google extension |
1938 | // not covered in the RFC, as is accepting dot as the pad character.) |
1939 | |
1940 | int equals = 0; |
1941 | while (szsrc > 0 && *src) { |
1942 | if (*src == kPad64Equals || *src == kPad64Dot) |
1943 | ++equals; |
1944 | else if (!ascii_isspace(c: *src)) |
1945 | return -1; |
1946 | --szsrc; |
1947 | ++src; |
1948 | } |
1949 | |
1950 | return (equals == 0 || equals == expected_equals) ? destidx : -1; |
1951 | } |
1952 | |
1953 | // The arrays below were generated by the following code |
1954 | // #include <sys/time.h> |
1955 | // #include <stdlib.h> |
1956 | // #include <string.h> |
1957 | // #include <stdio.h> |
1958 | // main() |
1959 | // { |
1960 | // static const char Base64[] = |
1961 | // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
1962 | // const char *pos; |
1963 | // int idx, i, j; |
1964 | // printf(" "); |
1965 | // for (i = 0; i < 255; i += 8) { |
1966 | // for (j = i; j < i + 8; j++) { |
1967 | // pos = strchr(Base64, j); |
1968 | // if ((pos == nullptr) || (j == 0)) |
1969 | // idx = -1; |
1970 | // else |
1971 | // idx = pos - Base64; |
1972 | // if (idx == -1) |
1973 | // printf(" %2d, ", idx); |
1974 | // else |
1975 | // printf(" %2d/""*%c*""/,", idx, j); |
1976 | // } |
1977 | // printf("\n "); |
1978 | // } |
1979 | // } |
1980 | // |
1981 | // where the value of "Base64[]" was replaced by one of the base-64 conversion |
1982 | // tables from the functions below. |
1983 | static const signed char kUnBase64[] = { |
1984 | -1, -1, -1, -1, -1, -1, -1, -1, |
1985 | -1, -1, -1, -1, -1, -1, -1, -1, |
1986 | -1, -1, -1, -1, -1, -1, -1, -1, |
1987 | -1, -1, -1, -1, -1, -1, -1, -1, |
1988 | -1, -1, -1, -1, -1, -1, -1, -1, |
1989 | -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */, |
1990 | 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, |
1991 | 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, |
1992 | -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, |
1993 | 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, |
1994 | 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, |
1995 | 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1, |
1996 | -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, |
1997 | 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, |
1998 | 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, |
1999 | 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, |
2000 | -1, -1, -1, -1, -1, -1, -1, -1, |
2001 | -1, -1, -1, -1, -1, -1, -1, -1, |
2002 | -1, -1, -1, -1, -1, -1, -1, -1, |
2003 | -1, -1, -1, -1, -1, -1, -1, -1, |
2004 | -1, -1, -1, -1, -1, -1, -1, -1, |
2005 | -1, -1, -1, -1, -1, -1, -1, -1, |
2006 | -1, -1, -1, -1, -1, -1, -1, -1, |
2007 | -1, -1, -1, -1, -1, -1, -1, -1, |
2008 | -1, -1, -1, -1, -1, -1, -1, -1, |
2009 | -1, -1, -1, -1, -1, -1, -1, -1, |
2010 | -1, -1, -1, -1, -1, -1, -1, -1, |
2011 | -1, -1, -1, -1, -1, -1, -1, -1, |
2012 | -1, -1, -1, -1, -1, -1, -1, -1, |
2013 | -1, -1, -1, -1, -1, -1, -1, -1, |
2014 | -1, -1, -1, -1, -1, -1, -1, -1, |
2015 | -1, -1, -1, -1, -1, -1, -1, -1 |
2016 | }; |
2017 | static const signed char kUnWebSafeBase64[] = { |
2018 | -1, -1, -1, -1, -1, -1, -1, -1, |
2019 | -1, -1, -1, -1, -1, -1, -1, -1, |
2020 | -1, -1, -1, -1, -1, -1, -1, -1, |
2021 | -1, -1, -1, -1, -1, -1, -1, -1, |
2022 | -1, -1, -1, -1, -1, -1, -1, -1, |
2023 | -1, -1, -1, -1, -1, 62/*-*/, -1, -1, |
2024 | 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, |
2025 | 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, |
2026 | -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, |
2027 | 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, |
2028 | 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, |
2029 | 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/, |
2030 | -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, |
2031 | 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, |
2032 | 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, |
2033 | 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, |
2034 | -1, -1, -1, -1, -1, -1, -1, -1, |
2035 | -1, -1, -1, -1, -1, -1, -1, -1, |
2036 | -1, -1, -1, -1, -1, -1, -1, -1, |
2037 | -1, -1, -1, -1, -1, -1, -1, -1, |
2038 | -1, -1, -1, -1, -1, -1, -1, -1, |
2039 | -1, -1, -1, -1, -1, -1, -1, -1, |
2040 | -1, -1, -1, -1, -1, -1, -1, -1, |
2041 | -1, -1, -1, -1, -1, -1, -1, -1, |
2042 | -1, -1, -1, -1, -1, -1, -1, -1, |
2043 | -1, -1, -1, -1, -1, -1, -1, -1, |
2044 | -1, -1, -1, -1, -1, -1, -1, -1, |
2045 | -1, -1, -1, -1, -1, -1, -1, -1, |
2046 | -1, -1, -1, -1, -1, -1, -1, -1, |
2047 | -1, -1, -1, -1, -1, -1, -1, -1, |
2048 | -1, -1, -1, -1, -1, -1, -1, -1, |
2049 | -1, -1, -1, -1, -1, -1, -1, -1 |
2050 | }; |
2051 | |
2052 | int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) { |
2053 | return Base64UnescapeInternal(src_param: src, szsrc, dest, szdest, unbase64: kUnWebSafeBase64); |
2054 | } |
2055 | |
2056 | static bool Base64UnescapeInternal(const char *src, int slen, std::string *dest, |
2057 | const signed char *unbase64) { |
2058 | // Determine the size of the output string. Base64 encodes every 3 bytes into |
2059 | // 4 characters. any leftover chars are added directly for good measure. |
2060 | // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548 |
2061 | const int dest_len = 3 * (slen / 4) + (slen % 4); |
2062 | |
2063 | dest->resize(n: dest_len); |
2064 | |
2065 | // We are getting the destination buffer by getting the beginning of the |
2066 | // string and converting it into a char *. |
2067 | const int len = Base64UnescapeInternal(src_param: src, szsrc: slen, dest: string_as_array(str: dest), |
2068 | szdest: dest_len, unbase64); |
2069 | if (len < 0) { |
2070 | dest->clear(); |
2071 | return false; |
2072 | } |
2073 | |
2074 | // could be shorter if there was padding |
2075 | GOOGLE_DCHECK_LE(len, dest_len); |
2076 | dest->erase(pos: len); |
2077 | |
2078 | return true; |
2079 | } |
2080 | |
2081 | bool Base64Unescape(StringPiece src, std::string *dest) { |
2082 | return Base64UnescapeInternal(src: src.data(), slen: src.size(), dest, unbase64: kUnBase64); |
2083 | } |
2084 | |
2085 | bool WebSafeBase64Unescape(StringPiece src, std::string *dest) { |
2086 | return Base64UnescapeInternal(src: src.data(), slen: src.size(), dest, unbase64: kUnWebSafeBase64); |
2087 | } |
2088 | |
2089 | int Base64EscapeInternal(const unsigned char *src, int szsrc, |
2090 | char *dest, int szdest, const char *base64, |
2091 | bool do_padding) { |
2092 | static const char kPad64 = '='; |
2093 | |
2094 | if (szsrc <= 0) return 0; |
2095 | |
2096 | if (szsrc * 4 > szdest * 3) return 0; |
2097 | |
2098 | char *cur_dest = dest; |
2099 | const unsigned char *cur_src = src; |
2100 | |
2101 | char *limit_dest = dest + szdest; |
2102 | const unsigned char *limit_src = src + szsrc; |
2103 | |
2104 | // Three bytes of data encodes to four characters of ciphertext. |
2105 | // So we can pump through three-byte chunks atomically. |
2106 | while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits |
2107 | uint32_t in = BigEndian::Load32(p: cur_src) >> 8; |
2108 | |
2109 | cur_dest[0] = base64[in >> 18]; |
2110 | in &= 0x3FFFF; |
2111 | cur_dest[1] = base64[in >> 12]; |
2112 | in &= 0xFFF; |
2113 | cur_dest[2] = base64[in >> 6]; |
2114 | in &= 0x3F; |
2115 | cur_dest[3] = base64[in]; |
2116 | |
2117 | cur_dest += 4; |
2118 | cur_src += 3; |
2119 | } |
2120 | // To save time, we didn't update szdest or szsrc in the loop. So do it now. |
2121 | szdest = limit_dest - cur_dest; |
2122 | szsrc = limit_src - cur_src; |
2123 | |
2124 | /* now deal with the tail (<=3 bytes) */ |
2125 | switch (szsrc) { |
2126 | case 0: |
2127 | // Nothing left; nothing more to do. |
2128 | break; |
2129 | case 1: { |
2130 | // One byte left: this encodes to two characters, and (optionally) |
2131 | // two pad characters to round out the four-character cipherblock. |
2132 | if ((szdest -= 2) < 0) return 0; |
2133 | uint32_t in = cur_src[0]; |
2134 | cur_dest[0] = base64[in >> 2]; |
2135 | in &= 0x3; |
2136 | cur_dest[1] = base64[in << 4]; |
2137 | cur_dest += 2; |
2138 | if (do_padding) { |
2139 | if ((szdest -= 2) < 0) return 0; |
2140 | cur_dest[0] = kPad64; |
2141 | cur_dest[1] = kPad64; |
2142 | cur_dest += 2; |
2143 | } |
2144 | break; |
2145 | } |
2146 | case 2: { |
2147 | // Two bytes left: this encodes to three characters, and (optionally) |
2148 | // one pad character to round out the four-character cipherblock. |
2149 | if ((szdest -= 3) < 0) return 0; |
2150 | uint32_t in = BigEndian::Load16(p: cur_src); |
2151 | cur_dest[0] = base64[in >> 10]; |
2152 | in &= 0x3FF; |
2153 | cur_dest[1] = base64[in >> 4]; |
2154 | in &= 0x00F; |
2155 | cur_dest[2] = base64[in << 2]; |
2156 | cur_dest += 3; |
2157 | if (do_padding) { |
2158 | if ((szdest -= 1) < 0) return 0; |
2159 | cur_dest[0] = kPad64; |
2160 | cur_dest += 1; |
2161 | } |
2162 | break; |
2163 | } |
2164 | case 3: { |
2165 | // Three bytes left: same as in the big loop above. We can't do this in |
2166 | // the loop because the loop above always reads 4 bytes, and the fourth |
2167 | // byte is past the end of the input. |
2168 | if ((szdest -= 4) < 0) return 0; |
2169 | uint32_t in = (cur_src[0] << 16) + BigEndian::Load16(p: cur_src + 1); |
2170 | cur_dest[0] = base64[in >> 18]; |
2171 | in &= 0x3FFFF; |
2172 | cur_dest[1] = base64[in >> 12]; |
2173 | in &= 0xFFF; |
2174 | cur_dest[2] = base64[in >> 6]; |
2175 | in &= 0x3F; |
2176 | cur_dest[3] = base64[in]; |
2177 | cur_dest += 4; |
2178 | break; |
2179 | } |
2180 | default: |
2181 | // Should not be reached: blocks of 4 bytes are handled |
2182 | // in the while loop before this switch statement. |
2183 | GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc; |
2184 | break; |
2185 | } |
2186 | return (cur_dest - dest); |
2187 | } |
2188 | |
2189 | static const char kBase64Chars[] = |
2190 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" ; |
2191 | |
2192 | static const char kWebSafeBase64Chars[] = |
2193 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" ; |
2194 | |
2195 | int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) { |
2196 | return Base64EscapeInternal(src, szsrc, dest, szdest, base64: kBase64Chars, do_padding: true); |
2197 | } |
2198 | int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest, |
2199 | int szdest, bool do_padding) { |
2200 | return Base64EscapeInternal(src, szsrc, dest, szdest, |
2201 | base64: kWebSafeBase64Chars, do_padding); |
2202 | } |
2203 | |
2204 | void Base64EscapeInternal(const unsigned char *src, int szsrc, |
2205 | std::string *dest, bool do_padding, |
2206 | const char *base64_chars) { |
2207 | const int calc_escaped_size = |
2208 | CalculateBase64EscapedLen(input_len: szsrc, do_padding); |
2209 | dest->resize(n: calc_escaped_size); |
2210 | const int escaped_len = Base64EscapeInternal(src, szsrc, |
2211 | dest: string_as_array(str: dest), |
2212 | szdest: dest->size(), |
2213 | base64: base64_chars, |
2214 | do_padding); |
2215 | GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len); |
2216 | dest->erase(pos: escaped_len); |
2217 | } |
2218 | |
2219 | void Base64Escape(const unsigned char *src, int szsrc, std::string *dest, |
2220 | bool do_padding) { |
2221 | Base64EscapeInternal(src, szsrc, dest, do_padding, base64_chars: kBase64Chars); |
2222 | } |
2223 | |
2224 | void WebSafeBase64Escape(const unsigned char *src, int szsrc, std::string *dest, |
2225 | bool do_padding) { |
2226 | Base64EscapeInternal(src, szsrc, dest, do_padding, base64_chars: kWebSafeBase64Chars); |
2227 | } |
2228 | |
2229 | void Base64Escape(StringPiece src, std::string *dest) { |
2230 | Base64Escape(src: reinterpret_cast<const unsigned char*>(src.data()), |
2231 | szsrc: src.size(), dest, do_padding: true); |
2232 | } |
2233 | |
2234 | void WebSafeBase64Escape(StringPiece src, std::string *dest) { |
2235 | WebSafeBase64Escape(src: reinterpret_cast<const unsigned char*>(src.data()), |
2236 | szsrc: src.size(), dest, do_padding: false); |
2237 | } |
2238 | |
2239 | void WebSafeBase64EscapeWithPadding(StringPiece src, std::string *dest) { |
2240 | WebSafeBase64Escape(src: reinterpret_cast<const unsigned char*>(src.data()), |
2241 | szsrc: src.size(), dest, do_padding: true); |
2242 | } |
2243 | |
2244 | // Helper to append a Unicode code point to a string as UTF8, without bringing |
2245 | // in any external dependencies. |
2246 | int EncodeAsUTF8Char(uint32_t code_point, char* output) { |
2247 | uint32_t tmp = 0; |
2248 | int len = 0; |
2249 | if (code_point <= 0x7f) { |
2250 | tmp = code_point; |
2251 | len = 1; |
2252 | } else if (code_point <= 0x07ff) { |
2253 | tmp = 0x0000c080 | |
2254 | ((code_point & 0x07c0) << 2) | |
2255 | (code_point & 0x003f); |
2256 | len = 2; |
2257 | } else if (code_point <= 0xffff) { |
2258 | tmp = 0x00e08080 | |
2259 | ((code_point & 0xf000) << 4) | |
2260 | ((code_point & 0x0fc0) << 2) | |
2261 | (code_point & 0x003f); |
2262 | len = 3; |
2263 | } else { |
2264 | // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is |
2265 | // normally only defined up to there as well. |
2266 | tmp = 0xf0808080 | |
2267 | ((code_point & 0x1c0000) << 6) | |
2268 | ((code_point & 0x03f000) << 4) | |
2269 | ((code_point & 0x000fc0) << 2) | |
2270 | (code_point & 0x003f); |
2271 | len = 4; |
2272 | } |
2273 | tmp = ghtonl(x: tmp); |
2274 | memcpy(dest: output, src: reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, n: len); |
2275 | return len; |
2276 | } |
2277 | |
2278 | // Table of UTF-8 character lengths, based on first byte |
2279 | static const unsigned char kUTF8LenTbl[256] = { |
2280 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2281 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2282 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2283 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2284 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2285 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2286 | |
2287 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2288 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2289 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, |
2290 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
2291 | 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
2292 | 3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; |
2293 | |
2294 | // Return length of a single UTF-8 source character |
2295 | int UTF8FirstLetterNumBytes(const char* src, int len) { |
2296 | if (len == 0) { |
2297 | return 0; |
2298 | } |
2299 | return kUTF8LenTbl[*reinterpret_cast<const uint8_t*>(src)]; |
2300 | } |
2301 | |
2302 | // ---------------------------------------------------------------------- |
2303 | // CleanStringLineEndings() |
2304 | // Clean up a multi-line string to conform to Unix line endings. |
2305 | // Reads from src and appends to dst, so usually dst should be empty. |
2306 | // |
2307 | // If there is no line ending at the end of a non-empty string, it can |
2308 | // be added automatically. |
2309 | // |
2310 | // Four different types of input are correctly handled: |
2311 | // |
2312 | // - Unix/Linux files: line ending is LF: pass through unchanged |
2313 | // |
2314 | // - DOS/Windows files: line ending is CRLF: convert to LF |
2315 | // |
2316 | // - Legacy Mac files: line ending is CR: convert to LF |
2317 | // |
2318 | // - Garbled files: random line endings: convert gracefully |
2319 | // lonely CR, lonely LF, CRLF: convert to LF |
2320 | // |
2321 | // @param src The multi-line string to convert |
2322 | // @param dst The converted string is appended to this string |
2323 | // @param auto_end_last_line Automatically terminate the last line |
2324 | // |
2325 | // Limitations: |
2326 | // |
2327 | // This does not do the right thing for CRCRLF files created by |
2328 | // broken programs that do another Unix->DOS conversion on files |
2329 | // that are already in CRLF format. For this, a two-pass approach |
2330 | // brute-force would be needed that |
2331 | // |
2332 | // (1) determines the presence of LF (first one is ok) |
2333 | // (2) if yes, removes any CR, else convert every CR to LF |
2334 | |
2335 | void CleanStringLineEndings(const std::string &src, std::string *dst, |
2336 | bool auto_end_last_line) { |
2337 | if (dst->empty()) { |
2338 | dst->append(str: src); |
2339 | CleanStringLineEndings(str: dst, auto_end_last_line); |
2340 | } else { |
2341 | std::string tmp = src; |
2342 | CleanStringLineEndings(str: &tmp, auto_end_last_line); |
2343 | dst->append(str: tmp); |
2344 | } |
2345 | } |
2346 | |
2347 | void CleanStringLineEndings(std::string *str, bool auto_end_last_line) { |
2348 | ptrdiff_t output_pos = 0; |
2349 | bool r_seen = false; |
2350 | ptrdiff_t len = str->size(); |
2351 | |
2352 | char *p = &(*str)[0]; |
2353 | |
2354 | for (ptrdiff_t input_pos = 0; input_pos < len;) { |
2355 | if (!r_seen && input_pos + 8 < len) { |
2356 | uint64_t v = GOOGLE_UNALIGNED_LOAD64(p: p + input_pos); |
2357 | // Loop over groups of 8 bytes at a time until we come across |
2358 | // a word that has a byte whose value is less than or equal to |
2359 | // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ). |
2360 | // |
2361 | // We use a has_less macro that quickly tests a whole 64-bit |
2362 | // word to see if any of the bytes has a value < N. |
2363 | // |
2364 | // For more details, see: |
2365 | // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord |
2366 | #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128) |
2367 | if (!has_less(v, '\r' + 1)) { |
2368 | #undef has_less |
2369 | // No byte in this word has a value that could be a \r or a \n |
2370 | if (output_pos != input_pos) { |
2371 | GOOGLE_UNALIGNED_STORE64(p: p + output_pos, v); |
2372 | } |
2373 | input_pos += 8; |
2374 | output_pos += 8; |
2375 | continue; |
2376 | } |
2377 | } |
2378 | std::string::const_reference in = p[input_pos]; |
2379 | if (in == '\r') { |
2380 | if (r_seen) p[output_pos++] = '\n'; |
2381 | r_seen = true; |
2382 | } else if (in == '\n') { |
2383 | if (input_pos != output_pos) |
2384 | p[output_pos++] = '\n'; |
2385 | else |
2386 | output_pos++; |
2387 | r_seen = false; |
2388 | } else { |
2389 | if (r_seen) p[output_pos++] = '\n'; |
2390 | r_seen = false; |
2391 | if (input_pos != output_pos) |
2392 | p[output_pos++] = in; |
2393 | else |
2394 | output_pos++; |
2395 | } |
2396 | input_pos++; |
2397 | } |
2398 | if (r_seen || |
2399 | (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) { |
2400 | str->resize(n: output_pos + 1); |
2401 | str->operator[](pos: output_pos) = '\n'; |
2402 | } else if (output_pos < len) { |
2403 | str->resize(n: output_pos); |
2404 | } |
2405 | } |
2406 | |
2407 | namespace internal { |
2408 | |
2409 | // ---------------------------------------------------------------------- |
2410 | // NoLocaleStrtod() |
2411 | // This code will make you cry. |
2412 | // ---------------------------------------------------------------------- |
2413 | |
2414 | namespace { |
2415 | |
2416 | // Returns a string identical to *input except that the character pointed to |
2417 | // by radix_pos (which should be '.') is replaced with the locale-specific |
2418 | // radix character. |
2419 | std::string LocalizeRadix(const char *input, const char *radix_pos) { |
2420 | // Determine the locale-specific radix character by calling sprintf() to |
2421 | // print the number 1.5, then stripping off the digits. As far as I can |
2422 | // tell, this is the only portable, thread-safe way to get the C library |
2423 | // to divuldge the locale's radix character. No, localeconv() is NOT |
2424 | // thread-safe. |
2425 | char temp[16]; |
2426 | int size = snprintf(s: temp, maxlen: sizeof(temp), format: "%.1f" , 1.5); |
2427 | GOOGLE_CHECK_EQ(temp[0], '1'); |
2428 | GOOGLE_CHECK_EQ(temp[size - 1], '5'); |
2429 | GOOGLE_CHECK_LE(size, 6); |
2430 | |
2431 | // Now replace the '.' in the input with it. |
2432 | std::string result; |
2433 | result.reserve(res_arg: strlen(s: input) + size - 3); |
2434 | result.append(first: input, last: radix_pos); |
2435 | result.append(s: temp + 1, n: size - 2); |
2436 | result.append(s: radix_pos + 1); |
2437 | return result; |
2438 | } |
2439 | |
2440 | } // namespace |
2441 | |
2442 | double NoLocaleStrtod(const char *str, char **endptr) { |
2443 | // We cannot simply set the locale to "C" temporarily with setlocale() |
2444 | // as this is not thread-safe. Instead, we try to parse in the current |
2445 | // locale first. If parsing stops at a '.' character, then this is a |
2446 | // pretty good hint that we're actually in some other locale in which |
2447 | // '.' is not the radix character. |
2448 | |
2449 | char *temp_endptr; |
2450 | double result = strtod(nptr: str, endptr: &temp_endptr); |
2451 | if (endptr != NULL) *endptr = temp_endptr; |
2452 | if (*temp_endptr != '.') return result; |
2453 | |
2454 | // Parsing halted on a '.'. Perhaps we're in a different locale? Let's |
2455 | // try to replace the '.' with a locale-specific radix character and |
2456 | // try again. |
2457 | std::string localized = LocalizeRadix(input: str, radix_pos: temp_endptr); |
2458 | const char *localized_cstr = localized.c_str(); |
2459 | char *localized_endptr; |
2460 | result = strtod(nptr: localized_cstr, endptr: &localized_endptr); |
2461 | if ((localized_endptr - localized_cstr) > (temp_endptr - str)) { |
2462 | // This attempt got further, so replacing the decimal must have helped. |
2463 | // Update endptr to point at the right location. |
2464 | if (endptr != NULL) { |
2465 | // size_diff is non-zero if the localized radix has multiple bytes. |
2466 | int size_diff = localized.size() - strlen(s: str); |
2467 | // const_cast is necessary to match the strtod() interface. |
2468 | *endptr = const_cast<char *>( |
2469 | str + (localized_endptr - localized_cstr - size_diff)); |
2470 | } |
2471 | } |
2472 | |
2473 | return result; |
2474 | } |
2475 | |
2476 | } // namespace internal |
2477 | |
2478 | } // namespace protobuf |
2479 | } // namespace google |
2480 | |