1// Copyright 2017 The Abseil Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "absl/strings/escaping.h"
16
17#include <algorithm>
18#include <cassert>
19#include <cstdint>
20#include <cstring>
21#include <iterator>
22#include <limits>
23#include <string>
24
25#include "absl/base/internal/endian.h"
26#include "absl/base/internal/raw_logging.h"
27#include "absl/base/internal/unaligned_access.h"
28#include "absl/strings/internal/char_map.h"
29#include "absl/strings/internal/resize_uninitialized.h"
30#include "absl/strings/internal/utf8.h"
31#include "absl/strings/str_cat.h"
32#include "absl/strings/str_join.h"
33#include "absl/strings/string_view.h"
34
35namespace absl {
36namespace {
37
38// Digit conversion.
39constexpr char kHexChar[] = "0123456789abcdef";
40
41constexpr char kHexTable[513] =
42 "000102030405060708090a0b0c0d0e0f"
43 "101112131415161718191a1b1c1d1e1f"
44 "202122232425262728292a2b2c2d2e2f"
45 "303132333435363738393a3b3c3d3e3f"
46 "404142434445464748494a4b4c4d4e4f"
47 "505152535455565758595a5b5c5d5e5f"
48 "606162636465666768696a6b6c6d6e6f"
49 "707172737475767778797a7b7c7d7e7f"
50 "808182838485868788898a8b8c8d8e8f"
51 "909192939495969798999a9b9c9d9e9f"
52 "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
53 "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
54 "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
55 "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
56 "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
57 "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
58
59// These are used for the leave_nulls_escaped argument to CUnescapeInternal().
60constexpr bool kUnescapeNulls = false;
61
62inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
63
64inline int hex_digit_to_int(char c) {
65 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
66 "Character set must be ASCII.");
67 assert(absl::ascii_isxdigit(c));
68 int x = static_cast<unsigned char>(c);
69 if (x > '9') {
70 x += 9;
71 }
72 return x & 0xf;
73}
74
75inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
76 if (c >= 0xD800 && c <= 0xDFFF) {
77 if (error) {
78 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
79 src);
80 }
81 return true;
82 }
83 return false;
84}
85
86// ----------------------------------------------------------------------
87// CUnescapeInternal()
88// Implements both CUnescape() and CUnescapeForNullTerminatedString().
89//
90// Unescapes C escape sequences and is the reverse of CEscape().
91//
92// If 'source' is valid, stores the unescaped string and its size in
93// 'dest' and 'dest_len' respectively, and returns true. Otherwise
94// returns false and optionally stores the error description in
95// 'error'. Set 'error' to nullptr to disable error reporting.
96//
97// 'dest' should point to a buffer that is at least as big as 'source'.
98// 'source' and 'dest' may be the same.
99//
100// NOTE: any changes to this function must also be reflected in the older
101// UnescapeCEscapeSequences().
102// ----------------------------------------------------------------------
103bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
104 char* dest, ptrdiff_t* dest_len, std::string* error) {
105 char* d = dest;
106 const char* p = source.data();
107 const char* end = p + source.size();
108 const char* last_byte = end - 1;
109
110 // Small optimization for case where source = dest and there's no escaping
111 while (p == d && p < end && *p != '\\') p++, d++;
112
113 while (p < end) {
114 if (*p != '\\') {
115 *d++ = *p++;
116 } else {
117 if (++p > last_byte) { // skip past the '\\'
118 if (error) *error = "String cannot end with \\";
119 return false;
120 }
121 switch (*p) {
122 case 'a': *d++ = '\a'; break;
123 case 'b': *d++ = '\b'; break;
124 case 'f': *d++ = '\f'; break;
125 case 'n': *d++ = '\n'; break;
126 case 'r': *d++ = '\r'; break;
127 case 't': *d++ = '\t'; break;
128 case 'v': *d++ = '\v'; break;
129 case '\\': *d++ = '\\'; break;
130 case '?': *d++ = '\?'; break; // \? Who knew?
131 case '\'': *d++ = '\''; break;
132 case '"': *d++ = '\"'; break;
133 case '0':
134 case '1':
135 case '2':
136 case '3':
137 case '4':
138 case '5':
139 case '6':
140 case '7': {
141 // octal digit: 1 to 3 digits
142 const char* octal_start = p;
143 unsigned int ch = *p - '0';
144 if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
145 if (p < last_byte && is_octal_digit(p[1]))
146 ch = ch * 8 + *++p - '0'; // now points at last digit
147 if (ch > 0xff) {
148 if (error) {
149 *error = "Value of \\" +
150 std::string(octal_start, p + 1 - octal_start) +
151 " exceeds 0xff";
152 }
153 return false;
154 }
155 if ((ch == 0) && leave_nulls_escaped) {
156 // Copy the escape sequence for the null character
157 const ptrdiff_t octal_size = p + 1 - octal_start;
158 *d++ = '\\';
159 memcpy(d, octal_start, octal_size);
160 d += octal_size;
161 break;
162 }
163 *d++ = ch;
164 break;
165 }
166 case 'x':
167 case 'X': {
168 if (p >= last_byte) {
169 if (error) *error = "String cannot end with \\x";
170 return false;
171 } else if (!absl::ascii_isxdigit(p[1])) {
172 if (error) *error = "\\x cannot be followed by a non-hex digit";
173 return false;
174 }
175 unsigned int ch = 0;
176 const char* hex_start = p;
177 while (p < last_byte && absl::ascii_isxdigit(p[1]))
178 // Arbitrarily many hex digits
179 ch = (ch << 4) + hex_digit_to_int(*++p);
180 if (ch > 0xFF) {
181 if (error) {
182 *error = "Value of \\" +
183 std::string(hex_start, p + 1 - hex_start) +
184 " exceeds 0xff";
185 }
186 return false;
187 }
188 if ((ch == 0) && leave_nulls_escaped) {
189 // Copy the escape sequence for the null character
190 const ptrdiff_t hex_size = p + 1 - hex_start;
191 *d++ = '\\';
192 memcpy(d, hex_start, hex_size);
193 d += hex_size;
194 break;
195 }
196 *d++ = ch;
197 break;
198 }
199 case 'u': {
200 // \uhhhh => convert 4 hex digits to UTF-8
201 char32_t rune = 0;
202 const char* hex_start = p;
203 if (p + 4 >= end) {
204 if (error) {
205 *error = "\\u must be followed by 4 hex digits: \\" +
206 std::string(hex_start, p + 1 - hex_start);
207 }
208 return false;
209 }
210 for (int i = 0; i < 4; ++i) {
211 // Look one char ahead.
212 if (absl::ascii_isxdigit(p[1])) {
213 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
214 } else {
215 if (error) {
216 *error = "\\u must be followed by 4 hex digits: \\" +
217 std::string(hex_start, p + 1 - hex_start);
218 }
219 return false;
220 }
221 }
222 if ((rune == 0) && leave_nulls_escaped) {
223 // Copy the escape sequence for the null character
224 *d++ = '\\';
225 memcpy(d, hex_start, 5); // u0000
226 d += 5;
227 break;
228 }
229 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
230 return false;
231 }
232 d += strings_internal::EncodeUTF8Char(d, rune);
233 break;
234 }
235 case 'U': {
236 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
237 char32_t rune = 0;
238 const char* hex_start = p;
239 if (p + 8 >= end) {
240 if (error) {
241 *error = "\\U must be followed by 8 hex digits: \\" +
242 std::string(hex_start, p + 1 - hex_start);
243 }
244 return false;
245 }
246 for (int i = 0; i < 8; ++i) {
247 // Look one char ahead.
248 if (absl::ascii_isxdigit(p[1])) {
249 // Don't change rune until we're sure this
250 // is within the Unicode limit, but do advance p.
251 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
252 if (newrune > 0x10FFFF) {
253 if (error) {
254 *error = "Value of \\" +
255 std::string(hex_start, p + 1 - hex_start) +
256 " exceeds Unicode limit (0x10FFFF)";
257 }
258 return false;
259 } else {
260 rune = newrune;
261 }
262 } else {
263 if (error) {
264 *error = "\\U must be followed by 8 hex digits: \\" +
265 std::string(hex_start, p + 1 - hex_start);
266 }
267 return false;
268 }
269 }
270 if ((rune == 0) && leave_nulls_escaped) {
271 // Copy the escape sequence for the null character
272 *d++ = '\\';
273 memcpy(d, hex_start, 9); // U00000000
274 d += 9;
275 break;
276 }
277 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
278 return false;
279 }
280 d += strings_internal::EncodeUTF8Char(d, rune);
281 break;
282 }
283 default: {
284 if (error) *error = std::string("Unknown escape sequence: \\") + *p;
285 return false;
286 }
287 }
288 p++; // read past letter we escaped
289 }
290 }
291 *dest_len = d - dest;
292 return true;
293}
294
295// ----------------------------------------------------------------------
296// CUnescapeInternal()
297//
298// Same as above but uses a std::string for output. 'source' and 'dest'
299// may be the same.
300// ----------------------------------------------------------------------
301bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
302 std::string* dest, std::string* error) {
303 strings_internal::STLStringResizeUninitialized(dest, source.size());
304
305 ptrdiff_t dest_size;
306 if (!CUnescapeInternal(source,
307 leave_nulls_escaped,
308 &(*dest)[0],
309 &dest_size,
310 error)) {
311 return false;
312 }
313 dest->erase(dest_size);
314 return true;
315}
316
317// ----------------------------------------------------------------------
318// CEscape()
319// CHexEscape()
320// Utf8SafeCEscape()
321// Utf8SafeCHexEscape()
322// Escapes 'src' using C-style escape sequences. This is useful for
323// preparing query flags. The 'Hex' version uses hexadecimal rather than
324// octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
325//
326// Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
327// ----------------------------------------------------------------------
328std::string CEscapeInternal(absl::string_view src, bool use_hex,
329 bool utf8_safe) {
330 std::string dest;
331 bool last_hex_escape = false; // true if last output char was \xNN.
332
333 for (unsigned char c : src) {
334 bool is_hex_escape = false;
335 switch (c) {
336 case '\n': dest.append("\\" "n"); break;
337 case '\r': dest.append("\\" "r"); break;
338 case '\t': dest.append("\\" "t"); break;
339 case '\"': dest.append("\\" "\""); break;
340 case '\'': dest.append("\\" "'"); break;
341 case '\\': dest.append("\\" "\\"); break;
342 default:
343 // Note that if we emit \xNN and the src character after that is a hex
344 // digit then that digit must be escaped too to prevent it being
345 // interpreted as part of the character code by C.
346 if ((!utf8_safe || c < 0x80) &&
347 (!absl::ascii_isprint(c) ||
348 (last_hex_escape && absl::ascii_isxdigit(c)))) {
349 if (use_hex) {
350 dest.append("\\" "x");
351 dest.push_back(kHexChar[c / 16]);
352 dest.push_back(kHexChar[c % 16]);
353 is_hex_escape = true;
354 } else {
355 dest.append("\\");
356 dest.push_back(kHexChar[c / 64]);
357 dest.push_back(kHexChar[(c % 64) / 8]);
358 dest.push_back(kHexChar[c % 8]);
359 }
360 } else {
361 dest.push_back(c);
362 break;
363 }
364 }
365 last_hex_escape = is_hex_escape;
366 }
367
368 return dest;
369}
370
371/* clang-format off */
372constexpr char c_escaped_len[256] = {
373 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
374 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
376 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
377 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
380 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
381 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
382 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
383 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
384 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
385 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
386 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
387 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
388 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
389};
390/* clang-format on */
391
392// Calculates the length of the C-style escaped version of 'src'.
393// Assumes that non-printable characters are escaped using octal sequences, and
394// that UTF-8 bytes are not handled specially.
395inline size_t CEscapedLength(absl::string_view src) {
396 size_t escaped_len = 0;
397 for (unsigned char c : src) escaped_len += c_escaped_len[c];
398 return escaped_len;
399}
400
401void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
402 size_t escaped_len = CEscapedLength(src);
403 if (escaped_len == src.size()) {
404 dest->append(src.data(), src.size());
405 return;
406 }
407
408 size_t cur_dest_len = dest->size();
409 strings_internal::STLStringResizeUninitialized(dest,
410 cur_dest_len + escaped_len);
411 char* append_ptr = &(*dest)[cur_dest_len];
412
413 for (unsigned char c : src) {
414 int char_len = c_escaped_len[c];
415 if (char_len == 1) {
416 *append_ptr++ = c;
417 } else if (char_len == 2) {
418 switch (c) {
419 case '\n':
420 *append_ptr++ = '\\';
421 *append_ptr++ = 'n';
422 break;
423 case '\r':
424 *append_ptr++ = '\\';
425 *append_ptr++ = 'r';
426 break;
427 case '\t':
428 *append_ptr++ = '\\';
429 *append_ptr++ = 't';
430 break;
431 case '\"':
432 *append_ptr++ = '\\';
433 *append_ptr++ = '\"';
434 break;
435 case '\'':
436 *append_ptr++ = '\\';
437 *append_ptr++ = '\'';
438 break;
439 case '\\':
440 *append_ptr++ = '\\';
441 *append_ptr++ = '\\';
442 break;
443 }
444 } else {
445 *append_ptr++ = '\\';
446 *append_ptr++ = '0' + c / 64;
447 *append_ptr++ = '0' + (c % 64) / 8;
448 *append_ptr++ = '0' + c % 8;
449 }
450 }
451}
452
453bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
454 size_t szdest, const signed char* unbase64,
455 size_t* len) {
456 static const char kPad64Equals = '=';
457 static const char kPad64Dot = '.';
458
459 size_t destidx = 0;
460 int decode = 0;
461 int state = 0;
462 unsigned int ch = 0;
463 unsigned int temp = 0;
464
465 // If "char" is signed by default, using *src as an array index results in
466 // accessing negative array elements. Treat the input as a pointer to
467 // unsigned char to avoid this.
468 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
469
470 // The GET_INPUT macro gets the next input character, skipping
471 // over any whitespace, and stopping when we reach the end of the
472 // std::string or when we read any non-data character. The arguments are
473 // an arbitrary identifier (used as a label for goto) and the number
474 // of data bytes that must remain in the input to avoid aborting the
475 // loop.
476#define GET_INPUT(label, remain) \
477 label: \
478 --szsrc; \
479 ch = *src++; \
480 decode = unbase64[ch]; \
481 if (decode < 0) { \
482 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
483 state = 4 - remain; \
484 break; \
485 }
486
487 // if dest is null, we're just checking to see if it's legal input
488 // rather than producing output. (I suspect this could just be done
489 // with a regexp...). We duplicate the loop so this test can be
490 // outside it instead of in every iteration.
491
492 if (dest) {
493 // This loop consumes 4 input bytes and produces 3 output bytes
494 // per iteration. We can't know at the start that there is enough
495 // data left in the std::string for a full iteration, so the loop may
496 // break out in the middle; if so 'state' will be set to the
497 // number of input bytes read.
498
499 while (szsrc >= 4) {
500 // We'll start by optimistically assuming that the next four
501 // bytes of the std::string (src[0..3]) are four good data bytes
502 // (that is, no nulls, whitespace, padding chars, or illegal
503 // chars). We need to test src[0..2] for nulls individually
504 // before constructing temp to preserve the property that we
505 // never read past a null in the std::string (no matter how long
506 // szsrc claims the std::string is).
507
508 if (!src[0] || !src[1] || !src[2] ||
509 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
510 (unsigned(unbase64[src[1]]) << 12) |
511 (unsigned(unbase64[src[2]]) << 6) |
512 (unsigned(unbase64[src[3]])))) &
513 0x80000000)) {
514 // Iff any of those four characters was bad (null, illegal,
515 // whitespace, padding), then temp's high bit will be set
516 // (because unbase64[] is -1 for all bad characters).
517 //
518 // We'll back up and resort to the slower decoder, which knows
519 // how to handle those cases.
520
521 GET_INPUT(first, 4);
522 temp = decode;
523 GET_INPUT(second, 3);
524 temp = (temp << 6) | decode;
525 GET_INPUT(third, 2);
526 temp = (temp << 6) | decode;
527 GET_INPUT(fourth, 1);
528 temp = (temp << 6) | decode;
529 } else {
530 // We really did have four good data bytes, so advance four
531 // characters in the std::string.
532
533 szsrc -= 4;
534 src += 4;
535 }
536
537 // temp has 24 bits of input, so write that out as three bytes.
538
539 if (destidx + 3 > szdest) return false;
540 dest[destidx + 2] = temp;
541 temp >>= 8;
542 dest[destidx + 1] = temp;
543 temp >>= 8;
544 dest[destidx] = temp;
545 destidx += 3;
546 }
547 } else {
548 while (szsrc >= 4) {
549 if (!src[0] || !src[1] || !src[2] ||
550 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
551 (unsigned(unbase64[src[1]]) << 12) |
552 (unsigned(unbase64[src[2]]) << 6) |
553 (unsigned(unbase64[src[3]])))) &
554 0x80000000)) {
555 GET_INPUT(first_no_dest, 4);
556 GET_INPUT(second_no_dest, 3);
557 GET_INPUT(third_no_dest, 2);
558 GET_INPUT(fourth_no_dest, 1);
559 } else {
560 szsrc -= 4;
561 src += 4;
562 }
563 destidx += 3;
564 }
565 }
566
567#undef GET_INPUT
568
569 // if the loop terminated because we read a bad character, return
570 // now.
571 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
572 !absl::ascii_isspace(ch))
573 return false;
574
575 if (ch == kPad64Equals || ch == kPad64Dot) {
576 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
577 // look at it again when we count to check for the proper number of
578 // equals signs at the end.
579 ++szsrc;
580 --src;
581 } else {
582 // This loop consumes 1 input byte per iteration. It's used to
583 // clean up the 0-3 input bytes remaining when the first, faster
584 // loop finishes. 'temp' contains the data from 'state' input
585 // characters read by the first loop.
586 while (szsrc > 0) {
587 --szsrc;
588 ch = *src++;
589 decode = unbase64[ch];
590 if (decode < 0) {
591 if (absl::ascii_isspace(ch)) {
592 continue;
593 } else if (ch == kPad64Equals || ch == kPad64Dot) {
594 // back up one character; we'll read it again when we check
595 // for the correct number of pad characters at the end.
596 ++szsrc;
597 --src;
598 break;
599 } else {
600 return false;
601 }
602 }
603
604 // Each input character gives us six bits of output.
605 temp = (temp << 6) | decode;
606 ++state;
607 if (state == 4) {
608 // If we've accumulated 24 bits of output, write that out as
609 // three bytes.
610 if (dest) {
611 if (destidx + 3 > szdest) return false;
612 dest[destidx + 2] = temp;
613 temp >>= 8;
614 dest[destidx + 1] = temp;
615 temp >>= 8;
616 dest[destidx] = temp;
617 }
618 destidx += 3;
619 state = 0;
620 temp = 0;
621 }
622 }
623 }
624
625 // Process the leftover data contained in 'temp' at the end of the input.
626 int expected_equals = 0;
627 switch (state) {
628 case 0:
629 // Nothing left over; output is a multiple of 3 bytes.
630 break;
631
632 case 1:
633 // Bad input; we have 6 bits left over.
634 return false;
635
636 case 2:
637 // Produce one more output byte from the 12 input bits we have left.
638 if (dest) {
639 if (destidx + 1 > szdest) return false;
640 temp >>= 4;
641 dest[destidx] = temp;
642 }
643 ++destidx;
644 expected_equals = 2;
645 break;
646
647 case 3:
648 // Produce two more output bytes from the 18 input bits we have left.
649 if (dest) {
650 if (destidx + 2 > szdest) return false;
651 temp >>= 2;
652 dest[destidx + 1] = temp;
653 temp >>= 8;
654 dest[destidx] = temp;
655 }
656 destidx += 2;
657 expected_equals = 1;
658 break;
659
660 default:
661 // state should have no other values at this point.
662 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
663 state);
664 }
665
666 // The remainder of the std::string should be all whitespace, mixed with
667 // exactly 0 equals signs, or exactly 'expected_equals' equals
668 // signs. (Always accepting 0 equals signs is an Abseil extension
669 // not covered in the RFC, as is accepting dot as the pad character.)
670
671 int equals = 0;
672 while (szsrc > 0) {
673 if (*src == kPad64Equals || *src == kPad64Dot)
674 ++equals;
675 else if (!absl::ascii_isspace(*src))
676 return false;
677 --szsrc;
678 ++src;
679 }
680
681 const bool ok = (equals == 0 || equals == expected_equals);
682 if (ok) *len = destidx;
683 return ok;
684}
685
686// The arrays below were generated by the following code
687// #include <sys/time.h>
688// #include <stdlib.h>
689// #include <string.h>
690// main()
691// {
692// static const char Base64[] =
693// "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
694// char* pos;
695// int idx, i, j;
696// printf(" ");
697// for (i = 0; i < 255; i += 8) {
698// for (j = i; j < i + 8; j++) {
699// pos = strchr(Base64, j);
700// if ((pos == nullptr) || (j == 0))
701// idx = -1;
702// else
703// idx = pos - Base64;
704// if (idx == -1)
705// printf(" %2d, ", idx);
706// else
707// printf(" %2d/*%c*/,", idx, j);
708// }
709// printf("\n ");
710// }
711// }
712//
713// where the value of "Base64[]" was replaced by one of the base-64 conversion
714// tables from the functions below.
715/* clang-format off */
716constexpr signed char kUnBase64[] = {
717 -1, -1, -1, -1, -1, -1, -1, -1,
718 -1, -1, -1, -1, -1, -1, -1, -1,
719 -1, -1, -1, -1, -1, -1, -1, -1,
720 -1, -1, -1, -1, -1, -1, -1, -1,
721 -1, -1, -1, -1, -1, -1, -1, -1,
722 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
723 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
724 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
725 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
726 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
727 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
728 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
729 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
730 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
731 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
732 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
733 -1, -1, -1, -1, -1, -1, -1, -1,
734 -1, -1, -1, -1, -1, -1, -1, -1,
735 -1, -1, -1, -1, -1, -1, -1, -1,
736 -1, -1, -1, -1, -1, -1, -1, -1,
737 -1, -1, -1, -1, -1, -1, -1, -1,
738 -1, -1, -1, -1, -1, -1, -1, -1,
739 -1, -1, -1, -1, -1, -1, -1, -1,
740 -1, -1, -1, -1, -1, -1, -1, -1,
741 -1, -1, -1, -1, -1, -1, -1, -1,
742 -1, -1, -1, -1, -1, -1, -1, -1,
743 -1, -1, -1, -1, -1, -1, -1, -1,
744 -1, -1, -1, -1, -1, -1, -1, -1,
745 -1, -1, -1, -1, -1, -1, -1, -1,
746 -1, -1, -1, -1, -1, -1, -1, -1,
747 -1, -1, -1, -1, -1, -1, -1, -1,
748 -1, -1, -1, -1, -1, -1, -1, -1
749};
750
751constexpr signed char kUnWebSafeBase64[] = {
752 -1, -1, -1, -1, -1, -1, -1, -1,
753 -1, -1, -1, -1, -1, -1, -1, -1,
754 -1, -1, -1, -1, -1, -1, -1, -1,
755 -1, -1, -1, -1, -1, -1, -1, -1,
756 -1, -1, -1, -1, -1, -1, -1, -1,
757 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
758 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
759 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
760 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
761 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
762 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
763 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
764 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
765 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
766 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
767 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
768 -1, -1, -1, -1, -1, -1, -1, -1,
769 -1, -1, -1, -1, -1, -1, -1, -1,
770 -1, -1, -1, -1, -1, -1, -1, -1,
771 -1, -1, -1, -1, -1, -1, -1, -1,
772 -1, -1, -1, -1, -1, -1, -1, -1,
773 -1, -1, -1, -1, -1, -1, -1, -1,
774 -1, -1, -1, -1, -1, -1, -1, -1,
775 -1, -1, -1, -1, -1, -1, -1, -1,
776 -1, -1, -1, -1, -1, -1, -1, -1,
777 -1, -1, -1, -1, -1, -1, -1, -1,
778 -1, -1, -1, -1, -1, -1, -1, -1,
779 -1, -1, -1, -1, -1, -1, -1, -1,
780 -1, -1, -1, -1, -1, -1, -1, -1,
781 -1, -1, -1, -1, -1, -1, -1, -1,
782 -1, -1, -1, -1, -1, -1, -1, -1,
783 -1, -1, -1, -1, -1, -1, -1, -1
784};
785/* clang-format on */
786
787size_t CalculateBase64EscapedLenInternal(size_t input_len, bool do_padding) {
788 // Base64 encodes three bytes of input at a time. If the input is not
789 // divisible by three, we pad as appropriate.
790 //
791 // (from https://tools.ietf.org/html/rfc3548)
792 // Special processing is performed if fewer than 24 bits are available
793 // at the end of the data being encoded. A full encoding quantum is
794 // always completed at the end of a quantity. When fewer than 24 input
795 // bits are available in an input group, zero bits are added (on the
796 // right) to form an integral number of 6-bit groups. Padding at the
797 // end of the data is performed using the '=' character. Since all base
798 // 64 input is an integral number of octets, only the following cases
799 // can arise:
800
801 // Base64 encodes each three bytes of input into four bytes of output.
802 size_t len = (input_len / 3) * 4;
803
804 if (input_len % 3 == 0) {
805 // (from https://tools.ietf.org/html/rfc3548)
806 // (1) the final quantum of encoding input is an integral multiple of 24
807 // bits; here, the final unit of encoded output will be an integral
808 // multiple of 4 characters with no "=" padding,
809 } else if (input_len % 3 == 1) {
810 // (from https://tools.ietf.org/html/rfc3548)
811 // (2) the final quantum of encoding input is exactly 8 bits; here, the
812 // final unit of encoded output will be two characters followed by two
813 // "=" padding characters, or
814 len += 2;
815 if (do_padding) {
816 len += 2;
817 }
818 } else { // (input_len % 3 == 2)
819 // (from https://tools.ietf.org/html/rfc3548)
820 // (3) the final quantum of encoding input is exactly 16 bits; here, the
821 // final unit of encoded output will be three characters followed by one
822 // "=" padding character.
823 len += 3;
824 if (do_padding) {
825 len += 1;
826 }
827 }
828
829 assert(len >= input_len); // make sure we didn't overflow
830 return len;
831}
832
833size_t Base64EscapeInternal(const unsigned char* src, size_t szsrc, char* dest,
834 size_t szdest, const char* base64,
835 bool do_padding) {
836 static const char kPad64 = '=';
837
838 if (szsrc * 4 > szdest * 3) return 0;
839
840 char* cur_dest = dest;
841 const unsigned char* cur_src = src;
842
843 char* const limit_dest = dest + szdest;
844 const unsigned char* const limit_src = src + szsrc;
845
846 // Three bytes of data encodes to four characters of cyphertext.
847 // So we can pump through three-byte chunks atomically.
848 if (szsrc >= 3) { // "limit_src - 3" is UB if szsrc < 3.
849 while (cur_src < limit_src - 3) { // While we have >= 32 bits.
850 uint32_t in = absl::big_endian::Load32(cur_src) >> 8;
851
852 cur_dest[0] = base64[in >> 18];
853 in &= 0x3FFFF;
854 cur_dest[1] = base64[in >> 12];
855 in &= 0xFFF;
856 cur_dest[2] = base64[in >> 6];
857 in &= 0x3F;
858 cur_dest[3] = base64[in];
859
860 cur_dest += 4;
861 cur_src += 3;
862 }
863 }
864 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
865 szdest = limit_dest - cur_dest;
866 szsrc = limit_src - cur_src;
867
868 /* now deal with the tail (<=3 bytes) */
869 switch (szsrc) {
870 case 0:
871 // Nothing left; nothing more to do.
872 break;
873 case 1: {
874 // One byte left: this encodes to two characters, and (optionally)
875 // two pad characters to round out the four-character cypherblock.
876 if (szdest < 2) return 0;
877 uint32_t in = cur_src[0];
878 cur_dest[0] = base64[in >> 2];
879 in &= 0x3;
880 cur_dest[1] = base64[in << 4];
881 cur_dest += 2;
882 szdest -= 2;
883 if (do_padding) {
884 if (szdest < 2) return 0;
885 cur_dest[0] = kPad64;
886 cur_dest[1] = kPad64;
887 cur_dest += 2;
888 szdest -= 2;
889 }
890 break;
891 }
892 case 2: {
893 // Two bytes left: this encodes to three characters, and (optionally)
894 // one pad character to round out the four-character cypherblock.
895 if (szdest < 3) return 0;
896 uint32_t in = absl::big_endian::Load16(cur_src);
897 cur_dest[0] = base64[in >> 10];
898 in &= 0x3FF;
899 cur_dest[1] = base64[in >> 4];
900 in &= 0x00F;
901 cur_dest[2] = base64[in << 2];
902 cur_dest += 3;
903 szdest -= 3;
904 if (do_padding) {
905 if (szdest < 1) return 0;
906 cur_dest[0] = kPad64;
907 cur_dest += 1;
908 szdest -= 1;
909 }
910 break;
911 }
912 case 3: {
913 // Three bytes left: same as in the big loop above. We can't do this in
914 // the loop because the loop above always reads 4 bytes, and the fourth
915 // byte is past the end of the input.
916 if (szdest < 4) return 0;
917 uint32_t in = (cur_src[0] << 16) + absl::big_endian::Load16(cur_src + 1);
918 cur_dest[0] = base64[in >> 18];
919 in &= 0x3FFFF;
920 cur_dest[1] = base64[in >> 12];
921 in &= 0xFFF;
922 cur_dest[2] = base64[in >> 6];
923 in &= 0x3F;
924 cur_dest[3] = base64[in];
925 cur_dest += 4;
926 szdest -= 4;
927 break;
928 }
929 default:
930 // Should not be reached: blocks of 4 bytes are handled
931 // in the while loop before this switch statement.
932 ABSL_RAW_LOG(FATAL, "Logic problem? szsrc = %zu", szsrc);
933 break;
934 }
935 return (cur_dest - dest);
936}
937
938constexpr char kBase64Chars[] =
939 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
940
941constexpr char kWebSafeBase64Chars[] =
942 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
943
944template <typename String>
945void Base64EscapeInternal(const unsigned char* src, size_t szsrc, String* dest,
946 bool do_padding, const char* base64_chars) {
947 const size_t calc_escaped_size =
948 CalculateBase64EscapedLenInternal(szsrc, do_padding);
949 strings_internal::STLStringResizeUninitialized(dest, calc_escaped_size);
950
951 const size_t escaped_len = Base64EscapeInternal(
952 src, szsrc, &(*dest)[0], dest->size(), base64_chars, do_padding);
953 assert(calc_escaped_size == escaped_len);
954 dest->erase(escaped_len);
955}
956
957template <typename String>
958bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
959 const signed char* unbase64) {
960 // Determine the size of the output std::string. Base64 encodes every 3 bytes into
961 // 4 characters. any leftover chars are added directly for good measure.
962 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
963 const size_t dest_len = 3 * (slen / 4) + (slen % 4);
964
965 strings_internal::STLStringResizeUninitialized(dest, dest_len);
966
967 // We are getting the destination buffer by getting the beginning of the
968 // std::string and converting it into a char *.
969 size_t len;
970 const bool ok =
971 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
972 if (!ok) {
973 dest->clear();
974 return false;
975 }
976
977 // could be shorter if there was padding
978 assert(len <= dest_len);
979 dest->erase(len);
980
981 return true;
982}
983
984/* clang-format off */
985constexpr char kHexValue[256] = {
986 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
989 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
990 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
992 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
993 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
994 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
996 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
997 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
998 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
999 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1000 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1001 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1002};
1003/* clang-format on */
1004
1005// This is a templated function so that T can be either a char*
1006// or a string. This works because we use the [] operator to access
1007// individual characters at a time.
1008template <typename T>
1009void HexStringToBytesInternal(const char* from, T to, ptrdiff_t num) {
1010 for (int i = 0; i < num; i++) {
1011 to[i] = (kHexValue[from[i * 2] & 0xFF] << 4) +
1012 (kHexValue[from[i * 2 + 1] & 0xFF]);
1013 }
1014}
1015
1016// This is a templated function so that T can be either a char* or a
1017// std::string.
1018template <typename T>
1019void BytesToHexStringInternal(const unsigned char* src, T dest, ptrdiff_t num) {
1020 auto dest_ptr = &dest[0];
1021 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
1022 const char* hex_p = &kHexTable[*src_ptr * 2];
1023 std::copy(hex_p, hex_p + 2, dest_ptr);
1024 }
1025}
1026
1027} // namespace
1028
1029// ----------------------------------------------------------------------
1030// CUnescape()
1031//
1032// See CUnescapeInternal() for implementation details.
1033// ----------------------------------------------------------------------
1034bool CUnescape(absl::string_view source, std::string* dest,
1035 std::string* error) {
1036 return CUnescapeInternal(source, kUnescapeNulls, dest, error);
1037}
1038
1039std::string CEscape(absl::string_view src) {
1040 std::string dest;
1041 CEscapeAndAppendInternal(src, &dest);
1042 return dest;
1043}
1044
1045std::string CHexEscape(absl::string_view src) {
1046 return CEscapeInternal(src, true, false);
1047}
1048
1049std::string Utf8SafeCEscape(absl::string_view src) {
1050 return CEscapeInternal(src, false, true);
1051}
1052
1053std::string Utf8SafeCHexEscape(absl::string_view src) {
1054 return CEscapeInternal(src, true, true);
1055}
1056
1057// ----------------------------------------------------------------------
1058// Base64Unescape() - base64 decoder
1059// Base64Escape() - base64 encoder
1060// WebSafeBase64Unescape() - Google's variation of base64 decoder
1061// WebSafeBase64Escape() - Google's variation of base64 encoder
1062//
1063// Check out
1064// http://tools.ietf.org/html/rfc2045 for formal description, but what we
1065// care about is that...
1066// Take the encoded stuff in groups of 4 characters and turn each
1067// character into a code 0 to 63 thus:
1068// A-Z map to 0 to 25
1069// a-z map to 26 to 51
1070// 0-9 map to 52 to 61
1071// +(- for WebSafe) maps to 62
1072// /(_ for WebSafe) maps to 63
1073// There will be four numbers, all less than 64 which can be represented
1074// by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1075// Arrange the 6 digit binary numbers into three bytes as such:
1076// aaaaaabb bbbbcccc ccdddddd
1077// Equals signs (one or two) are used at the end of the encoded block to
1078// indicate that the text was not an integer multiple of three bytes long.
1079// ----------------------------------------------------------------------
1080
1081bool Base64Unescape(absl::string_view src, std::string* dest) {
1082 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
1083}
1084
1085bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
1086 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
1087}
1088
1089void Base64Escape(absl::string_view src, std::string* dest) {
1090 Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1091 src.size(), dest, true, kBase64Chars);
1092}
1093
1094void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
1095 Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1096 src.size(), dest, false, kWebSafeBase64Chars);
1097}
1098
1099std::string Base64Escape(absl::string_view src) {
1100 std::string dest;
1101 Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1102 src.size(), &dest, true, kBase64Chars);
1103 return dest;
1104}
1105
1106std::string WebSafeBase64Escape(absl::string_view src) {
1107 std::string dest;
1108 Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1109 src.size(), &dest, false, kWebSafeBase64Chars);
1110 return dest;
1111}
1112
1113std::string HexStringToBytes(absl::string_view from) {
1114 std::string result;
1115 const auto num = from.size() / 2;
1116 strings_internal::STLStringResizeUninitialized(&result, num);
1117 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
1118 return result;
1119}
1120
1121std::string BytesToHexString(absl::string_view from) {
1122 std::string result;
1123 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
1124 absl::BytesToHexStringInternal<std::string&>(
1125 reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
1126 return result;
1127}
1128
1129} // namespace absl
1130