1/*
2 * Copyright 2011-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <folly/Unicode.h>
18#include <folly/Conv.h>
19
20namespace folly {
21
22//////////////////////////////////////////////////////////////////////
23
24std::string codePointToUtf8(char32_t cp) {
25 std::string result;
26
27 // Based on description from http://en.wikipedia.org/wiki/UTF-8.
28
29 if (cp <= 0x7f) {
30 result.resize(1);
31 result[0] = static_cast<char>(cp);
32 } else if (cp <= 0x7FF) {
33 result.resize(2);
34 result[1] = static_cast<char>(0x80 | (0x3f & cp));
35 result[0] = static_cast<char>(0xC0 | (cp >> 6));
36 } else if (cp <= 0xFFFF) {
37 result.resize(3);
38 result[2] = static_cast<char>(0x80 | (0x3f & cp));
39 result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
40 result[0] = (0xE0 | static_cast<char>(cp >> 12));
41 } else if (cp <= 0x10FFFF) {
42 result.resize(4);
43 result[3] = static_cast<char>(0x80 | (0x3f & cp));
44 result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
45 result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
46 result[0] = static_cast<char>(0xF0 | (cp >> 18));
47 }
48
49 return result;
50}
51
52char32_t utf8ToCodePoint(
53 const unsigned char*& p,
54 const unsigned char* const e,
55 bool skipOnError) {
56 /* The following encodings are valid, except for the 5 and 6 byte
57 * combinations:
58 * 0xxxxxxx
59 * 110xxxxx 10xxxxxx
60 * 1110xxxx 10xxxxxx 10xxxxxx
61 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
62 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
63 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64 */
65
66 const auto skip = [&] {
67 ++p;
68 return U'\ufffd';
69 };
70
71 if (p >= e) {
72 if (skipOnError) {
73 return skip();
74 }
75 throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
76 }
77
78 unsigned char fst = *p;
79 if (!(fst & 0x80)) {
80 // trivial case
81 return *p++;
82 }
83
84 static const uint32_t bitMask[] = {
85 (1 << 7) - 1,
86 (1 << 11) - 1,
87 (1 << 16) - 1,
88 (1 << 21) - 1,
89 };
90
91 // upper control bits are masked out later
92 uint32_t d = fst;
93
94 if ((fst & 0xC0) != 0xC0) {
95 if (skipOnError) {
96 return skip();
97 }
98 throw std::runtime_error(
99 to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
100 }
101
102 fst <<= 1;
103
104 for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
105 const unsigned char tmp = p[i];
106
107 if ((tmp & 0xC0) != 0x80) {
108 if (skipOnError) {
109 return skip();
110 }
111 throw std::runtime_error(to<std::string>(
112 "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
113 }
114
115 d = (d << 6) | (tmp & 0x3F);
116 fst <<= 1;
117
118 if (!(fst & 0x80)) {
119 d &= bitMask[i];
120
121 // overlong, could have been encoded with i bytes
122 if ((d & ~bitMask[i - 1]) == 0) {
123 if (skipOnError) {
124 return skip();
125 }
126 throw std::runtime_error(
127 to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
128 }
129
130 // check for surrogates only needed for 3 bytes
131 if (i == 2) {
132 if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
133 if (skipOnError) {
134 return skip();
135 }
136 throw std::runtime_error(
137 to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
138 }
139 }
140
141 p += i + 1;
142 return d;
143 }
144 }
145
146 if (skipOnError) {
147 return skip();
148 }
149 throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
150}
151
152//////////////////////////////////////////////////////////////////////
153
154} // namespace folly
155