1 | /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl> |
2 | * |
3 | * Permission to use, copy, modify, and/or distribute this software for any |
4 | * purpose with or without fee is hereby granted, provided that the above |
5 | * copyright notice and this permission notice appear in all copies. |
6 | * |
7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
8 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
9 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
10 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
11 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
12 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
13 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
14 | */ |
15 | |
16 | /* Derived from https://github.com/bnoordhuis/punycode |
17 | * but updated to support IDNA 2008. |
18 | */ |
19 | |
20 | #include "uv.h" |
21 | #include "idna.h" |
22 | #include <string.h> |
23 | |
24 | static unsigned uv__utf8_decode1_slow(const char** p, |
25 | const char* pe, |
26 | unsigned a) { |
27 | unsigned b; |
28 | unsigned c; |
29 | unsigned d; |
30 | unsigned min; |
31 | |
32 | if (a > 0xF7) |
33 | return -1; |
34 | |
35 | switch (*p - pe) { |
36 | default: |
37 | if (a > 0xEF) { |
38 | min = 0x10000; |
39 | a = a & 7; |
40 | b = (unsigned char) *(*p)++; |
41 | c = (unsigned char) *(*p)++; |
42 | d = (unsigned char) *(*p)++; |
43 | break; |
44 | } |
45 | /* Fall through. */ |
46 | case 2: |
47 | if (a > 0xDF) { |
48 | min = 0x800; |
49 | b = 0x80 | (a & 15); |
50 | c = (unsigned char) *(*p)++; |
51 | d = (unsigned char) *(*p)++; |
52 | a = 0; |
53 | break; |
54 | } |
55 | /* Fall through. */ |
56 | case 1: |
57 | if (a > 0xBF) { |
58 | min = 0x80; |
59 | b = 0x80; |
60 | c = 0x80 | (a & 31); |
61 | d = (unsigned char) *(*p)++; |
62 | a = 0; |
63 | break; |
64 | } |
65 | return -1; /* Invalid continuation byte. */ |
66 | } |
67 | |
68 | if (0x80 != (0xC0 & (b ^ c ^ d))) |
69 | return -1; /* Invalid sequence. */ |
70 | |
71 | b &= 63; |
72 | c &= 63; |
73 | d &= 63; |
74 | a = (a << 18) | (b << 12) | (c << 6) | d; |
75 | |
76 | if (a < min) |
77 | return -1; /* Overlong sequence. */ |
78 | |
79 | if (a > 0x10FFFF) |
80 | return -1; /* Four-byte sequence > U+10FFFF. */ |
81 | |
82 | if (a >= 0xD800 && a <= 0xDFFF) |
83 | return -1; /* Surrogate pair. */ |
84 | |
85 | return a; |
86 | } |
87 | |
88 | unsigned uv__utf8_decode1(const char** p, const char* pe) { |
89 | unsigned a; |
90 | |
91 | a = (unsigned char) *(*p)++; |
92 | |
93 | if (a < 128) |
94 | return a; /* ASCII, common case. */ |
95 | |
96 | return uv__utf8_decode1_slow(p, pe, a); |
97 | } |
98 | |
99 | #define foreach_codepoint(c, p, pe) \ |
100 | for (; (void) (*p <= pe && (c = uv__utf8_decode1(p, pe))), *p <= pe;) |
101 | |
102 | static int uv__idna_toascii_label(const char* s, const char* se, |
103 | char** d, char* de) { |
104 | static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789" ; |
105 | const char* ss; |
106 | unsigned c; |
107 | unsigned h; |
108 | unsigned k; |
109 | unsigned n; |
110 | unsigned m; |
111 | unsigned q; |
112 | unsigned t; |
113 | unsigned x; |
114 | unsigned y; |
115 | unsigned bias; |
116 | unsigned delta; |
117 | unsigned todo; |
118 | int first; |
119 | |
120 | h = 0; |
121 | ss = s; |
122 | todo = 0; |
123 | |
124 | foreach_codepoint(c, &s, se) { |
125 | if (c < 128) |
126 | h++; |
127 | else if (c == (unsigned) -1) |
128 | return UV_EINVAL; |
129 | else |
130 | todo++; |
131 | } |
132 | |
133 | if (todo > 0) { |
134 | if (*d < de) *(*d)++ = 'x'; |
135 | if (*d < de) *(*d)++ = 'n'; |
136 | if (*d < de) *(*d)++ = '-'; |
137 | if (*d < de) *(*d)++ = '-'; |
138 | } |
139 | |
140 | x = 0; |
141 | s = ss; |
142 | foreach_codepoint(c, &s, se) { |
143 | if (c > 127) |
144 | continue; |
145 | |
146 | if (*d < de) |
147 | *(*d)++ = c; |
148 | |
149 | if (++x == h) |
150 | break; /* Visited all ASCII characters. */ |
151 | } |
152 | |
153 | if (todo == 0) |
154 | return h; |
155 | |
156 | /* Only write separator when we've written ASCII characters first. */ |
157 | if (h > 0) |
158 | if (*d < de) |
159 | *(*d)++ = '-'; |
160 | |
161 | n = 128; |
162 | bias = 72; |
163 | delta = 0; |
164 | first = 1; |
165 | |
166 | while (todo > 0) { |
167 | m = -1; |
168 | s = ss; |
169 | foreach_codepoint(c, &s, se) |
170 | if (c >= n) |
171 | if (c < m) |
172 | m = c; |
173 | |
174 | x = m - n; |
175 | y = h + 1; |
176 | |
177 | if (x > ~delta / y) |
178 | return UV_E2BIG; /* Overflow. */ |
179 | |
180 | delta += x * y; |
181 | n = m; |
182 | |
183 | s = ss; |
184 | foreach_codepoint(c, &s, se) { |
185 | if (c < n) |
186 | if (++delta == 0) |
187 | return UV_E2BIG; /* Overflow. */ |
188 | |
189 | if (c != n) |
190 | continue; |
191 | |
192 | for (k = 36, q = delta; /* empty */; k += 36) { |
193 | t = 1; |
194 | |
195 | if (k > bias) |
196 | t = k - bias; |
197 | |
198 | if (t > 26) |
199 | t = 26; |
200 | |
201 | if (q < t) |
202 | break; |
203 | |
204 | /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore |
205 | * 10 <= y <= 35, we can optimize the long division |
206 | * into a table-based reciprocal multiplication. |
207 | */ |
208 | x = q - t; |
209 | y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */ |
210 | q = x / y; |
211 | t = t + x % y; /* 1 <= t <= 35 because of y. */ |
212 | |
213 | if (*d < de) |
214 | *(*d)++ = alphabet[t]; |
215 | } |
216 | |
217 | if (*d < de) |
218 | *(*d)++ = alphabet[q]; |
219 | |
220 | delta /= 2; |
221 | |
222 | if (first) { |
223 | delta /= 350; |
224 | first = 0; |
225 | } |
226 | |
227 | /* No overflow check is needed because |delta| was just |
228 | * divided by 2 and |delta+delta >= delta + delta/h|. |
229 | */ |
230 | h++; |
231 | delta += delta / h; |
232 | |
233 | for (bias = 0; delta > 35 * 26 / 2; bias += 36) |
234 | delta /= 35; |
235 | |
236 | bias += 36 * delta / (delta + 38); |
237 | delta = 0; |
238 | todo--; |
239 | } |
240 | |
241 | delta++; |
242 | n++; |
243 | } |
244 | |
245 | return 0; |
246 | } |
247 | |
248 | #undef foreach_codepoint |
249 | |
250 | long uv__idna_toascii(const char* s, const char* se, char* d, char* de) { |
251 | const char* si; |
252 | const char* st; |
253 | unsigned c; |
254 | char* ds; |
255 | int rc; |
256 | |
257 | ds = d; |
258 | |
259 | for (si = s; si < se; /* empty */) { |
260 | st = si; |
261 | c = uv__utf8_decode1(&si, se); |
262 | |
263 | if (c != '.') |
264 | if (c != 0x3002) /* 。 */ |
265 | if (c != 0xFF0E) /* . */ |
266 | if (c != 0xFF61) /* 。 */ |
267 | continue; |
268 | |
269 | rc = uv__idna_toascii_label(s, st, &d, de); |
270 | |
271 | if (rc < 0) |
272 | return rc; |
273 | |
274 | if (d < de) |
275 | *d++ = '.'; |
276 | |
277 | s = si; |
278 | } |
279 | |
280 | if (s < se) { |
281 | rc = uv__idna_toascii_label(s, se, &d, de); |
282 | |
283 | if (rc < 0) |
284 | return rc; |
285 | } |
286 | |
287 | if (d < de) |
288 | *d++ = '\0'; |
289 | |
290 | return d - ds; /* Number of bytes written. */ |
291 | } |
292 | |