1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2010-2015, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | * file name: uts46.cpp |
9 | * encoding: UTF-8 |
10 | * tab size: 8 (not used) |
11 | * indentation:4 |
12 | * |
13 | * created on: 2010mar09 |
14 | * created by: Markus W. Scherer |
15 | */ |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_IDNA |
20 | |
21 | #include "unicode/idna.h" |
22 | #include "unicode/normalizer2.h" |
23 | #include "unicode/uscript.h" |
24 | #include "unicode/ustring.h" |
25 | #include "unicode/utf16.h" |
26 | #include "cmemory.h" |
27 | #include "cstring.h" |
28 | #include "punycode.h" |
29 | #include "ubidi_props.h" |
30 | #include "ustr_imp.h" |
31 | |
32 | // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: |
33 | // |
34 | // The domain name length limit is 255 octets in an internal DNS representation |
35 | // where the last ("root") label is the empty label |
36 | // represented by length byte 0 alone. |
37 | // In a conventional string, this translates to 253 characters, or 254 |
38 | // if there is a trailing dot for the root label. |
39 | |
40 | U_NAMESPACE_BEGIN |
41 | |
42 | // Severe errors which usually result in a U+FFFD replacement character in the result string. |
43 | const uint32_t severeErrors= |
44 | UIDNA_ERROR_LEADING_COMBINING_MARK| |
45 | UIDNA_ERROR_DISALLOWED| |
46 | UIDNA_ERROR_PUNYCODE| |
47 | UIDNA_ERROR_LABEL_HAS_DOT| |
48 | UIDNA_ERROR_INVALID_ACE_LABEL; |
49 | |
50 | static inline UBool |
51 | isASCIIString(const UnicodeString &dest) { |
52 | const char16_t *s=dest.getBuffer(); |
53 | const char16_t *limit=s+dest.length(); |
54 | while(s<limit) { |
55 | if(*s++>0x7f) { |
56 | return false; |
57 | } |
58 | } |
59 | return true; |
60 | } |
61 | |
62 | static UBool |
63 | isASCIIOkBiDi(const char16_t *s, int32_t length); |
64 | |
65 | static UBool |
66 | isASCIIOkBiDi(const char *s, int32_t length); |
67 | |
68 | // IDNA class default implementations -------------------------------------- *** |
69 | |
70 | IDNA::~IDNA() {} |
71 | |
72 | void |
73 | IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest, |
74 | IDNAInfo &info, UErrorCode &errorCode) const { |
75 | if(U_SUCCESS(errorCode)) { |
76 | UnicodeString destString; |
77 | labelToASCII(UnicodeString::fromUTF8(label), destString, |
78 | info, errorCode).toUTF8(dest); |
79 | } |
80 | } |
81 | |
82 | void |
83 | IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest, |
84 | IDNAInfo &info, UErrorCode &errorCode) const { |
85 | if(U_SUCCESS(errorCode)) { |
86 | UnicodeString destString; |
87 | labelToUnicode(UnicodeString::fromUTF8(label), destString, |
88 | info, errorCode).toUTF8(dest); |
89 | } |
90 | } |
91 | |
92 | void |
93 | IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest, |
94 | IDNAInfo &info, UErrorCode &errorCode) const { |
95 | if(U_SUCCESS(errorCode)) { |
96 | UnicodeString destString; |
97 | nameToASCII(UnicodeString::fromUTF8(name), destString, |
98 | info, errorCode).toUTF8(dest); |
99 | } |
100 | } |
101 | |
102 | void |
103 | IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest, |
104 | IDNAInfo &info, UErrorCode &errorCode) const { |
105 | if(U_SUCCESS(errorCode)) { |
106 | UnicodeString destString; |
107 | nameToUnicode(UnicodeString::fromUTF8(name), destString, |
108 | info, errorCode).toUTF8(dest); |
109 | } |
110 | } |
111 | |
112 | // UTS46 class declaration ------------------------------------------------- *** |
113 | |
114 | class UTS46 : public IDNA { |
115 | public: |
116 | UTS46(uint32_t options, UErrorCode &errorCode); |
117 | virtual ~UTS46(); |
118 | |
119 | virtual UnicodeString & |
120 | labelToASCII(const UnicodeString &label, UnicodeString &dest, |
121 | IDNAInfo &info, UErrorCode &errorCode) const override; |
122 | |
123 | virtual UnicodeString & |
124 | labelToUnicode(const UnicodeString &label, UnicodeString &dest, |
125 | IDNAInfo &info, UErrorCode &errorCode) const override; |
126 | |
127 | virtual UnicodeString & |
128 | nameToASCII(const UnicodeString &name, UnicodeString &dest, |
129 | IDNAInfo &info, UErrorCode &errorCode) const override; |
130 | |
131 | virtual UnicodeString & |
132 | nameToUnicode(const UnicodeString &name, UnicodeString &dest, |
133 | IDNAInfo &info, UErrorCode &errorCode) const override; |
134 | |
135 | virtual void |
136 | labelToASCII_UTF8(StringPiece label, ByteSink &dest, |
137 | IDNAInfo &info, UErrorCode &errorCode) const override; |
138 | |
139 | virtual void |
140 | labelToUnicodeUTF8(StringPiece label, ByteSink &dest, |
141 | IDNAInfo &info, UErrorCode &errorCode) const override; |
142 | |
143 | virtual void |
144 | nameToASCII_UTF8(StringPiece name, ByteSink &dest, |
145 | IDNAInfo &info, UErrorCode &errorCode) const override; |
146 | |
147 | virtual void |
148 | nameToUnicodeUTF8(StringPiece name, ByteSink &dest, |
149 | IDNAInfo &info, UErrorCode &errorCode) const override; |
150 | |
151 | private: |
152 | UnicodeString & |
153 | process(const UnicodeString &src, |
154 | UBool isLabel, UBool toASCII, |
155 | UnicodeString &dest, |
156 | IDNAInfo &info, UErrorCode &errorCode) const; |
157 | |
158 | void |
159 | processUTF8(StringPiece src, |
160 | UBool isLabel, UBool toASCII, |
161 | ByteSink &dest, |
162 | IDNAInfo &info, UErrorCode &errorCode) const; |
163 | |
164 | UnicodeString & |
165 | processUnicode(const UnicodeString &src, |
166 | int32_t labelStart, int32_t mappingStart, |
167 | UBool isLabel, UBool toASCII, |
168 | UnicodeString &dest, |
169 | IDNAInfo &info, UErrorCode &errorCode) const; |
170 | |
171 | // returns the new dest.length() |
172 | int32_t |
173 | mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, |
174 | UErrorCode &errorCode) const; |
175 | |
176 | // returns the new label length |
177 | int32_t |
178 | processLabel(UnicodeString &dest, |
179 | int32_t labelStart, int32_t labelLength, |
180 | UBool toASCII, |
181 | IDNAInfo &info, UErrorCode &errorCode) const; |
182 | int32_t |
183 | markBadACELabel(UnicodeString &dest, |
184 | int32_t labelStart, int32_t labelLength, |
185 | UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const; |
186 | |
187 | void |
188 | checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const; |
189 | |
190 | UBool |
191 | isLabelOkContextJ(const char16_t *label, int32_t labelLength) const; |
192 | |
193 | void |
194 | checkLabelContextO(const char16_t *label, int32_t labelLength, IDNAInfo &info) const; |
195 | |
196 | const Normalizer2 &uts46Norm2; // uts46.nrm |
197 | uint32_t options; |
198 | }; |
199 | |
200 | IDNA * |
201 | IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { |
202 | if(U_SUCCESS(errorCode)) { |
203 | IDNA *idna=new UTS46(options, errorCode); |
204 | if(idna==nullptr) { |
205 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
206 | } else if(U_FAILURE(errorCode)) { |
207 | delete idna; |
208 | idna=nullptr; |
209 | } |
210 | return idna; |
211 | } else { |
212 | return nullptr; |
213 | } |
214 | } |
215 | |
216 | // UTS46 implementation ---------------------------------------------------- *** |
217 | |
218 | UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) |
219 | : uts46Norm2(*Normalizer2::getInstance(nullptr, "uts46" , UNORM2_COMPOSE, errorCode)), |
220 | options(opt) {} |
221 | |
222 | UTS46::~UTS46() {} |
223 | |
224 | UnicodeString & |
225 | UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, |
226 | IDNAInfo &info, UErrorCode &errorCode) const { |
227 | return process(label, true, true, dest, info, errorCode); |
228 | } |
229 | |
230 | UnicodeString & |
231 | UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, |
232 | IDNAInfo &info, UErrorCode &errorCode) const { |
233 | return process(label, true, false, dest, info, errorCode); |
234 | } |
235 | |
236 | UnicodeString & |
237 | UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, |
238 | IDNAInfo &info, UErrorCode &errorCode) const { |
239 | process(name, false, true, dest, info, errorCode); |
240 | if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 && |
241 | isASCIIString(dest) && |
242 | (dest.length()>254 || dest[253]!=0x2e) |
243 | ) { |
244 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
245 | } |
246 | return dest; |
247 | } |
248 | |
249 | UnicodeString & |
250 | UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, |
251 | IDNAInfo &info, UErrorCode &errorCode) const { |
252 | return process(name, false, false, dest, info, errorCode); |
253 | } |
254 | |
255 | void |
256 | UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest, |
257 | IDNAInfo &info, UErrorCode &errorCode) const { |
258 | processUTF8(label, true, true, dest, info, errorCode); |
259 | } |
260 | |
261 | void |
262 | UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest, |
263 | IDNAInfo &info, UErrorCode &errorCode) const { |
264 | processUTF8(label, true, false, dest, info, errorCode); |
265 | } |
266 | |
267 | void |
268 | UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest, |
269 | IDNAInfo &info, UErrorCode &errorCode) const { |
270 | processUTF8(name, false, true, dest, info, errorCode); |
271 | } |
272 | |
273 | void |
274 | UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest, |
275 | IDNAInfo &info, UErrorCode &errorCode) const { |
276 | processUTF8(name, false, false, dest, info, errorCode); |
277 | } |
278 | |
279 | // UTS #46 data for ASCII characters. |
280 | // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase |
281 | // and passes through all other ASCII characters. |
282 | // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed |
283 | // using this data. |
284 | // The ASCII fastpath also uses this data. |
285 | // Values: -1=disallowed 0==valid 1==mapped (lowercase) |
286 | static const int8_t asciiData[128]={ |
287 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
288 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
289 | // 002D..002E; valid # HYPHEN-MINUS..FULL STOP |
290 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, |
291 | // 0030..0039; valid # DIGIT ZERO..DIGIT NINE |
292 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, |
293 | // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z |
294 | -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
295 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, |
296 | // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z |
297 | -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
298 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 |
299 | }; |
300 | |
301 | UnicodeString & |
302 | UTS46::process(const UnicodeString &src, |
303 | UBool isLabel, UBool toASCII, |
304 | UnicodeString &dest, |
305 | IDNAInfo &info, UErrorCode &errorCode) const { |
306 | // uts46Norm2.normalize() would do all of this error checking and setup, |
307 | // but with the ASCII fastpath we do not always call it, and do not |
308 | // call it first. |
309 | if(U_FAILURE(errorCode)) { |
310 | dest.setToBogus(); |
311 | return dest; |
312 | } |
313 | const char16_t *srcArray=src.getBuffer(); |
314 | if(&dest==&src || srcArray==nullptr) { |
315 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
316 | dest.setToBogus(); |
317 | return dest; |
318 | } |
319 | // Arguments are fine, reset output values. |
320 | dest.remove(); |
321 | info.reset(); |
322 | int32_t srcLength=src.length(); |
323 | if(srcLength==0) { |
324 | info.errors|=UIDNA_ERROR_EMPTY_LABEL; |
325 | return dest; |
326 | } |
327 | char16_t *destArray=dest.getBuffer(srcLength); |
328 | if(destArray==nullptr) { |
329 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
330 | return dest; |
331 | } |
332 | // ASCII fastpath |
333 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
334 | int32_t labelStart=0; |
335 | int32_t i; |
336 | for(i=0;; ++i) { |
337 | if(i==srcLength) { |
338 | if(toASCII) { |
339 | if((i-labelStart)>63) { |
340 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
341 | } |
342 | // There is a trailing dot if labelStart==i. |
343 | if(!isLabel && i>=254 && (i>254 || labelStart<i)) { |
344 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
345 | } |
346 | } |
347 | info.errors|=info.labelErrors; |
348 | dest.releaseBuffer(i); |
349 | return dest; |
350 | } |
351 | char16_t c=srcArray[i]; |
352 | if(c>0x7f) { |
353 | break; |
354 | } |
355 | int cData=asciiData[c]; |
356 | if(cData>0) { |
357 | destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. |
358 | } else if(cData<0 && disallowNonLDHDot) { |
359 | break; // Replacing with U+FFFD can be complicated for toASCII. |
360 | } else { |
361 | destArray[i]=c; |
362 | if(c==0x2d) { // hyphen |
363 | if(i==(labelStart+3) && srcArray[i-1]==0x2d) { |
364 | // "??--..." is Punycode or forbidden. |
365 | ++i; // '-' was copied to dest already |
366 | break; |
367 | } |
368 | if(i==labelStart) { |
369 | // label starts with "-" |
370 | info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
371 | } |
372 | if((i+1)==srcLength || srcArray[i+1]==0x2e) { |
373 | // label ends with "-" |
374 | info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
375 | } |
376 | } else if(c==0x2e) { // dot |
377 | if(isLabel) { |
378 | // Replacing with U+FFFD can be complicated for toASCII. |
379 | ++i; // '.' was copied to dest already |
380 | break; |
381 | } |
382 | if(i==labelStart) { |
383 | info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
384 | } |
385 | if(toASCII && (i-labelStart)>63) { |
386 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
387 | } |
388 | info.errors|=info.labelErrors; |
389 | info.labelErrors=0; |
390 | labelStart=i+1; |
391 | } |
392 | } |
393 | } |
394 | info.errors|=info.labelErrors; |
395 | dest.releaseBuffer(i); |
396 | processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); |
397 | if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && |
398 | (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart))) |
399 | ) { |
400 | info.errors|=UIDNA_ERROR_BIDI; |
401 | } |
402 | return dest; |
403 | } |
404 | |
405 | void |
406 | UTS46::processUTF8(StringPiece src, |
407 | UBool isLabel, UBool toASCII, |
408 | ByteSink &dest, |
409 | IDNAInfo &info, UErrorCode &errorCode) const { |
410 | if(U_FAILURE(errorCode)) { |
411 | return; |
412 | } |
413 | const char *srcArray=src.data(); |
414 | int32_t srcLength=src.length(); |
415 | if(srcArray==nullptr && srcLength!=0) { |
416 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
417 | return; |
418 | } |
419 | // Arguments are fine, reset output values. |
420 | info.reset(); |
421 | if(srcLength==0) { |
422 | info.errors|=UIDNA_ERROR_EMPTY_LABEL; |
423 | dest.Flush(); |
424 | return; |
425 | } |
426 | UnicodeString destString; |
427 | int32_t labelStart=0; |
428 | if(srcLength<=256) { // length of stackArray[] |
429 | // ASCII fastpath |
430 | char stackArray[256]; |
431 | int32_t destCapacity; |
432 | char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, |
433 | stackArray, UPRV_LENGTHOF(stackArray), &destCapacity); |
434 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
435 | int32_t i; |
436 | for(i=0;; ++i) { |
437 | if(i==srcLength) { |
438 | if(toASCII) { |
439 | if((i-labelStart)>63) { |
440 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
441 | } |
442 | // There is a trailing dot if labelStart==i. |
443 | if(!isLabel && i>=254 && (i>254 || labelStart<i)) { |
444 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
445 | } |
446 | } |
447 | info.errors|=info.labelErrors; |
448 | dest.Append(destArray, i); |
449 | dest.Flush(); |
450 | return; |
451 | } |
452 | char c=srcArray[i]; |
453 | if((int8_t)c<0) { // (uint8_t)c>0x7f |
454 | break; |
455 | } |
456 | int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char. |
457 | if(cData>0) { |
458 | destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. |
459 | } else if(cData<0 && disallowNonLDHDot) { |
460 | break; // Replacing with U+FFFD can be complicated for toASCII. |
461 | } else { |
462 | destArray[i]=c; |
463 | if(c==0x2d) { // hyphen |
464 | if(i==(labelStart+3) && srcArray[i-1]==0x2d) { |
465 | // "??--..." is Punycode or forbidden. |
466 | break; |
467 | } |
468 | if(i==labelStart) { |
469 | // label starts with "-" |
470 | info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
471 | } |
472 | if((i+1)==srcLength || srcArray[i+1]==0x2e) { |
473 | // label ends with "-" |
474 | info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
475 | } |
476 | } else if(c==0x2e) { // dot |
477 | if(isLabel) { |
478 | break; // Replacing with U+FFFD can be complicated for toASCII. |
479 | } |
480 | if(i==labelStart) { |
481 | info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
482 | } |
483 | if(toASCII && (i-labelStart)>63) { |
484 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
485 | } |
486 | info.errors|=info.labelErrors; |
487 | info.labelErrors=0; |
488 | labelStart=i+1; |
489 | } |
490 | } |
491 | } |
492 | info.errors|=info.labelErrors; |
493 | // Convert the processed ASCII prefix of the current label to UTF-16. |
494 | int32_t mappingStart=i-labelStart; |
495 | destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); |
496 | // Output the previous ASCII labels and process the rest of src in UTF-16. |
497 | dest.Append(destArray, labelStart); |
498 | processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart, |
499 | isLabel, toASCII, |
500 | destString, info, errorCode); |
501 | } else { |
502 | // src is too long for the ASCII fastpath implementation. |
503 | processUnicode(UnicodeString::fromUTF8(src), 0, 0, |
504 | isLabel, toASCII, |
505 | destString, info, errorCode); |
506 | } |
507 | destString.toUTF8(dest); // calls dest.Flush() |
508 | if(toASCII && !isLabel) { |
509 | // length==labelStart==254 means that there is a trailing dot (ok) and |
510 | // destString is empty (do not index at 253-labelStart). |
511 | int32_t length=labelStart+destString.length(); |
512 | if( length>=254 && isASCIIString(destString) && |
513 | (length>254 || |
514 | (labelStart<254 && destString[253-labelStart]!=0x2e)) |
515 | ) { |
516 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
517 | } |
518 | } |
519 | if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && |
520 | (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart))) |
521 | ) { |
522 | info.errors|=UIDNA_ERROR_BIDI; |
523 | } |
524 | } |
525 | |
526 | UnicodeString & |
527 | UTS46::processUnicode(const UnicodeString &src, |
528 | int32_t labelStart, int32_t mappingStart, |
529 | UBool isLabel, UBool toASCII, |
530 | UnicodeString &dest, |
531 | IDNAInfo &info, UErrorCode &errorCode) const { |
532 | if(mappingStart==0) { |
533 | uts46Norm2.normalize(src, dest, errorCode); |
534 | } else { |
535 | uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode); |
536 | } |
537 | if(U_FAILURE(errorCode)) { |
538 | return dest; |
539 | } |
540 | UBool doMapDevChars= |
541 | toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : |
542 | (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; |
543 | const char16_t *destArray=dest.getBuffer(); |
544 | int32_t destLength=dest.length(); |
545 | int32_t labelLimit=labelStart; |
546 | while(labelLimit<destLength) { |
547 | char16_t c=destArray[labelLimit]; |
548 | if(c==0x2e && !isLabel) { |
549 | int32_t labelLength=labelLimit-labelStart; |
550 | int32_t newLength=processLabel(dest, labelStart, labelLength, |
551 | toASCII, info, errorCode); |
552 | info.errors|=info.labelErrors; |
553 | info.labelErrors=0; |
554 | if(U_FAILURE(errorCode)) { |
555 | return dest; |
556 | } |
557 | destArray=dest.getBuffer(); |
558 | destLength+=newLength-labelLength; |
559 | labelLimit=labelStart+=newLength+1; |
560 | continue; |
561 | } else if(c<0xdf) { |
562 | // pass |
563 | } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { |
564 | info.isTransDiff=true; |
565 | if(doMapDevChars) { |
566 | destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); |
567 | if(U_FAILURE(errorCode)) { |
568 | return dest; |
569 | } |
570 | destArray=dest.getBuffer(); |
571 | // All deviation characters have been mapped, no need to check for them again. |
572 | doMapDevChars=false; |
573 | // Do not increment labelLimit in case c was removed. |
574 | continue; |
575 | } |
576 | } else if(U16_IS_SURROGATE(c)) { |
577 | if(U16_IS_SURROGATE_LEAD(c) ? |
578 | (labelLimit+1)==destLength || !U16_IS_TRAIL(destArray[labelLimit+1]) : |
579 | labelLimit==labelStart || !U16_IS_LEAD(destArray[labelLimit-1])) { |
580 | // Map an unpaired surrogate to U+FFFD before normalization so that when |
581 | // that removes characters we do not turn two unpaired ones into a pair. |
582 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
583 | dest.setCharAt(labelLimit, 0xfffd); |
584 | destArray=dest.getBuffer(); |
585 | } |
586 | } |
587 | ++labelLimit; |
588 | } |
589 | // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) |
590 | // but not an empty label elsewhere nor a completely empty domain name. |
591 | // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. |
592 | if(0==labelStart || labelStart<labelLimit) { |
593 | processLabel(dest, labelStart, labelLimit-labelStart, |
594 | toASCII, info, errorCode); |
595 | info.errors|=info.labelErrors; |
596 | } |
597 | return dest; |
598 | } |
599 | |
600 | int32_t |
601 | UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, |
602 | UErrorCode &errorCode) const { |
603 | if(U_FAILURE(errorCode)) { |
604 | return 0; |
605 | } |
606 | int32_t length=dest.length(); |
607 | char16_t *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); |
608 | if(s==nullptr) { |
609 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
610 | return length; |
611 | } |
612 | int32_t capacity=dest.getCapacity(); |
613 | UBool didMapDevChars=false; |
614 | int32_t readIndex=mappingStart, writeIndex=mappingStart; |
615 | do { |
616 | char16_t c=s[readIndex++]; |
617 | switch(c) { |
618 | case 0xdf: |
619 | // Map sharp s to ss. |
620 | didMapDevChars=true; |
621 | s[writeIndex++]=0x73; // Replace sharp s with first s. |
622 | // Insert second s and account for possible buffer reallocation. |
623 | if(writeIndex==readIndex) { |
624 | if(length==capacity) { |
625 | dest.releaseBuffer(length); |
626 | s=dest.getBuffer(length+1); |
627 | if(s==nullptr) { |
628 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
629 | return length; |
630 | } |
631 | capacity=dest.getCapacity(); |
632 | } |
633 | u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); |
634 | ++readIndex; |
635 | } |
636 | s[writeIndex++]=0x73; |
637 | ++length; |
638 | break; |
639 | case 0x3c2: // Map final sigma to nonfinal sigma. |
640 | didMapDevChars=true; |
641 | s[writeIndex++]=0x3c3; |
642 | break; |
643 | case 0x200c: // Ignore/remove ZWNJ. |
644 | case 0x200d: // Ignore/remove ZWJ. |
645 | didMapDevChars=true; |
646 | --length; |
647 | break; |
648 | default: |
649 | // Only really necessary if writeIndex was different from readIndex. |
650 | s[writeIndex++]=c; |
651 | break; |
652 | } |
653 | } while(writeIndex<length); |
654 | dest.releaseBuffer(length); |
655 | if(didMapDevChars) { |
656 | // Mapping deviation characters might have resulted in an un-NFC string. |
657 | // We could use either the NFC or the UTS #46 normalizer. |
658 | // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. |
659 | UnicodeString normalized; |
660 | uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); |
661 | if(U_SUCCESS(errorCode)) { |
662 | dest.replace(labelStart, 0x7fffffff, normalized); |
663 | if(dest.isBogus()) { |
664 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
665 | } |
666 | return dest.length(); |
667 | } |
668 | } |
669 | return length; |
670 | } |
671 | |
672 | // Some non-ASCII characters are equivalent to sequences with |
673 | // non-LDH ASCII characters. To find them: |
674 | // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) |
675 | static inline UBool |
676 | isNonASCIIDisallowedSTD3Valid(UChar32 c) { |
677 | return c==0x2260 || c==0x226E || c==0x226F; |
678 | } |
679 | |
680 | // Replace the label in dest with the label string, if the label was modified. |
681 | // If &label==&dest then the label was modified in-place and labelLength |
682 | // is the new label length, different from label.length(). |
683 | // If &label!=&dest then labelLength==label.length(). |
684 | // Returns labelLength (= the new label length). |
685 | static int32_t |
686 | replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, |
687 | const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) { |
688 | if(U_FAILURE(errorCode)) { |
689 | return 0; |
690 | } |
691 | if(&label!=&dest) { |
692 | dest.replace(destLabelStart, destLabelLength, label); |
693 | if(dest.isBogus()) { |
694 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
695 | return 0; |
696 | } |
697 | } |
698 | return labelLength; |
699 | } |
700 | |
701 | int32_t |
702 | UTS46::processLabel(UnicodeString &dest, |
703 | int32_t labelStart, int32_t labelLength, |
704 | UBool toASCII, |
705 | IDNAInfo &info, UErrorCode &errorCode) const { |
706 | if(U_FAILURE(errorCode)) { |
707 | return 0; |
708 | } |
709 | UnicodeString fromPunycode; |
710 | UnicodeString *labelString; |
711 | const char16_t *label=dest.getBuffer()+labelStart; |
712 | int32_t destLabelStart=labelStart; |
713 | int32_t destLabelLength=labelLength; |
714 | UBool wasPunycode; |
715 | if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { |
716 | // Label starts with "xn--", try to un-Punycode it. |
717 | // In IDNA2008, labels like "xn--" (decodes to an empty string) and |
718 | // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from |
719 | // comparing the ToUnicode input with the back-to-ToASCII output. |
720 | // They are alternate encodings of the respective ASCII labels. |
721 | // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before |
722 | // the round-trip verification. |
723 | if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) { |
724 | info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
725 | return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); |
726 | } |
727 | wasPunycode=true; |
728 | char16_t *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit |
729 | if(unicodeBuffer==nullptr) { |
730 | // Should never occur if we used capacity==-1 which uses the internal buffer. |
731 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
732 | return labelLength; |
733 | } |
734 | UErrorCode punycodeErrorCode=U_ZERO_ERROR; |
735 | int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, |
736 | unicodeBuffer, fromPunycode.getCapacity(), |
737 | nullptr, &punycodeErrorCode); |
738 | if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { |
739 | fromPunycode.releaseBuffer(0); |
740 | unicodeBuffer=fromPunycode.getBuffer(unicodeLength); |
741 | if(unicodeBuffer==nullptr) { |
742 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
743 | return labelLength; |
744 | } |
745 | punycodeErrorCode=U_ZERO_ERROR; |
746 | unicodeLength=u_strFromPunycode(label+4, labelLength-4, |
747 | unicodeBuffer, fromPunycode.getCapacity(), |
748 | nullptr, &punycodeErrorCode); |
749 | } |
750 | fromPunycode.releaseBuffer(unicodeLength); |
751 | if(U_FAILURE(punycodeErrorCode)) { |
752 | info.labelErrors|=UIDNA_ERROR_PUNYCODE; |
753 | return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); |
754 | } |
755 | // Check for NFC, and for characters that are not |
756 | // valid or deviation characters according to the normalizer. |
757 | // If there is something wrong, then the string will change. |
758 | // Note that the normalizer passes through non-LDH ASCII and deviation characters. |
759 | // Deviation characters are ok in Punycode even in transitional processing. |
760 | // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES |
761 | // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. |
762 | UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); |
763 | if(U_FAILURE(errorCode)) { |
764 | return labelLength; |
765 | } |
766 | if(!isValid) { |
767 | info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
768 | return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); |
769 | } |
770 | labelString=&fromPunycode; |
771 | label=fromPunycode.getBuffer(); |
772 | labelStart=0; |
773 | labelLength=fromPunycode.length(); |
774 | } else { |
775 | wasPunycode=false; |
776 | labelString=&dest; |
777 | } |
778 | // Validity check |
779 | if(labelLength==0) { |
780 | info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
781 | return replaceLabel(dest, destLabelStart, destLabelLength, |
782 | *labelString, labelLength, errorCode); |
783 | } |
784 | // labelLength>0 |
785 | if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { |
786 | // label starts with "??--" |
787 | info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; |
788 | } |
789 | if(label[0]==0x2d) { |
790 | // label starts with "-" |
791 | info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
792 | } |
793 | if(label[labelLength-1]==0x2d) { |
794 | // label ends with "-" |
795 | info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
796 | } |
797 | // If the label was not a Punycode label, then it was the result of |
798 | // mapping, normalization and label segmentation. |
799 | // If the label was in Punycode, then we mapped it again above |
800 | // and checked its validity. |
801 | // Now we handle the STD3 restriction to LDH characters (if set) |
802 | // and we look for U+FFFD which indicates disallowed characters |
803 | // in a non-Punycode label or U+FFFD itself in a Punycode label. |
804 | // We also check for dots which can come from the input to a single-label function. |
805 | // Ok to cast away const because we own the UnicodeString. |
806 | char16_t *s=(char16_t *)label; |
807 | const char16_t *limit=label+labelLength; |
808 | char16_t oredChars=0; |
809 | // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. |
810 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
811 | do { |
812 | char16_t c=*s; |
813 | if(c<=0x7f) { |
814 | if(c==0x2e) { |
815 | info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; |
816 | *s=0xfffd; |
817 | } else if(disallowNonLDHDot && asciiData[c]<0) { |
818 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
819 | *s=0xfffd; |
820 | } |
821 | } else { |
822 | oredChars|=c; |
823 | if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { |
824 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
825 | *s=0xfffd; |
826 | } else if(c==0xfffd) { |
827 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
828 | } |
829 | } |
830 | ++s; |
831 | } while(s<limit); |
832 | // Check for a leading combining mark after other validity checks |
833 | // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. |
834 | UChar32 c; |
835 | int32_t cpLength=0; |
836 | // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. |
837 | U16_NEXT_UNSAFE(label, cpLength, c); |
838 | if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { |
839 | info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; |
840 | labelString->replace(labelStart, cpLength, (char16_t)0xfffd); |
841 | label=labelString->getBuffer()+labelStart; |
842 | labelLength+=1-cpLength; |
843 | if(labelString==&dest) { |
844 | destLabelLength=labelLength; |
845 | } |
846 | } |
847 | if((info.labelErrors&severeErrors)==0) { |
848 | // Do contextual checks only if we do not have U+FFFD from a severe error |
849 | // because U+FFFD can make these checks fail. |
850 | if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { |
851 | checkLabelBiDi(label, labelLength, info); |
852 | } |
853 | if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && |
854 | !isLabelOkContextJ(label, labelLength) |
855 | ) { |
856 | info.labelErrors|=UIDNA_ERROR_CONTEXTJ; |
857 | } |
858 | if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { |
859 | checkLabelContextO(label, labelLength, info); |
860 | } |
861 | if(toASCII) { |
862 | if(wasPunycode) { |
863 | // Leave a Punycode label unchanged if it has no severe errors. |
864 | if(destLabelLength>63) { |
865 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
866 | } |
867 | return destLabelLength; |
868 | } else if(oredChars>=0x80) { |
869 | // Contains non-ASCII characters. |
870 | UnicodeString punycode; |
871 | char16_t *buffer=punycode.getBuffer(63); // 63==maximum DNS label length |
872 | if(buffer==nullptr) { |
873 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
874 | return destLabelLength; |
875 | } |
876 | buffer[0]=0x78; // Write "xn--". |
877 | buffer[1]=0x6e; |
878 | buffer[2]=0x2d; |
879 | buffer[3]=0x2d; |
880 | int32_t punycodeLength=u_strToPunycode(label, labelLength, |
881 | buffer+4, punycode.getCapacity()-4, |
882 | nullptr, &errorCode); |
883 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
884 | errorCode=U_ZERO_ERROR; |
885 | punycode.releaseBuffer(4); |
886 | buffer=punycode.getBuffer(4+punycodeLength); |
887 | if(buffer==nullptr) { |
888 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
889 | return destLabelLength; |
890 | } |
891 | punycodeLength=u_strToPunycode(label, labelLength, |
892 | buffer+4, punycode.getCapacity()-4, |
893 | nullptr, &errorCode); |
894 | } |
895 | punycodeLength+=4; |
896 | punycode.releaseBuffer(punycodeLength); |
897 | if(U_FAILURE(errorCode)) { |
898 | return destLabelLength; |
899 | } |
900 | if(punycodeLength>63) { |
901 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
902 | } |
903 | return replaceLabel(dest, destLabelStart, destLabelLength, |
904 | punycode, punycodeLength, errorCode); |
905 | } else { |
906 | // all-ASCII label |
907 | if(labelLength>63) { |
908 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
909 | } |
910 | } |
911 | } |
912 | } else { |
913 | // If a Punycode label has severe errors, |
914 | // then leave it but make sure it does not look valid. |
915 | if(wasPunycode) { |
916 | info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
917 | return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode); |
918 | } |
919 | } |
920 | return replaceLabel(dest, destLabelStart, destLabelLength, |
921 | *labelString, labelLength, errorCode); |
922 | } |
923 | |
924 | // Make sure an ACE label does not look valid. |
925 | // Append U+FFFD if the label has only LDH characters. |
926 | // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD. |
927 | int32_t |
928 | UTS46::markBadACELabel(UnicodeString &dest, |
929 | int32_t labelStart, int32_t labelLength, |
930 | UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const { |
931 | if(U_FAILURE(errorCode)) { |
932 | return 0; |
933 | } |
934 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
935 | UBool isASCII=true; |
936 | UBool onlyLDH=true; |
937 | const char16_t *label=dest.getBuffer()+labelStart; |
938 | const char16_t *limit=label+labelLength; |
939 | // Start after the initial "xn--". |
940 | // Ok to cast away const because we own the UnicodeString. |
941 | for(char16_t *s=const_cast<char16_t *>(label+4); s<limit; ++s) { |
942 | char16_t c=*s; |
943 | if(c<=0x7f) { |
944 | if(c==0x2e) { |
945 | info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; |
946 | *s=0xfffd; |
947 | isASCII=onlyLDH=false; |
948 | } else if(asciiData[c]<0) { |
949 | onlyLDH=false; |
950 | if(disallowNonLDHDot) { |
951 | *s=0xfffd; |
952 | isASCII=false; |
953 | } |
954 | } |
955 | } else { |
956 | isASCII=onlyLDH=false; |
957 | } |
958 | } |
959 | if(onlyLDH) { |
960 | dest.insert(labelStart+labelLength, (char16_t)0xfffd); |
961 | if(dest.isBogus()) { |
962 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
963 | return 0; |
964 | } |
965 | ++labelLength; |
966 | } else { |
967 | if(toASCII && isASCII && labelLength>63) { |
968 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
969 | } |
970 | } |
971 | return labelLength; |
972 | } |
973 | |
974 | const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); |
975 | const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); |
976 | const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; |
977 | |
978 | const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); |
979 | |
980 | const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); |
981 | const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; |
982 | const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); |
983 | |
984 | const uint32_t ES_CS_ET_ON_BN_NSM_MASK= |
985 | U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| |
986 | U_MASK(U_COMMON_NUMBER_SEPARATOR)| |
987 | U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| |
988 | U_MASK(U_OTHER_NEUTRAL)| |
989 | U_MASK(U_BOUNDARY_NEUTRAL)| |
990 | U_MASK(U_DIR_NON_SPACING_MARK); |
991 | const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; |
992 | const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; |
993 | |
994 | // We scan the whole label and check both for whether it contains RTL characters |
995 | // and whether it passes the BiDi Rule. |
996 | // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find |
997 | // that a domain name is a BiDi domain name (has an RTL label) only after |
998 | // processing several earlier labels. |
999 | void |
1000 | UTS46::checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const { |
1001 | // IDNA2008 BiDi rule |
1002 | // Get the directionality of the first character. |
1003 | UChar32 c; |
1004 | int32_t i=0; |
1005 | U16_NEXT_UNSAFE(label, i, c); |
1006 | uint32_t firstMask=U_MASK(u_charDirection(c)); |
1007 | // 1. The first character must be a character with BIDI property L, R |
1008 | // or AL. If it has the R or AL property, it is an RTL label; if it |
1009 | // has the L property, it is an LTR label. |
1010 | if((firstMask&~L_R_AL_MASK)!=0) { |
1011 | info.isOkBiDi=false; |
1012 | } |
1013 | // Get the directionality of the last non-NSM character. |
1014 | uint32_t lastMask; |
1015 | for(;;) { |
1016 | if(i>=labelLength) { |
1017 | lastMask=firstMask; |
1018 | break; |
1019 | } |
1020 | U16_PREV_UNSAFE(label, labelLength, c); |
1021 | UCharDirection dir=u_charDirection(c); |
1022 | if(dir!=U_DIR_NON_SPACING_MARK) { |
1023 | lastMask=U_MASK(dir); |
1024 | break; |
1025 | } |
1026 | } |
1027 | // 3. In an RTL label, the end of the label must be a character with |
1028 | // BIDI property R, AL, EN or AN, followed by zero or more |
1029 | // characters with BIDI property NSM. |
1030 | // 6. In an LTR label, the end of the label must be a character with |
1031 | // BIDI property L or EN, followed by zero or more characters with |
1032 | // BIDI property NSM. |
1033 | if( (firstMask&L_MASK)!=0 ? |
1034 | (lastMask&~L_EN_MASK)!=0 : |
1035 | (lastMask&~R_AL_EN_AN_MASK)!=0 |
1036 | ) { |
1037 | info.isOkBiDi=false; |
1038 | } |
1039 | // Add the directionalities of the intervening characters. |
1040 | uint32_t mask=firstMask|lastMask; |
1041 | while(i<labelLength) { |
1042 | U16_NEXT_UNSAFE(label, i, c); |
1043 | mask|=U_MASK(u_charDirection(c)); |
1044 | } |
1045 | if(firstMask&L_MASK) { |
1046 | // 5. In an LTR label, only characters with the BIDI properties L, EN, |
1047 | // ES, CS, ET, ON, BN and NSM are allowed. |
1048 | if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { |
1049 | info.isOkBiDi=false; |
1050 | } |
1051 | } else { |
1052 | // 2. In an RTL label, only characters with the BIDI properties R, AL, |
1053 | // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. |
1054 | if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { |
1055 | info.isOkBiDi=false; |
1056 | } |
1057 | // 4. In an RTL label, if an EN is present, no AN may be present, and |
1058 | // vice versa. |
1059 | if((mask&EN_AN_MASK)==EN_AN_MASK) { |
1060 | info.isOkBiDi=false; |
1061 | } |
1062 | } |
1063 | // An RTL label is a label that contains at least one character of type |
1064 | // R, AL or AN. [...] |
1065 | // A "BIDI domain name" is a domain name that contains at least one RTL |
1066 | // label. [...] |
1067 | // The following rule, consisting of six conditions, applies to labels |
1068 | // in BIDI domain names. |
1069 | if((mask&R_AL_AN_MASK)!=0) { |
1070 | info.isBiDi=true; |
1071 | } |
1072 | } |
1073 | |
1074 | // Special code for the ASCII prefix of a BiDi domain name. |
1075 | // The ASCII prefix is all-LTR. |
1076 | |
1077 | // IDNA2008 BiDi rule, parts relevant to ASCII labels: |
1078 | // 1. The first character must be a character with BIDI property L [...] |
1079 | // 5. In an LTR label, only characters with the BIDI properties L, EN, |
1080 | // ES, CS, ET, ON, BN and NSM are allowed. |
1081 | // 6. In an LTR label, the end of the label must be a character with |
1082 | // BIDI property L or EN [...] |
1083 | |
1084 | // UTF-16 version, called for mapped ASCII prefix. |
1085 | // Cannot contain uppercase A-Z. |
1086 | // s[length-1] must be the trailing dot. |
1087 | static UBool |
1088 | isASCIIOkBiDi(const char16_t *s, int32_t length) { |
1089 | int32_t labelStart=0; |
1090 | for(int32_t i=0; i<length; ++i) { |
1091 | char16_t c=s[i]; |
1092 | if(c==0x2e) { // dot |
1093 | if(i>labelStart) { |
1094 | c=s[i-1]; |
1095 | if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { |
1096 | // Last character in the label is not an L or EN. |
1097 | return false; |
1098 | } |
1099 | } |
1100 | labelStart=i+1; |
1101 | } else if(i==labelStart) { |
1102 | if(!(0x61<=c && c<=0x7a)) { |
1103 | // First character in the label is not an L. |
1104 | return false; |
1105 | } |
1106 | } else { |
1107 | if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { |
1108 | // Intermediate character in the label is a B, S or WS. |
1109 | return false; |
1110 | } |
1111 | } |
1112 | } |
1113 | return true; |
1114 | } |
1115 | |
1116 | // UTF-8 version, called for source ASCII prefix. |
1117 | // Can contain uppercase A-Z. |
1118 | // s[length-1] must be the trailing dot. |
1119 | static UBool |
1120 | isASCIIOkBiDi(const char *s, int32_t length) { |
1121 | int32_t labelStart=0; |
1122 | for(int32_t i=0; i<length; ++i) { |
1123 | char c=s[i]; |
1124 | if(c==0x2e) { // dot |
1125 | if(i>labelStart) { |
1126 | c=s[i-1]; |
1127 | if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { |
1128 | // Last character in the label is not an L or EN. |
1129 | return false; |
1130 | } |
1131 | } |
1132 | labelStart=i+1; |
1133 | } else if(i==labelStart) { |
1134 | if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { |
1135 | // First character in the label is not an L. |
1136 | return false; |
1137 | } |
1138 | } else { |
1139 | if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { |
1140 | // Intermediate character in the label is a B, S or WS. |
1141 | return false; |
1142 | } |
1143 | } |
1144 | } |
1145 | return true; |
1146 | } |
1147 | |
1148 | UBool |
1149 | UTS46::isLabelOkContextJ(const char16_t *label, int32_t labelLength) const { |
1150 | // [IDNA2008-Tables] |
1151 | // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER |
1152 | for(int32_t i=0; i<labelLength; ++i) { |
1153 | if(label[i]==0x200c) { |
1154 | // Appendix A.1. ZERO WIDTH NON-JOINER |
1155 | // Rule Set: |
1156 | // False; |
1157 | // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
1158 | // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
1159 | // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
1160 | if(i==0) { |
1161 | return false; |
1162 | } |
1163 | UChar32 c; |
1164 | int32_t j=i; |
1165 | U16_PREV_UNSAFE(label, j, c); |
1166 | if(uts46Norm2.getCombiningClass(c)==9) { |
1167 | continue; |
1168 | } |
1169 | // check precontext (Joining_Type:{L,D})(Joining_Type:T)* |
1170 | for(;;) { |
1171 | UJoiningType type=ubidi_getJoiningType(c); |
1172 | if(type==U_JT_TRANSPARENT) { |
1173 | if(j==0) { |
1174 | return false; |
1175 | } |
1176 | U16_PREV_UNSAFE(label, j, c); |
1177 | } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { |
1178 | break; // precontext fulfilled |
1179 | } else { |
1180 | return false; |
1181 | } |
1182 | } |
1183 | // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) |
1184 | for(j=i+1;;) { |
1185 | if(j==labelLength) { |
1186 | return false; |
1187 | } |
1188 | U16_NEXT_UNSAFE(label, j, c); |
1189 | UJoiningType type=ubidi_getJoiningType(c); |
1190 | if(type==U_JT_TRANSPARENT) { |
1191 | // just skip this character |
1192 | } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { |
1193 | break; // postcontext fulfilled |
1194 | } else { |
1195 | return false; |
1196 | } |
1197 | } |
1198 | } else if(label[i]==0x200d) { |
1199 | // Appendix A.2. ZERO WIDTH JOINER (U+200D) |
1200 | // Rule Set: |
1201 | // False; |
1202 | // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
1203 | if(i==0) { |
1204 | return false; |
1205 | } |
1206 | UChar32 c; |
1207 | int32_t j=i; |
1208 | U16_PREV_UNSAFE(label, j, c); |
1209 | if(uts46Norm2.getCombiningClass(c)!=9) { |
1210 | return false; |
1211 | } |
1212 | } |
1213 | } |
1214 | return true; |
1215 | } |
1216 | |
1217 | void |
1218 | UTS46::checkLabelContextO(const char16_t *label, int32_t labelLength, IDNAInfo &info) const { |
1219 | int32_t labelEnd=labelLength-1; // inclusive |
1220 | int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx |
1221 | for(int32_t i=0; i<=labelEnd; ++i) { |
1222 | UChar32 c=label[i]; |
1223 | if(c<0xb7) { |
1224 | // ASCII fastpath |
1225 | } else if(c<=0x6f9) { |
1226 | if(c==0xb7) { |
1227 | // Appendix A.3. MIDDLE DOT (U+00B7) |
1228 | // Rule Set: |
1229 | // False; |
1230 | // If Before(cp) .eq. U+006C And |
1231 | // After(cp) .eq. U+006C Then True; |
1232 | if(!(0<i && label[i-1]==0x6c && |
1233 | i<labelEnd && label[i+1]==0x6c)) { |
1234 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
1235 | } |
1236 | } else if(c==0x375) { |
1237 | // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) |
1238 | // Rule Set: |
1239 | // False; |
1240 | // If Script(After(cp)) .eq. Greek Then True; |
1241 | UScriptCode script=USCRIPT_INVALID_CODE; |
1242 | if(i<labelEnd) { |
1243 | UErrorCode errorCode=U_ZERO_ERROR; |
1244 | int32_t j=i+1; |
1245 | U16_NEXT(label, j, labelLength, c); |
1246 | script=uscript_getScript(c, &errorCode); |
1247 | } |
1248 | if(script!=USCRIPT_GREEK) { |
1249 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
1250 | } |
1251 | } else if(c==0x5f3 || c==0x5f4) { |
1252 | // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) |
1253 | // Rule Set: |
1254 | // False; |
1255 | // If Script(Before(cp)) .eq. Hebrew Then True; |
1256 | // |
1257 | // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) |
1258 | // Rule Set: |
1259 | // False; |
1260 | // If Script(Before(cp)) .eq. Hebrew Then True; |
1261 | UScriptCode script=USCRIPT_INVALID_CODE; |
1262 | if(0<i) { |
1263 | UErrorCode errorCode=U_ZERO_ERROR; |
1264 | int32_t j=i; |
1265 | U16_PREV(label, 0, j, c); |
1266 | script=uscript_getScript(c, &errorCode); |
1267 | } |
1268 | if(script!=USCRIPT_HEBREW) { |
1269 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
1270 | } |
1271 | } else if(0x660<=c /* && c<=0x6f9 */) { |
1272 | // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) |
1273 | // Rule Set: |
1274 | // True; |
1275 | // For All Characters: |
1276 | // If cp .in. 06F0..06F9 Then False; |
1277 | // End For; |
1278 | // |
1279 | // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) |
1280 | // Rule Set: |
1281 | // True; |
1282 | // For All Characters: |
1283 | // If cp .in. 0660..0669 Then False; |
1284 | // End For; |
1285 | if(c<=0x669) { |
1286 | if(arabicDigits>0) { |
1287 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; |
1288 | } |
1289 | arabicDigits=-1; |
1290 | } else if(0x6f0<=c) { |
1291 | if(arabicDigits<0) { |
1292 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; |
1293 | } |
1294 | arabicDigits=1; |
1295 | } |
1296 | } |
1297 | } else if(c==0x30fb) { |
1298 | // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) |
1299 | // Rule Set: |
1300 | // False; |
1301 | // For All Characters: |
1302 | // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; |
1303 | // End For; |
1304 | UErrorCode errorCode=U_ZERO_ERROR; |
1305 | for(int j=0;;) { |
1306 | if(j>labelEnd) { |
1307 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
1308 | break; |
1309 | } |
1310 | U16_NEXT(label, j, labelLength, c); |
1311 | UScriptCode script=uscript_getScript(c, &errorCode); |
1312 | if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { |
1313 | break; |
1314 | } |
1315 | } |
1316 | } |
1317 | } |
1318 | } |
1319 | |
1320 | U_NAMESPACE_END |
1321 | |
1322 | // C API ------------------------------------------------------------------- *** |
1323 | |
1324 | U_NAMESPACE_USE |
1325 | |
1326 | U_CAPI UIDNA * U_EXPORT2 |
1327 | uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { |
1328 | return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode)); |
1329 | } |
1330 | |
1331 | U_CAPI void U_EXPORT2 |
1332 | uidna_close(UIDNA *idna) { |
1333 | delete reinterpret_cast<IDNA *>(idna); |
1334 | } |
1335 | |
1336 | static UBool |
1337 | checkArgs(const void *label, int32_t length, |
1338 | void *dest, int32_t capacity, |
1339 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1340 | if(U_FAILURE(*pErrorCode)) { |
1341 | return false; |
1342 | } |
1343 | // sizeof(UIDNAInfo)=16 in the first API version. |
1344 | if(pInfo==nullptr || pInfo->size<16) { |
1345 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1346 | return false; |
1347 | } |
1348 | if( (label==nullptr ? length!=0 : length<-1) || |
1349 | (dest==nullptr ? capacity!=0 : capacity<0) || |
1350 | (dest==label && label!=nullptr) |
1351 | ) { |
1352 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1353 | return false; |
1354 | } |
1355 | // Set all *pInfo bytes to 0 except for the size field itself. |
1356 | uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); |
1357 | return true; |
1358 | } |
1359 | |
1360 | static void |
1361 | idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { |
1362 | pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); |
1363 | pInfo->errors=info.getErrors(); |
1364 | } |
1365 | |
1366 | U_CAPI int32_t U_EXPORT2 |
1367 | uidna_labelToASCII(const UIDNA *idna, |
1368 | const char16_t *label, int32_t length, |
1369 | char16_t *dest, int32_t capacity, |
1370 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1371 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
1372 | return 0; |
1373 | } |
1374 | UnicodeString src((UBool)(length<0), label, length); |
1375 | UnicodeString destString(dest, 0, capacity); |
1376 | IDNAInfo info; |
1377 | reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode); |
1378 | idnaInfoToStruct(info, pInfo); |
1379 | return destString.extract(dest, capacity, *pErrorCode); |
1380 | } |
1381 | |
1382 | U_CAPI int32_t U_EXPORT2 |
1383 | uidna_labelToUnicode(const UIDNA *idna, |
1384 | const char16_t *label, int32_t length, |
1385 | char16_t *dest, int32_t capacity, |
1386 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1387 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
1388 | return 0; |
1389 | } |
1390 | UnicodeString src((UBool)(length<0), label, length); |
1391 | UnicodeString destString(dest, 0, capacity); |
1392 | IDNAInfo info; |
1393 | reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode); |
1394 | idnaInfoToStruct(info, pInfo); |
1395 | return destString.extract(dest, capacity, *pErrorCode); |
1396 | } |
1397 | |
1398 | U_CAPI int32_t U_EXPORT2 |
1399 | uidna_nameToASCII(const UIDNA *idna, |
1400 | const char16_t *name, int32_t length, |
1401 | char16_t *dest, int32_t capacity, |
1402 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1403 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
1404 | return 0; |
1405 | } |
1406 | UnicodeString src((UBool)(length<0), name, length); |
1407 | UnicodeString destString(dest, 0, capacity); |
1408 | IDNAInfo info; |
1409 | reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode); |
1410 | idnaInfoToStruct(info, pInfo); |
1411 | return destString.extract(dest, capacity, *pErrorCode); |
1412 | } |
1413 | |
1414 | U_CAPI int32_t U_EXPORT2 |
1415 | uidna_nameToUnicode(const UIDNA *idna, |
1416 | const char16_t *name, int32_t length, |
1417 | char16_t *dest, int32_t capacity, |
1418 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1419 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
1420 | return 0; |
1421 | } |
1422 | UnicodeString src((UBool)(length<0), name, length); |
1423 | UnicodeString destString(dest, 0, capacity); |
1424 | IDNAInfo info; |
1425 | reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode); |
1426 | idnaInfoToStruct(info, pInfo); |
1427 | return destString.extract(dest, capacity, *pErrorCode); |
1428 | } |
1429 | |
1430 | U_CAPI int32_t U_EXPORT2 |
1431 | uidna_labelToASCII_UTF8(const UIDNA *idna, |
1432 | const char *label, int32_t length, |
1433 | char *dest, int32_t capacity, |
1434 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1435 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
1436 | return 0; |
1437 | } |
1438 | StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length); |
1439 | CheckedArrayByteSink sink(dest, capacity); |
1440 | IDNAInfo info; |
1441 | reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode); |
1442 | idnaInfoToStruct(info, pInfo); |
1443 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
1444 | } |
1445 | |
1446 | U_CAPI int32_t U_EXPORT2 |
1447 | uidna_labelToUnicodeUTF8(const UIDNA *idna, |
1448 | const char *label, int32_t length, |
1449 | char *dest, int32_t capacity, |
1450 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1451 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
1452 | return 0; |
1453 | } |
1454 | StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length); |
1455 | CheckedArrayByteSink sink(dest, capacity); |
1456 | IDNAInfo info; |
1457 | reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode); |
1458 | idnaInfoToStruct(info, pInfo); |
1459 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
1460 | } |
1461 | |
1462 | U_CAPI int32_t U_EXPORT2 |
1463 | uidna_nameToASCII_UTF8(const UIDNA *idna, |
1464 | const char *name, int32_t length, |
1465 | char *dest, int32_t capacity, |
1466 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1467 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
1468 | return 0; |
1469 | } |
1470 | StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length); |
1471 | CheckedArrayByteSink sink(dest, capacity); |
1472 | IDNAInfo info; |
1473 | reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode); |
1474 | idnaInfoToStruct(info, pInfo); |
1475 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
1476 | } |
1477 | |
1478 | U_CAPI int32_t U_EXPORT2 |
1479 | uidna_nameToUnicodeUTF8(const UIDNA *idna, |
1480 | const char *name, int32_t length, |
1481 | char *dest, int32_t capacity, |
1482 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
1483 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
1484 | return 0; |
1485 | } |
1486 | StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length); |
1487 | CheckedArrayByteSink sink(dest, capacity); |
1488 | IDNAInfo info; |
1489 | reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode); |
1490 | idnaInfoToStruct(info, pInfo); |
1491 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
1492 | } |
1493 | |
1494 | #endif // UCONFIG_NO_IDNA |
1495 | |