uts46.cpp source code [Godot/thirdparty/icu4c/common/uts46.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2010-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* file name: uts46.cpp
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2010mar09
14	* created by: Markus W. Scherer
15	*/
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_IDNA
20
21	#include "unicode/idna.h"
22	#include "unicode/normalizer2.h"
23	#include "unicode/uscript.h"
24	#include "unicode/ustring.h"
25	#include "unicode/utf16.h"
26	#include "cmemory.h"
27	#include "cstring.h"
28	#include "punycode.h"
29	#include "ubidi_props.h"
30	#include "ustr_imp.h"
31
32	// Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
33	//
34	// The domain name length limit is 255 octets in an internal DNS representation
35	// where the last ("root") label is the empty label
36	// represented by length byte 0 alone.
37	// In a conventional string, this translates to 253 characters, or 254
38	// if there is a trailing dot for the root label.
39
40	U_NAMESPACE_BEGIN
41
42	// Severe errors which usually result in a U+FFFD replacement character in the result string.
43	const uint32_t severeErrors=
44	UIDNA_ERROR_LEADING_COMBINING_MARK\|
45	UIDNA_ERROR_DISALLOWED\|
46	UIDNA_ERROR_PUNYCODE\|
47	UIDNA_ERROR_LABEL_HAS_DOT\|
48	UIDNA_ERROR_INVALID_ACE_LABEL;
49
50	static inline UBool
51	isASCIIString(const UnicodeString &dest) {
52	const char16_t *s=dest.getBuffer();
53	const char16_t *limit=s+dest.length();
54	while(s<limit) {
55	if(*s++>`0x7f`) {
56	return false;
57	}
58	}
59	return true;
60	}
61
62	static UBool
63	isASCIIOkBiDi(const char16_t *s, int32_t length);
64
65	static UBool
66	isASCIIOkBiDi(const char *s, int32_t length);
67
68	// IDNA class default implementations -------------------------------------- ***
69
70	IDNA::~IDNA() {}
71
72	void
73	IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
74	IDNAInfo &info, UErrorCode &errorCode) const {
75	if(U_SUCCESS(errorCode)) {
76	UnicodeString destString;
77	labelToASCII(UnicodeString::fromUTF8(label), destString,
78	info, errorCode).toUTF8(dest);
79	}
80	}
81
82	void
83	IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
84	IDNAInfo &info, UErrorCode &errorCode) const {
85	if(U_SUCCESS(errorCode)) {
86	UnicodeString destString;
87	labelToUnicode(UnicodeString::fromUTF8(label), destString,
88	info, errorCode).toUTF8(dest);
89	}
90	}
91
92	void
93	IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
94	IDNAInfo &info, UErrorCode &errorCode) const {
95	if(U_SUCCESS(errorCode)) {
96	UnicodeString destString;
97	nameToASCII(UnicodeString::fromUTF8(name), destString,
98	info, errorCode).toUTF8(dest);
99	}
100	}
101
102	void
103	IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
104	IDNAInfo &info, UErrorCode &errorCode) const {
105	if(U_SUCCESS(errorCode)) {
106	UnicodeString destString;
107	nameToUnicode(UnicodeString::fromUTF8(name), destString,
108	info, errorCode).toUTF8(dest);
109	}
110	}
111
112	// UTS46 class declaration ------------------------------------------------- ***
113
114	class UTS46 : public IDNA {
115	public:
116	UTS46(uint32_t options, UErrorCode &errorCode);
117	virtual ~UTS46();
118
119	virtual UnicodeString &
120	labelToASCII(const UnicodeString &label, UnicodeString &dest,
121	IDNAInfo &info, UErrorCode &errorCode) const override;
122
123	virtual UnicodeString &
124	labelToUnicode(const UnicodeString &label, UnicodeString &dest,
125	IDNAInfo &info, UErrorCode &errorCode) const override;
126
127	virtual UnicodeString &
128	nameToASCII(const UnicodeString &name, UnicodeString &dest,
129	IDNAInfo &info, UErrorCode &errorCode) const override;
130
131	virtual UnicodeString &
132	nameToUnicode(const UnicodeString &name, UnicodeString &dest,
133	IDNAInfo &info, UErrorCode &errorCode) const override;
134
135	virtual void
136	labelToASCII_UTF8(StringPiece label, ByteSink &dest,
137	IDNAInfo &info, UErrorCode &errorCode) const override;
138
139	virtual void
140	labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
141	IDNAInfo &info, UErrorCode &errorCode) const override;
142
143	virtual void
144	nameToASCII_UTF8(StringPiece name, ByteSink &dest,
145	IDNAInfo &info, UErrorCode &errorCode) const override;
146
147	virtual void
148	nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
149	IDNAInfo &info, UErrorCode &errorCode) const override;
150
151	private:
152	UnicodeString &
153	process(const UnicodeString &src,
154	UBool isLabel, UBool toASCII,
155	UnicodeString &dest,
156	IDNAInfo &info, UErrorCode &errorCode) const;
157
158	void
159	processUTF8(StringPiece src,
160	UBool isLabel, UBool toASCII,
161	ByteSink &dest,
162	IDNAInfo &info, UErrorCode &errorCode) const;
163
164	UnicodeString &
165	processUnicode(const UnicodeString &src,
166	int32_t labelStart, int32_t mappingStart,
167	UBool isLabel, UBool toASCII,
168	UnicodeString &dest,
169	IDNAInfo &info, UErrorCode &errorCode) const;
170
171	// returns the new dest.length()
172	int32_t
173	mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
174	UErrorCode &errorCode) const;
175
176	// returns the new label length
177	int32_t
178	processLabel(UnicodeString &dest,
179	int32_t labelStart, int32_t labelLength,
180	UBool toASCII,
181	IDNAInfo &info, UErrorCode &errorCode) const;
182	int32_t
183	markBadACELabel(UnicodeString &dest,
184	int32_t labelStart, int32_t labelLength,
185	UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const;
186
187	void
188	checkLabelBiDi(const char16_t label, int32_t labelLength, IDNAInfo &info) const*;
189
190	UBool
191	isLabelOkContextJ(const char16_t label, int32_t labelLength) const*;
192
193	void
194	checkLabelContextO(const char16_t label, int32_t labelLength, IDNAInfo &info) const*;
195
196	const Normalizer2 &uts46Norm2; // uts46.nrm
197	uint32_t options;
198	};
199
200	IDNA *
201	IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
202	if(U_SUCCESS(errorCode)) {
203	IDNA idna=new* UTS46(options, errorCode);
204	if(idna==nullptr) {
205	errorCode=U_MEMORY_ALLOCATION_ERROR;
206	} else if(U_FAILURE(errorCode)) {
207	delete idna;
208	idna=nullptr;
209	}
210	return idna;
211	} else {
212	return nullptr;
213	}
214	}
215
216	// UTS46 implementation ---------------------------------------------------- ***
217
218	UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
219	: uts46Norm2(Normalizer2::getInstance(nullptr*, "uts46", UNORM2_COMPOSE, errorCode)),
220	options(opt) {}
221
222	UTS46::~UTS46() {}
223
224	UnicodeString &
225	UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
226	IDNAInfo &info, UErrorCode &errorCode) const {
227	return process(label, true, true, dest, info, errorCode);
228	}
229
230	UnicodeString &
231	UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
232	IDNAInfo &info, UErrorCode &errorCode) const {
233	return process(label, true, false, dest, info, errorCode);
234	}
235
236	UnicodeString &
237	UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
238	IDNAInfo &info, UErrorCode &errorCode) const {
239	process(name, false, true, dest, info, errorCode);
240	if( dest.length()>=`254` && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==`0` &&
241	isASCIIString(dest) &&
242	(dest.length()>`254` \|\| dest[`253`]!=`0x2e`)
243	) {
244	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
245	}
246	return dest;
247	}
248
249	UnicodeString &
250	UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
251	IDNAInfo &info, UErrorCode &errorCode) const {
252	return process(name, false, false, dest, info, errorCode);
253	}
254
255	void
256	UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
257	IDNAInfo &info, UErrorCode &errorCode) const {
258	processUTF8(label, true, true, dest, info, errorCode);
259	}
260
261	void
262	UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
263	IDNAInfo &info, UErrorCode &errorCode) const {
264	processUTF8(label, true, false, dest, info, errorCode);
265	}
266
267	void
268	UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
269	IDNAInfo &info, UErrorCode &errorCode) const {
270	processUTF8(name, false, true, dest, info, errorCode);
271	}
272
273	void
274	UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
275	IDNAInfo &info, UErrorCode &errorCode) const {
276	processUTF8(name, false, false, dest, info, errorCode);
277	}
278
279	// UTS #46 data for ASCII characters.
280	// The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
281	// and passes through all other ASCII characters.
282	// If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
283	// using this data.
284	// The ASCII fastpath also uses this data.
285	// Values: -1=disallowed 0==valid 1==mapped (lowercase)
286	static const int8_t asciiData[`128`]={
287	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
288	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
289	// 002D..002E; valid # HYPHEN-MINUS..FULL STOP
290	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `0`, `0`, -`1`,
291	// 0030..0039; valid # DIGIT ZERO..DIGIT NINE
292	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
293	// 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
294	-`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
295	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`,
296	// 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
297	-`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
298	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`
299	};
300
301	UnicodeString &
302	UTS46::process(const UnicodeString &src,
303	UBool isLabel, UBool toASCII,
304	UnicodeString &dest,
305	IDNAInfo &info, UErrorCode &errorCode) const {
306	// uts46Norm2.normalize() would do all of this error checking and setup,
307	// but with the ASCII fastpath we do not always call it, and do not
308	// call it first.
309	if(U_FAILURE(errorCode)) {
310	dest.setToBogus();
311	return dest;
312	}
313	const char16_t *srcArray=src.getBuffer();
314	if(&dest==&src \|\| srcArray==nullptr) {
315	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
316	dest.setToBogus();
317	return dest;
318	}
319	// Arguments are fine, reset output values.
320	dest.remove();
321	info.reset();
322	int32_t srcLength=src.length();
323	if(srcLength==`0`) {
324	info.errors\|=UIDNA_ERROR_EMPTY_LABEL;
325	return dest;
326	}
327	char16_t *destArray=dest.getBuffer(srcLength);
328	if(destArray==nullptr) {
329	errorCode=U_MEMORY_ALLOCATION_ERROR;
330	return dest;
331	}
332	// ASCII fastpath
333	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
334	int32_t labelStart=`0`;
335	int32_t i;
336	for(i=`0`;; ++i) {
337	if(i==srcLength) {
338	if(toASCII) {
339	if((i-labelStart)>`63`) {
340	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
341	}
342	// There is a trailing dot if labelStart==i.
343	if(!isLabel && i>=`254` && (i>`254` \|\| labelStart<i)) {
344	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
345	}
346	}
347	info.errors\|=info.labelErrors;
348	dest.releaseBuffer(i);
349	return dest;
350	}
351	char16_t c=srcArray[i];
352	if(c>`0x7f`) {
353	break;
354	}
355	int cData=asciiData[c];
356	if(cData>`0`) {
357	destArray[i]=c+`0x20`; // Lowercase an uppercase ASCII letter.
358	} else if(cData<`0` && disallowNonLDHDot) {
359	break; // Replacing with U+FFFD can be complicated for toASCII.
360	} else {
361	destArray[i]=c;
362	if(c==`0x2d`) { // hyphen
363	if(i==(labelStart+`3`) && srcArray[i-`1`]==`0x2d`) {
364	// "??--..." is Punycode or forbidden.
365	++i; // '-' was copied to dest already
366	break;
367	}
368	if(i==labelStart) {
369	// label starts with "-"
370	info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;
371	}
372	if((i+`1`)==srcLength \|\| srcArray[i+`1`]==`0x2e`) {
373	// label ends with "-"
374	info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;
375	}
376	} else if(c==`0x2e`) { // dot
377	if(isLabel) {
378	// Replacing with U+FFFD can be complicated for toASCII.
379	++i; // '.' was copied to dest already
380	break;
381	}
382	if(i==labelStart) {
383	info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;
384	}
385	if(toASCII && (i-labelStart)>`63`) {
386	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
387	}
388	info.errors\|=info.labelErrors;
389	info.labelErrors=`0`;
390	labelStart=i+`1`;
391	}
392	}
393	}
394	info.errors\|=info.labelErrors;
395	dest.releaseBuffer(i);
396	processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
397	if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==`0` &&
398	(!info.isOkBiDi \|\| (labelStart>`0` && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
399	) {
400	info.errors\|=UIDNA_ERROR_BIDI;
401	}
402	return dest;
403	}
404
405	void
406	UTS46::processUTF8(StringPiece src,
407	UBool isLabel, UBool toASCII,
408	ByteSink &dest,
409	IDNAInfo &info, UErrorCode &errorCode) const {
410	if(U_FAILURE(errorCode)) {
411	return;
412	}
413	const char *srcArray=src.data();
414	int32_t srcLength=src.length();
415	if(srcArray==nullptr && srcLength!=`0`) {
416	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
417	return;
418	}
419	// Arguments are fine, reset output values.
420	info.reset();
421	if(srcLength==`0`) {
422	info.errors\|=UIDNA_ERROR_EMPTY_LABEL;
423	dest.Flush();
424	return;
425	}
426	UnicodeString destString;
427	int32_t labelStart=`0`;
428	if(srcLength<=`256`) { // length of stackArray[]
429	// ASCII fastpath
430	char stackArray[`256`];
431	int32_t destCapacity;
432	char *destArray=dest.GetAppendBuffer(srcLength, srcLength+`20`,
433	stackArray, UPRV_LENGTHOF(stackArray), &destCapacity);
434	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
435	int32_t i;
436	for(i=`0`;; ++i) {
437	if(i==srcLength) {
438	if(toASCII) {
439	if((i-labelStart)>`63`) {
440	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
441	}
442	// There is a trailing dot if labelStart==i.
443	if(!isLabel && i>=`254` && (i>`254` \|\| labelStart<i)) {
444	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
445	}
446	}
447	info.errors\|=info.labelErrors;
448	dest.Append(destArray, i);
449	dest.Flush();
450	return;
451	}
452	char c=srcArray[i];
453	if((int8_t)c<`0`) { // (uint8_t)c>0x7f
454	break;
455	}
456	int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char.
457	if(cData>`0`) {
458	destArray[i]=c+`0x20`; // Lowercase an uppercase ASCII letter.
459	} else if(cData<`0` && disallowNonLDHDot) {
460	break; // Replacing with U+FFFD can be complicated for toASCII.
461	} else {
462	destArray[i]=c;
463	if(c==`0x2d`) { // hyphen
464	if(i==(labelStart+`3`) && srcArray[i-`1`]==`0x2d`) {
465	// "??--..." is Punycode or forbidden.
466	break;
467	}
468	if(i==labelStart) {
469	// label starts with "-"
470	info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;
471	}
472	if((i+`1`)==srcLength \|\| srcArray[i+`1`]==`0x2e`) {
473	// label ends with "-"
474	info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;
475	}
476	} else if(c==`0x2e`) { // dot
477	if(isLabel) {
478	break; // Replacing with U+FFFD can be complicated for toASCII.
479	}
480	if(i==labelStart) {
481	info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;
482	}
483	if(toASCII && (i-labelStart)>`63`) {
484	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
485	}
486	info.errors\|=info.labelErrors;
487	info.labelErrors=`0`;
488	labelStart=i+`1`;
489	}
490	}
491	}
492	info.errors\|=info.labelErrors;
493	// Convert the processed ASCII prefix of the current label to UTF-16.
494	int32_t mappingStart=i-labelStart;
495	destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
496	// Output the previous ASCII labels and process the rest of src in UTF-16.
497	dest.Append(destArray, labelStart);
498	processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), `0`, mappingStart,
499	isLabel, toASCII,
500	destString, info, errorCode);
501	} else {
502	// src is too long for the ASCII fastpath implementation.
503	processUnicode(UnicodeString::fromUTF8(src), `0`, `0`,
504	isLabel, toASCII,
505	destString, info, errorCode);
506	}
507	destString.toUTF8(dest); // calls dest.Flush()
508	if(toASCII && !isLabel) {
509	// length==labelStart==254 means that there is a trailing dot (ok) and
510	// destString is empty (do not index at 253-labelStart).
511	int32_t length=labelStart+destString.length();
512	if( length>=`254` && isASCIIString(destString) &&
513	(length>`254` \|\|
514	(labelStart<`254` && destString[`253`-labelStart]!=`0x2e`))
515	) {
516	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
517	}
518	}
519	if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==`0` &&
520	(!info.isOkBiDi \|\| (labelStart>`0` && !isASCIIOkBiDi(srcArray, labelStart)))
521	) {
522	info.errors\|=UIDNA_ERROR_BIDI;
523	}
524	}
525
526	UnicodeString &
527	UTS46::processUnicode(const UnicodeString &src,
528	int32_t labelStart, int32_t mappingStart,
529	UBool isLabel, UBool toASCII,
530	UnicodeString &dest,
531	IDNAInfo &info, UErrorCode &errorCode) const {
532	if(mappingStart==`0`) {
533	uts46Norm2.normalize(src, dest, errorCode);
534	} else {
535	uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
536	}
537	if(U_FAILURE(errorCode)) {
538	return dest;
539	}
540	UBool doMapDevChars=
541	toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==`0` :
542	(options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==`0`;
543	const char16_t *destArray=dest.getBuffer();
544	int32_t destLength=dest.length();
545	int32_t labelLimit=labelStart;
546	while(labelLimit<destLength) {
547	char16_t c=destArray[labelLimit];
548	if(c==`0x2e` && !isLabel) {
549	int32_t labelLength=labelLimit-labelStart;
550	int32_t newLength=processLabel(dest, labelStart, labelLength,
551	toASCII, info, errorCode);
552	info.errors\|=info.labelErrors;
553	info.labelErrors=`0`;
554	if(U_FAILURE(errorCode)) {
555	return dest;
556	}
557	destArray=dest.getBuffer();
558	destLength+=newLength-labelLength;
559	labelLimit=labelStart+=newLength+`1`;
560	continue;
561	} else if(c<`0xdf`) {
562	// pass
563	} else if(c<=`0x200d` && (c==`0xdf` \|\| c==`0x3c2` \|\| c>=`0x200c`)) {
564	info.isTransDiff=true;
565	if(doMapDevChars) {
566	destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
567	if(U_FAILURE(errorCode)) {
568	return dest;
569	}
570	destArray=dest.getBuffer();
571	// All deviation characters have been mapped, no need to check for them again.
572	doMapDevChars=false;
573	// Do not increment labelLimit in case c was removed.
574	continue;
575	}
576	} else if(U16_IS_SURROGATE(c)) {
577	if(U16_IS_SURROGATE_LEAD(c) ?
578	(labelLimit+`1`)==destLength \|\| !U16_IS_TRAIL(destArray[labelLimit+`1`]) :
579	labelLimit==labelStart \|\| !U16_IS_LEAD(destArray[labelLimit-`1`])) {
580	// Map an unpaired surrogate to U+FFFD before normalization so that when
581	// that removes characters we do not turn two unpaired ones into a pair.
582	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
583	dest.setCharAt(labelLimit, `0xfffd`);
584	destArray=dest.getBuffer();
585	}
586	}
587	++labelLimit;
588	}
589	// Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
590	// but not an empty label elsewhere nor a completely empty domain name.
591	// processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
592	if(`0`==labelStart \|\| labelStart<labelLimit) {
593	processLabel(dest, labelStart, labelLimit-labelStart,
594	toASCII, info, errorCode);
595	info.errors\|=info.labelErrors;
596	}
597	return dest;
598	}
599
600	int32_t
601	UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
602	UErrorCode &errorCode) const {
603	if(U_FAILURE(errorCode)) {
604	return `0`;
605	}
606	int32_t length=dest.length();
607	char16_t *s=dest.getBuffer(dest[mappingStart]==`0xdf` ? length+`1` : length);
608	if(s==nullptr) {
609	errorCode=U_MEMORY_ALLOCATION_ERROR;
610	return length;
611	}
612	int32_t capacity=dest.getCapacity();
613	UBool didMapDevChars=false;
614	int32_t readIndex=mappingStart, writeIndex=mappingStart;
615	do {
616	char16_t c=s[readIndex++];
617	switch(c) {
618	case `0xdf`:
619	// Map sharp s to ss.
620	didMapDevChars=true;
621	s[writeIndex++]=`0x73`; // Replace sharp s with first s.
622	// Insert second s and account for possible buffer reallocation.
623	if(writeIndex==readIndex) {
624	if(length==capacity) {
625	dest.releaseBuffer(length);
626	s=dest.getBuffer(length+`1`);
627	if(s==nullptr) {
628	errorCode=U_MEMORY_ALLOCATION_ERROR;
629	return length;
630	}
631	capacity=dest.getCapacity();
632	}
633	u_memmove(s+writeIndex+`1`, s+writeIndex, length-writeIndex);
634	++readIndex;
635	}
636	s[writeIndex++]=`0x73`;
637	++length;
638	break;
639	case `0x3c2`: // Map final sigma to nonfinal sigma.
640	didMapDevChars=true;
641	s[writeIndex++]=`0x3c3`;
642	break;
643	case `0x200c`: // Ignore/remove ZWNJ.
644	case `0x200d`: // Ignore/remove ZWJ.
645	didMapDevChars=true;
646	--length;
647	break;
648	default:
649	// Only really necessary if writeIndex was different from readIndex.
650	s[writeIndex++]=c;
651	break;
652	}
653	} while(writeIndex<length);
654	dest.releaseBuffer(length);
655	if(didMapDevChars) {
656	// Mapping deviation characters might have resulted in an un-NFC string.
657	// We could use either the NFC or the UTS #46 normalizer.
658	// By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
659	UnicodeString normalized;
660	uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
661	if(U_SUCCESS(errorCode)) {
662	dest.replace(labelStart, `0x7fffffff`, normalized);
663	if(dest.isBogus()) {
664	errorCode=U_MEMORY_ALLOCATION_ERROR;
665	}
666	return dest.length();
667	}
668	}
669	return length;
670	}
671
672	// Some non-ASCII characters are equivalent to sequences with
673	// non-LDH ASCII characters. To find them:
674	// grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
675	static inline UBool
676	isNonASCIIDisallowedSTD3Valid(UChar32 c) {
677	return c==`0x2260` \|\| c==`0x226E` \|\| c==`0x226F`;
678	}
679
680	// Replace the label in dest with the label string, if the label was modified.
681	// If &label==&dest then the label was modified in-place and labelLength
682	// is the new label length, different from label.length().
683	// If &label!=&dest then labelLength==label.length().
684	// Returns labelLength (= the new label length).
685	static int32_t
686	replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
687	const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) {
688	if(U_FAILURE(errorCode)) {
689	return `0`;
690	}
691	if(&label!=&dest) {
692	dest.replace(destLabelStart, destLabelLength, label);
693	if(dest.isBogus()) {
694	errorCode=U_MEMORY_ALLOCATION_ERROR;
695	return `0`;
696	}
697	}
698	return labelLength;
699	}
700
701	int32_t
702	UTS46::processLabel(UnicodeString &dest,
703	int32_t labelStart, int32_t labelLength,
704	UBool toASCII,
705	IDNAInfo &info, UErrorCode &errorCode) const {
706	if(U_FAILURE(errorCode)) {
707	return `0`;
708	}
709	UnicodeString fromPunycode;
710	UnicodeString *labelString;
711	const char16_t *label=dest.getBuffer()+labelStart;
712	int32_t destLabelStart=labelStart;
713	int32_t destLabelLength=labelLength;
714	UBool wasPunycode;
715	if(labelLength>=`4` && label[`0`]==`0x78` && label[`1`]==`0x6e` && label[`2`]==`0x2d` && label[`3`]==`0x2d`) {
716	// Label starts with "xn--", try to un-Punycode it.
717	// In IDNA2008, labels like "xn--" (decodes to an empty string) and
718	// "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
719	// comparing the ToUnicode input with the back-to-ToASCII output.
720	// They are alternate encodings of the respective ASCII labels.
721	// Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
722	// the round-trip verification.
723	if(labelLength==`4` \|\| (labelLength>`5` && label[labelLength-`1`]==u`'-'`)) {
724	info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;
725	return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
726	}
727	wasPunycode=true;
728	char16_t unicodeBuffer=fromPunycode.getBuffer(-`1`); // capacity==-1: most labels should fit*
729	if(unicodeBuffer==nullptr) {
730	// Should never occur if we used capacity==-1 which uses the internal buffer.
731	errorCode=U_MEMORY_ALLOCATION_ERROR;
732	return labelLength;
733	}
734	UErrorCode punycodeErrorCode=U_ZERO_ERROR;
735	int32_t unicodeLength=u_strFromPunycode(label+`4`, labelLength-`4`,
736	unicodeBuffer, fromPunycode.getCapacity(),
737	nullptr, &punycodeErrorCode);
738	if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
739	fromPunycode.releaseBuffer(`0`);
740	unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
741	if(unicodeBuffer==nullptr) {
742	errorCode=U_MEMORY_ALLOCATION_ERROR;
743	return labelLength;
744	}
745	punycodeErrorCode=U_ZERO_ERROR;
746	unicodeLength=u_strFromPunycode(label+`4`, labelLength-`4`,
747	unicodeBuffer, fromPunycode.getCapacity(),
748	nullptr, &punycodeErrorCode);
749	}
750	fromPunycode.releaseBuffer(unicodeLength);
751	if(U_FAILURE(punycodeErrorCode)) {
752	info.labelErrors\|=UIDNA_ERROR_PUNYCODE;
753	return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
754	}
755	// Check for NFC, and for characters that are not
756	// valid or deviation characters according to the normalizer.
757	// If there is something wrong, then the string will change.
758	// Note that the normalizer passes through non-LDH ASCII and deviation characters.
759	// Deviation characters are ok in Punycode even in transitional processing.
760	// In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
761	// then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
762	UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
763	if(U_FAILURE(errorCode)) {
764	return labelLength;
765	}
766	if(!isValid) {
767	info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;
768	return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
769	}
770	labelString=&fromPunycode;
771	label=fromPunycode.getBuffer();
772	labelStart=`0`;
773	labelLength=fromPunycode.length();
774	} else {
775	wasPunycode=false;
776	labelString=&dest;
777	}
778	// Validity check
779	if(labelLength==`0`) {
780	info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;
781	return replaceLabel(dest, destLabelStart, destLabelLength,
782	*labelString, labelLength, errorCode);
783	}
784	// labelLength>0
785	if(labelLength>=`4` && label[`2`]==`0x2d` && label[`3`]==`0x2d`) {
786	// label starts with "??--"
787	info.labelErrors\|=UIDNA_ERROR_HYPHEN_3_4;
788	}
789	if(label[`0`]==`0x2d`) {
790	// label starts with "-"
791	info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;
792	}
793	if(label[labelLength-`1`]==`0x2d`) {
794	// label ends with "-"
795	info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;
796	}
797	// If the label was not a Punycode label, then it was the result of
798	// mapping, normalization and label segmentation.
799	// If the label was in Punycode, then we mapped it again above
800	// and checked its validity.
801	// Now we handle the STD3 restriction to LDH characters (if set)
802	// and we look for U+FFFD which indicates disallowed characters
803	// in a non-Punycode label or U+FFFD itself in a Punycode label.
804	// We also check for dots which can come from the input to a single-label function.
805	// Ok to cast away const because we own the UnicodeString.
806	char16_t s=(char16_t* *)label;
807	const char16_t *limit=label+labelLength;
808	char16_t oredChars=`0`;
809	// If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
810	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
811	do {
812	char16_t c=*s;
813	if(c<=`0x7f`) {
814	if(c==`0x2e`) {
815	info.labelErrors\|=UIDNA_ERROR_LABEL_HAS_DOT;
816	*s=`0xfffd`;
817	} else if(disallowNonLDHDot && asciiData[c]<`0`) {
818	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
819	*s=`0xfffd`;
820	}
821	} else {
822	oredChars\|=c;
823	if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
824	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
825	*s=`0xfffd`;
826	} else if(c==`0xfffd`) {
827	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
828	}
829	}
830	++s;
831	} while(s<limit);
832	// Check for a leading combining mark after other validity checks
833	// so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
834	UChar32 c;
835	int32_t cpLength=`0`;
836	// "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
837	U16_NEXT_UNSAFE(label, cpLength, c);
838	if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=`0`) {
839	info.labelErrors\|=UIDNA_ERROR_LEADING_COMBINING_MARK;
840	labelString->replace(labelStart, cpLength, (char16_t)`0xfffd`);
841	label=labelString->getBuffer()+labelStart;
842	labelLength+=`1`-cpLength;
843	if(labelString==&dest) {
844	destLabelLength=labelLength;
845	}
846	}
847	if((info.labelErrors&severeErrors)==`0`) {
848	// Do contextual checks only if we do not have U+FFFD from a severe error
849	// because U+FFFD can make these checks fail.
850	if((options&UIDNA_CHECK_BIDI)!=`0` && (!info.isBiDi \|\| info.isOkBiDi)) {
851	checkLabelBiDi(label, labelLength, info);
852	}
853	if( (options&UIDNA_CHECK_CONTEXTJ)!=`0` && (oredChars&`0x200c`)==`0x200c` &&
854	!isLabelOkContextJ(label, labelLength)
855	) {
856	info.labelErrors\|=UIDNA_ERROR_CONTEXTJ;
857	}
858	if((options&UIDNA_CHECK_CONTEXTO)!=`0` && oredChars>=`0xb7`) {
859	checkLabelContextO(label, labelLength, info);
860	}
861	if(toASCII) {
862	if(wasPunycode) {
863	// Leave a Punycode label unchanged if it has no severe errors.
864	if(destLabelLength>`63`) {
865	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
866	}
867	return destLabelLength;
868	} else if(oredChars>=`0x80`) {
869	// Contains non-ASCII characters.
870	UnicodeString punycode;
871	char16_t buffer=punycode.getBuffer(`63`); // 63==maximum DNS label length*
872	if(buffer==nullptr) {
873	errorCode=U_MEMORY_ALLOCATION_ERROR;
874	return destLabelLength;
875	}
876	buffer[`0`]=`0x78`; // Write "xn--".
877	buffer[`1`]=`0x6e`;
878	buffer[`2`]=`0x2d`;
879	buffer[`3`]=`0x2d`;
880	int32_t punycodeLength=u_strToPunycode(label, labelLength,
881	buffer+`4`, punycode.getCapacity()-`4`,
882	nullptr, &errorCode);
883	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
884	errorCode=U_ZERO_ERROR;
885	punycode.releaseBuffer(`4`);
886	buffer=punycode.getBuffer(`4`+punycodeLength);
887	if(buffer==nullptr) {
888	errorCode=U_MEMORY_ALLOCATION_ERROR;
889	return destLabelLength;
890	}
891	punycodeLength=u_strToPunycode(label, labelLength,
892	buffer+`4`, punycode.getCapacity()-`4`,
893	nullptr, &errorCode);
894	}
895	punycodeLength+=`4`;
896	punycode.releaseBuffer(punycodeLength);
897	if(U_FAILURE(errorCode)) {
898	return destLabelLength;
899	}
900	if(punycodeLength>`63`) {
901	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
902	}
903	return replaceLabel(dest, destLabelStart, destLabelLength,
904	punycode, punycodeLength, errorCode);
905	} else {
906	// all-ASCII label
907	if(labelLength>`63`) {
908	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
909	}
910	}
911	}
912	} else {
913	// If a Punycode label has severe errors,
914	// then leave it but make sure it does not look valid.
915	if(wasPunycode) {
916	info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;
917	return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode);
918	}
919	}
920	return replaceLabel(dest, destLabelStart, destLabelLength,
921	*labelString, labelLength, errorCode);
922	}
923
924	// Make sure an ACE label does not look valid.
925	// Append U+FFFD if the label has only LDH characters.
926	// If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
927	int32_t
928	UTS46::markBadACELabel(UnicodeString &dest,
929	int32_t labelStart, int32_t labelLength,
930	UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const {
931	if(U_FAILURE(errorCode)) {
932	return `0`;
933	}
934	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
935	UBool isASCII=true;
936	UBool onlyLDH=true;
937	const char16_t *label=dest.getBuffer()+labelStart;
938	const char16_t *limit=label+labelLength;
939	// Start after the initial "xn--".
940	// Ok to cast away const because we own the UnicodeString.
941	for(char16_t s=const_cast<char16_t* *>(label+`4`); s<limit; ++s) {
942	char16_t c=*s;
943	if(c<=`0x7f`) {
944	if(c==`0x2e`) {
945	info.labelErrors\|=UIDNA_ERROR_LABEL_HAS_DOT;
946	*s=`0xfffd`;
947	isASCII=onlyLDH=false;
948	} else if(asciiData[c]<`0`) {
949	onlyLDH=false;
950	if(disallowNonLDHDot) {
951	*s=`0xfffd`;
952	isASCII=false;
953	}
954	}
955	} else {
956	isASCII=onlyLDH=false;
957	}
958	}
959	if(onlyLDH) {
960	dest.insert(labelStart+labelLength, (char16_t)`0xfffd`);
961	if(dest.isBogus()) {
962	errorCode=U_MEMORY_ALLOCATION_ERROR;
963	return `0`;
964	}
965	++labelLength;
966	} else {
967	if(toASCII && isASCII && labelLength>`63`) {
968	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
969	}
970	}
971	return labelLength;
972	}
973
974	const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
975	const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)\|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
976	const uint32_t L_R_AL_MASK=L_MASK\|R_AL_MASK;
977
978	const uint32_t R_AL_AN_MASK=R_AL_MASK\|U_MASK(U_ARABIC_NUMBER);
979
980	const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)\|U_MASK(U_ARABIC_NUMBER);
981	const uint32_t R_AL_EN_AN_MASK=R_AL_MASK\|EN_AN_MASK;
982	const uint32_t L_EN_MASK=L_MASK\|U_MASK(U_EUROPEAN_NUMBER);
983
984	const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
985	U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)\|
986	U_MASK(U_COMMON_NUMBER_SEPARATOR)\|
987	U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)\|
988	U_MASK(U_OTHER_NEUTRAL)\|
989	U_MASK(U_BOUNDARY_NEUTRAL)\|
990	U_MASK(U_DIR_NON_SPACING_MARK);
991	const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK\|ES_CS_ET_ON_BN_NSM_MASK;
992	const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK\|EN_AN_MASK\|ES_CS_ET_ON_BN_NSM_MASK;
993
994	// We scan the whole label and check both for whether it contains RTL characters
995	// and whether it passes the BiDi Rule.
996	// In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
997	// that a domain name is a BiDi domain name (has an RTL label) only after
998	// processing several earlier labels.
999	void
1000	UTS46::checkLabelBiDi(const char16_t label, int32_t labelLength, IDNAInfo &info) const* {
1001	// IDNA2008 BiDi rule
1002	// Get the directionality of the first character.
1003	UChar32 c;
1004	int32_t i=`0`;
1005	U16_NEXT_UNSAFE(label, i, c);
1006	uint32_t firstMask=U_MASK(u_charDirection(c));
1007	// 1. The first character must be a character with BIDI property L, R
1008	// or AL. If it has the R or AL property, it is an RTL label; if it
1009	// has the L property, it is an LTR label.
1010	if((firstMask&~L_R_AL_MASK)!=`0`) {
1011	info.isOkBiDi=false;
1012	}
1013	// Get the directionality of the last non-NSM character.
1014	uint32_t lastMask;
1015	for(;;) {
1016	if(i>=labelLength) {
1017	lastMask=firstMask;
1018	break;
1019	}
1020	U16_PREV_UNSAFE(label, labelLength, c);
1021	UCharDirection dir=u_charDirection(c);
1022	if(dir!=U_DIR_NON_SPACING_MARK) {
1023	lastMask=U_MASK(dir);
1024	break;
1025	}
1026	}
1027	// 3. In an RTL label, the end of the label must be a character with
1028	// BIDI property R, AL, EN or AN, followed by zero or more
1029	// characters with BIDI property NSM.
1030	// 6. In an LTR label, the end of the label must be a character with
1031	// BIDI property L or EN, followed by zero or more characters with
1032	// BIDI property NSM.
1033	if( (firstMask&L_MASK)!=`0` ?
1034	(lastMask&~L_EN_MASK)!=`0` :
1035	(lastMask&~R_AL_EN_AN_MASK)!=`0`
1036	) {
1037	info.isOkBiDi=false;
1038	}
1039	// Add the directionalities of the intervening characters.
1040	uint32_t mask=firstMask\|lastMask;
1041	while(i<labelLength) {
1042	U16_NEXT_UNSAFE(label, i, c);
1043	mask\|=U_MASK(u_charDirection(c));
1044	}
1045	if(firstMask&L_MASK) {
1046	// 5. In an LTR label, only characters with the BIDI properties L, EN,
1047	// ES, CS, ET, ON, BN and NSM are allowed.
1048	if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=`0`) {
1049	info.isOkBiDi=false;
1050	}
1051	} else {
1052	// 2. In an RTL label, only characters with the BIDI properties R, AL,
1053	// AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
1054	if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=`0`) {
1055	info.isOkBiDi=false;
1056	}
1057	// 4. In an RTL label, if an EN is present, no AN may be present, and
1058	// vice versa.
1059	if((mask&EN_AN_MASK)==EN_AN_MASK) {
1060	info.isOkBiDi=false;
1061	}
1062	}
1063	// An RTL label is a label that contains at least one character of type
1064	// R, AL or AN. [...]
1065	// A "BIDI domain name" is a domain name that contains at least one RTL
1066	// label. [...]
1067	// The following rule, consisting of six conditions, applies to labels
1068	// in BIDI domain names.
1069	if((mask&R_AL_AN_MASK)!=`0`) {
1070	info.isBiDi=true;
1071	}
1072	}
1073
1074	// Special code for the ASCII prefix of a BiDi domain name.
1075	// The ASCII prefix is all-LTR.
1076
1077	// IDNA2008 BiDi rule, parts relevant to ASCII labels:
1078	// 1. The first character must be a character with BIDI property L [...]
1079	// 5. In an LTR label, only characters with the BIDI properties L, EN,
1080	// ES, CS, ET, ON, BN and NSM are allowed.
1081	// 6. In an LTR label, the end of the label must be a character with
1082	// BIDI property L or EN [...]
1083
1084	// UTF-16 version, called for mapped ASCII prefix.
1085	// Cannot contain uppercase A-Z.
1086	// s[length-1] must be the trailing dot.
1087	static UBool
1088	isASCIIOkBiDi(const char16_t *s, int32_t length) {
1089	int32_t labelStart=`0`;
1090	for(int32_t i=`0`; i<length; ++i) {
1091	char16_t c=s[i];
1092	if(c==`0x2e`) { // dot
1093	if(i>labelStart) {
1094	c=s[i-`1`];
1095	if(!(`0x61`<=c && c<=`0x7a`) && !(`0x30`<=c && c<=`0x39`)) {
1096	// Last character in the label is not an L or EN.
1097	return false;
1098	}
1099	}
1100	labelStart=i+`1`;
1101	} else if(i==labelStart) {
1102	if(!(`0x61`<=c && c<=`0x7a`)) {
1103	// First character in the label is not an L.
1104	return false;
1105	}
1106	} else {
1107	if(c<=`0x20` && (c>=`0x1c` \|\| (`9`<=c && c<=`0xd`))) {
1108	// Intermediate character in the label is a B, S or WS.
1109	return false;
1110	}
1111	}
1112	}
1113	return true;
1114	}
1115
1116	// UTF-8 version, called for source ASCII prefix.
1117	// Can contain uppercase A-Z.
1118	// s[length-1] must be the trailing dot.
1119	static UBool
1120	isASCIIOkBiDi(const char *s, int32_t length) {
1121	int32_t labelStart=`0`;
1122	for(int32_t i=`0`; i<length; ++i) {
1123	char c=s[i];
1124	if(c==`0x2e`) { // dot
1125	if(i>labelStart) {
1126	c=s[i-`1`];
1127	if(!(`0x61`<=c && c<=`0x7a`) && !(`0x41`<=c && c<=`0x5a`) && !(`0x30`<=c && c<=`0x39`)) {
1128	// Last character in the label is not an L or EN.
1129	return false;
1130	}
1131	}
1132	labelStart=i+`1`;
1133	} else if(i==labelStart) {
1134	if(!(`0x61`<=c && c<=`0x7a`) && !(`0x41`<=c && c<=`0x5a`)) {
1135	// First character in the label is not an L.
1136	return false;
1137	}
1138	} else {
1139	if(c<=`0x20` && (c>=`0x1c` \|\| (`9`<=c && c<=`0xd`))) {
1140	// Intermediate character in the label is a B, S or WS.
1141	return false;
1142	}
1143	}
1144	}
1145	return true;
1146	}
1147
1148	UBool
1149	UTS46::isLabelOkContextJ(const char16_t label, int32_t labelLength) const* {
1150	// [IDNA2008-Tables]
1151	// 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
1152	for(int32_t i=`0`; i<labelLength; ++i) {
1153	if(label[i]==`0x200c`) {
1154	// Appendix A.1. ZERO WIDTH NON-JOINER
1155	// Rule Set:
1156	// False;
1157	// If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
1158	// If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)\u200C*
1159	// (Joining_Type:T)(Joining_Type:{R,D})) Then True;*
1160	if(i==`0`) {
1161	return false;
1162	}
1163	UChar32 c;
1164	int32_t j=i;
1165	U16_PREV_UNSAFE(label, j, c);
1166	if(uts46Norm2.getCombiningClass(c)==`9`) {
1167	continue;
1168	}
1169	// check precontext (Joining_Type:{L,D})(Joining_Type:T)*
1170	for(;;) {
1171	UJoiningType type=ubidi_getJoiningType(c);
1172	if(type==U_JT_TRANSPARENT) {
1173	if(j==`0`) {
1174	return false;
1175	}
1176	U16_PREV_UNSAFE(label, j, c);
1177	} else if(type==U_JT_LEFT_JOINING \|\| type==U_JT_DUAL_JOINING) {
1178	break; // precontext fulfilled
1179	} else {
1180	return false;
1181	}
1182	}
1183	// check postcontext (Joining_Type:T)(Joining_Type:{R,D})*
1184	for(j=i+`1`;;) {
1185	if(j==labelLength) {
1186	return false;
1187	}
1188	U16_NEXT_UNSAFE(label, j, c);
1189	UJoiningType type=ubidi_getJoiningType(c);
1190	if(type==U_JT_TRANSPARENT) {
1191	// just skip this character
1192	} else if(type==U_JT_RIGHT_JOINING \|\| type==U_JT_DUAL_JOINING) {
1193	break; // postcontext fulfilled
1194	} else {
1195	return false;
1196	}
1197	}
1198	} else if(label[i]==`0x200d`) {
1199	// Appendix A.2. ZERO WIDTH JOINER (U+200D)
1200	// Rule Set:
1201	// False;
1202	// If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
1203	if(i==`0`) {
1204	return false;
1205	}
1206	UChar32 c;
1207	int32_t j=i;
1208	U16_PREV_UNSAFE(label, j, c);
1209	if(uts46Norm2.getCombiningClass(c)!=`9`) {
1210	return false;
1211	}
1212	}
1213	}
1214	return true;
1215	}
1216
1217	void
1218	UTS46::checkLabelContextO(const char16_t label, int32_t labelLength, IDNAInfo &info) const* {
1219	int32_t labelEnd=labelLength-`1`; // inclusive
1220	int32_t arabicDigits=`0`; // -1 for 066x, +1 for 06Fx
1221	for(int32_t i=`0`; i<=labelEnd; ++i) {
1222	UChar32 c=label[i];
1223	if(c<`0xb7`) {
1224	// ASCII fastpath
1225	} else if(c<=`0x6f9`) {
1226	if(c==`0xb7`) {
1227	// Appendix A.3. MIDDLE DOT (U+00B7)
1228	// Rule Set:
1229	// False;
1230	// If Before(cp) .eq. U+006C And
1231	// After(cp) .eq. U+006C Then True;
1232	if(!(`0`<i && label[i-`1`]==`0x6c` &&
1233	i<labelEnd && label[i+`1`]==`0x6c`)) {
1234	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1235	}
1236	} else if(c==`0x375`) {
1237	// Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
1238	// Rule Set:
1239	// False;
1240	// If Script(After(cp)) .eq. Greek Then True;
1241	UScriptCode script=USCRIPT_INVALID_CODE;
1242	if(i<labelEnd) {
1243	UErrorCode errorCode=U_ZERO_ERROR;
1244	int32_t j=i+`1`;
1245	U16_NEXT(label, j, labelLength, c);
1246	script=uscript_getScript(c, &errorCode);
1247	}
1248	if(script!=USCRIPT_GREEK) {
1249	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1250	}
1251	} else if(c==`0x5f3` \|\| c==`0x5f4`) {
1252	// Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
1253	// Rule Set:
1254	// False;
1255	// If Script(Before(cp)) .eq. Hebrew Then True;
1256	//
1257	// Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
1258	// Rule Set:
1259	// False;
1260	// If Script(Before(cp)) .eq. Hebrew Then True;
1261	UScriptCode script=USCRIPT_INVALID_CODE;
1262	if(`0`<i) {
1263	UErrorCode errorCode=U_ZERO_ERROR;
1264	int32_t j=i;
1265	U16_PREV(label, `0`, j, c);
1266	script=uscript_getScript(c, &errorCode);
1267	}
1268	if(script!=USCRIPT_HEBREW) {
1269	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1270	}
1271	} else if(`0x660`<=c / && c<=0x6f9 /) {
1272	// Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
1273	// Rule Set:
1274	// True;
1275	// For All Characters:
1276	// If cp .in. 06F0..06F9 Then False;
1277	// End For;
1278	//
1279	// Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
1280	// Rule Set:
1281	// True;
1282	// For All Characters:
1283	// If cp .in. 0660..0669 Then False;
1284	// End For;
1285	if(c<=`0x669`) {
1286	if(arabicDigits>`0`) {
1287	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_DIGITS;
1288	}
1289	arabicDigits=-`1`;
1290	} else if(`0x6f0`<=c) {
1291	if(arabicDigits<`0`) {
1292	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_DIGITS;
1293	}
1294	arabicDigits=`1`;
1295	}
1296	}
1297	} else if(c==`0x30fb`) {
1298	// Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
1299	// Rule Set:
1300	// False;
1301	// For All Characters:
1302	// If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
1303	// End For;
1304	UErrorCode errorCode=U_ZERO_ERROR;
1305	for(int j=`0`;;) {
1306	if(j>labelEnd) {
1307	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1308	break;
1309	}
1310	U16_NEXT(label, j, labelLength, c);
1311	UScriptCode script=uscript_getScript(c, &errorCode);
1312	if(script==USCRIPT_HIRAGANA \|\| script==USCRIPT_KATAKANA \|\| script==USCRIPT_HAN) {
1313	break;
1314	}
1315	}
1316	}
1317	}
1318	}
1319
1320	U_NAMESPACE_END
1321
1322	// C API ------------------------------------------------------------------- ***
1323
1324	U_NAMESPACE_USE
1325
1326	U_CAPI UIDNA * U_EXPORT2
1327	uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
1328	return reinterpret_cast<UIDNA >(IDNA::createUTS46Instance(options, pErrorCode));
1329	}
1330
1331	U_CAPI void U_EXPORT2
1332	uidna_close(UIDNA *idna) {
1333	delete reinterpret_cast<IDNA *>(idna);
1334	}
1335
1336	static UBool
1337	checkArgs(const void *label, int32_t length,
1338	void *dest, int32_t capacity,
1339	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1340	if(U_FAILURE(*pErrorCode)) {
1341	return false;
1342	}
1343	// sizeof(UIDNAInfo)=16 in the first API version.
1344	if(pInfo==nullptr \|\| pInfo->size<`16`) {
1345	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1346	return false;
1347	}
1348	if( (label==nullptr ? length!=`0` : length<-`1`) \|\|
1349	(dest==nullptr ? capacity!=`0` : capacity<`0`) \|\|
1350	(dest==label && label!=nullptr)
1351	) {
1352	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1353	return false;
1354	}
1355	// Set all pInfo bytes to 0 except for the size field itself.*
1356	uprv_memset(&pInfo->size+`1`, `0`, pInfo->size-sizeof(pInfo->size));
1357	return true;
1358	}
1359
1360	static void
1361	idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
1362	pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
1363	pInfo->errors=info.getErrors();
1364	}
1365
1366	U_CAPI int32_t U_EXPORT2
1367	uidna_labelToASCII(const UIDNA *idna,
1368	const char16_t *label, int32_t length,
1369	char16_t *dest, int32_t capacity,
1370	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1371	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1372	return `0`;
1373	}
1374	UnicodeString src((UBool)(length<`0`), label, length);
1375	UnicodeString destString(dest, `0`, capacity);
1376	IDNAInfo info;
1377	reinterpret_cast<const IDNA >(idna)->labelToASCII(src, destString, info, pErrorCode);
1378	idnaInfoToStruct(info, pInfo);
1379	return destString.extract(dest, capacity, *pErrorCode);
1380	}
1381
1382	U_CAPI int32_t U_EXPORT2
1383	uidna_labelToUnicode(const UIDNA *idna,
1384	const char16_t *label, int32_t length,
1385	char16_t *dest, int32_t capacity,
1386	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1387	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1388	return `0`;
1389	}
1390	UnicodeString src((UBool)(length<`0`), label, length);
1391	UnicodeString destString(dest, `0`, capacity);
1392	IDNAInfo info;
1393	reinterpret_cast<const IDNA >(idna)->labelToUnicode(src, destString, info, pErrorCode);
1394	idnaInfoToStruct(info, pInfo);
1395	return destString.extract(dest, capacity, *pErrorCode);
1396	}
1397
1398	U_CAPI int32_t U_EXPORT2
1399	uidna_nameToASCII(const UIDNA *idna,
1400	const char16_t *name, int32_t length,
1401	char16_t *dest, int32_t capacity,
1402	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1403	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1404	return `0`;
1405	}
1406	UnicodeString src((UBool)(length<`0`), name, length);
1407	UnicodeString destString(dest, `0`, capacity);
1408	IDNAInfo info;
1409	reinterpret_cast<const IDNA >(idna)->nameToASCII(src, destString, info, pErrorCode);
1410	idnaInfoToStruct(info, pInfo);
1411	return destString.extract(dest, capacity, *pErrorCode);
1412	}
1413
1414	U_CAPI int32_t U_EXPORT2
1415	uidna_nameToUnicode(const UIDNA *idna,
1416	const char16_t *name, int32_t length,
1417	char16_t *dest, int32_t capacity,
1418	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1419	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1420	return `0`;
1421	}
1422	UnicodeString src((UBool)(length<`0`), name, length);
1423	UnicodeString destString(dest, `0`, capacity);
1424	IDNAInfo info;
1425	reinterpret_cast<const IDNA >(idna)->nameToUnicode(src, destString, info, pErrorCode);
1426	idnaInfoToStruct(info, pInfo);
1427	return destString.extract(dest, capacity, *pErrorCode);
1428	}
1429
1430	U_CAPI int32_t U_EXPORT2
1431	uidna_labelToASCII_UTF8(const UIDNA *idna,
1432	const char *label, int32_t length,
1433	char *dest, int32_t capacity,
1434	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1435	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1436	return `0`;
1437	}
1438	StringPiece src(label, length<`0` ? static_cast<int32_t>(uprv_strlen(label)) : length);
1439	CheckedArrayByteSink sink(dest, capacity);
1440	IDNAInfo info;
1441	reinterpret_cast<const IDNA >(idna)->labelToASCII_UTF8(src, sink, info, pErrorCode);
1442	idnaInfoToStruct(info, pInfo);
1443	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1444	}
1445
1446	U_CAPI int32_t U_EXPORT2
1447	uidna_labelToUnicodeUTF8(const UIDNA *idna,
1448	const char *label, int32_t length,
1449	char *dest, int32_t capacity,
1450	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1451	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1452	return `0`;
1453	}
1454	StringPiece src(label, length<`0` ? static_cast<int32_t>(uprv_strlen(label)) : length);
1455	CheckedArrayByteSink sink(dest, capacity);
1456	IDNAInfo info;
1457	reinterpret_cast<const IDNA >(idna)->labelToUnicodeUTF8(src, sink, info, pErrorCode);
1458	idnaInfoToStruct(info, pInfo);
1459	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1460	}
1461
1462	U_CAPI int32_t U_EXPORT2
1463	uidna_nameToASCII_UTF8(const UIDNA *idna,
1464	const char *name, int32_t length,
1465	char *dest, int32_t capacity,
1466	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1467	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1468	return `0`;
1469	}
1470	StringPiece src(name, length<`0` ? static_cast<int32_t>(uprv_strlen(name)) : length);
1471	CheckedArrayByteSink sink(dest, capacity);
1472	IDNAInfo info;
1473	reinterpret_cast<const IDNA >(idna)->nameToASCII_UTF8(src, sink, info, pErrorCode);
1474	idnaInfoToStruct(info, pInfo);
1475	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1476	}
1477
1478	U_CAPI int32_t U_EXPORT2
1479	uidna_nameToUnicodeUTF8(const UIDNA *idna,
1480	const char *name, int32_t length,
1481	char *dest, int32_t capacity,
1482	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1483	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1484	return `0`;
1485	}
1486	StringPiece src(name, length<`0` ? static_cast<int32_t>(uprv_strlen(name)) : length);
1487	CheckedArrayByteSink sink(dest, capacity);
1488	IDNAInfo info;
1489	reinterpret_cast<const IDNA >(idna)->nameToUnicodeUTF8(src, sink, info, pErrorCode);
1490	idnaInfoToStruct(info, pInfo);
1491	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1492	}
1493
1494	#endif // UCONFIG_NO_IDNA
1495

Browse the source code of Godot/thirdparty/icu4c/common/uts46.cpp