uts46.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/uts46.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2010-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* file name: uts46.cpp
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2010mar09
14	* created by: Markus W. Scherer
15	*/
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_IDNA
20
21	#include "unicode/idna.h"
22	#include "unicode/normalizer2.h"
23	#include "unicode/uscript.h"
24	#include "unicode/ustring.h"
25	#include "unicode/utf16.h"
26	#include "cmemory.h"
27	#include "cstring.h"
28	#include "punycode.h"
29	#include "ubidi_props.h"
30	#include "ustr_imp.h"
31
32	// Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
33	//
34	// The domain name length limit is 255 octets in an internal DNS representation
35	// where the last ("root") label is the empty label
36	// represented by length byte 0 alone.
37	// In a conventional string, this translates to 253 characters, or 254
38	// if there is a trailing dot for the root label.
39
40	U_NAMESPACE_BEGIN
41
42	// Severe errors which usually result in a U+FFFD replacement character in the result string.
43	const uint32_t severeErrors=
44	UIDNA_ERROR_LEADING_COMBINING_MARK\|
45	UIDNA_ERROR_DISALLOWED\|
46	UIDNA_ERROR_PUNYCODE\|
47	UIDNA_ERROR_LABEL_HAS_DOT\|
48	UIDNA_ERROR_INVALID_ACE_LABEL;
49
50	static inline UBool
51	isASCIIString(const UnicodeString &dest) {
52	const UChar *s=dest.getBuffer();
53	const UChar *limit=s+dest.length();
54	while(s<limit) {
55	if(*s++>`0x7f`) {
56	return FALSE;
57	}
58	}
59	return TRUE;
60	}
61
62	static UBool
63	isASCIIOkBiDi(const UChar *s, int32_t length);
64
65	static UBool
66	isASCIIOkBiDi(const char *s, int32_t length);
67
68	// IDNA class default implementations -------------------------------------- ***
69
70	IDNA::~IDNA() {}
71
72	void
73	IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
74	IDNAInfo &info, UErrorCode &errorCode) const {
75	if(U_SUCCESS(errorCode)) {
76	UnicodeString destString;
77	labelToASCII(UnicodeString::fromUTF8(label), destString,
78	info, errorCode).toUTF8(dest);
79	}
80	}
81
82	void
83	IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
84	IDNAInfo &info, UErrorCode &errorCode) const {
85	if(U_SUCCESS(errorCode)) {
86	UnicodeString destString;
87	labelToUnicode(UnicodeString::fromUTF8(label), destString,
88	info, errorCode).toUTF8(dest);
89	}
90	}
91
92	void
93	IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
94	IDNAInfo &info, UErrorCode &errorCode) const {
95	if(U_SUCCESS(errorCode)) {
96	UnicodeString destString;
97	nameToASCII(UnicodeString::fromUTF8(name), destString,
98	info, errorCode).toUTF8(dest);
99	}
100	}
101
102	void
103	IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
104	IDNAInfo &info, UErrorCode &errorCode) const {
105	if(U_SUCCESS(errorCode)) {
106	UnicodeString destString;
107	nameToUnicode(UnicodeString::fromUTF8(name), destString,
108	info, errorCode).toUTF8(dest);
109	}
110	}
111
112	// UTS46 class declaration ------------------------------------------------- ***
113
114	class UTS46 : public IDNA {
115	public:
116	UTS46(uint32_t options, UErrorCode &errorCode);
117	virtual ~UTS46();
118
119	virtual UnicodeString &
120	labelToASCII(const UnicodeString &label, UnicodeString &dest,
121	IDNAInfo &info, UErrorCode &errorCode) const;
122
123	virtual UnicodeString &
124	labelToUnicode(const UnicodeString &label, UnicodeString &dest,
125	IDNAInfo &info, UErrorCode &errorCode) const;
126
127	virtual UnicodeString &
128	nameToASCII(const UnicodeString &name, UnicodeString &dest,
129	IDNAInfo &info, UErrorCode &errorCode) const;
130
131	virtual UnicodeString &
132	nameToUnicode(const UnicodeString &name, UnicodeString &dest,
133	IDNAInfo &info, UErrorCode &errorCode) const;
134
135	virtual void
136	labelToASCII_UTF8(StringPiece label, ByteSink &dest,
137	IDNAInfo &info, UErrorCode &errorCode) const;
138
139	virtual void
140	labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
141	IDNAInfo &info, UErrorCode &errorCode) const;
142
143	virtual void
144	nameToASCII_UTF8(StringPiece name, ByteSink &dest,
145	IDNAInfo &info, UErrorCode &errorCode) const;
146
147	virtual void
148	nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
149	IDNAInfo &info, UErrorCode &errorCode) const;
150
151	private:
152	UnicodeString &
153	process(const UnicodeString &src,
154	UBool isLabel, UBool toASCII,
155	UnicodeString &dest,
156	IDNAInfo &info, UErrorCode &errorCode) const;
157
158	void
159	processUTF8(StringPiece src,
160	UBool isLabel, UBool toASCII,
161	ByteSink &dest,
162	IDNAInfo &info, UErrorCode &errorCode) const;
163
164	UnicodeString &
165	processUnicode(const UnicodeString &src,
166	int32_t labelStart, int32_t mappingStart,
167	UBool isLabel, UBool toASCII,
168	UnicodeString &dest,
169	IDNAInfo &info, UErrorCode &errorCode) const;
170
171	// returns the new dest.length()
172	int32_t
173	mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
174	UErrorCode &errorCode) const;
175
176	// returns the new label length
177	int32_t
178	processLabel(UnicodeString &dest,
179	int32_t labelStart, int32_t labelLength,
180	UBool toASCII,
181	IDNAInfo &info, UErrorCode &errorCode) const;
182	int32_t
183	markBadACELabel(UnicodeString &dest,
184	int32_t labelStart, int32_t labelLength,
185	UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const;
186
187	void
188	checkLabelBiDi(const UChar label, int32_t labelLength, IDNAInfo &info) const*;
189
190	UBool
191	isLabelOkContextJ(const UChar label, int32_t labelLength) const*;
192
193	void
194	checkLabelContextO(const UChar label, int32_t labelLength, IDNAInfo &info) const*;
195
196	const Normalizer2 &uts46Norm2; // uts46.nrm
197	uint32_t options;
198	};
199
200	IDNA *
201	IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
202	if(U_SUCCESS(errorCode)) {
203	IDNA idna=new* UTS46 (options, errorCode);
204	if(idna==NULL) {
205	errorCode=U_MEMORY_ALLOCATION_ERROR;
206	} else if(U_FAILURE(errorCode)) {
207	delete idna;
208	idna=NULL;
209	}
210	return idna;
211	} else {
212	return NULL;
213	}
214	}
215
216	// UTS46 implementation ---------------------------------------------------- ***
217
218	UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
219	: uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)),
220	options(opt) {}
221
222	UTS46::~UTS46() {}
223
224	UnicodeString &
225	UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
226	IDNAInfo &info, UErrorCode &errorCode) const {
227	return process(label, TRUE, TRUE, dest, info, errorCode);
228	}
229
230	UnicodeString &
231	UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
232	IDNAInfo &info, UErrorCode &errorCode) const {
233	return process(label, TRUE, FALSE, dest, info, errorCode);
234	}
235
236	UnicodeString &
237	UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
238	IDNAInfo &info, UErrorCode &errorCode) const {
239	process(name, FALSE, TRUE, dest, info, errorCode);
240	if( dest.length()>=`254` && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==`0` &&
241	isASCIIString(dest) &&
242	(dest.length()>`254` \|\| dest [`253`]!=`0x2e`)
243	) {
244	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
245	}
246	return dest;
247	}
248
249	UnicodeString &
250	UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
251	IDNAInfo &info, UErrorCode &errorCode) const {
252	return process(name, FALSE, FALSE, dest, info, errorCode);
253	}
254
255	void
256	UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
257	IDNAInfo &info, UErrorCode &errorCode) const {
258	processUTF8(label, TRUE, TRUE, dest, info, errorCode);
259	}
260
261	void
262	UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
263	IDNAInfo &info, UErrorCode &errorCode) const {
264	processUTF8(label, TRUE, FALSE, dest, info, errorCode);
265	}
266
267	void
268	UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
269	IDNAInfo &info, UErrorCode &errorCode) const {
270	processUTF8(name, FALSE, TRUE, dest, info, errorCode);
271	}
272
273	void
274	UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
275	IDNAInfo &info, UErrorCode &errorCode) const {
276	processUTF8(name, FALSE, FALSE, dest, info, errorCode);
277	}
278
279	// UTS #46 data for ASCII characters.
280	// The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
281	// and passes through all other ASCII characters.
282	// If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
283	// using this data.
284	// The ASCII fastpath also uses this data.
285	// Values: -1=disallowed 0==valid 1==mapped (lowercase)
286	static const int8_t asciiData[`128`]={
287	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
288	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
289	// 002D..002E; valid # HYPHEN-MINUS..FULL STOP
290	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `0`, `0`, -`1`,
291	// 0030..0039; valid # DIGIT ZERO..DIGIT NINE
292	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
293	// 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
294	-`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
295	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`,
296	// 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
297	-`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
298	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`
299	};
300
301	UnicodeString &
302	UTS46::process(const UnicodeString &src,
303	UBool isLabel, UBool toASCII,
304	UnicodeString &dest,
305	IDNAInfo &info, UErrorCode &errorCode) const {
306	// uts46Norm2.normalize() would do all of this error checking and setup,
307	// but with the ASCII fastpath we do not always call it, and do not
308	// call it first.
309	if(U_FAILURE(errorCode)) {
310	dest.setToBogus();
311	return dest;
312	}
313	const UChar *srcArray=src.getBuffer();
314	if(&dest==&src \|\| srcArray==NULL) {
315	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
316	dest.setToBogus();
317	return dest;
318	}
319	// Arguments are fine, reset output values.
320	dest.remove();
321	info.reset();
322	int32_t srcLength=src.length();
323	if(srcLength==`0`) {
324	info.errors\|=UIDNA_ERROR_EMPTY_LABEL;
325	return dest;
326	}
327	UChar *destArray=dest.getBuffer(srcLength);
328	if(destArray==NULL) {
329	errorCode=U_MEMORY_ALLOCATION_ERROR;
330	return dest;
331	}
332	// ASCII fastpath
333	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
334	int32_t labelStart=`0`;
335	int32_t i;
336	for(i=`0`;; ++i) {
337	if(i==srcLength) {
338	if(toASCII) {
339	if((i-labelStart)>`63`) {
340	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
341	}
342	// There is a trailing dot if labelStart==i.
343	if(!isLabel && i>=`254` && (i>`254` \|\| labelStart<i)) {
344	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
345	}
346	}
347	info.errors\|=info.labelErrors;
348	dest.releaseBuffer(i);
349	return dest;
350	}
351	UChar c=srcArray[i];
352	if(c>`0x7f`) {
353	break;
354	}
355	int cData=asciiData[c];
356	if(cData>`0`) {
357	destArray[i]=c+`0x20`; // Lowercase an uppercase ASCII letter.
358	} else if(cData<`0` && disallowNonLDHDot) {
359	break; // Replacing with U+FFFD can be complicated for toASCII.
360	} else {
361	destArray[i]=c;
362	if(c==`0x2d`) { // hyphen
363	if(i==(labelStart+`3`) && srcArray[i-`1`]==`0x2d`) {
364	// "??--..." is Punycode or forbidden.
365	++i; // '-' was copied to dest already
366	break;
367	}
368	if(i==labelStart) {
369	// label starts with "-"
370	info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;
371	}
372	if((i+`1`)==srcLength \|\| srcArray[i+`1`]==`0x2e`) {
373	// label ends with "-"
374	info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;
375	}
376	} else if(c==`0x2e`) { // dot
377	if(isLabel) {
378	// Replacing with U+FFFD can be complicated for toASCII.
379	++i; // '.' was copied to dest already
380	break;
381	}
382	if(i==labelStart) {
383	info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;
384	}
385	if(toASCII && (i-labelStart)>`63`) {
386	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
387	}
388	info.errors\|=info.labelErrors;
389	info.labelErrors=`0`;
390	labelStart=i+`1`;
391	}
392	}
393	}
394	info.errors\|=info.labelErrors;
395	dest.releaseBuffer(i);
396	processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
397	if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==`0` &&
398	(!info.isOkBiDi \|\| (labelStart>`0` && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
399	) {
400	info.errors\|=UIDNA_ERROR_BIDI;
401	}
402	return dest;
403	}
404
405	void
406	UTS46::processUTF8(StringPiece src,
407	UBool isLabel, UBool toASCII,
408	ByteSink &dest,
409	IDNAInfo &info, UErrorCode &errorCode) const {
410	if(U_FAILURE(errorCode)) {
411	return;
412	}
413	const char *srcArray=src.data();
414	int32_t srcLength=src.length();
415	if(srcArray==NULL && srcLength!=`0`) {
416	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
417	return;
418	}
419	// Arguments are fine, reset output values.
420	info.reset();
421	if(srcLength==`0`) {
422	info.errors\|=UIDNA_ERROR_EMPTY_LABEL;
423	dest.Flush();
424	return;
425	}
426	UnicodeString destString;
427	int32_t labelStart=`0`;
428	if(srcLength<=`256`) { // length of stackArray[]
429	// ASCII fastpath
430	char stackArray[`256`];
431	int32_t destCapacity;
432	char *destArray=dest.GetAppendBuffer(srcLength, srcLength+`20`,
433	stackArray, UPRV_LENGTHOF(stackArray), &destCapacity);
434	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
435	int32_t i;
436	for(i=`0`;; ++i) {
437	if(i==srcLength) {
438	if(toASCII) {
439	if((i-labelStart)>`63`) {
440	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
441	}
442	// There is a trailing dot if labelStart==i.
443	if(!isLabel && i>=`254` && (i>`254` \|\| labelStart<i)) {
444	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
445	}
446	}
447	info.errors\|=info.labelErrors;
448	dest.Append(destArray, i);
449	dest.Flush();
450	return;
451	}
452	char c=srcArray[i];
453	if((int8_t)c<`0`) { // (uint8_t)c>0x7f
454	break;
455	}
456	int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char.
457	if(cData>`0`) {
458	destArray[i]=c+`0x20`; // Lowercase an uppercase ASCII letter.
459	} else if(cData<`0` && disallowNonLDHDot) {
460	break; // Replacing with U+FFFD can be complicated for toASCII.
461	} else {
462	destArray[i]=c;
463	if(c==`0x2d`) { // hyphen
464	if(i==(labelStart+`3`) && srcArray[i-`1`]==`0x2d`) {
465	// "??--..." is Punycode or forbidden.
466	break;
467	}
468	if(i==labelStart) {
469	// label starts with "-"
470	info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;
471	}
472	if((i+`1`)==srcLength \|\| srcArray[i+`1`]==`0x2e`) {
473	// label ends with "-"
474	info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;
475	}
476	} else if(c==`0x2e`) { // dot
477	if(isLabel) {
478	break; // Replacing with U+FFFD can be complicated for toASCII.
479	}
480	if(i==labelStart) {
481	info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;
482	}
483	if(toASCII && (i-labelStart)>`63`) {
484	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
485	}
486	info.errors\|=info.labelErrors;
487	info.labelErrors=`0`;
488	labelStart=i+`1`;
489	}
490	}
491	}
492	info.errors\|=info.labelErrors;
493	// Convert the processed ASCII prefix of the current label to UTF-16.
494	int32_t mappingStart=i-labelStart;
495	destString =UnicodeString::fromUTF8(StringPiece (destArray+labelStart, mappingStart));
496	// Output the previous ASCII labels and process the rest of src in UTF-16.
497	dest.Append(destArray, labelStart);
498	processUnicode(UnicodeString::fromUTF8(StringPiece (src, labelStart)), `0`, mappingStart,
499	isLabel, toASCII,
500	destString, info, errorCode);
501	} else {
502	// src is too long for the ASCII fastpath implementation.
503	processUnicode(UnicodeString::fromUTF8(src), `0`, `0`,
504	isLabel, toASCII,
505	destString, info, errorCode);
506	}
507	destString.toUTF8(dest); // calls dest.Flush()
508	if(toASCII && !isLabel) {
509	// length==labelStart==254 means that there is a trailing dot (ok) and
510	// destString is empty (do not index at 253-labelStart).
511	int32_t length=labelStart+destString.length();
512	if( length>=`254` && isASCIIString(destString) &&
513	(length>`254` \|\|
514	(labelStart<`254` && destString [`253`-labelStart]!=`0x2e`))
515	) {
516	info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
517	}
518	}
519	if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==`0` &&
520	(!info.isOkBiDi \|\| (labelStart>`0` && !isASCIIOkBiDi(srcArray, labelStart)))
521	) {
522	info.errors\|=UIDNA_ERROR_BIDI;
523	}
524	}
525
526	UnicodeString &
527	UTS46::processUnicode(const UnicodeString &src,
528	int32_t labelStart, int32_t mappingStart,
529	UBool isLabel, UBool toASCII,
530	UnicodeString &dest,
531	IDNAInfo &info, UErrorCode &errorCode) const {
532	if(mappingStart==`0`) {
533	uts46Norm2.normalize(src, dest, errorCode);
534	} else {
535	uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
536	}
537	if(U_FAILURE(errorCode)) {
538	return dest;
539	}
540	UBool doMapDevChars=
541	toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==`0` :
542	(options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==`0`;
543	const UChar *destArray=dest.getBuffer();
544	int32_t destLength=dest.length();
545	int32_t labelLimit=labelStart;
546	while(labelLimit<destLength) {
547	UChar c=destArray[labelLimit];
548	if(c==`0x2e` && !isLabel) {
549	int32_t labelLength=labelLimit-labelStart;
550	int32_t newLength=processLabel(dest, labelStart, labelLength,
551	toASCII, info, errorCode);
552	info.errors\|=info.labelErrors;
553	info.labelErrors=`0`;
554	if(U_FAILURE(errorCode)) {
555	return dest;
556	}
557	destArray=dest.getBuffer();
558	destLength+=newLength-labelLength;
559	labelLimit=labelStart+=newLength+`1`;
560	continue;
561	} else if(c<`0xdf`) {
562	// pass
563	} else if(c<=`0x200d` && (c==`0xdf` \|\| c==`0x3c2` \|\| c>=`0x200c`)) {
564	info.isTransDiff=TRUE;
565	if(doMapDevChars) {
566	destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
567	if(U_FAILURE(errorCode)) {
568	return dest;
569	}
570	destArray=dest.getBuffer();
571	// All deviation characters have been mapped, no need to check for them again.
572	doMapDevChars=FALSE;
573	// Do not increment labelLimit in case c was removed.
574	continue;
575	}
576	} else if(U16_IS_SURROGATE(c)) {
577	if(U16_IS_SURROGATE_LEAD(c) ?
578	(labelLimit+`1`)==destLength \|\| !U16_IS_TRAIL(destArray[labelLimit+`1`]) :
579	labelLimit==labelStart \|\| !U16_IS_LEAD(destArray[labelLimit-`1`])) {
580	// Map an unpaired surrogate to U+FFFD before normalization so that when
581	// that removes characters we do not turn two unpaired ones into a pair.
582	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
583	dest.setCharAt(labelLimit, `0xfffd`);
584	destArray=dest.getBuffer();
585	}
586	}
587	++labelLimit;
588	}
589	// Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
590	// but not an empty label elsewhere nor a completely empty domain name.
591	// processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
592	if(`0`==labelStart \|\| labelStart<labelLimit) {
593	processLabel(dest, labelStart, labelLimit-labelStart,
594	toASCII, info, errorCode);
595	info.errors\|=info.labelErrors;
596	}
597	return dest;
598	}
599
600	int32_t
601	UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
602	UErrorCode &errorCode) const {
603	if(U_FAILURE(errorCode)) {
604	return `0`;
605	}
606	int32_t length=dest.length();
607	UChar *s=dest.getBuffer(dest [mappingStart]==`0xdf` ? length+`1` : length);
608	if(s==NULL) {
609	errorCode=U_MEMORY_ALLOCATION_ERROR;
610	return length;
611	}
612	int32_t capacity=dest.getCapacity();
613	UBool didMapDevChars=FALSE;
614	int32_t readIndex=mappingStart, writeIndex=mappingStart;
615	do {
616	UChar c=s[readIndex++];
617	switch(c) {
618	case `0xdf`:
619	// Map sharp s to ss.
620	didMapDevChars=TRUE;
621	s[writeIndex++]=`0x73`; // Replace sharp s with first s.
622	// Insert second s and account for possible buffer reallocation.
623	if(writeIndex==readIndex) {
624	if(length==capacity) {
625	dest.releaseBuffer(length);
626	s=dest.getBuffer(length+`1`);
627	if(s==NULL) {
628	errorCode=U_MEMORY_ALLOCATION_ERROR;
629	return length;
630	}
631	capacity=dest.getCapacity();
632	}
633	u_memmove(s+writeIndex+`1`, s+writeIndex, length-writeIndex);
634	++readIndex;
635	}
636	s[writeIndex++]=`0x73`;
637	++length;
638	break;
639	case `0x3c2`: // Map final sigma to nonfinal sigma.
640	didMapDevChars=TRUE;
641	s[writeIndex++]=`0x3c3`;
642	break;
643	case `0x200c`: // Ignore/remove ZWNJ.
644	case `0x200d`: // Ignore/remove ZWJ.
645	didMapDevChars=TRUE;
646	--length;
647	break;
648	default:
649	// Only really necessary if writeIndex was different from readIndex.
650	s[writeIndex++]=c;
651	break;
652	}
653	} while(writeIndex<length);
654	dest.releaseBuffer(length);
655	if(didMapDevChars) {
656	// Mapping deviation characters might have resulted in an un-NFC string.
657	// We could use either the NFC or the UTS #46 normalizer.
658	// By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
659	UnicodeString normalized;
660	uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
661	if(U_SUCCESS(errorCode)) {
662	dest.replace(labelStart, `0x7fffffff`, normalized);
663	if(dest.isBogus()) {
664	errorCode=U_MEMORY_ALLOCATION_ERROR;
665	}
666	return dest.length();
667	}
668	}
669	return length;
670	}
671
672	// Some non-ASCII characters are equivalent to sequences with
673	// non-LDH ASCII characters. To find them:
674	// grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
675	static inline UBool
676	isNonASCIIDisallowedSTD3Valid(UChar32 c) {
677	return c==`0x2260` \|\| c==`0x226E` \|\| c==`0x226F`;
678	}
679
680	// Replace the label in dest with the label string, if the label was modified.
681	// If &label==&dest then the label was modified in-place and labelLength
682	// is the new label length, different from label.length().
683	// If &label!=&dest then labelLength==label.length().
684	// Returns labelLength (= the new label length).
685	static int32_t
686	replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
687	const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) {
688	if(U_FAILURE(errorCode)) {
689	return `0`;
690	}
691	if(&label!=&dest) {
692	dest.replace(destLabelStart, destLabelLength, label);
693	if(dest.isBogus()) {
694	errorCode=U_MEMORY_ALLOCATION_ERROR;
695	return `0`;
696	}
697	}
698	return labelLength;
699	}
700
701	int32_t
702	UTS46::processLabel(UnicodeString &dest,
703	int32_t labelStart, int32_t labelLength,
704	UBool toASCII,
705	IDNAInfo &info, UErrorCode &errorCode) const {
706	if(U_FAILURE(errorCode)) {
707	return `0`;
708	}
709	UnicodeString fromPunycode;
710	UnicodeString *labelString;
711	const UChar *label=dest.getBuffer()+labelStart;
712	int32_t destLabelStart=labelStart;
713	int32_t destLabelLength=labelLength;
714	UBool wasPunycode;
715	if(labelLength>=`4` && label[`0`]==`0x78` && label[`1`]==`0x6e` && label[`2`]==`0x2d` && label[`3`]==`0x2d`) {
716	// Label starts with "xn--", try to un-Punycode it.
717	wasPunycode=TRUE;
718	UChar unicodeBuffer=fromPunycode.getBuffer(-`1`); // capacity==-1: most labels should fit*
719	if(unicodeBuffer==NULL) {
720	// Should never occur if we used capacity==-1 which uses the internal buffer.
721	errorCode=U_MEMORY_ALLOCATION_ERROR;
722	return labelLength;
723	}
724	UErrorCode punycodeErrorCode=U_ZERO_ERROR;
725	int32_t unicodeLength=u_strFromPunycode(label+`4`, labelLength-`4`,
726	unicodeBuffer, fromPunycode.getCapacity(),
727	NULL, &punycodeErrorCode);
728	if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
729	fromPunycode.releaseBuffer(`0`);
730	unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
731	if(unicodeBuffer==NULL) {
732	errorCode=U_MEMORY_ALLOCATION_ERROR;
733	return labelLength;
734	}
735	punycodeErrorCode=U_ZERO_ERROR;
736	unicodeLength=u_strFromPunycode(label+`4`, labelLength-`4`,
737	unicodeBuffer, fromPunycode.getCapacity(),
738	NULL, &punycodeErrorCode);
739	}
740	fromPunycode.releaseBuffer(unicodeLength);
741	if(U_FAILURE(punycodeErrorCode)) {
742	info.labelErrors\|=UIDNA_ERROR_PUNYCODE;
743	return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
744	}
745	// Check for NFC, and for characters that are not
746	// valid or deviation characters according to the normalizer.
747	// If there is something wrong, then the string will change.
748	// Note that the normalizer passes through non-LDH ASCII and deviation characters.
749	// Deviation characters are ok in Punycode even in transitional processing.
750	// In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
751	// then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
752	UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
753	if(U_FAILURE(errorCode)) {
754	return labelLength;
755	}
756	if(!isValid) {
757	info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;
758	return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
759	}
760	labelString=&fromPunycode;
761	label=fromPunycode.getBuffer();
762	labelStart=`0`;
763	labelLength=fromPunycode.length();
764	} else {
765	wasPunycode=FALSE;
766	labelString=&dest;
767	}
768	// Validity check
769	if(labelLength==`0`) {
770	info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;
771	return replaceLabel(dest, destLabelStart, destLabelLength,
772	*labelString, labelLength, errorCode);
773	}
774	// labelLength>0
775	if(labelLength>=`4` && label[`2`]==`0x2d` && label[`3`]==`0x2d`) {
776	// label starts with "??--"
777	info.labelErrors\|=UIDNA_ERROR_HYPHEN_3_4;
778	}
779	if(label[`0`]==`0x2d`) {
780	// label starts with "-"
781	info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;
782	}
783	if(label[labelLength-`1`]==`0x2d`) {
784	// label ends with "-"
785	info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;
786	}
787	// If the label was not a Punycode label, then it was the result of
788	// mapping, normalization and label segmentation.
789	// If the label was in Punycode, then we mapped it again above
790	// and checked its validity.
791	// Now we handle the STD3 restriction to LDH characters (if set)
792	// and we look for U+FFFD which indicates disallowed characters
793	// in a non-Punycode label or U+FFFD itself in a Punycode label.
794	// We also check for dots which can come from the input to a single-label function.
795	// Ok to cast away const because we own the UnicodeString.
796	UChar s=(UChar )label;
797	const UChar *limit=label+labelLength;
798	UChar oredChars=`0`;
799	// If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
800	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
801	do {
802	UChar c=*s;
803	if(c<=`0x7f`) {
804	if(c==`0x2e`) {
805	info.labelErrors\|=UIDNA_ERROR_LABEL_HAS_DOT;
806	*s=`0xfffd`;
807	} else if(disallowNonLDHDot && asciiData[c]<`0`) {
808	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
809	*s=`0xfffd`;
810	}
811	} else {
812	oredChars\|=c;
813	if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
814	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
815	*s=`0xfffd`;
816	} else if(c==`0xfffd`) {
817	info.labelErrors\|=UIDNA_ERROR_DISALLOWED;
818	}
819	}
820	++s;
821	} while(s<limit);
822	// Check for a leading combining mark after other validity checks
823	// so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
824	UChar32 c;
825	int32_t cpLength=`0`;
826	// "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
827	U16_NEXT_UNSAFE(label, cpLength, c);
828	if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=`0`) {
829	info.labelErrors\|=UIDNA_ERROR_LEADING_COMBINING_MARK;
830	labelString->replace(labelStart, cpLength, (UChar)`0xfffd`);
831	label=labelString->getBuffer()+labelStart;
832	labelLength+=`1`-cpLength;
833	if(labelString==&dest) {
834	destLabelLength=labelLength;
835	}
836	}
837	if((info.labelErrors&severeErrors)==`0`) {
838	// Do contextual checks only if we do not have U+FFFD from a severe error
839	// because U+FFFD can make these checks fail.
840	if((options&UIDNA_CHECK_BIDI)!=`0` && (!info.isBiDi \|\| info.isOkBiDi)) {
841	checkLabelBiDi(label, labelLength, info);
842	}
843	if( (options&UIDNA_CHECK_CONTEXTJ)!=`0` && (oredChars&`0x200c`)==`0x200c` &&
844	!isLabelOkContextJ(label, labelLength)
845	) {
846	info.labelErrors\|=UIDNA_ERROR_CONTEXTJ;
847	}
848	if((options&UIDNA_CHECK_CONTEXTO)!=`0` && oredChars>=`0xb7`) {
849	checkLabelContextO(label, labelLength, info);
850	}
851	if(toASCII) {
852	if(wasPunycode) {
853	// Leave a Punycode label unchanged if it has no severe errors.
854	if(destLabelLength>`63`) {
855	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
856	}
857	return destLabelLength;
858	} else if(oredChars>=`0x80`) {
859	// Contains non-ASCII characters.
860	UnicodeString punycode;
861	UChar buffer=punycode.getBuffer(`63`); // 63==maximum DNS label length*
862	if(buffer==NULL) {
863	errorCode=U_MEMORY_ALLOCATION_ERROR;
864	return destLabelLength;
865	}
866	buffer[`0`]=`0x78`; // Write "xn--".
867	buffer[`1`]=`0x6e`;
868	buffer[`2`]=`0x2d`;
869	buffer[`3`]=`0x2d`;
870	int32_t punycodeLength=u_strToPunycode(label, labelLength,
871	buffer+`4`, punycode.getCapacity()-`4`,
872	NULL, &errorCode);
873	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
874	errorCode=U_ZERO_ERROR;
875	punycode.releaseBuffer(`4`);
876	buffer=punycode.getBuffer(`4`+punycodeLength);
877	if(buffer==NULL) {
878	errorCode=U_MEMORY_ALLOCATION_ERROR;
879	return destLabelLength;
880	}
881	punycodeLength=u_strToPunycode(label, labelLength,
882	buffer+`4`, punycode.getCapacity()-`4`,
883	NULL, &errorCode);
884	}
885	punycodeLength+=`4`;
886	punycode.releaseBuffer(punycodeLength);
887	if(U_FAILURE(errorCode)) {
888	return destLabelLength;
889	}
890	if(punycodeLength>`63`) {
891	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
892	}
893	return replaceLabel(dest, destLabelStart, destLabelLength,
894	punycode, punycodeLength, errorCode);
895	} else {
896	// all-ASCII label
897	if(labelLength>`63`) {
898	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
899	}
900	}
901	}
902	} else {
903	// If a Punycode label has severe errors,
904	// then leave it but make sure it does not look valid.
905	if(wasPunycode) {
906	info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;
907	return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode);
908	}
909	}
910	return replaceLabel(dest, destLabelStart, destLabelLength,
911	*labelString, labelLength, errorCode);
912	}
913
914	// Make sure an ACE label does not look valid.
915	// Append U+FFFD if the label has only LDH characters.
916	// If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
917	int32_t
918	UTS46::markBadACELabel(UnicodeString &dest,
919	int32_t labelStart, int32_t labelLength,
920	UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const {
921	if(U_FAILURE(errorCode)) {
922	return `0`;
923	}
924	UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=`0`;
925	UBool isASCII=TRUE;
926	UBool onlyLDH=TRUE;
927	const UChar *label=dest.getBuffer()+labelStart;
928	// Ok to cast away const because we own the UnicodeString.
929	UChar s=(UChar )label+`4`; // After the initial "xn--".
930	const UChar *limit=label+labelLength;
931	do {
932	UChar c=*s;
933	if(c<=`0x7f`) {
934	if(c==`0x2e`) {
935	info.labelErrors\|=UIDNA_ERROR_LABEL_HAS_DOT;
936	*s=`0xfffd`;
937	isASCII=onlyLDH=FALSE;
938	} else if(asciiData[c]<`0`) {
939	onlyLDH=FALSE;
940	if(disallowNonLDHDot) {
941	*s=`0xfffd`;
942	isASCII=FALSE;
943	}
944	}
945	} else {
946	isASCII=onlyLDH=FALSE;
947	}
948	} while(++s<limit);
949	if(onlyLDH) {
950	dest.insert(labelStart+labelLength, (UChar)`0xfffd`);
951	if(dest.isBogus()) {
952	errorCode=U_MEMORY_ALLOCATION_ERROR;
953	return `0`;
954	}
955	++labelLength;
956	} else {
957	if(toASCII && isASCII && labelLength>`63`) {
958	info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;
959	}
960	}
961	return labelLength;
962	}
963
964	const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
965	const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)\|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
966	const uint32_t L_R_AL_MASK=L_MASK\|R_AL_MASK;
967
968	const uint32_t R_AL_AN_MASK=R_AL_MASK\|U_MASK(U_ARABIC_NUMBER);
969
970	const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)\|U_MASK(U_ARABIC_NUMBER);
971	const uint32_t R_AL_EN_AN_MASK=R_AL_MASK\|EN_AN_MASK;
972	const uint32_t L_EN_MASK=L_MASK\|U_MASK(U_EUROPEAN_NUMBER);
973
974	const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
975	U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)\|
976	U_MASK(U_COMMON_NUMBER_SEPARATOR)\|
977	U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)\|
978	U_MASK(U_OTHER_NEUTRAL)\|
979	U_MASK(U_BOUNDARY_NEUTRAL)\|
980	U_MASK(U_DIR_NON_SPACING_MARK);
981	const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK\|ES_CS_ET_ON_BN_NSM_MASK;
982	const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK\|EN_AN_MASK\|ES_CS_ET_ON_BN_NSM_MASK;
983
984	// We scan the whole label and check both for whether it contains RTL characters
985	// and whether it passes the BiDi Rule.
986	// In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
987	// that a domain name is a BiDi domain name (has an RTL label) only after
988	// processing several earlier labels.
989	void
990	UTS46::checkLabelBiDi(const UChar label, int32_t labelLength, IDNAInfo &info) const* {
991	// IDNA2008 BiDi rule
992	// Get the directionality of the first character.
993	UChar32 c;
994	int32_t i=`0`;
995	U16_NEXT_UNSAFE(label, i, c);
996	uint32_t firstMask=U_MASK(u_charDirection(c));
997	// 1. The first character must be a character with BIDI property L, R
998	// or AL. If it has the R or AL property, it is an RTL label; if it
999	// has the L property, it is an LTR label.
1000	if((firstMask&~L_R_AL_MASK)!=`0`) {
1001	info.isOkBiDi=FALSE;
1002	}
1003	// Get the directionality of the last non-NSM character.
1004	uint32_t lastMask;
1005	for(;;) {
1006	if(i>=labelLength) {
1007	lastMask=firstMask;
1008	break;
1009	}
1010	U16_PREV_UNSAFE(label, labelLength, c);
1011	UCharDirection dir=u_charDirection(c);
1012	if(dir!=U_DIR_NON_SPACING_MARK) {
1013	lastMask=U_MASK(dir);
1014	break;
1015	}
1016	}
1017	// 3. In an RTL label, the end of the label must be a character with
1018	// BIDI property R, AL, EN or AN, followed by zero or more
1019	// characters with BIDI property NSM.
1020	// 6. In an LTR label, the end of the label must be a character with
1021	// BIDI property L or EN, followed by zero or more characters with
1022	// BIDI property NSM.
1023	if( (firstMask&L_MASK)!=`0` ?
1024	(lastMask&~L_EN_MASK)!=`0` :
1025	(lastMask&~R_AL_EN_AN_MASK)!=`0`
1026	) {
1027	info.isOkBiDi=FALSE;
1028	}
1029	// Add the directionalities of the intervening characters.
1030	uint32_t mask=firstMask\|lastMask;
1031	while(i<labelLength) {
1032	U16_NEXT_UNSAFE(label, i, c);
1033	mask\|=U_MASK(u_charDirection(c));
1034	}
1035	if(firstMask&L_MASK) {
1036	// 5. In an LTR label, only characters with the BIDI properties L, EN,
1037	// ES, CS, ET, ON, BN and NSM are allowed.
1038	if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=`0`) {
1039	info.isOkBiDi=FALSE;
1040	}
1041	} else {
1042	// 2. In an RTL label, only characters with the BIDI properties R, AL,
1043	// AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
1044	if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=`0`) {
1045	info.isOkBiDi=FALSE;
1046	}
1047	// 4. In an RTL label, if an EN is present, no AN may be present, and
1048	// vice versa.
1049	if((mask&EN_AN_MASK)==EN_AN_MASK) {
1050	info.isOkBiDi=FALSE;
1051	}
1052	}
1053	// An RTL label is a label that contains at least one character of type
1054	// R, AL or AN. [...]
1055	// A "BIDI domain name" is a domain name that contains at least one RTL
1056	// label. [...]
1057	// The following rule, consisting of six conditions, applies to labels
1058	// in BIDI domain names.
1059	if((mask&R_AL_AN_MASK)!=`0`) {
1060	info.isBiDi=TRUE;
1061	}
1062	}
1063
1064	// Special code for the ASCII prefix of a BiDi domain name.
1065	// The ASCII prefix is all-LTR.
1066
1067	// IDNA2008 BiDi rule, parts relevant to ASCII labels:
1068	// 1. The first character must be a character with BIDI property L [...]
1069	// 5. In an LTR label, only characters with the BIDI properties L, EN,
1070	// ES, CS, ET, ON, BN and NSM are allowed.
1071	// 6. In an LTR label, the end of the label must be a character with
1072	// BIDI property L or EN [...]
1073
1074	// UTF-16 version, called for mapped ASCII prefix.
1075	// Cannot contain uppercase A-Z.
1076	// s[length-1] must be the trailing dot.
1077	static UBool
1078	isASCIIOkBiDi(const UChar *s, int32_t length) {
1079	int32_t labelStart=`0`;
1080	for(int32_t i=`0`; i<length; ++i) {
1081	UChar c=s[i];
1082	if(c==`0x2e`) { // dot
1083	if(i>labelStart) {
1084	c=s[i-`1`];
1085	if(!(`0x61`<=c && c<=`0x7a`) && !(`0x30`<=c && c<=`0x39`)) {
1086	// Last character in the label is not an L or EN.
1087	return FALSE;
1088	}
1089	}
1090	labelStart=i+`1`;
1091	} else if(i==labelStart) {
1092	if(!(`0x61`<=c && c<=`0x7a`)) {
1093	// First character in the label is not an L.
1094	return FALSE;
1095	}
1096	} else {
1097	if(c<=`0x20` && (c>=`0x1c` \|\| (`9`<=c && c<=`0xd`))) {
1098	// Intermediate character in the label is a B, S or WS.
1099	return FALSE;
1100	}
1101	}
1102	}
1103	return TRUE;
1104	}
1105
1106	// UTF-8 version, called for source ASCII prefix.
1107	// Can contain uppercase A-Z.
1108	// s[length-1] must be the trailing dot.
1109	static UBool
1110	isASCIIOkBiDi(const char *s, int32_t length) {
1111	int32_t labelStart=`0`;
1112	for(int32_t i=`0`; i<length; ++i) {
1113	char c=s[i];
1114	if(c==`0x2e`) { // dot
1115	if(i>labelStart) {
1116	c=s[i-`1`];
1117	if(!(`0x61`<=c && c<=`0x7a`) && !(`0x41`<=c && c<=`0x5a`) && !(`0x30`<=c && c<=`0x39`)) {
1118	// Last character in the label is not an L or EN.
1119	return FALSE;
1120	}
1121	}
1122	labelStart=i+`1`;
1123	} else if(i==labelStart) {
1124	if(!(`0x61`<=c && c<=`0x7a`) && !(`0x41`<=c && c<=`0x5a`)) {
1125	// First character in the label is not an L.
1126	return FALSE;
1127	}
1128	} else {
1129	if(c<=`0x20` && (c>=`0x1c` \|\| (`9`<=c && c<=`0xd`))) {
1130	// Intermediate character in the label is a B, S or WS.
1131	return FALSE;
1132	}
1133	}
1134	}
1135	return TRUE;
1136	}
1137
1138	UBool
1139	UTS46::isLabelOkContextJ(const UChar label, int32_t labelLength) const* {
1140	// [IDNA2008-Tables]
1141	// 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
1142	for(int32_t i=`0`; i<labelLength; ++i) {
1143	if(label[i]==`0x200c`) {
1144	// Appendix A.1. ZERO WIDTH NON-JOINER
1145	// Rule Set:
1146	// False;
1147	// If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
1148	// If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)\u200C*
1149	// (Joining_Type:T)(Joining_Type:{R,D})) Then True;*
1150	if(i==`0`) {
1151	return FALSE;
1152	}
1153	UChar32 c;
1154	int32_t j=i;
1155	U16_PREV_UNSAFE(label, j, c);
1156	if(uts46Norm2.getCombiningClass(c)==`9`) {
1157	continue;
1158	}
1159	// check precontext (Joining_Type:{L,D})(Joining_Type:T)*
1160	for(;;) {
1161	UJoiningType type=ubidi_getJoiningType(c);
1162	if(type==U_JT_TRANSPARENT) {
1163	if(j==`0`) {
1164	return FALSE;
1165	}
1166	U16_PREV_UNSAFE(label, j, c);
1167	} else if(type==U_JT_LEFT_JOINING \|\| type==U_JT_DUAL_JOINING) {
1168	break; // precontext fulfilled
1169	} else {
1170	return FALSE;
1171	}
1172	}
1173	// check postcontext (Joining_Type:T)(Joining_Type:{R,D})*
1174	for(j=i+`1`;;) {
1175	if(j==labelLength) {
1176	return FALSE;
1177	}
1178	U16_NEXT_UNSAFE(label, j, c);
1179	UJoiningType type=ubidi_getJoiningType(c);
1180	if(type==U_JT_TRANSPARENT) {
1181	// just skip this character
1182	} else if(type==U_JT_RIGHT_JOINING \|\| type==U_JT_DUAL_JOINING) {
1183	break; // postcontext fulfilled
1184	} else {
1185	return FALSE;
1186	}
1187	}
1188	} else if(label[i]==`0x200d`) {
1189	// Appendix A.2. ZERO WIDTH JOINER (U+200D)
1190	// Rule Set:
1191	// False;
1192	// If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
1193	if(i==`0`) {
1194	return FALSE;
1195	}
1196	UChar32 c;
1197	int32_t j=i;
1198	U16_PREV_UNSAFE(label, j, c);
1199	if(uts46Norm2.getCombiningClass(c)!=`9`) {
1200	return FALSE;
1201	}
1202	}
1203	}
1204	return TRUE;
1205	}
1206
1207	void
1208	UTS46::checkLabelContextO(const UChar label, int32_t labelLength, IDNAInfo &info) const* {
1209	int32_t labelEnd=labelLength-`1`; // inclusive
1210	int32_t arabicDigits=`0`; // -1 for 066x, +1 for 06Fx
1211	for(int32_t i=`0`; i<=labelEnd; ++i) {
1212	UChar32 c=label[i];
1213	if(c<`0xb7`) {
1214	// ASCII fastpath
1215	} else if(c<=`0x6f9`) {
1216	if(c==`0xb7`) {
1217	// Appendix A.3. MIDDLE DOT (U+00B7)
1218	// Rule Set:
1219	// False;
1220	// If Before(cp) .eq. U+006C And
1221	// After(cp) .eq. U+006C Then True;
1222	if(!(`0`<i && label[i-`1`]==`0x6c` &&
1223	i<labelEnd && label[i+`1`]==`0x6c`)) {
1224	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1225	}
1226	} else if(c==`0x375`) {
1227	// Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
1228	// Rule Set:
1229	// False;
1230	// If Script(After(cp)) .eq. Greek Then True;
1231	UScriptCode script=USCRIPT_INVALID_CODE;
1232	if(i<labelEnd) {
1233	UErrorCode errorCode=U_ZERO_ERROR;
1234	int32_t j=i+`1`;
1235	U16_NEXT(label, j, labelLength, c);
1236	script=uscript_getScript(c, &errorCode);
1237	}
1238	if(script!=USCRIPT_GREEK) {
1239	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1240	}
1241	} else if(c==`0x5f3` \|\| c==`0x5f4`) {
1242	// Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
1243	// Rule Set:
1244	// False;
1245	// If Script(Before(cp)) .eq. Hebrew Then True;
1246	//
1247	// Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
1248	// Rule Set:
1249	// False;
1250	// If Script(Before(cp)) .eq. Hebrew Then True;
1251	UScriptCode script=USCRIPT_INVALID_CODE;
1252	if(`0`<i) {
1253	UErrorCode errorCode=U_ZERO_ERROR;
1254	int32_t j=i;
1255	U16_PREV(label, `0`, j, c);
1256	script=uscript_getScript(c, &errorCode);
1257	}
1258	if(script!=USCRIPT_HEBREW) {
1259	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1260	}
1261	} else if(`0x660`<=c / && c<=0x6f9 /) {
1262	// Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
1263	// Rule Set:
1264	// True;
1265	// For All Characters:
1266	// If cp .in. 06F0..06F9 Then False;
1267	// End For;
1268	//
1269	// Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
1270	// Rule Set:
1271	// True;
1272	// For All Characters:
1273	// If cp .in. 0660..0669 Then False;
1274	// End For;
1275	if(c<=`0x669`) {
1276	if(arabicDigits>`0`) {
1277	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_DIGITS;
1278	}
1279	arabicDigits=-`1`;
1280	} else if(`0x6f0`<=c) {
1281	if(arabicDigits<`0`) {
1282	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_DIGITS;
1283	}
1284	arabicDigits=`1`;
1285	}
1286	}
1287	} else if(c==`0x30fb`) {
1288	// Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
1289	// Rule Set:
1290	// False;
1291	// For All Characters:
1292	// If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
1293	// End For;
1294	UErrorCode errorCode=U_ZERO_ERROR;
1295	for(int j=`0`;;) {
1296	if(j>labelEnd) {
1297	info.labelErrors\|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1298	break;
1299	}
1300	U16_NEXT(label, j, labelLength, c);
1301	UScriptCode script=uscript_getScript(c, &errorCode);
1302	if(script==USCRIPT_HIRAGANA \|\| script==USCRIPT_KATAKANA \|\| script==USCRIPT_HAN) {
1303	break;
1304	}
1305	}
1306	}
1307	}
1308	}
1309
1310	U_NAMESPACE_END
1311
1312	// C API ------------------------------------------------------------------- ***
1313
1314	U_NAMESPACE_USE
1315
1316	U_CAPI UIDNA * U_EXPORT2
1317	uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
1318	return reinterpret_cast<UIDNA >(IDNA::createUTS46Instance(options, pErrorCode));
1319	}
1320
1321	U_CAPI void U_EXPORT2
1322	uidna_close(UIDNA *idna) {
1323	delete reinterpret_cast<IDNA *>(idna);
1324	}
1325
1326	static UBool
1327	checkArgs(const void *label, int32_t length,
1328	void *dest, int32_t capacity,
1329	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1330	if(U_FAILURE(*pErrorCode)) {
1331	return FALSE;
1332	}
1333	// sizeof(UIDNAInfo)=16 in the first API version.
1334	if(pInfo==NULL \|\| pInfo->size<`16`) {
1335	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1336	return FALSE;
1337	}
1338	if( (label==NULL ? length!=`0` : length<-`1`) \|\|
1339	(dest==NULL ? capacity!=`0` : capacity<`0`) \|\|
1340	(dest==label && label!=NULL)
1341	) {
1342	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1343	return FALSE;
1344	}
1345	// Set all pInfo bytes to 0 except for the size field itself.*
1346	uprv_memset(&pInfo->size+`1`, `0`, pInfo->size-sizeof(pInfo->size));
1347	return TRUE;
1348	}
1349
1350	static void
1351	idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
1352	pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
1353	pInfo->errors=info.getErrors();
1354	}
1355
1356	U_CAPI int32_t U_EXPORT2
1357	uidna_labelToASCII(const UIDNA *idna,
1358	const UChar *label, int32_t length,
1359	UChar *dest, int32_t capacity,
1360	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1361	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1362	return `0`;
1363	}
1364	UnicodeString src((UBool)(length<`0`), label, length);
1365	UnicodeString destString(dest, `0`, capacity);
1366	IDNAInfo info;
1367	reinterpret_cast<const IDNA >(idna)->labelToASCII(src, destString, info, pErrorCode);
1368	idnaInfoToStruct(info, pInfo);
1369	return destString.extract(dest, capacity, *pErrorCode);
1370	}
1371
1372	U_CAPI int32_t U_EXPORT2
1373	uidna_labelToUnicode(const UIDNA *idna,
1374	const UChar *label, int32_t length,
1375	UChar *dest, int32_t capacity,
1376	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1377	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1378	return `0`;
1379	}
1380	UnicodeString src((UBool)(length<`0`), label, length);
1381	UnicodeString destString(dest, `0`, capacity);
1382	IDNAInfo info;
1383	reinterpret_cast<const IDNA >(idna)->labelToUnicode(src, destString, info, pErrorCode);
1384	idnaInfoToStruct(info, pInfo);
1385	return destString.extract(dest, capacity, *pErrorCode);
1386	}
1387
1388	U_CAPI int32_t U_EXPORT2
1389	uidna_nameToASCII(const UIDNA *idna,
1390	const UChar *name, int32_t length,
1391	UChar *dest, int32_t capacity,
1392	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1393	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1394	return `0`;
1395	}
1396	UnicodeString src((UBool)(length<`0`), name, length);
1397	UnicodeString destString(dest, `0`, capacity);
1398	IDNAInfo info;
1399	reinterpret_cast<const IDNA >(idna)->nameToASCII(src, destString, info, pErrorCode);
1400	idnaInfoToStruct(info, pInfo);
1401	return destString.extract(dest, capacity, *pErrorCode);
1402	}
1403
1404	U_CAPI int32_t U_EXPORT2
1405	uidna_nameToUnicode(const UIDNA *idna,
1406	const UChar *name, int32_t length,
1407	UChar *dest, int32_t capacity,
1408	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1409	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1410	return `0`;
1411	}
1412	UnicodeString src((UBool)(length<`0`), name, length);
1413	UnicodeString destString(dest, `0`, capacity);
1414	IDNAInfo info;
1415	reinterpret_cast<const IDNA >(idna)->nameToUnicode(src, destString, info, pErrorCode);
1416	idnaInfoToStruct(info, pInfo);
1417	return destString.extract(dest, capacity, *pErrorCode);
1418	}
1419
1420	U_CAPI int32_t U_EXPORT2
1421	uidna_labelToASCII_UTF8(const UIDNA *idna,
1422	const char *label, int32_t length,
1423	char *dest, int32_t capacity,
1424	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1425	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1426	return `0`;
1427	}
1428	StringPiece src(label, length<`0` ? static_cast<int32_t>(uprv_strlen(label)) : length);
1429	CheckedArrayByteSink sink(dest, capacity);
1430	IDNAInfo info;
1431	reinterpret_cast<const IDNA >(idna)->labelToASCII_UTF8(src, sink, info, pErrorCode);
1432	idnaInfoToStruct(info, pInfo);
1433	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1434	}
1435
1436	U_CAPI int32_t U_EXPORT2
1437	uidna_labelToUnicodeUTF8(const UIDNA *idna,
1438	const char *label, int32_t length,
1439	char *dest, int32_t capacity,
1440	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1441	if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1442	return `0`;
1443	}
1444	StringPiece src(label, length<`0` ? static_cast<int32_t>(uprv_strlen(label)) : length);
1445	CheckedArrayByteSink sink(dest, capacity);
1446	IDNAInfo info;
1447	reinterpret_cast<const IDNA >(idna)->labelToUnicodeUTF8(src, sink, info, pErrorCode);
1448	idnaInfoToStruct(info, pInfo);
1449	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1450	}
1451
1452	U_CAPI int32_t U_EXPORT2
1453	uidna_nameToASCII_UTF8(const UIDNA *idna,
1454	const char *name, int32_t length,
1455	char *dest, int32_t capacity,
1456	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1457	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1458	return `0`;
1459	}
1460	StringPiece src(name, length<`0` ? static_cast<int32_t>(uprv_strlen(name)) : length);
1461	CheckedArrayByteSink sink(dest, capacity);
1462	IDNAInfo info;
1463	reinterpret_cast<const IDNA >(idna)->nameToASCII_UTF8(src, sink, info, pErrorCode);
1464	idnaInfoToStruct(info, pInfo);
1465	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1466	}
1467
1468	U_CAPI int32_t U_EXPORT2
1469	uidna_nameToUnicodeUTF8(const UIDNA *idna,
1470	const char *name, int32_t length,
1471	char *dest, int32_t capacity,
1472	UIDNAInfo pInfo, UErrorCode pErrorCode) {
1473	if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1474	return `0`;
1475	}
1476	StringPiece src(name, length<`0` ? static_cast<int32_t>(uprv_strlen(name)) : length);
1477	CheckedArrayByteSink sink(dest, capacity);
1478	IDNAInfo info;
1479	reinterpret_cast<const IDNA >(idna)->nameToUnicodeUTF8(src, sink, info, pErrorCode);
1480	idnaInfoToStruct(info, pInfo);
1481	return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1482	}
1483
1484	#endif // UCONFIG_NO_IDNA
1485

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/uts46.cpp