punycode.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/punycode.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2002-2011, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: punycode.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2002jan31
16	* created by: Markus W. Scherer
17	*/
18
19
20	/ This ICU code derived from: /
21	/*
22	punycode.c 0.4.0 (2001-Nov-17-Sat)
23	http://www.cs.berkeley.edu/~amc/idn/
24	Adam M. Costello
25	http://www.nicemice.net/amc/
26
27	Disclaimer and license
28
29	Regarding this entire document or any portion of it (including
30	the pseudocode and C code), the author makes no guarantees and
31	is not responsible for any damage resulting from its use. The
32	author grants irrevocable permission to anyone to use, modify,
33	and distribute it in any way that does not diminish the rights
34	of anyone else to use, modify, and distribute it, provided that
35	redistributed derivative works do not contain misleading author or
36	version information. Derivative works need not be licensed under
37	similar terms.
38	*/
39	/*
40	* ICU modifications:
41	* - ICU data types and coding conventions
42	* - ICU string buffer handling with implicit source lengths
43	* and destination preflighting
44	* - UTF-16 handling
45	*/
46
47	#include "unicode/utypes.h"
48
49	#if !UCONFIG_NO_IDNA
50
51	#include "unicode/ustring.h"
52	#include "unicode/utf.h"
53	#include "unicode/utf16.h"
54	#include "ustr_imp.h"
55	#include "cstring.h"
56	#include "cmemory.h"
57	#include "punycode.h"
58	#include "uassert.h"
59
60
61	/ Punycode ----------------------------------------------------------------- /
62
63	/ Punycode parameters for Bootstring /
64	#define BASE 36
65	#define TMIN 1
66	#define TMAX 26
67	#define SKEW 38
68	#define DAMP 700
69	#define INITIAL_BIAS 72
70	#define INITIAL_N 0x80
71
72	/ "Basic" Unicode/ASCII code points /
73	#define _HYPHEN 0X2d
74	#define DELIMITER _HYPHEN
75
76	#define _ZERO_ 0X30
77	#define _NINE 0x39
78
79	#define _SMALL_A 0X61
80	#define _SMALL_Z 0X7a
81
82	#define _CAPITAL_A 0X41
83	#define _CAPITAL_Z 0X5a
84
85	#define IS_BASIC(c) ((c)<0x80)
86	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87
88	/**
89	* digitToBasic() returns the basic code point whose value
90	* (when used for representing integers) is d, which must be in the
91	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92	* nonzero, in which case the uppercase form is used.
93	*/
94	static inline char
95	digitToBasic(int32_t digit, UBool uppercase) {
96	/ 0..25 map to ASCII a..z or A..Z /
97	/ 26..35 map to ASCII 0..9 /
98	if(digit<`26`) {
99	if(uppercase) {
100	return (char)(_CAPITAL_A+digit);
101	} else {
102	return (char)(_SMALL_A+digit);
103	}
104	} else {
105	return (char)((_ZERO_-`26`)+digit);
106	}
107	}
108
109	/**
110	* basicToDigit[] contains the numeric value of a basic code
111	* point (for use in representing integers) in the range 0 to
112	* BASE-1, or -1 if b is does not represent a value.
113	*/
114	static const int8_t
115	basicToDigit[`256`]={
116	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
117	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
118
119	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
120	`26`, `27`, `28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
121
122	-`1`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
123	`15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, -`1`, -`1`, -`1`, -`1`, -`1`,
124
125	-`1`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
126	`15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, -`1`, -`1`, -`1`, -`1`, -`1`,
127
128	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
129	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
130
131	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
132	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
133
134	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
135	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
136
137	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
138	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`
139	};
140
141	static inline char
142	asciiCaseMap(char b, UBool uppercase) {
143	if(uppercase) {
144	if(_SMALL_A<=b && b<=_SMALL_Z) {
145	b-=(_SMALL_A-_CAPITAL_A);
146	}
147	} else {
148	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
149	b+=(_SMALL_A-_CAPITAL_A);
150	}
151	}
152	return b;
153	}
154
155	/ Punycode-specific Bootstring code ---------------------------------------- /
156
157	/*
158	* The following code omits the {parts} of the pseudo-algorithm in the spec
159	* that are not used with the Punycode parameter set.
160	*/
161
162	/ Bias adaptation function. /
163	static int32_t
164	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
165	int32_t count;
166
167	if(firstTime) {
168	delta/=DAMP;
169	} else {
170	delta/=`2`;
171	}
172
173	delta+=delta/length;
174	for(count=`0`; delta>((BASE-TMIN)*TMAX)/`2`; count+=BASE) {
175	delta/=(BASE-TMIN);
176	}
177
178	return count+(((BASE-TMIN+`1`)*delta)/(delta+SKEW));
179	}
180
181	#define MAX_CP_COUNT 200
182
183	U_CFUNC int32_t
184	u_strToPunycode(const UChar *src, int32_t srcLength,
185	UChar *dest, int32_t destCapacity,
186	const UBool *caseFlags,
187	UErrorCode *pErrorCode) {
188
189	int32_t cpBuffer[MAX_CP_COUNT];
190	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
191	UChar c, c2;
192
193	/ argument checking /
194	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
195	return `0`;
196	}
197
198	if(src==NULL \|\| srcLength<-`1` \|\| (dest==NULL && destCapacity!=`0`)) {
199	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
200	return `0`;
201	}
202
203	/*
204	* Handle the basic code points and
205	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
206	*/
207	srcCPCount=destLength=`0`;
208	if(srcLength==-`1`) {
209	/ NUL-terminated input /
210	for(j=`0`; / no condition /; ++j) {
211	if((c=src[j])==`0`) {
212	break;
213	}
214	if(srcCPCount==MAX_CP_COUNT) {
215	/ too many input code points /
216	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
217	return `0`;
218	}
219	if(IS_BASIC(c)) {
220	cpBuffer[srcCPCount++]=`0`;
221	if(destLength<destCapacity) {
222	dest[destLength]=
223	caseFlags!=NULL ?
224	asciiCaseMap((char)c, caseFlags[j]) :
225	(char)c;
226	}
227	++destLength;
228	} else {
229	n=(caseFlags!=NULL && caseFlags[j])<<`31L`;
230	if(U16_IS_SINGLE(c)) {
231	n\|=c;
232	} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+`1`])) {
233	++j;
234	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
235	} else {
236	/ error: unmatched surrogate /
237	*pErrorCode=U_INVALID_CHAR_FOUND;
238	return `0`;
239	}
240	cpBuffer[srcCPCount++]=n;
241	}
242	}
243	} else {
244	/ length-specified input /
245	for(j=`0`; j<srcLength; ++j) {
246	if(srcCPCount==MAX_CP_COUNT) {
247	/ too many input code points /
248	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
249	return `0`;
250	}
251	c=src[j];
252	if(IS_BASIC(c)) {
253	cpBuffer[srcCPCount++]=`0`;
254	if(destLength<destCapacity) {
255	dest[destLength]=
256	caseFlags!=NULL ?
257	asciiCaseMap((char)c, caseFlags[j]) :
258	(char)c;
259	}
260	++destLength;
261	} else {
262	n=(caseFlags!=NULL && caseFlags[j])<<`31L`;
263	if(U16_IS_SINGLE(c)) {
264	n\|=c;
265	} else if(U16_IS_LEAD(c) && (j+`1`)<srcLength && U16_IS_TRAIL(c2=src[j+`1`])) {
266	++j;
267	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
268	} else {
269	/ error: unmatched surrogate /
270	*pErrorCode=U_INVALID_CHAR_FOUND;
271	return `0`;
272	}
273	cpBuffer[srcCPCount++]=n;
274	}
275	}
276	}
277
278	/ Finish the basic string - if it is not empty - with a delimiter. /
279	basicLength=destLength;
280	if(basicLength>`0`) {
281	if(destLength<destCapacity) {
282	dest[destLength]=DELIMITER;
283	}
284	++destLength;
285	}
286
287	/*
288	* handledCPCount is the number of code points that have been handled
289	* basicLength is the number of basic code points
290	* destLength is the number of chars that have been output
291	*/
292
293	/ Initialize the state: /
294	n=INITIAL_N;
295	delta=`0`;
296	bias=INITIAL_BIAS;
297
298	/ Main encoding loop: /
299	for(handledCPCount=basicLength; handledCPCount<srcCPCount; / no op /) {
300	/*
301	* All non-basic code points < n have been handled already.
302	* Find the next larger one:
303	*/
304	for(m=`0x7fffffff`, j=`0`; j<srcCPCount; ++j) {
305	q=cpBuffer[j]&`0x7fffffff`; / remove case flag from the sign bit /
306	if(n<=q && q<m) {
307	m=q;
308	}
309	}
310
311	/*
312	* Increase delta enough to advance the decoder's
313	* <n,i> state to <m,0>, but guard against overflow:
314	*/
315	if(m-n>(`0x7fffffff`-MAX_CP_COUNT-delta)/(handledCPCount+`1`)) {
316	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
317	return `0`;
318	}
319	delta+=(m-n)*(handledCPCount+`1`);
320	n=m;
321
322	/ Encode a sequence of same code points n /
323	for(j=`0`; j<srcCPCount; ++j) {
324	q=cpBuffer[j]&`0x7fffffff`; / remove case flag from the sign bit /
325	if(q<n) {
326	++delta;
327	} else if(q==n) {
328	/ Represent delta as a generalized variable-length integer: /
329	for(q=delta, k=BASE; / no condition /; k+=BASE) {
330
331	/* RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt*
332
333	t=k-bias;
334	if(t<TMIN) {
335	t=TMIN;
336	} else if(t>TMAX) {
337	t=TMAX;
338	}
339	*/
340
341	t=k-bias;
342	if(t<TMIN) {
343	t=TMIN;
344	} else if(k>=(bias+TMAX)) {
345	t=TMAX;
346	}
347
348	if(q<t) {
349	break;
350	}
351
352	if(destLength<destCapacity) {
353	dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), `0`);
354	}
355	++destLength;
356	q=(q-t)/(BASE-t);
357	}
358
359	if(destLength<destCapacity) {
360	dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<`0`));
361	}
362	++destLength;
363	bias=adaptBias(delta, handledCPCount+`1`, (UBool)(handledCPCount==basicLength));
364	delta=`0`;
365	++handledCPCount;
366	}
367	}
368
369	++delta;
370	++n;
371	}
372
373	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
374	}
375
376	U_CFUNC int32_t
377	u_strFromPunycode(const UChar *src, int32_t srcLength,
378	UChar *dest, int32_t destCapacity,
379	UBool *caseFlags,
380	UErrorCode *pErrorCode) {
381	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
382	destCPCount, firstSupplementaryIndex, cpLength;
383	UChar b;
384
385	/ argument checking /
386	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
387	return `0`;
388	}
389
390	if(src==NULL \|\| srcLength<-`1` \|\| (dest==NULL && destCapacity!=`0`)) {
391	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
392	return `0`;
393	}
394
395	if(srcLength==-`1`) {
396	srcLength=u_strlen(src);
397	}
398
399	/*
400	* Handle the basic code points:
401	* Let basicLength be the number of input code points
402	* before the last delimiter, or 0 if there is none,
403	* then copy the first basicLength code points to the output.
404	*
405	* The two following loops iterate backward.
406	*/
407	for(j=srcLength; j>`0`;) {
408	if(src[--j]==DELIMITER) {
409	break;
410	}
411	}
412	destLength=basicLength=destCPCount=j;
413	U_ASSERT(destLength>=`0`);
414
415	while(j>`0`) {
416	b=src[--j];
417	if(!IS_BASIC(b)) {
418	*pErrorCode=U_INVALID_CHAR_FOUND;
419	return `0`;
420	}
421
422	if(j<destCapacity) {
423	dest[j]=(UChar)b;
424
425	if(caseFlags!=NULL) {
426	caseFlags[j]=IS_BASIC_UPPERCASE(b);
427	}
428	}
429	}
430
431	/ Initialize the state: /
432	n=INITIAL_N;
433	i=`0`;
434	bias=INITIAL_BIAS;
435	firstSupplementaryIndex=`1000000000`;
436
437	/*
438	* Main decoding loop:
439	* Start just after the last delimiter if any
440	* basic code points were copied; start at the beginning otherwise.
441	*/
442	for(in=basicLength>`0` ? basicLength+`1` : `0`; in<srcLength; / no op /) {
443	/*
444	* in is the index of the next character to be consumed, and
445	* destCPCount is the number of code points in the output array.
446	*
447	* Decode a generalized variable-length integer into delta,
448	* which gets added to i. The overflow checking is easier
449	* if we increase i as we go, then subtract off its starting
450	* value at the end to obtain delta.
451	*/
452	for(oldi=i, w=`1`, k=BASE; / no condition /; k+=BASE) {
453	if(in>=srcLength) {
454	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
455	return `0`;
456	}
457
458	digit=basicToDigit[(uint8_t)src[in++]];
459	if(digit<`0`) {
460	*pErrorCode=U_INVALID_CHAR_FOUND;
461	return `0`;
462	}
463	if(digit>(`0x7fffffff`-i)/w) {
464	/ integer overflow /
465	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
466	return `0`;
467	}
468
469	i+=digit*w;
470	/* RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt*
471	t=k-bias;
472	if(t<TMIN) {
473	t=TMIN;
474	} else if(t>TMAX) {
475	t=TMAX;
476	}
477	*/
478	t=k-bias;
479	if(t<TMIN) {
480	t=TMIN;
481	} else if(k>=(bias+TMAX)) {
482	t=TMAX;
483	}
484	if(digit<t) {
485	break;
486	}
487
488	if(w>`0x7fffffff`/(BASE-t)) {
489	/ integer overflow /
490	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
491	return `0`;
492	}
493	w*=BASE-t;
494	}
495
496	/*
497	* Modification from sample code:
498	* Increments destCPCount here,
499	* where needed instead of in for() loop tail.
500	*/
501	++destCPCount;
502	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==`0`));
503
504	/*
505	* i was supposed to wrap around from (incremented) destCPCount to 0,
506	* incrementing n each time, so we'll fix that now:
507	*/
508	if(i/destCPCount>(`0x7fffffff`-n)) {
509	/ integer overflow /
510	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
511	return `0`;
512	}
513
514	n+=i/destCPCount;
515	i%=destCPCount;
516	/ not needed for Punycode: /
517	/ if (decode_digit(n) <= BASE) return punycode_invalid_input; /
518
519	if(n>`0x10ffff` \|\| U_IS_SURROGATE(n)) {
520	/ Unicode code point overflow /
521	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
522	return `0`;
523	}
524
525	/ Insert n at position i of the output: /
526	cpLength=U16_LENGTH(n);
527	if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
528	int32_t codeUnitIndex;
529
530	/*
531	* Handle indexes when supplementary code points are present.
532	*
533	* In almost all cases, there will be only BMP code points before i
534	* and even in the entire string.
535	* This is handled with the same efficiency as with UTF-32.
536	*
537	* Only the rare cases with supplementary code points are handled
538	* more slowly - but not too bad since this is an insertion anyway.
539	*/
540	if(i<=firstSupplementaryIndex) {
541	codeUnitIndex=i;
542	if(cpLength>`1`) {
543	firstSupplementaryIndex=codeUnitIndex;
544	} else {
545	++firstSupplementaryIndex;
546	}
547	} else {
548	codeUnitIndex=firstSupplementaryIndex;
549	U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
550	}
551
552	/ use the UChar index codeUnitIndex instead of the code point index i /
553	if(codeUnitIndex<destLength) {
554	uprv_memmove(dest+codeUnitIndex+cpLength,
555	dest+codeUnitIndex,
556	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
557	if(caseFlags!=NULL) {
558	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
559	caseFlags+codeUnitIndex,
560	destLength-codeUnitIndex);
561	}
562	}
563	if(cpLength==`1`) {
564	/ BMP, insert one code unit /
565	dest[codeUnitIndex]=(UChar)n;
566	} else {
567	/ supplementary character, insert two code units /
568	dest[codeUnitIndex]=U16_LEAD(n);
569	dest[codeUnitIndex+`1`]=U16_TRAIL(n);
570	}
571	if(caseFlags!=NULL) {
572	/ Case of last character determines uppercase flag: /
573	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-`1`]);
574	if(cpLength==`2`) {
575	caseFlags[codeUnitIndex+`1`]=FALSE;
576	}
577	}
578	}
579	destLength+=cpLength;
580	U_ASSERT(destLength>=`0`);
581	++i;
582	}
583
584	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
585	}
586
587	/ ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? /
588
589	#endif /* #if !UCONFIG_NO_IDNA */
590

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/punycode.cpp