punycode.cpp source code [Godot/thirdparty/icu4c/common/punycode.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2002-2011, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: punycode.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2002jan31
16	* created by: Markus W. Scherer
17	*/
18
19
20	/ This ICU code derived from: /
21	/*
22	punycode.c 0.4.0 (2001-Nov-17-Sat)
23	http://www.cs.berkeley.edu/~amc/idn/
24	Adam M. Costello
25	http://www.nicemice.net/amc/
26
27	Disclaimer and license
28
29	Regarding this entire document or any portion of it (including
30	the pseudocode and C code), the author makes no guarantees and
31	is not responsible for any damage resulting from its use. The
32	author grants irrevocable permission to anyone to use, modify,
33	and distribute it in any way that does not diminish the rights
34	of anyone else to use, modify, and distribute it, provided that
35	redistributed derivative works do not contain misleading author or
36	version information. Derivative works need not be licensed under
37	similar terms.
38	*/
39	/*
40	* ICU modifications:
41	* - ICU data types and coding conventions
42	* - ICU string buffer handling with implicit source lengths
43	* and destination preflighting
44	* - UTF-16 handling
45	*/
46
47	#include "unicode/utypes.h"
48
49	#if !UCONFIG_NO_IDNA
50
51	#include "unicode/ustring.h"
52	#include "unicode/utf.h"
53	#include "unicode/utf16.h"
54	#include "ustr_imp.h"
55	#include "cstring.h"
56	#include "cmemory.h"
57	#include "punycode.h"
58	#include "uassert.h"
59
60
61	/ Punycode ----------------------------------------------------------------- /
62
63	/ Punycode parameters for Bootstring /
64	#define BASE 36
65	#define TMIN 1
66	#define TMAX 26
67	#define SKEW 38
68	#define DAMP 700
69	#define INITIAL_BIAS 72
70	#define INITIAL_N 0x80
71
72	/ "Basic" Unicode/ASCII code points /
73	#define _HYPHEN 0X2d
74	#define DELIMITER _HYPHEN
75
76	#define _ZERO_ 0X30
77	#define _NINE 0x39
78
79	#define _SMALL_A 0X61
80	#define _SMALL_Z 0X7a
81
82	#define _CAPITAL_A 0X41
83	#define _CAPITAL_Z 0X5a
84
85	#define IS_BASIC(c) ((c)<0x80)
86	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87
88	/**
89	* digitToBasic() returns the basic code point whose value
90	* (when used for representing integers) is d, which must be in the
91	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92	* nonzero, in which case the uppercase form is used.
93	*/
94	static inline char
95	digitToBasic(int32_t digit, UBool uppercase) {
96	/ 0..25 map to ASCII a..z or A..Z /
97	/ 26..35 map to ASCII 0..9 /
98	if(digit<`26`) {
99	if(uppercase) {
100	return (char)(_CAPITAL_A+digit);
101	} else {
102	return (char)(_SMALL_A+digit);
103	}
104	} else {
105	return (char)((_ZERO_-`26`)+digit);
106	}
107	}
108
109	/**
110	* @return the numeric value of a basic code point (for use in representing integers)
111	* in the range 0 to BASE-1, or a negative value if cp is invalid.
112	*/
113	static int32_t decodeDigit(int32_t cp) {
114	if(cp<=u`'Z'`) {
115	if(cp<=u`'9'`) {
116	if(cp<u`'0'`) {
117	return -`1`;
118	} else {
119	return cp-u`'0'`+`26`; // 0..9 -> 26..35
120	}
121	} else {
122	return cp-u`'A'`; // A-Z -> 0..25
123	}
124	} else if(cp<=u`'z'`) {
125	return cp-`'a'`; // a..z -> 0..25
126	} else {
127	return -`1`;
128	}
129	}
130
131	static inline char
132	asciiCaseMap(char b, UBool uppercase) {
133	if(uppercase) {
134	if(_SMALL_A<=b && b<=_SMALL_Z) {
135	b-=(_SMALL_A-_CAPITAL_A);
136	}
137	} else {
138	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
139	b+=(_SMALL_A-_CAPITAL_A);
140	}
141	}
142	return b;
143	}
144
145	/ Punycode-specific Bootstring code ---------------------------------------- /
146
147	/*
148	* The following code omits the {parts} of the pseudo-algorithm in the spec
149	* that are not used with the Punycode parameter set.
150	*/
151
152	/ Bias adaptation function. /
153	static int32_t
154	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
155	int32_t count;
156
157	if(firstTime) {
158	delta/=DAMP;
159	} else {
160	delta/=`2`;
161	}
162
163	delta+=delta/length;
164	for(count=`0`; delta>((BASE-TMIN)*TMAX)/`2`; count+=BASE) {
165	delta/=(BASE-TMIN);
166	}
167
168	return count+(((BASE-TMIN+`1`)*delta)/(delta+SKEW));
169	}
170
171	namespace {
172
173	// ICU-13727: Limit input length for n^2 algorithm
174	// where well-formed strings are at most 59 characters long.
175	constexpr int32_t ENCODE_MAX_CODE_UNITS=`1000`;
176	constexpr int32_t DECODE_MAX_CHARS=`2000`;
177
178	} // namespace
179
180	// encode
181	U_CAPI int32_t
182	u_strToPunycode(const char16_t *src, int32_t srcLength,
183	char16_t *dest, int32_t destCapacity,
184	const UBool *caseFlags,
185	UErrorCode *pErrorCode) {
186
187	int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
188	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
189	char16_t c, c2;
190
191	/ argument checking /
192	if(pErrorCode==nullptr \|\| U_FAILURE(*pErrorCode)) {
193	return `0`;
194	}
195
196	if(src==nullptr \|\| srcLength<-`1` \|\| (dest==nullptr && destCapacity!=`0`)) {
197	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
198	return `0`;
199	}
200	if (srcLength>ENCODE_MAX_CODE_UNITS) {
201	*pErrorCode=U_INPUT_TOO_LONG_ERROR;
202	return `0`;
203	}
204
205	/*
206	* Handle the basic code points and
207	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
208	*/
209	srcCPCount=destLength=`0`;
210	if(srcLength==-`1`) {
211	/ NUL-terminated input /
212	for(j=`0`; / no condition /; ++j) {
213	if((c=src[j])==`0`) {
214	break;
215	}
216	if(j>=ENCODE_MAX_CODE_UNITS) {
217	*pErrorCode=U_INPUT_TOO_LONG_ERROR;
218	return `0`;
219	}
220	if(IS_BASIC(c)) {
221	cpBuffer[srcCPCount++]=`0`;
222	if(destLength<destCapacity) {
223	dest[destLength]=
224	caseFlags!=nullptr ?
225	asciiCaseMap((char)c, caseFlags[j]) :
226	(char)c;
227	}
228	++destLength;
229	} else {
230	n=(caseFlags!=nullptr && caseFlags[j])<<`31L`;
231	if(U16_IS_SINGLE(c)) {
232	n\|=c;
233	} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+`1`])) {
234	++j;
235	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
236	} else {
237	/ error: unmatched surrogate /
238	*pErrorCode=U_INVALID_CHAR_FOUND;
239	return `0`;
240	}
241	cpBuffer[srcCPCount++]=n;
242	}
243	}
244	} else {
245	/ length-specified input /
246	for(j=`0`; j<srcLength; ++j) {
247	c=src[j];
248	if(IS_BASIC(c)) {
249	cpBuffer[srcCPCount++]=`0`;
250	if(destLength<destCapacity) {
251	dest[destLength]=
252	caseFlags!=nullptr ?
253	asciiCaseMap((char)c, caseFlags[j]) :
254	(char)c;
255	}
256	++destLength;
257	} else {
258	n=(caseFlags!=nullptr && caseFlags[j])<<`31L`;
259	if(U16_IS_SINGLE(c)) {
260	n\|=c;
261	} else if(U16_IS_LEAD(c) && (j+`1`)<srcLength && U16_IS_TRAIL(c2=src[j+`1`])) {
262	++j;
263	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
264	} else {
265	/ error: unmatched surrogate /
266	*pErrorCode=U_INVALID_CHAR_FOUND;
267	return `0`;
268	}
269	cpBuffer[srcCPCount++]=n;
270	}
271	}
272	}
273
274	/ Finish the basic string - if it is not empty - with a delimiter. /
275	basicLength=destLength;
276	if(basicLength>`0`) {
277	if(destLength<destCapacity) {
278	dest[destLength]=DELIMITER;
279	}
280	++destLength;
281	}
282
283	/*
284	* handledCPCount is the number of code points that have been handled
285	* basicLength is the number of basic code points
286	* destLength is the number of chars that have been output
287	*/
288
289	/ Initialize the state: /
290	n=INITIAL_N;
291	delta=`0`;
292	bias=INITIAL_BIAS;
293
294	/ Main encoding loop: /
295	for(handledCPCount=basicLength; handledCPCount<srcCPCount; / no op /) {
296	/*
297	* All non-basic code points < n have been handled already.
298	* Find the next larger one:
299	*/
300	for(m=`0x7fffffff`, j=`0`; j<srcCPCount; ++j) {
301	q=cpBuffer[j]&`0x7fffffff`; / remove case flag from the sign bit /
302	if(n<=q && q<m) {
303	m=q;
304	}
305	}
306
307	/*
308	* Increase delta enough to advance the decoder's
309	* <n,i> state to <m,0>, but guard against overflow:
310	*/
311	if(m-n>(`0x7fffffff`-handledCPCount-delta)/(handledCPCount+`1`)) {
312	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
313	return `0`;
314	}
315	delta+=(m-n)*(handledCPCount+`1`);
316	n=m;
317
318	/ Encode a sequence of same code points n /
319	for(j=`0`; j<srcCPCount; ++j) {
320	q=cpBuffer[j]&`0x7fffffff`; / remove case flag from the sign bit /
321	if(q<n) {
322	++delta;
323	} else if(q==n) {
324	/ Represent delta as a generalized variable-length integer: /
325	for(q=delta, k=BASE; / no condition /; k+=BASE) {
326
327	/* RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt*
328
329	t=k-bias;
330	if(t<TMIN) {
331	t=TMIN;
332	} else if(t>TMAX) {
333	t=TMAX;
334	}
335	*/
336
337	t=k-bias;
338	if(t<TMIN) {
339	t=TMIN;
340	} else if(k>=(bias+TMAX)) {
341	t=TMAX;
342	}
343
344	if(q<t) {
345	break;
346	}
347
348	if(destLength<destCapacity) {
349	dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), `0`);
350	}
351	++destLength;
352	q=(q-t)/(BASE-t);
353	}
354
355	if(destLength<destCapacity) {
356	dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<`0`));
357	}
358	++destLength;
359	bias=adaptBias(delta, handledCPCount+`1`, (UBool)(handledCPCount==basicLength));
360	delta=`0`;
361	++handledCPCount;
362	}
363	}
364
365	++delta;
366	++n;
367	}
368
369	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
370	}
371
372	// decode
373	U_CAPI int32_t
374	u_strFromPunycode(const char16_t *src, int32_t srcLength,
375	char16_t *dest, int32_t destCapacity,
376	UBool *caseFlags,
377	UErrorCode *pErrorCode) {
378	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
379	destCPCount, firstSupplementaryIndex, cpLength;
380	char16_t b;
381
382	/ argument checking /
383	if(pErrorCode==nullptr \|\| U_FAILURE(*pErrorCode)) {
384	return `0`;
385	}
386
387	if(src==nullptr \|\| srcLength<-`1` \|\| (dest==nullptr && destCapacity!=`0`)) {
388	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
389	return `0`;
390	}
391
392	if(srcLength==-`1`) {
393	srcLength=u_strlen(src);
394	}
395	if (srcLength>DECODE_MAX_CHARS) {
396	*pErrorCode=U_INPUT_TOO_LONG_ERROR;
397	return `0`;
398	}
399
400	/*
401	* Handle the basic code points:
402	* Let basicLength be the number of input code points
403	* before the last delimiter, or 0 if there is none,
404	* then copy the first basicLength code points to the output.
405	*
406	* The two following loops iterate backward.
407	*/
408	for(j=srcLength; j>`0`;) {
409	if(src[--j]==DELIMITER) {
410	break;
411	}
412	}
413	destLength=basicLength=destCPCount=j;
414	U_ASSERT(destLength>=`0`);
415
416	while(j>`0`) {
417	b=src[--j];
418	if(!IS_BASIC(b)) {
419	*pErrorCode=U_INVALID_CHAR_FOUND;
420	return `0`;
421	}
422
423	if(j<destCapacity) {
424	dest[j]=(char16_t)b;
425
426	if(caseFlags!=nullptr) {
427	caseFlags[j]=IS_BASIC_UPPERCASE(b);
428	}
429	}
430	}
431
432	/ Initialize the state: /
433	n=INITIAL_N;
434	i=`0`;
435	bias=INITIAL_BIAS;
436	firstSupplementaryIndex=`1000000000`;
437
438	/*
439	* Main decoding loop:
440	* Start just after the last delimiter if any
441	* basic code points were copied; start at the beginning otherwise.
442	*/
443	for(in=basicLength>`0` ? basicLength+`1` : `0`; in<srcLength; / no op /) {
444	/*
445	* in is the index of the next character to be consumed, and
446	* destCPCount is the number of code points in the output array.
447	*
448	* Decode a generalized variable-length integer into delta,
449	* which gets added to i. The overflow checking is easier
450	* if we increase i as we go, then subtract off its starting
451	* value at the end to obtain delta.
452	*/
453	for(oldi=i, w=`1`, k=BASE; / no condition /; k+=BASE) {
454	if(in>=srcLength) {
455	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
456	return `0`;
457	}
458
459	digit=decodeDigit(src[in++]);
460	if(digit<`0`) {
461	*pErrorCode=U_INVALID_CHAR_FOUND;
462	return `0`;
463	}
464	if(digit>(`0x7fffffff`-i)/w) {
465	/ integer overflow /
466	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
467	return `0`;
468	}
469
470	i+=digit*w;
471	/* RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt*
472	t=k-bias;
473	if(t<TMIN) {
474	t=TMIN;
475	} else if(t>TMAX) {
476	t=TMAX;
477	}
478	*/
479	t=k-bias;
480	if(t<TMIN) {
481	t=TMIN;
482	} else if(k>=(bias+TMAX)) {
483	t=TMAX;
484	}
485	if(digit<t) {
486	break;
487	}
488
489	if(w>`0x7fffffff`/(BASE-t)) {
490	/ integer overflow /
491	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
492	return `0`;
493	}
494	w*=BASE-t;
495	}
496
497	/*
498	* Modification from sample code:
499	* Increments destCPCount here,
500	* where needed instead of in for() loop tail.
501	*/
502	++destCPCount;
503	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==`0`));
504
505	/*
506	* i was supposed to wrap around from (incremented) destCPCount to 0,
507	* incrementing n each time, so we'll fix that now:
508	*/
509	if(i/destCPCount>(`0x7fffffff`-n)) {
510	/ integer overflow /
511	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
512	return `0`;
513	}
514
515	n+=i/destCPCount;
516	i%=destCPCount;
517	/ not needed for Punycode: /
518	/ if (decode_digit(n) <= BASE) return punycode_invalid_input; /
519
520	if(n>`0x10ffff` \|\| U_IS_SURROGATE(n)) {
521	/ Unicode code point overflow /
522	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
523	return `0`;
524	}
525
526	/ Insert n at position i of the output: /
527	cpLength=U16_LENGTH(n);
528	if(dest!=nullptr && ((destLength+cpLength)<=destCapacity)) {
529	int32_t codeUnitIndex;
530
531	/*
532	* Handle indexes when supplementary code points are present.
533	*
534	* In almost all cases, there will be only BMP code points before i
535	* and even in the entire string.
536	* This is handled with the same efficiency as with UTF-32.
537	*
538	* Only the rare cases with supplementary code points are handled
539	* more slowly - but not too bad since this is an insertion anyway.
540	*/
541	if(i<=firstSupplementaryIndex) {
542	codeUnitIndex=i;
543	if(cpLength>`1`) {
544	firstSupplementaryIndex=codeUnitIndex;
545	} else {
546	++firstSupplementaryIndex;
547	}
548	} else {
549	codeUnitIndex=firstSupplementaryIndex;
550	U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
551	}
552
553	/ use the char16_t index codeUnitIndex instead of the code point index i /
554	if(codeUnitIndex<destLength) {
555	uprv_memmove(dest+codeUnitIndex+cpLength,
556	dest+codeUnitIndex,
557	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
558	if(caseFlags!=nullptr) {
559	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
560	caseFlags+codeUnitIndex,
561	destLength-codeUnitIndex);
562	}
563	}
564	if(cpLength==`1`) {
565	/ BMP, insert one code unit /
566	dest[codeUnitIndex]=(char16_t)n;
567	} else {
568	/ supplementary character, insert two code units /
569	dest[codeUnitIndex]=U16_LEAD(n);
570	dest[codeUnitIndex+`1`]=U16_TRAIL(n);
571	}
572	if(caseFlags!=nullptr) {
573	/ Case of last character determines uppercase flag: /
574	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-`1`]);
575	if(cpLength==`2`) {
576	caseFlags[codeUnitIndex+`1`]=false;
577	}
578	}
579	}
580	destLength+=cpLength;
581	U_ASSERT(destLength>=`0`);
582	++i;
583	}
584
585	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
586	}
587
588	/ ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? /
589
590	#endif /* #if !UCONFIG_NO_IDNA */
591

Browse the source code of Godot/thirdparty/icu4c/common/punycode.cpp