physfs_unicode.c source code [LOVE/libraries/physfs/physfs_unicode.c]

1	#define __PHYSICSFS_INTERNAL__
2	#include "physfs_internal.h"
3
4	#include "physfs_casefolding.h"
5
6
7	/*
8	* From rfc3629, the UTF-8 spec:
9	* https://www.ietf.org/rfc/rfc3629.txt
10	*
11	* Char. number range \| UTF-8 octet sequence
12	* (hexadecimal) \| (binary)
13	* --------------------+---------------------------------------------
14	* 0000 0000-0000 007F \| 0xxxxxxx
15	* 0000 0080-0000 07FF \| 110xxxxx 10xxxxxx
16	* 0000 0800-0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
17	* 0001 0000-0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
18	*/
19
20
21	/*
22	* This may not be the best value, but it's one that isn't represented
23	* in Unicode (0x10FFFF is the largest codepoint value). We return this
24	* value from __PHYSFS_utf8codepoint() if there's bogus bits in the
25	* stream. __PHYSFS_utf8codepoint() will turn this value into something
26	* reasonable (like a question mark), for text that wants to try to recover,
27	* whereas utf8valid() will use the value to determine if a string has bad
28	* bits.
29	*/
30	#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
31
32	/*
33	* This is the codepoint we currently return when there was bogus bits in a
34	* UTF-8 string. May not fly in Asian locales?
35	*/
36	#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
37
38	PHYSFS_uint32 __PHYSFS_utf8codepoint(const char **_str)
39	{
40	const char str = _str;
41	PHYSFS_uint32 retval = `0`;
42	PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
43	PHYSFS_uint32 octet2, octet3, octet4;
44
45	if (octet == `0`) / null terminator, end of string. /
46	return `0`;
47
48	else if (octet < `128`) / one octet char: 0 to 127 /
49	{
50	(_str)++; /* skip to next possible start of codepoint. /
51	return octet;
52	} / else if /
53
54	else if ((octet > `127`) && (octet < `192`)) / bad (starts with 10xxxxxx). /
55	{
56	/*
57	* Apparently each of these is supposed to be flagged as a bogus
58	* char, instead of just resyncing to the next valid codepoint.
59	*/
60	(_str)++; /* skip to next possible start of codepoint. /
61	return UNICODE_BOGUS_CHAR_VALUE;
62	} / else if /
63
64	else if (octet < `224`) / two octets /
65	{
66	(_str)++; /* advance at least one byte in case of an error /
67	octet -= (`128`+`64`);
68	octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
69	if ((octet2 & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
70	return UNICODE_BOGUS_CHAR_VALUE;
71
72	_str += `1`; /* skip to next possible start of codepoint. /
73	retval = ((octet << `6`) \| (octet2 - `128`));
74	if ((retval >= `0x80`) && (retval <= `0x7FF`))
75	return retval;
76	} / else if /
77
78	else if (octet < `240`) / three octets /
79	{
80	(_str)++; /* advance at least one byte in case of an error /
81	octet -= (`128`+`64`+`32`);
82	octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
83	if ((octet2 & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
84	return UNICODE_BOGUS_CHAR_VALUE;
85
86	octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
87	if ((octet3 & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
88	return UNICODE_BOGUS_CHAR_VALUE;
89
90	_str += `2`; /* skip to next possible start of codepoint. /
91	retval = ( ((octet << `12`)) \| ((octet2-`128`) << `6`) \| ((octet3-`128`)) );
92
93	/ There are seven "UTF-16 surrogates" that are illegal in UTF-8. /
94	switch (retval)
95	{
96	case `0xD800`:
97	case `0xDB7F`:
98	case `0xDB80`:
99	case `0xDBFF`:
100	case `0xDC00`:
101	case `0xDF80`:
102	case `0xDFFF`:
103	return UNICODE_BOGUS_CHAR_VALUE;
104	} / switch /
105
106	/ 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. /
107	if ((retval >= `0x800`) && (retval <= `0xFFFD`))
108	return retval;
109	} / else if /
110
111	else if (octet < `248`) / four octets /
112	{
113	(_str)++; /* advance at least one byte in case of an error /
114	octet -= (`128`+`64`+`32`+`16`);
115	octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
116	if ((octet2 & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
117	return UNICODE_BOGUS_CHAR_VALUE;
118
119	octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
120	if ((octet3 & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
121	return UNICODE_BOGUS_CHAR_VALUE;
122
123	octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
124	if ((octet4 & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
125	return UNICODE_BOGUS_CHAR_VALUE;
126
127	_str += `3`; /* skip to next possible start of codepoint. /
128	retval = ( ((octet << `18`)) \| ((octet2 - `128`) << `12`) \|
129	((octet3 - `128`) << `6`) \| ((octet4 - `128`)) );
130	if ((retval >= `0x10000`) && (retval <= `0x10FFFF`))
131	return retval;
132	} / else if /
133
134	/*
135	* Five and six octet sequences became illegal in rfc3629.
136	* We throw the codepoint away, but parse them to make sure we move
137	* ahead the right number of bytes and don't overflow the buffer.
138	*/
139
140	else if (octet < `252`) / five octets /
141	{
142	(_str)++; /* advance at least one byte in case of an error /
143	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
144	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
145	return UNICODE_BOGUS_CHAR_VALUE;
146
147	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
148	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
149	return UNICODE_BOGUS_CHAR_VALUE;
150
151	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
152	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
153	return UNICODE_BOGUS_CHAR_VALUE;
154
155	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
156	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
157	return UNICODE_BOGUS_CHAR_VALUE;
158
159	_str += `4`; /* skip to next possible start of codepoint. /
160	return UNICODE_BOGUS_CHAR_VALUE;
161	} / else if /
162
163	else / six octets /
164	{
165	(_str)++; /* advance at least one byte in case of an error /
166	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
167	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
168	return UNICODE_BOGUS_CHAR_VALUE;
169
170	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
171	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
172	return UNICODE_BOGUS_CHAR_VALUE;
173
174	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
175	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
176	return UNICODE_BOGUS_CHAR_VALUE;
177
178	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
179	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
180	return UNICODE_BOGUS_CHAR_VALUE;
181
182	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
183	if ((octet & (`128`+`64`)) != `128`) / Format isn't 10xxxxxx? /
184	return UNICODE_BOGUS_CHAR_VALUE;
185
186	_str += `6`; /* skip to next possible start of codepoint. /
187	return UNICODE_BOGUS_CHAR_VALUE;
188	} / else if /
189
190	return UNICODE_BOGUS_CHAR_VALUE;
191	} / __PHYSFS_utf8codepoint /
192
193	static inline PHYSFS_uint32 utf8codepoint(const char **_str)
194	{
195	return __PHYSFS_utf8codepoint(_str);
196	} / utf8codepoint /
197
198	static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str)
199	{
200	const PHYSFS_uint16 src = _str;
201	PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
202
203	if (cp == `0`) / null terminator, end of string. /
204	return `0`;
205	/ Orphaned second half of surrogate pair? /
206	else if ((cp >= `0xDC00`) && (cp <= `0xDFFF`))
207	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
208	else if ((cp >= `0xD800`) && (cp <= `0xDBFF`)) / start surrogate pair! /
209	{
210	const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
211	if (pair == `0`)
212	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
213	else if ((pair < `0xDC00`) \|\| (pair > `0xDFFF`))
214	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
215	else
216	{
217	src++; / eat the other surrogate. /
218	cp = `0x10000` + (((cp - `0xD800`) << `10`) \| (pair - `0xDC00`));
219	} / else /
220	} / else if /
221
222	*_str = src;
223	return cp;
224	} / utf16codepoint /
225
226	static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str)
227	{
228	const PHYSFS_uint32 src = _str;
229	PHYSFS_uint32 cp = *(src++);
230
231	if (cp == `0`) / null terminator, end of string. /
232	return `0`;
233	else if (cp > `0x10FFF`)
234	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
235
236	*_str = src;
237	return cp;
238	} / utf32codepoint /
239
240
241	void PHYSFS_utf8ToUcs4(const char src, PHYSFS_uint32 dst, PHYSFS_uint64 len)
242	{
243	len -= sizeof (PHYSFS_uint32); / save room for null char. /
244	while (len >= sizeof (PHYSFS_uint32))
245	{
246	PHYSFS_uint32 cp = __PHYSFS_utf8codepoint(&src);
247	if (cp == `0`)
248	break;
249	else if (cp == UNICODE_BOGUS_CHAR_VALUE)
250	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
251	*(dst++) = cp;
252	len -= sizeof (PHYSFS_uint32);
253	} / while /
254
255	*dst = `0`;
256	} / PHYSFS_utf8ToUcs4 /
257
258
259	void PHYSFS_utf8ToUcs2(const char src, PHYSFS_uint16 dst, PHYSFS_uint64 len)
260	{
261	len -= sizeof (PHYSFS_uint16); / save room for null char. /
262	while (len >= sizeof (PHYSFS_uint16))
263	{
264	PHYSFS_uint32 cp = __PHYSFS_utf8codepoint(&src);
265	if (cp == `0`)
266	break;
267	else if (cp == UNICODE_BOGUS_CHAR_VALUE)
268	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
269
270	if (cp > `0xFFFF`) / UTF-16 surrogates (bogus chars in UCS-2) /
271	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
272
273	*(dst++) = cp;
274	len -= sizeof (PHYSFS_uint16);
275	} / while /
276
277	*dst = `0`;
278	} / PHYSFS_utf8ToUcs2 /
279
280
281	void PHYSFS_utf8ToUtf16(const char src, PHYSFS_uint16 dst, PHYSFS_uint64 len)
282	{
283	len -= sizeof (PHYSFS_uint16); / save room for null char. /
284	while (len >= sizeof (PHYSFS_uint16))
285	{
286	PHYSFS_uint32 cp = __PHYSFS_utf8codepoint(&src);
287	if (cp == `0`)
288	break;
289	else if (cp == UNICODE_BOGUS_CHAR_VALUE)
290	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
291
292	if (cp > `0xFFFF`) / encode as surrogate pair /
293	{
294	if (len < (sizeof (PHYSFS_uint16) * `2`))
295	break; / not enough room for the pair, stop now. /
296
297	cp -= `0x10000`; / Make this a 20-bit value /
298
299	*(dst++) = `0xD800` + ((cp >> `10`) & `0x3FF`);
300	len -= sizeof (PHYSFS_uint16);
301
302	cp = `0xDC00` + (cp & `0x3FF`);
303	} / if /
304
305	*(dst++) = cp;
306	len -= sizeof (PHYSFS_uint16);
307	} / while /
308
309	*dst = `0`;
310	} / PHYSFS_utf8ToUtf16 /
311
312	static void utf8fromcodepoint(PHYSFS_uint32 cp, char *_dst, PHYSFS_uint64 _len)
313	{
314	char dst = _dst;
315	PHYSFS_uint64 len = *_len;
316
317	if (len == `0`)
318	return;
319
320	if (cp > `0x10FFFF`)
321	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
322	else if ((cp == `0xFFFE`) \|\| (cp == `0xFFFF`)) / illegal values. /
323	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
324	else
325	{
326	/ There are seven "UTF-16 surrogates" that are illegal in UTF-8. /
327	switch (cp)
328	{
329	case `0xD800`:
330	case `0xDB7F`:
331	case `0xDB80`:
332	case `0xDBFF`:
333	case `0xDC00`:
334	case `0xDF80`:
335	case `0xDFFF`:
336	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
337	} / switch /
338	} / else /
339
340	/ Do the encoding... /
341	if (cp < `0x80`)
342	{
343	(dst++) = (char*) cp;
344	len--;
345	} / if /
346
347	else if (cp < `0x800`)
348	{
349	if (len < `2`)
350	len = `0`;
351	else
352	{
353	(dst++) = (char*) ((cp >> `6`) \| `128` \| `64`);
354	(dst++) = (char*) (cp & `0x3F`) \| `128`;
355	len -= `2`;
356	} / else /
357	} / else if /
358
359	else if (cp < `0x10000`)
360	{
361	if (len < `3`)
362	len = `0`;
363	else
364	{
365	(dst++) = (char*) ((cp >> `12`) \| `128` \| `64` \| `32`);
366	(dst++) = (char*) ((cp >> `6`) & `0x3F`) \| `128`;
367	(dst++) = (char*) (cp & `0x3F`) \| `128`;
368	len -= `3`;
369	} / else /
370	} / else if /
371
372	else
373	{
374	if (len < `4`)
375	len = `0`;
376	else
377	{
378	(dst++) = (char*) ((cp >> `18`) \| `128` \| `64` \| `32` \| `16`);
379	(dst++) = (char*) ((cp >> `12`) & `0x3F`) \| `128`;
380	(dst++) = (char*) ((cp >> `6`) & `0x3F`) \| `128`;
381	(dst++) = (char*) (cp & `0x3F`) \| `128`;
382	len -= `4`;
383	} / else if /
384	} / else /
385
386	*_dst = dst;
387	*_len = len;
388	} / utf8fromcodepoint /
389
390	#define UTF8FROMTYPE(typ, src, dst, len) \
391	if (len == 0) return; \
392	len--; \
393	while (len) \
394	{ \
395	const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
396	if (cp == 0) break; \
397	utf8fromcodepoint(cp, &dst, &len); \
398	} \
399	*dst = '\0'; \
400
401	void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 src, char* *dst, PHYSFS_uint64 len)
402	{
403	UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
404	} / PHYSFS_utf8FromUcs4 /
405
406	void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 src, char* *dst, PHYSFS_uint64 len)
407	{
408	UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
409	} / PHYSFS_utf8FromUcs2 /
410
411	/ latin1 maps to unicode codepoints directly, we just utf-8 encode it. /
412	void PHYSFS_utf8FromLatin1(const char src, char* *dst, PHYSFS_uint64 len)
413	{
414	UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
415	} / PHYSFS_utf8FromLatin1 /
416
417	#undef UTF8FROMTYPE
418
419
420	void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 src, char* *dst, PHYSFS_uint64 len)
421	{
422	if (len == `0`)
423	return;
424
425	len--;
426	while (len)
427	{
428	const PHYSFS_uint32 cp = utf16codepoint(&src);
429	if (!cp)
430	break;
431	utf8fromcodepoint(cp, &dst, &len);
432	} / while /
433
434	*dst = `'\0'`;
435	} / PHYSFS_utf8FromUtf16 /
436
437
438	int PHYSFS_caseFold(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
439	{
440	int i;
441
442	if (from < `128`) / low-ASCII, easy! /
443	{
444	if ((from >= `'A'`) && (from <= `'Z'`))
445	*to = from - (`'A'` - `'a'`);
446	else
447	*to = from;
448	return `1`;
449	} / if /
450
451	else if (from <= `0xFFFF`)
452	{
453	const PHYSFS_uint8 hash = ((from ^ (from >> `8`)) & `0xFF`);
454	const PHYSFS_uint16 from16 = (PHYSFS_uint16) from;
455
456	{
457	const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
458	const int count = (int) bucket->count;
459	for (i = `0`; i < count; i++)
460	{
461	const CaseFoldMapping1_16 *mapping = &bucket->list[i];
462	if (mapping->from == from16)
463	{
464	*to = mapping->to0;
465	return `1`;
466	} / if /
467	} / for /
468	}
469
470	{
471	const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & `15`];
472	const int count = (int) bucket->count;
473	for (i = `0`; i < count; i++)
474	{
475	const CaseFoldMapping2_16 *mapping = &bucket->list[i];
476	if (mapping->from == from16)
477	{
478	to[`0`] = mapping->to0;
479	to[`1`] = mapping->to1;
480	return `2`;
481	} / if /
482	} / for /
483	}
484
485	{
486	const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & `3`];
487	const int count = (int) bucket->count;
488	for (i = `0`; i < count; i++)
489	{
490	const CaseFoldMapping3_16 *mapping = &bucket->list[i];
491	if (mapping->from == from16)
492	{
493	to[`0`] = mapping->to0;
494	to[`1`] = mapping->to1;
495	to[`2`] = mapping->to2;
496	return `3`;
497	} / if /
498	} / for /
499	}
500	} / else if /
501
502	else / codepoint that doesn't fit in 16 bits. /
503	{
504	const PHYSFS_uint8 hash = ((from ^ (from >> `8`)) & `0xFF`);
505	const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & `15`];
506	const int count = (int) bucket->count;
507	for (i = `0`; i < count; i++)
508	{
509	const CaseFoldMapping1_32 *mapping = &bucket->list[i];
510	if (mapping->from == from)
511	{
512	*to = mapping->to0;
513	return `1`;
514	} / if /
515	} / for /
516	} / else /
517
518
519	/ Not found...there's no remapping for this codepoint. /
520	*to = from;
521	return `1`;
522	} / PHYSFS_caseFold /
523
524
525	#define UTFSTRICMP(bits) \
526	PHYSFS_uint32 folded1[3], folded2[3]; \
527	int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
528	while (1) { \
529	PHYSFS_uint32 cp1, cp2; \
530	if (head1 != tail1) { \
531	cp1 = folded1[tail1++]; \
532	} else { \
533	head1 = PHYSFS_caseFold(utf##bits##codepoint(&str1), folded1); \
534	cp1 = folded1[0]; \
535	tail1 = 1; \
536	} \
537	if (head2 != tail2) { \
538	cp2 = folded2[tail2++]; \
539	} else { \
540	head2 = PHYSFS_caseFold(utf##bits##codepoint(&str2), folded2); \
541	cp2 = folded2[0]; \
542	tail2 = 1; \
543	} \
544	if (cp1 < cp2) { \
545	return -1; \
546	} else if (cp1 > cp2) { \
547	return 1; \
548	} else if (cp1 == 0) { \
549	break; /* complete match. */ \
550	} \
551	} \
552	return 0
553
554	int PHYSFS_utf8stricmp(const char str1, const* char *str2)
555	{
556	UTFSTRICMP(`8`);
557	} / PHYSFS_utf8stricmp /
558
559	int PHYSFS_utf16stricmp(const PHYSFS_uint16 str1, const* PHYSFS_uint16 *str2)
560	{
561	UTFSTRICMP(`16`);
562	} / PHYSFS_utf16stricmp /
563
564	int PHYSFS_ucs4stricmp(const PHYSFS_uint32 str1, const* PHYSFS_uint32 *str2)
565	{
566	UTFSTRICMP(`32`);
567	} / PHYSFS_ucs4stricmp /
568
569	#undef UTFSTRICMP
570
571	/ end of physfs_unicode.c ... /
572
573

Browse the source code of LOVE/libraries/physfs/physfs_unicode.c