1#define __PHYSICSFS_INTERNAL__
2#include "physfs_internal.h"
3
4#include "physfs_casefolding.h"
5
6
7/*
8 * From rfc3629, the UTF-8 spec:
9 * https://www.ietf.org/rfc/rfc3629.txt
10 *
11 * Char. number range | UTF-8 octet sequence
12 * (hexadecimal) | (binary)
13 * --------------------+---------------------------------------------
14 * 0000 0000-0000 007F | 0xxxxxxx
15 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
16 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
17 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
18 */
19
20
21/*
22 * This may not be the best value, but it's one that isn't represented
23 * in Unicode (0x10FFFF is the largest codepoint value). We return this
24 * value from __PHYSFS_utf8codepoint() if there's bogus bits in the
25 * stream. __PHYSFS_utf8codepoint() will turn this value into something
26 * reasonable (like a question mark), for text that wants to try to recover,
27 * whereas utf8valid() will use the value to determine if a string has bad
28 * bits.
29 */
30#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
31
32/*
33 * This is the codepoint we currently return when there was bogus bits in a
34 * UTF-8 string. May not fly in Asian locales?
35 */
36#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
37
38PHYSFS_uint32 __PHYSFS_utf8codepoint(const char **_str)
39{
40 const char *str = *_str;
41 PHYSFS_uint32 retval = 0;
42 PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
43 PHYSFS_uint32 octet2, octet3, octet4;
44
45 if (octet == 0) /* null terminator, end of string. */
46 return 0;
47
48 else if (octet < 128) /* one octet char: 0 to 127 */
49 {
50 (*_str)++; /* skip to next possible start of codepoint. */
51 return octet;
52 } /* else if */
53
54 else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
55 {
56 /*
57 * Apparently each of these is supposed to be flagged as a bogus
58 * char, instead of just resyncing to the next valid codepoint.
59 */
60 (*_str)++; /* skip to next possible start of codepoint. */
61 return UNICODE_BOGUS_CHAR_VALUE;
62 } /* else if */
63
64 else if (octet < 224) /* two octets */
65 {
66 (*_str)++; /* advance at least one byte in case of an error */
67 octet -= (128+64);
68 octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
69 if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
70 return UNICODE_BOGUS_CHAR_VALUE;
71
72 *_str += 1; /* skip to next possible start of codepoint. */
73 retval = ((octet << 6) | (octet2 - 128));
74 if ((retval >= 0x80) && (retval <= 0x7FF))
75 return retval;
76 } /* else if */
77
78 else if (octet < 240) /* three octets */
79 {
80 (*_str)++; /* advance at least one byte in case of an error */
81 octet -= (128+64+32);
82 octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
83 if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
84 return UNICODE_BOGUS_CHAR_VALUE;
85
86 octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
87 if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
88 return UNICODE_BOGUS_CHAR_VALUE;
89
90 *_str += 2; /* skip to next possible start of codepoint. */
91 retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
92
93 /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
94 switch (retval)
95 {
96 case 0xD800:
97 case 0xDB7F:
98 case 0xDB80:
99 case 0xDBFF:
100 case 0xDC00:
101 case 0xDF80:
102 case 0xDFFF:
103 return UNICODE_BOGUS_CHAR_VALUE;
104 } /* switch */
105
106 /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
107 if ((retval >= 0x800) && (retval <= 0xFFFD))
108 return retval;
109 } /* else if */
110
111 else if (octet < 248) /* four octets */
112 {
113 (*_str)++; /* advance at least one byte in case of an error */
114 octet -= (128+64+32+16);
115 octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
116 if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
117 return UNICODE_BOGUS_CHAR_VALUE;
118
119 octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
120 if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
121 return UNICODE_BOGUS_CHAR_VALUE;
122
123 octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
124 if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
125 return UNICODE_BOGUS_CHAR_VALUE;
126
127 *_str += 3; /* skip to next possible start of codepoint. */
128 retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
129 ((octet3 - 128) << 6) | ((octet4 - 128)) );
130 if ((retval >= 0x10000) && (retval <= 0x10FFFF))
131 return retval;
132 } /* else if */
133
134 /*
135 * Five and six octet sequences became illegal in rfc3629.
136 * We throw the codepoint away, but parse them to make sure we move
137 * ahead the right number of bytes and don't overflow the buffer.
138 */
139
140 else if (octet < 252) /* five octets */
141 {
142 (*_str)++; /* advance at least one byte in case of an error */
143 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
144 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
145 return UNICODE_BOGUS_CHAR_VALUE;
146
147 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
148 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
149 return UNICODE_BOGUS_CHAR_VALUE;
150
151 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
152 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
153 return UNICODE_BOGUS_CHAR_VALUE;
154
155 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
156 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
157 return UNICODE_BOGUS_CHAR_VALUE;
158
159 *_str += 4; /* skip to next possible start of codepoint. */
160 return UNICODE_BOGUS_CHAR_VALUE;
161 } /* else if */
162
163 else /* six octets */
164 {
165 (*_str)++; /* advance at least one byte in case of an error */
166 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
167 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
168 return UNICODE_BOGUS_CHAR_VALUE;
169
170 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
171 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
172 return UNICODE_BOGUS_CHAR_VALUE;
173
174 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
175 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
176 return UNICODE_BOGUS_CHAR_VALUE;
177
178 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
179 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
180 return UNICODE_BOGUS_CHAR_VALUE;
181
182 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
183 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
184 return UNICODE_BOGUS_CHAR_VALUE;
185
186 *_str += 6; /* skip to next possible start of codepoint. */
187 return UNICODE_BOGUS_CHAR_VALUE;
188 } /* else if */
189
190 return UNICODE_BOGUS_CHAR_VALUE;
191} /* __PHYSFS_utf8codepoint */
192
193static inline PHYSFS_uint32 utf8codepoint(const char **_str)
194{
195 return __PHYSFS_utf8codepoint(_str);
196} /* utf8codepoint */
197
198static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str)
199{
200 const PHYSFS_uint16 *src = *_str;
201 PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
202
203 if (cp == 0) /* null terminator, end of string. */
204 return 0;
205 /* Orphaned second half of surrogate pair? */
206 else if ((cp >= 0xDC00) && (cp <= 0xDFFF))
207 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
208 else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */
209 {
210 const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
211 if (pair == 0)
212 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
213 else if ((pair < 0xDC00) || (pair > 0xDFFF))
214 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
215 else
216 {
217 src++; /* eat the other surrogate. */
218 cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00));
219 } /* else */
220 } /* else if */
221
222 *_str = src;
223 return cp;
224} /* utf16codepoint */
225
226static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str)
227{
228 const PHYSFS_uint32 *src = *_str;
229 PHYSFS_uint32 cp = *(src++);
230
231 if (cp == 0) /* null terminator, end of string. */
232 return 0;
233 else if (cp > 0x10FFF)
234 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
235
236 *_str = src;
237 return cp;
238} /* utf32codepoint */
239
240
241void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
242{
243 len -= sizeof (PHYSFS_uint32); /* save room for null char. */
244 while (len >= sizeof (PHYSFS_uint32))
245 {
246 PHYSFS_uint32 cp = __PHYSFS_utf8codepoint(&src);
247 if (cp == 0)
248 break;
249 else if (cp == UNICODE_BOGUS_CHAR_VALUE)
250 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
251 *(dst++) = cp;
252 len -= sizeof (PHYSFS_uint32);
253 } /* while */
254
255 *dst = 0;
256} /* PHYSFS_utf8ToUcs4 */
257
258
259void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
260{
261 len -= sizeof (PHYSFS_uint16); /* save room for null char. */
262 while (len >= sizeof (PHYSFS_uint16))
263 {
264 PHYSFS_uint32 cp = __PHYSFS_utf8codepoint(&src);
265 if (cp == 0)
266 break;
267 else if (cp == UNICODE_BOGUS_CHAR_VALUE)
268 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
269
270 if (cp > 0xFFFF) /* UTF-16 surrogates (bogus chars in UCS-2) */
271 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
272
273 *(dst++) = cp;
274 len -= sizeof (PHYSFS_uint16);
275 } /* while */
276
277 *dst = 0;
278} /* PHYSFS_utf8ToUcs2 */
279
280
281void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
282{
283 len -= sizeof (PHYSFS_uint16); /* save room for null char. */
284 while (len >= sizeof (PHYSFS_uint16))
285 {
286 PHYSFS_uint32 cp = __PHYSFS_utf8codepoint(&src);
287 if (cp == 0)
288 break;
289 else if (cp == UNICODE_BOGUS_CHAR_VALUE)
290 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
291
292 if (cp > 0xFFFF) /* encode as surrogate pair */
293 {
294 if (len < (sizeof (PHYSFS_uint16) * 2))
295 break; /* not enough room for the pair, stop now. */
296
297 cp -= 0x10000; /* Make this a 20-bit value */
298
299 *(dst++) = 0xD800 + ((cp >> 10) & 0x3FF);
300 len -= sizeof (PHYSFS_uint16);
301
302 cp = 0xDC00 + (cp & 0x3FF);
303 } /* if */
304
305 *(dst++) = cp;
306 len -= sizeof (PHYSFS_uint16);
307 } /* while */
308
309 *dst = 0;
310} /* PHYSFS_utf8ToUtf16 */
311
312static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
313{
314 char *dst = *_dst;
315 PHYSFS_uint64 len = *_len;
316
317 if (len == 0)
318 return;
319
320 if (cp > 0x10FFFF)
321 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
322 else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
323 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
324 else
325 {
326 /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
327 switch (cp)
328 {
329 case 0xD800:
330 case 0xDB7F:
331 case 0xDB80:
332 case 0xDBFF:
333 case 0xDC00:
334 case 0xDF80:
335 case 0xDFFF:
336 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
337 } /* switch */
338 } /* else */
339
340 /* Do the encoding... */
341 if (cp < 0x80)
342 {
343 *(dst++) = (char) cp;
344 len--;
345 } /* if */
346
347 else if (cp < 0x800)
348 {
349 if (len < 2)
350 len = 0;
351 else
352 {
353 *(dst++) = (char) ((cp >> 6) | 128 | 64);
354 *(dst++) = (char) (cp & 0x3F) | 128;
355 len -= 2;
356 } /* else */
357 } /* else if */
358
359 else if (cp < 0x10000)
360 {
361 if (len < 3)
362 len = 0;
363 else
364 {
365 *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
366 *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
367 *(dst++) = (char) (cp & 0x3F) | 128;
368 len -= 3;
369 } /* else */
370 } /* else if */
371
372 else
373 {
374 if (len < 4)
375 len = 0;
376 else
377 {
378 *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
379 *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
380 *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
381 *(dst++) = (char) (cp & 0x3F) | 128;
382 len -= 4;
383 } /* else if */
384 } /* else */
385
386 *_dst = dst;
387 *_len = len;
388} /* utf8fromcodepoint */
389
390#define UTF8FROMTYPE(typ, src, dst, len) \
391 if (len == 0) return; \
392 len--; \
393 while (len) \
394 { \
395 const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
396 if (cp == 0) break; \
397 utf8fromcodepoint(cp, &dst, &len); \
398 } \
399 *dst = '\0'; \
400
401void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
402{
403 UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
404} /* PHYSFS_utf8FromUcs4 */
405
406void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
407{
408 UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
409} /* PHYSFS_utf8FromUcs2 */
410
411/* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
412void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
413{
414 UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
415} /* PHYSFS_utf8FromLatin1 */
416
417#undef UTF8FROMTYPE
418
419
420void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
421{
422 if (len == 0)
423 return;
424
425 len--;
426 while (len)
427 {
428 const PHYSFS_uint32 cp = utf16codepoint(&src);
429 if (!cp)
430 break;
431 utf8fromcodepoint(cp, &dst, &len);
432 } /* while */
433
434 *dst = '\0';
435} /* PHYSFS_utf8FromUtf16 */
436
437
438int PHYSFS_caseFold(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
439{
440 int i;
441
442 if (from < 128) /* low-ASCII, easy! */
443 {
444 if ((from >= 'A') && (from <= 'Z'))
445 *to = from - ('A' - 'a');
446 else
447 *to = from;
448 return 1;
449 } /* if */
450
451 else if (from <= 0xFFFF)
452 {
453 const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
454 const PHYSFS_uint16 from16 = (PHYSFS_uint16) from;
455
456 {
457 const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
458 const int count = (int) bucket->count;
459 for (i = 0; i < count; i++)
460 {
461 const CaseFoldMapping1_16 *mapping = &bucket->list[i];
462 if (mapping->from == from16)
463 {
464 *to = mapping->to0;
465 return 1;
466 } /* if */
467 } /* for */
468 }
469
470 {
471 const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
472 const int count = (int) bucket->count;
473 for (i = 0; i < count; i++)
474 {
475 const CaseFoldMapping2_16 *mapping = &bucket->list[i];
476 if (mapping->from == from16)
477 {
478 to[0] = mapping->to0;
479 to[1] = mapping->to1;
480 return 2;
481 } /* if */
482 } /* for */
483 }
484
485 {
486 const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
487 const int count = (int) bucket->count;
488 for (i = 0; i < count; i++)
489 {
490 const CaseFoldMapping3_16 *mapping = &bucket->list[i];
491 if (mapping->from == from16)
492 {
493 to[0] = mapping->to0;
494 to[1] = mapping->to1;
495 to[2] = mapping->to2;
496 return 3;
497 } /* if */
498 } /* for */
499 }
500 } /* else if */
501
502 else /* codepoint that doesn't fit in 16 bits. */
503 {
504 const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
505 const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
506 const int count = (int) bucket->count;
507 for (i = 0; i < count; i++)
508 {
509 const CaseFoldMapping1_32 *mapping = &bucket->list[i];
510 if (mapping->from == from)
511 {
512 *to = mapping->to0;
513 return 1;
514 } /* if */
515 } /* for */
516 } /* else */
517
518
519 /* Not found...there's no remapping for this codepoint. */
520 *to = from;
521 return 1;
522} /* PHYSFS_caseFold */
523
524
525#define UTFSTRICMP(bits) \
526 PHYSFS_uint32 folded1[3], folded2[3]; \
527 int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
528 while (1) { \
529 PHYSFS_uint32 cp1, cp2; \
530 if (head1 != tail1) { \
531 cp1 = folded1[tail1++]; \
532 } else { \
533 head1 = PHYSFS_caseFold(utf##bits##codepoint(&str1), folded1); \
534 cp1 = folded1[0]; \
535 tail1 = 1; \
536 } \
537 if (head2 != tail2) { \
538 cp2 = folded2[tail2++]; \
539 } else { \
540 head2 = PHYSFS_caseFold(utf##bits##codepoint(&str2), folded2); \
541 cp2 = folded2[0]; \
542 tail2 = 1; \
543 } \
544 if (cp1 < cp2) { \
545 return -1; \
546 } else if (cp1 > cp2) { \
547 return 1; \
548 } else if (cp1 == 0) { \
549 break; /* complete match. */ \
550 } \
551 } \
552 return 0
553
554int PHYSFS_utf8stricmp(const char *str1, const char *str2)
555{
556 UTFSTRICMP(8);
557} /* PHYSFS_utf8stricmp */
558
559int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2)
560{
561 UTFSTRICMP(16);
562} /* PHYSFS_utf16stricmp */
563
564int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2)
565{
566 UTFSTRICMP(32);
567} /* PHYSFS_ucs4stricmp */
568
569#undef UTFSTRICMP
570
571/* end of physfs_unicode.c ... */
572
573