1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ****************************************************************************** |
5 | * |
6 | * Copyright (C) 2003-2016, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ****************************************************************************** |
10 | * file name: ucnv_ext.cpp |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2003jun13 |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * Conversion extensions |
19 | */ |
20 | |
21 | #include "unicode/utypes.h" |
22 | |
23 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
24 | |
25 | #include "unicode/uset.h" |
26 | #include "unicode/ustring.h" |
27 | #include "ucnv_bld.h" |
28 | #include "ucnv_cnv.h" |
29 | #include "ucnv_ext.h" |
30 | #include "cmemory.h" |
31 | #include "uassert.h" |
32 | |
33 | /* to Unicode --------------------------------------------------------------- */ |
34 | |
35 | /* |
36 | * @return lookup value for the byte, if found; else 0 |
37 | */ |
38 | static inline uint32_t |
39 | ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { |
40 | uint32_t word0, word; |
41 | int32_t i, start, limit; |
42 | |
43 | /* check the input byte against the lowest and highest section bytes */ |
44 | start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); |
45 | limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); |
46 | if(byte<start || limit<byte) { |
47 | return 0; /* the byte is out of range */ |
48 | } |
49 | |
50 | if(length==((limit-start)+1)) { |
51 | /* direct access on a linear array */ |
52 | return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */ |
53 | } |
54 | |
55 | /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ |
56 | word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0); |
57 | |
58 | /* |
59 | * Shift byte once instead of each section word and add 0xffffff. |
60 | * We will compare the shifted/added byte (bbffffff) against |
61 | * section words which have byte values in the same bit position. |
62 | * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv |
63 | * for all v=0..f |
64 | * so we need not mask off the lower 24 bits of each section word. |
65 | */ |
66 | word=word0|UCNV_EXT_TO_U_VALUE_MASK; |
67 | |
68 | /* binary search */ |
69 | start=0; |
70 | limit=length; |
71 | for(;;) { |
72 | i=limit-start; |
73 | if(i<=1) { |
74 | break; /* done */ |
75 | } |
76 | /* start<limit-1 */ |
77 | |
78 | if(i<=4) { |
79 | /* linear search for the last part */ |
80 | if(word0<=toUSection[start]) { |
81 | break; |
82 | } |
83 | if(++start<limit && word0<=toUSection[start]) { |
84 | break; |
85 | } |
86 | if(++start<limit && word0<=toUSection[start]) { |
87 | break; |
88 | } |
89 | /* always break at start==limit-1 */ |
90 | ++start; |
91 | break; |
92 | } |
93 | |
94 | i=(start+limit)/2; |
95 | if(word<toUSection[i]) { |
96 | limit=i; |
97 | } else { |
98 | start=i; |
99 | } |
100 | } |
101 | |
102 | /* did we really find it? */ |
103 | if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) { |
104 | return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */ |
105 | } else { |
106 | return 0; /* not found */ |
107 | } |
108 | } |
109 | |
110 | /* |
111 | * true if not an SI/SO stateful converter, |
112 | * or if the match length fits with the current converter state |
113 | */ |
114 | #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \ |
115 | ((sisoState)<0 || ((sisoState)==0) == (match==1)) |
116 | |
117 | /* |
118 | * this works like ucnv_extMatchFromU() except |
119 | * - the first character is in pre |
120 | * - no trie is used |
121 | * - the returned matchLength is not offset by 2 |
122 | */ |
123 | static int32_t |
124 | ucnv_extMatchToU(const int32_t *cx, int8_t sisoState, |
125 | const char *pre, int32_t preLength, |
126 | const char *src, int32_t srcLength, |
127 | uint32_t *pMatchValue, |
128 | UBool /*useFallback*/, UBool flush) { |
129 | const uint32_t *toUTable, *toUSection; |
130 | |
131 | uint32_t value, matchValue; |
132 | int32_t i, j, idx, length, matchLength; |
133 | uint8_t b; |
134 | |
135 | if(cx==nullptr || cx[UCNV_EXT_TO_U_LENGTH]<=0) { |
136 | return 0; /* no extension data, no match */ |
137 | } |
138 | |
139 | /* initialize */ |
140 | toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t); |
141 | idx=0; |
142 | |
143 | matchValue=0; |
144 | i=j=matchLength=0; |
145 | |
146 | if(sisoState==0) { |
147 | /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ |
148 | if(preLength>1) { |
149 | return 0; /* no match of a DBCS sequence in SBCS mode */ |
150 | } else if(preLength==1) { |
151 | srcLength=0; |
152 | } else /* preLength==0 */ { |
153 | if(srcLength>1) { |
154 | srcLength=1; |
155 | } |
156 | } |
157 | flush=true; |
158 | } |
159 | |
160 | /* we must not remember fallback matches when not using fallbacks */ |
161 | |
162 | /* match input units until there is a full match or the input is consumed */ |
163 | for(;;) { |
164 | /* go to the next section */ |
165 | toUSection=toUTable+idx; |
166 | |
167 | /* read first pair of the section */ |
168 | value=*toUSection++; |
169 | length=UCNV_EXT_TO_U_GET_BYTE(value); |
170 | value=UCNV_EXT_TO_U_GET_VALUE(value); |
171 | if( value!=0 && |
172 | (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || |
173 | TO_U_USE_FALLBACK(useFallback)) && |
174 | UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) |
175 | ) { |
176 | /* remember longest match so far */ |
177 | matchValue=value; |
178 | matchLength=i+j; |
179 | } |
180 | |
181 | /* match pre[] then src[] */ |
182 | if(i<preLength) { |
183 | b=(uint8_t)pre[i++]; |
184 | } else if(j<srcLength) { |
185 | b=(uint8_t)src[j++]; |
186 | } else { |
187 | /* all input consumed, partial match */ |
188 | if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) { |
189 | /* |
190 | * end of the entire input stream, stop with the longest match so far |
191 | * or: partial match must not be longer than UCNV_EXT_MAX_BYTES |
192 | * because it must fit into state buffers |
193 | */ |
194 | break; |
195 | } else { |
196 | /* continue with more input next time */ |
197 | return -length; |
198 | } |
199 | } |
200 | |
201 | /* search for the current char16_t */ |
202 | value=ucnv_extFindToU(toUSection, length, b); |
203 | if(value==0) { |
204 | /* no match here, stop with the longest match so far */ |
205 | break; |
206 | } else { |
207 | if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { |
208 | /* partial match, continue */ |
209 | idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); |
210 | } else { |
211 | if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || |
212 | TO_U_USE_FALLBACK(useFallback)) && |
213 | UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) |
214 | ) { |
215 | /* full match, stop with result */ |
216 | matchValue=value; |
217 | matchLength=i+j; |
218 | } else { |
219 | /* full match on fallback not taken, stop with the longest match so far */ |
220 | } |
221 | break; |
222 | } |
223 | } |
224 | } |
225 | |
226 | if(matchLength==0) { |
227 | /* no match at all */ |
228 | return 0; |
229 | } |
230 | |
231 | /* return result */ |
232 | *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); |
233 | return matchLength; |
234 | } |
235 | |
236 | static inline void |
237 | ucnv_extWriteToU(UConverter *cnv, const int32_t *cx, |
238 | uint32_t value, |
239 | char16_t **target, const char16_t *targetLimit, |
240 | int32_t **offsets, int32_t srcIndex, |
241 | UErrorCode *pErrorCode) { |
242 | /* output the result */ |
243 | if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { |
244 | /* output a single code point */ |
245 | ucnv_toUWriteCodePoint( |
246 | cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value), |
247 | target, targetLimit, |
248 | offsets, srcIndex, |
249 | pErrorCode); |
250 | } else { |
251 | /* output a string - with correct data we have resultLength>0 */ |
252 | ucnv_toUWriteUChars( |
253 | cnv, |
254 | UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, char16_t)+ |
255 | UCNV_EXT_TO_U_GET_INDEX(value), |
256 | UCNV_EXT_TO_U_GET_LENGTH(value), |
257 | target, targetLimit, |
258 | offsets, srcIndex, |
259 | pErrorCode); |
260 | } |
261 | } |
262 | |
263 | /* |
264 | * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), |
265 | * or 1 for DBCS-only, |
266 | * or -1 if the converter is not SI/SO stateful |
267 | * |
268 | * Note: For SI/SO stateful converters getting here, |
269 | * cnv->mode==0 is equivalent to firstLength==1. |
270 | */ |
271 | #define UCNV_SISO_STATE(cnv) \ |
272 | ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \ |
273 | (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1) |
274 | |
275 | /* |
276 | * target<targetLimit; set error code for overflow |
277 | */ |
278 | U_CFUNC UBool |
279 | ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, |
280 | int32_t firstLength, |
281 | const char **src, const char *srcLimit, |
282 | char16_t **target, const char16_t *targetLimit, |
283 | int32_t **offsets, int32_t srcIndex, |
284 | UBool flush, |
285 | UErrorCode *pErrorCode) { |
286 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
287 | int32_t match; |
288 | |
289 | /* try to match */ |
290 | match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv), |
291 | (const char *)cnv->toUBytes, firstLength, |
292 | *src, (int32_t)(srcLimit-*src), |
293 | &value, |
294 | cnv->useFallback, flush); |
295 | if(match>0) { |
296 | /* advance src pointer for the consumed input */ |
297 | *src+=match-firstLength; |
298 | |
299 | /* write result to target */ |
300 | ucnv_extWriteToU(cnv, cx, |
301 | value, |
302 | target, targetLimit, |
303 | offsets, srcIndex, |
304 | pErrorCode); |
305 | return true; |
306 | } else if(match<0) { |
307 | /* save state for partial match */ |
308 | const char *s; |
309 | int32_t j; |
310 | |
311 | /* copy the first code point */ |
312 | s=(const char *)cnv->toUBytes; |
313 | cnv->preToUFirstLength=(int8_t)firstLength; |
314 | for(j=0; j<firstLength; ++j) { |
315 | cnv->preToU[j]=*s++; |
316 | } |
317 | |
318 | /* now copy the newly consumed input */ |
319 | s=*src; |
320 | match=-match; |
321 | for(; j<match; ++j) { |
322 | cnv->preToU[j]=*s++; |
323 | } |
324 | *src=s; /* same as *src=srcLimit; because we reached the end of input */ |
325 | cnv->preToULength=(int8_t)match; |
326 | return true; |
327 | } else /* match==0 no match */ { |
328 | return false; |
329 | } |
330 | } |
331 | |
332 | U_CFUNC UChar32 |
333 | ucnv_extSimpleMatchToU(const int32_t *cx, |
334 | const char *source, int32_t length, |
335 | UBool useFallback) { |
336 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
337 | int32_t match; |
338 | |
339 | if(length<=0) { |
340 | return 0xffff; |
341 | } |
342 | |
343 | /* try to match */ |
344 | match=ucnv_extMatchToU(cx, -1, |
345 | source, length, |
346 | nullptr, 0, |
347 | &value, |
348 | useFallback, true); |
349 | if(match==length) { |
350 | /* write result for simple, single-character conversion */ |
351 | if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { |
352 | return UCNV_EXT_TO_U_GET_CODE_POINT(value); |
353 | } |
354 | } |
355 | |
356 | /* |
357 | * return no match because |
358 | * - match>0 && value points to string: simple conversion cannot handle multiple code points |
359 | * - match>0 && match!=length: not all input consumed, forbidden for this function |
360 | * - match==0: no match found in the first place |
361 | * - match<0: partial match, not supported for simple conversion (and flush==true) |
362 | */ |
363 | return 0xfffe; |
364 | } |
365 | |
366 | /* |
367 | * continue partial match with new input |
368 | * never called for simple, single-character conversion |
369 | */ |
370 | U_CFUNC void |
371 | ucnv_extContinueMatchToU(UConverter *cnv, |
372 | UConverterToUnicodeArgs *pArgs, int32_t srcIndex, |
373 | UErrorCode *pErrorCode) { |
374 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
375 | int32_t match, length; |
376 | |
377 | match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv), |
378 | cnv->preToU, cnv->preToULength, |
379 | pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), |
380 | &value, |
381 | cnv->useFallback, pArgs->flush); |
382 | if(match>0) { |
383 | if(match>=cnv->preToULength) { |
384 | /* advance src pointer for the consumed input */ |
385 | pArgs->source+=match-cnv->preToULength; |
386 | cnv->preToULength=0; |
387 | } else { |
388 | /* the match did not use all of preToU[] - keep the rest for replay */ |
389 | length=cnv->preToULength-match; |
390 | uprv_memmove(cnv->preToU, cnv->preToU+match, length); |
391 | cnv->preToULength=(int8_t)-length; |
392 | } |
393 | |
394 | /* write result */ |
395 | ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes, |
396 | value, |
397 | &pArgs->target, pArgs->targetLimit, |
398 | &pArgs->offsets, srcIndex, |
399 | pErrorCode); |
400 | } else if(match<0) { |
401 | /* save state for partial match */ |
402 | const char *s; |
403 | int32_t j; |
404 | |
405 | /* just _append_ the newly consumed input to preToU[] */ |
406 | s=pArgs->source; |
407 | match=-match; |
408 | for(j=cnv->preToULength; j<match; ++j) { |
409 | cnv->preToU[j]=*s++; |
410 | } |
411 | pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ |
412 | cnv->preToULength=(int8_t)match; |
413 | } else /* match==0 */ { |
414 | /* |
415 | * no match |
416 | * |
417 | * We need to split the previous input into two parts: |
418 | * |
419 | * 1. The first codepage character is unmappable - that's how we got into |
420 | * trying the extension data in the first place. |
421 | * We need to move it from the preToU buffer |
422 | * to the error buffer, set an error code, |
423 | * and prepare the rest of the previous input for 2. |
424 | * |
425 | * 2. The rest of the previous input must be converted once we |
426 | * come back from the callback for the first character. |
427 | * At that time, we have to try again from scratch to convert |
428 | * these input characters. |
429 | * The replay will be handled by the ucnv.c conversion code. |
430 | */ |
431 | |
432 | /* move the first codepage character to the error field */ |
433 | uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); |
434 | cnv->toULength=cnv->preToUFirstLength; |
435 | |
436 | /* move the rest up inside the buffer */ |
437 | length=cnv->preToULength-cnv->preToUFirstLength; |
438 | if(length>0) { |
439 | uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); |
440 | } |
441 | |
442 | /* mark preToU for replay */ |
443 | cnv->preToULength=(int8_t)-length; |
444 | |
445 | /* set the error code for unassigned */ |
446 | *pErrorCode=U_INVALID_CHAR_FOUND; |
447 | } |
448 | } |
449 | |
450 | /* from Unicode ------------------------------------------------------------- */ |
451 | |
452 | // Use roundtrips, "good one-way" mappings, and some normal fallbacks. |
453 | static inline UBool |
454 | extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) { |
455 | return |
456 | ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 || |
457 | FROM_U_USE_FALLBACK(useFallback, firstCP)) && |
458 | (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0; |
459 | } |
460 | |
461 | /* |
462 | * @return index of the char16_t, if found; else <0 |
463 | */ |
464 | static inline int32_t |
465 | ucnv_extFindFromU(const char16_t *fromUSection, int32_t length, char16_t u) { |
466 | int32_t i, start, limit; |
467 | |
468 | /* binary search */ |
469 | start=0; |
470 | limit=length; |
471 | for(;;) { |
472 | i=limit-start; |
473 | if(i<=1) { |
474 | break; /* done */ |
475 | } |
476 | /* start<limit-1 */ |
477 | |
478 | if(i<=4) { |
479 | /* linear search for the last part */ |
480 | if(u<=fromUSection[start]) { |
481 | break; |
482 | } |
483 | if(++start<limit && u<=fromUSection[start]) { |
484 | break; |
485 | } |
486 | if(++start<limit && u<=fromUSection[start]) { |
487 | break; |
488 | } |
489 | /* always break at start==limit-1 */ |
490 | ++start; |
491 | break; |
492 | } |
493 | |
494 | i=(start+limit)/2; |
495 | if(u<fromUSection[i]) { |
496 | limit=i; |
497 | } else { |
498 | start=i; |
499 | } |
500 | } |
501 | |
502 | /* did we really find it? */ |
503 | if(start<limit && u==fromUSection[start]) { |
504 | return start; |
505 | } else { |
506 | return -1; /* not found */ |
507 | } |
508 | } |
509 | |
510 | /* |
511 | * @param cx pointer to extension data; if nullptr, returns 0 |
512 | * @param firstCP the first code point before all the other UChars |
513 | * @param pre UChars that must match; !initialMatch: partial match with them |
514 | * @param preLength length of pre, >=0 |
515 | * @param src UChars that can be used to complete a match |
516 | * @param srcLength length of src, >=0 |
517 | * @param pMatchValue [out] output result value for the match from the data structure |
518 | * @param useFallback "use fallback" flag, usually from cnv->useFallback |
519 | * @param flush true if the end of the input stream is reached |
520 | * @return >1: matched, return value=total match length (number of input units matched) |
521 | * 1: matched, no mapping but request for <subchar1> |
522 | * (only for the first code point) |
523 | * 0: no match |
524 | * <0: partial match, return value=negative total match length |
525 | * (partial matches are never returned for flush==true) |
526 | * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) |
527 | * the matchLength is 2 if only firstCP matched, and >2 if firstCP and |
528 | * further code units matched |
529 | */ |
530 | static int32_t |
531 | ucnv_extMatchFromU(const int32_t *cx, |
532 | UChar32 firstCP, |
533 | const char16_t *pre, int32_t preLength, |
534 | const char16_t *src, int32_t srcLength, |
535 | uint32_t *pMatchValue, |
536 | UBool useFallback, UBool flush) { |
537 | const uint16_t *stage12, *stage3; |
538 | const uint32_t *stage3b; |
539 | |
540 | const char16_t *fromUTableUChars, *fromUSectionUChars; |
541 | const uint32_t *fromUTableValues, *fromUSectionValues; |
542 | |
543 | uint32_t value, matchValue; |
544 | int32_t i, j, idx, length, matchLength; |
545 | char16_t c; |
546 | |
547 | if(cx==nullptr) { |
548 | return 0; /* no extension data, no match */ |
549 | } |
550 | |
551 | /* trie lookup of firstCP */ |
552 | idx=firstCP>>10; /* stage 1 index */ |
553 | if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { |
554 | return 0; /* the first code point is outside the trie */ |
555 | } |
556 | |
557 | stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); |
558 | stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); |
559 | idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP); |
560 | |
561 | stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); |
562 | value=stage3b[idx]; |
563 | if(value==0) { |
564 | return 0; |
565 | } |
566 | |
567 | /* |
568 | * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: |
569 | * Do not interpret values with reserved bits used, for forward compatibility, |
570 | * and do not even remember intermediate results with reserved bits used. |
571 | */ |
572 | |
573 | if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { |
574 | /* partial match, enter the loop below */ |
575 | idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); |
576 | |
577 | /* initialize */ |
578 | fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, char16_t); |
579 | fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); |
580 | |
581 | matchValue=0; |
582 | i=j=matchLength=0; |
583 | |
584 | /* we must not remember fallback matches when not using fallbacks */ |
585 | |
586 | /* match input units until there is a full match or the input is consumed */ |
587 | for(;;) { |
588 | /* go to the next section */ |
589 | fromUSectionUChars=fromUTableUChars+idx; |
590 | fromUSectionValues=fromUTableValues+idx; |
591 | |
592 | /* read first pair of the section */ |
593 | length=*fromUSectionUChars++; |
594 | value=*fromUSectionValues++; |
595 | if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) { |
596 | /* remember longest match so far */ |
597 | matchValue=value; |
598 | matchLength=2+i+j; |
599 | } |
600 | |
601 | /* match pre[] then src[] */ |
602 | if(i<preLength) { |
603 | c=pre[i++]; |
604 | } else if(j<srcLength) { |
605 | c=src[j++]; |
606 | } else { |
607 | /* all input consumed, partial match */ |
608 | if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) { |
609 | /* |
610 | * end of the entire input stream, stop with the longest match so far |
611 | * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS |
612 | * because it must fit into state buffers |
613 | */ |
614 | break; |
615 | } else { |
616 | /* continue with more input next time */ |
617 | return -(2+length); |
618 | } |
619 | } |
620 | |
621 | /* search for the current char16_t */ |
622 | idx=ucnv_extFindFromU(fromUSectionUChars, length, c); |
623 | if(idx<0) { |
624 | /* no match here, stop with the longest match so far */ |
625 | break; |
626 | } else { |
627 | value=fromUSectionValues[idx]; |
628 | if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
629 | /* partial match, continue */ |
630 | idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); |
631 | } else { |
632 | if(extFromUUseMapping(useFallback, value, firstCP)) { |
633 | /* full match, stop with result */ |
634 | matchValue=value; |
635 | matchLength=2+i+j; |
636 | } else { |
637 | /* full match on fallback not taken, stop with the longest match so far */ |
638 | } |
639 | break; |
640 | } |
641 | } |
642 | } |
643 | |
644 | if(matchLength==0) { |
645 | /* no match at all */ |
646 | return 0; |
647 | } |
648 | } else /* result from firstCP trie lookup */ { |
649 | if(extFromUUseMapping(useFallback, value, firstCP)) { |
650 | /* full match, stop with result */ |
651 | matchValue=value; |
652 | matchLength=2; |
653 | } else { |
654 | /* fallback not taken */ |
655 | return 0; |
656 | } |
657 | } |
658 | |
659 | /* return result */ |
660 | if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { |
661 | return 1; /* assert matchLength==2 */ |
662 | } |
663 | |
664 | *pMatchValue=matchValue; |
665 | return matchLength; |
666 | } |
667 | |
668 | /* |
669 | * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits |
670 | */ |
671 | static inline void |
672 | ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, |
673 | uint32_t value, |
674 | char **target, const char *targetLimit, |
675 | int32_t **offsets, int32_t srcIndex, |
676 | UErrorCode *pErrorCode) { |
677 | uint8_t buffer[1+UCNV_EXT_MAX_BYTES]; |
678 | const uint8_t *result; |
679 | int32_t length, prevLength; |
680 | |
681 | length=UCNV_EXT_FROM_U_GET_LENGTH(value); |
682 | value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); |
683 | |
684 | /* output the result */ |
685 | if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { |
686 | /* |
687 | * Generate a byte array and then write it below. |
688 | * This is not the fastest possible way, but it should be ok for |
689 | * extension mappings, and it is much simpler. |
690 | * Offset and overflow handling are only done once this way. |
691 | */ |
692 | uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */ |
693 | switch(length) { |
694 | case 3: |
695 | *p++=(uint8_t)(value>>16); |
696 | U_FALLTHROUGH; |
697 | case 2: |
698 | *p++=(uint8_t)(value>>8); |
699 | U_FALLTHROUGH; |
700 | case 1: |
701 | *p++=(uint8_t)value; |
702 | U_FALLTHROUGH; |
703 | default: |
704 | break; /* will never occur */ |
705 | } |
706 | result=buffer+1; |
707 | } else { |
708 | result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; |
709 | } |
710 | |
711 | /* with correct data we have length>0 */ |
712 | |
713 | if((prevLength=cnv->fromUnicodeStatus)!=0) { |
714 | /* handle SI/SO stateful output */ |
715 | uint8_t shiftByte; |
716 | |
717 | if(prevLength>1 && length==1) { |
718 | /* change from double-byte mode to single-byte */ |
719 | shiftByte=(uint8_t)UCNV_SI; |
720 | cnv->fromUnicodeStatus=1; |
721 | } else if(prevLength==1 && length>1) { |
722 | /* change from single-byte mode to double-byte */ |
723 | shiftByte=(uint8_t)UCNV_SO; |
724 | cnv->fromUnicodeStatus=2; |
725 | } else { |
726 | shiftByte=0; |
727 | } |
728 | |
729 | if(shiftByte!=0) { |
730 | /* prepend the shift byte to the result bytes */ |
731 | buffer[0]=shiftByte; |
732 | if(result!=buffer+1) { |
733 | uprv_memcpy(buffer+1, result, length); |
734 | } |
735 | result=buffer; |
736 | ++length; |
737 | } |
738 | } |
739 | |
740 | ucnv_fromUWriteBytes(cnv, (const char *)result, length, |
741 | target, targetLimit, |
742 | offsets, srcIndex, |
743 | pErrorCode); |
744 | } |
745 | |
746 | /* |
747 | * target<targetLimit; set error code for overflow |
748 | */ |
749 | U_CFUNC UBool |
750 | ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, |
751 | UChar32 cp, |
752 | const char16_t **src, const char16_t *srcLimit, |
753 | char **target, const char *targetLimit, |
754 | int32_t **offsets, int32_t srcIndex, |
755 | UBool flush, |
756 | UErrorCode *pErrorCode) { |
757 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
758 | int32_t match; |
759 | |
760 | /* try to match */ |
761 | match=ucnv_extMatchFromU(cx, cp, |
762 | nullptr, 0, |
763 | *src, (int32_t)(srcLimit-*src), |
764 | &value, |
765 | cnv->useFallback, flush); |
766 | |
767 | /* reject a match if the result is a single byte for DBCS-only */ |
768 | if( match>=2 && |
769 | !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 && |
770 | cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) |
771 | ) { |
772 | /* advance src pointer for the consumed input */ |
773 | *src+=match-2; /* remove 2 for the initial code point */ |
774 | |
775 | /* write result to target */ |
776 | ucnv_extWriteFromU(cnv, cx, |
777 | value, |
778 | target, targetLimit, |
779 | offsets, srcIndex, |
780 | pErrorCode); |
781 | return true; |
782 | } else if(match<0) { |
783 | /* save state for partial match */ |
784 | const char16_t *s; |
785 | int32_t j; |
786 | |
787 | /* copy the first code point */ |
788 | cnv->preFromUFirstCP=cp; |
789 | |
790 | /* now copy the newly consumed input */ |
791 | s=*src; |
792 | match=-match-2; /* remove 2 for the initial code point */ |
793 | for(j=0; j<match; ++j) { |
794 | cnv->preFromU[j]=*s++; |
795 | } |
796 | *src=s; /* same as *src=srcLimit; because we reached the end of input */ |
797 | cnv->preFromULength=(int8_t)match; |
798 | return true; |
799 | } else if(match==1) { |
800 | /* matched, no mapping but request for <subchar1> */ |
801 | cnv->useSubChar1=true; |
802 | return false; |
803 | } else /* match==0 no match */ { |
804 | return false; |
805 | } |
806 | } |
807 | |
808 | /* |
809 | * Used by ISO 2022 implementation. |
810 | * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping |
811 | */ |
812 | U_CFUNC int32_t |
813 | ucnv_extSimpleMatchFromU(const int32_t *cx, |
814 | UChar32 cp, uint32_t *pValue, |
815 | UBool useFallback) { |
816 | uint32_t value; |
817 | int32_t match; |
818 | |
819 | /* try to match */ |
820 | match=ucnv_extMatchFromU(cx, |
821 | cp, |
822 | nullptr, 0, |
823 | nullptr, 0, |
824 | &value, |
825 | useFallback, true); |
826 | if(match>=2) { |
827 | /* write result for simple, single-character conversion */ |
828 | int32_t length; |
829 | int isRoundtrip; |
830 | |
831 | isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); |
832 | length=UCNV_EXT_FROM_U_GET_LENGTH(value); |
833 | value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); |
834 | |
835 | if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { |
836 | *pValue=value; |
837 | return isRoundtrip ? length : -length; |
838 | #if 0 /* not currently used */ |
839 | } else if(length==4) { |
840 | /* de-serialize a 4-byte result */ |
841 | const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; |
842 | *pValue= |
843 | ((uint32_t)result[0]<<24)| |
844 | ((uint32_t)result[1]<<16)| |
845 | ((uint32_t)result[2]<<8)| |
846 | result[3]; |
847 | return isRoundtrip ? 4 : -4; |
848 | #endif |
849 | } |
850 | } |
851 | |
852 | /* |
853 | * return no match because |
854 | * - match>1 && resultLength>4: result too long for simple conversion |
855 | * - match==1: no match found, <subchar1> preferred |
856 | * - match==0: no match found in the first place |
857 | * - match<0: partial match, not supported for simple conversion (and flush==true) |
858 | */ |
859 | return 0; |
860 | } |
861 | |
862 | /* |
863 | * continue partial match with new input, requires cnv->preFromUFirstCP>=0 |
864 | * never called for simple, single-character conversion |
865 | */ |
866 | U_CFUNC void |
867 | ucnv_extContinueMatchFromU(UConverter *cnv, |
868 | UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, |
869 | UErrorCode *pErrorCode) { |
870 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
871 | int32_t match; |
872 | |
873 | match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes, |
874 | cnv->preFromUFirstCP, |
875 | cnv->preFromU, cnv->preFromULength, |
876 | pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), |
877 | &value, |
878 | cnv->useFallback, pArgs->flush); |
879 | if(match>=2) { |
880 | match-=2; /* remove 2 for the initial code point */ |
881 | |
882 | if(match>=cnv->preFromULength) { |
883 | /* advance src pointer for the consumed input */ |
884 | pArgs->source+=match-cnv->preFromULength; |
885 | cnv->preFromULength=0; |
886 | } else { |
887 | /* the match did not use all of preFromU[] - keep the rest for replay */ |
888 | int32_t length=cnv->preFromULength-match; |
889 | u_memmove(cnv->preFromU, cnv->preFromU+match, length); |
890 | cnv->preFromULength=(int8_t)-length; |
891 | } |
892 | |
893 | /* finish the partial match */ |
894 | cnv->preFromUFirstCP=U_SENTINEL; |
895 | |
896 | /* write result */ |
897 | ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes, |
898 | value, |
899 | &pArgs->target, pArgs->targetLimit, |
900 | &pArgs->offsets, srcIndex, |
901 | pErrorCode); |
902 | } else if(match<0) { |
903 | /* save state for partial match */ |
904 | const char16_t *s; |
905 | int32_t j; |
906 | |
907 | /* just _append_ the newly consumed input to preFromU[] */ |
908 | s=pArgs->source; |
909 | match=-match-2; /* remove 2 for the initial code point */ |
910 | for(j=cnv->preFromULength; j<match; ++j) { |
911 | U_ASSERT(j>=0); |
912 | cnv->preFromU[j]=*s++; |
913 | } |
914 | pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ |
915 | cnv->preFromULength=(int8_t)match; |
916 | } else /* match==0 or 1 */ { |
917 | /* |
918 | * no match |
919 | * |
920 | * We need to split the previous input into two parts: |
921 | * |
922 | * 1. The first code point is unmappable - that's how we got into |
923 | * trying the extension data in the first place. |
924 | * We need to move it from the preFromU buffer |
925 | * to the error buffer, set an error code, |
926 | * and prepare the rest of the previous input for 2. |
927 | * |
928 | * 2. The rest of the previous input must be converted once we |
929 | * come back from the callback for the first code point. |
930 | * At that time, we have to try again from scratch to convert |
931 | * these input characters. |
932 | * The replay will be handled by the ucnv.c conversion code. |
933 | */ |
934 | |
935 | if(match==1) { |
936 | /* matched, no mapping but request for <subchar1> */ |
937 | cnv->useSubChar1=true; |
938 | } |
939 | |
940 | /* move the first code point to the error field */ |
941 | cnv->fromUChar32=cnv->preFromUFirstCP; |
942 | cnv->preFromUFirstCP=U_SENTINEL; |
943 | |
944 | /* mark preFromU for replay */ |
945 | cnv->preFromULength=-cnv->preFromULength; |
946 | |
947 | /* set the error code for unassigned */ |
948 | *pErrorCode=U_INVALID_CHAR_FOUND; |
949 | } |
950 | } |
951 | |
952 | static UBool |
953 | extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) { |
954 | if(which==UCNV_ROUNDTRIP_SET) { |
955 | // Add only code points for which the roundtrip flag is set. |
956 | // Do not add any fallbacks, even if ucnv_fromUnicode() would use them |
957 | // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet(). |
958 | // |
959 | // By analogy, also do not add "good one-way" mappings. |
960 | // |
961 | // Do not add entries with reserved bits set. |
962 | if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!= |
963 | UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) { |
964 | return false; |
965 | } |
966 | } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { |
967 | // Do not add entries with reserved bits set. |
968 | if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) { |
969 | return false; |
970 | } |
971 | } |
972 | // Do not add <subchar1> entries or other (future?) pseudo-entries |
973 | // with an output length of 0. |
974 | return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength; |
975 | } |
976 | |
977 | static void |
978 | ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, |
979 | const int32_t *cx, |
980 | const USetAdder *sa, |
981 | UConverterUnicodeSet which, |
982 | int32_t minLength, |
983 | UChar32 firstCP, |
984 | char16_t s[UCNV_EXT_MAX_UCHARS], int32_t length, |
985 | int32_t sectionIndex, |
986 | UErrorCode *pErrorCode) { |
987 | const char16_t *fromUSectionUChars; |
988 | const uint32_t *fromUSectionValues; |
989 | |
990 | uint32_t value; |
991 | int32_t i, count; |
992 | |
993 | fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, char16_t)+sectionIndex; |
994 | fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; |
995 | |
996 | /* read first pair of the section */ |
997 | count=*fromUSectionUChars++; |
998 | value=*fromUSectionValues++; |
999 | |
1000 | if(extSetUseMapping(which, minLength, value)) { |
1001 | if(length==U16_LENGTH(firstCP)) { |
1002 | /* add the initial code point */ |
1003 | sa->add(sa->set, firstCP); |
1004 | } else { |
1005 | /* add the string so far */ |
1006 | sa->addString(sa->set, s, length); |
1007 | } |
1008 | } |
1009 | |
1010 | for(i=0; i<count; ++i) { |
1011 | /* append this code unit and recurse or add the string */ |
1012 | s[length]=fromUSectionUChars[i]; |
1013 | value=fromUSectionValues[i]; |
1014 | |
1015 | if(value==0) { |
1016 | /* no mapping, do nothing */ |
1017 | } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
1018 | ucnv_extGetUnicodeSetString( |
1019 | sharedData, cx, sa, which, minLength, |
1020 | firstCP, s, length+1, |
1021 | (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), |
1022 | pErrorCode); |
1023 | } else if(extSetUseMapping(which, minLength, value)) { |
1024 | sa->addString(sa->set, s, length+1); |
1025 | } |
1026 | } |
1027 | } |
1028 | |
1029 | U_CFUNC void |
1030 | ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, |
1031 | const USetAdder *sa, |
1032 | UConverterUnicodeSet which, |
1033 | UConverterSetFilter filter, |
1034 | UErrorCode *pErrorCode) { |
1035 | const int32_t *cx; |
1036 | const uint16_t *stage12, *stage3, *ps2, *ps3; |
1037 | const uint32_t *stage3b; |
1038 | |
1039 | uint32_t value; |
1040 | int32_t st1, stage1Length, st2, st3, minLength; |
1041 | |
1042 | char16_t s[UCNV_EXT_MAX_UCHARS]; |
1043 | UChar32 c; |
1044 | int32_t length; |
1045 | |
1046 | cx=sharedData->mbcs.extIndexes; |
1047 | if(cx==nullptr) { |
1048 | return; |
1049 | } |
1050 | |
1051 | stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); |
1052 | stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); |
1053 | stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); |
1054 | |
1055 | stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; |
1056 | |
1057 | /* enumerate the from-Unicode trie table */ |
1058 | c=0; /* keep track of the current code point while enumerating */ |
1059 | |
1060 | if(filter==UCNV_SET_FILTER_2022_CN) { |
1061 | minLength=3; |
1062 | } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || |
1063 | filter!=UCNV_SET_FILTER_NONE |
1064 | ) { |
1065 | /* DBCS-only, ignore single-byte results */ |
1066 | minLength=2; |
1067 | } else { |
1068 | minLength=1; |
1069 | } |
1070 | |
1071 | /* |
1072 | * the trie enumeration is almost the same as |
1073 | * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1 |
1074 | */ |
1075 | for(st1=0; st1<stage1Length; ++st1) { |
1076 | st2=stage12[st1]; |
1077 | if(st2>stage1Length) { |
1078 | ps2=stage12+st2; |
1079 | for(st2=0; st2<64; ++st2) { |
1080 | if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) { |
1081 | /* read the stage 3 block */ |
1082 | ps3=stage3+st3; |
1083 | |
1084 | do { |
1085 | value=stage3b[*ps3++]; |
1086 | if(value==0) { |
1087 | /* no mapping, do nothing */ |
1088 | } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
1089 | // Recurse for partial results. |
1090 | length=0; |
1091 | U16_APPEND_UNSAFE(s, length, c); |
1092 | ucnv_extGetUnicodeSetString( |
1093 | sharedData, cx, sa, which, minLength, |
1094 | c, s, length, |
1095 | (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), |
1096 | pErrorCode); |
1097 | } else if(extSetUseMapping(which, minLength, value)) { |
1098 | switch(filter) { |
1099 | case UCNV_SET_FILTER_2022_CN: |
1100 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { |
1101 | continue; |
1102 | } |
1103 | break; |
1104 | case UCNV_SET_FILTER_SJIS: |
1105 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { |
1106 | continue; |
1107 | } |
1108 | break; |
1109 | case UCNV_SET_FILTER_GR94DBCS: |
1110 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && |
1111 | (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) && |
1112 | (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { |
1113 | continue; |
1114 | } |
1115 | break; |
1116 | case UCNV_SET_FILTER_HZ: |
1117 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && |
1118 | (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && |
1119 | (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { |
1120 | continue; |
1121 | } |
1122 | break; |
1123 | default: |
1124 | /* |
1125 | * UCNV_SET_FILTER_NONE, |
1126 | * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength |
1127 | */ |
1128 | break; |
1129 | } |
1130 | sa->add(sa->set, c); |
1131 | } |
1132 | } while((++c&0xf)!=0); |
1133 | } else { |
1134 | c+=16; /* empty stage 3 block */ |
1135 | } |
1136 | } |
1137 | } else { |
1138 | c+=1024; /* empty stage 2 block */ |
1139 | } |
1140 | } |
1141 | } |
1142 | |
1143 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |
1144 | |