1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 2000-2015, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * file name: ucnvlat1.cpp |
9 | * encoding: UTF-8 |
10 | * tab size: 8 (not used) |
11 | * indentation:4 |
12 | * |
13 | * created on: 2000feb07 |
14 | * created by: Markus W. Scherer |
15 | */ |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_CONVERSION |
20 | |
21 | #include "unicode/ucnv.h" |
22 | #include "unicode/uset.h" |
23 | #include "unicode/utf8.h" |
24 | #include "ucnv_bld.h" |
25 | #include "ucnv_cnv.h" |
26 | #include "ustr_imp.h" |
27 | |
28 | /* control optimizations according to the platform */ |
29 | #define LATIN1_UNROLL_FROM_UNICODE 1 |
30 | |
31 | /* ISO 8859-1 --------------------------------------------------------------- */ |
32 | |
33 | /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
34 | U_CDECL_BEGIN |
35 | static void U_CALLCONV |
36 | _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
37 | UErrorCode *pErrorCode) { |
38 | const uint8_t *source; |
39 | char16_t *target; |
40 | int32_t targetCapacity, length; |
41 | int32_t *offsets; |
42 | |
43 | int32_t sourceIndex; |
44 | |
45 | /* set up the local pointers */ |
46 | source=(const uint8_t *)pArgs->source; |
47 | target=pArgs->target; |
48 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
49 | offsets=pArgs->offsets; |
50 | |
51 | sourceIndex=0; |
52 | |
53 | /* |
54 | * since the conversion here is 1:1 char16_t:uint8_t, we need only one counter |
55 | * for the minimum of the sourceLength and targetCapacity |
56 | */ |
57 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
58 | if(length<=targetCapacity) { |
59 | targetCapacity=length; |
60 | } else { |
61 | /* target will be full */ |
62 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
63 | length=targetCapacity; |
64 | } |
65 | |
66 | if(targetCapacity>=8) { |
67 | /* This loop is unrolled for speed and improved pipelining. */ |
68 | int32_t count, loops; |
69 | |
70 | loops=count=targetCapacity>>3; |
71 | length=targetCapacity&=0x7; |
72 | do { |
73 | target[0]=source[0]; |
74 | target[1]=source[1]; |
75 | target[2]=source[2]; |
76 | target[3]=source[3]; |
77 | target[4]=source[4]; |
78 | target[5]=source[5]; |
79 | target[6]=source[6]; |
80 | target[7]=source[7]; |
81 | target+=8; |
82 | source+=8; |
83 | } while(--count>0); |
84 | |
85 | if(offsets!=nullptr) { |
86 | do { |
87 | offsets[0]=sourceIndex++; |
88 | offsets[1]=sourceIndex++; |
89 | offsets[2]=sourceIndex++; |
90 | offsets[3]=sourceIndex++; |
91 | offsets[4]=sourceIndex++; |
92 | offsets[5]=sourceIndex++; |
93 | offsets[6]=sourceIndex++; |
94 | offsets[7]=sourceIndex++; |
95 | offsets+=8; |
96 | } while(--loops>0); |
97 | } |
98 | } |
99 | |
100 | /* conversion loop */ |
101 | while(targetCapacity>0) { |
102 | *target++=*source++; |
103 | --targetCapacity; |
104 | } |
105 | |
106 | /* write back the updated pointers */ |
107 | pArgs->source=(const char *)source; |
108 | pArgs->target=target; |
109 | |
110 | /* set offsets */ |
111 | if(offsets!=nullptr) { |
112 | while(length>0) { |
113 | *offsets++=sourceIndex++; |
114 | --length; |
115 | } |
116 | pArgs->offsets=offsets; |
117 | } |
118 | } |
119 | |
120 | /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */ |
121 | static UChar32 U_CALLCONV |
122 | _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs, |
123 | UErrorCode *pErrorCode) { |
124 | const uint8_t *source=(const uint8_t *)pArgs->source; |
125 | if(source<(const uint8_t *)pArgs->sourceLimit) { |
126 | pArgs->source=(const char *)(source+1); |
127 | return *source; |
128 | } |
129 | |
130 | /* no output because of empty input */ |
131 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
132 | return 0xffff; |
133 | } |
134 | |
135 | /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */ |
136 | static void U_CALLCONV |
137 | _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
138 | UErrorCode *pErrorCode) { |
139 | UConverter *cnv; |
140 | const char16_t *source, *sourceLimit; |
141 | uint8_t *target, *oldTarget; |
142 | int32_t targetCapacity, length; |
143 | int32_t *offsets; |
144 | |
145 | UChar32 cp; |
146 | char16_t c, max; |
147 | |
148 | int32_t sourceIndex; |
149 | |
150 | /* set up the local pointers */ |
151 | cnv=pArgs->converter; |
152 | source=pArgs->source; |
153 | sourceLimit=pArgs->sourceLimit; |
154 | target=oldTarget=(uint8_t *)pArgs->target; |
155 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
156 | offsets=pArgs->offsets; |
157 | |
158 | if(cnv->sharedData==&_Latin1Data) { |
159 | max=0xff; /* Latin-1 */ |
160 | } else { |
161 | max=0x7f; /* US-ASCII */ |
162 | } |
163 | |
164 | /* get the converter state from UConverter */ |
165 | cp=cnv->fromUChar32; |
166 | |
167 | /* sourceIndex=-1 if the current character began in the previous buffer */ |
168 | sourceIndex= cp==0 ? 0 : -1; |
169 | |
170 | /* |
171 | * since the conversion here is 1:1 char16_t:uint8_t, we need only one counter |
172 | * for the minimum of the sourceLength and targetCapacity |
173 | */ |
174 | length=(int32_t)(sourceLimit-source); |
175 | if(length<targetCapacity) { |
176 | targetCapacity=length; |
177 | } |
178 | |
179 | /* conversion loop */ |
180 | if(cp!=0 && targetCapacity>0) { |
181 | goto getTrail; |
182 | } |
183 | |
184 | #if LATIN1_UNROLL_FROM_UNICODE |
185 | /* unroll the loop with the most common case */ |
186 | if(targetCapacity>=16) { |
187 | int32_t count, loops; |
188 | char16_t u, oredChars; |
189 | |
190 | loops=count=targetCapacity>>4; |
191 | do { |
192 | oredChars=u=*source++; |
193 | *target++=(uint8_t)u; |
194 | oredChars|=u=*source++; |
195 | *target++=(uint8_t)u; |
196 | oredChars|=u=*source++; |
197 | *target++=(uint8_t)u; |
198 | oredChars|=u=*source++; |
199 | *target++=(uint8_t)u; |
200 | oredChars|=u=*source++; |
201 | *target++=(uint8_t)u; |
202 | oredChars|=u=*source++; |
203 | *target++=(uint8_t)u; |
204 | oredChars|=u=*source++; |
205 | *target++=(uint8_t)u; |
206 | oredChars|=u=*source++; |
207 | *target++=(uint8_t)u; |
208 | oredChars|=u=*source++; |
209 | *target++=(uint8_t)u; |
210 | oredChars|=u=*source++; |
211 | *target++=(uint8_t)u; |
212 | oredChars|=u=*source++; |
213 | *target++=(uint8_t)u; |
214 | oredChars|=u=*source++; |
215 | *target++=(uint8_t)u; |
216 | oredChars|=u=*source++; |
217 | *target++=(uint8_t)u; |
218 | oredChars|=u=*source++; |
219 | *target++=(uint8_t)u; |
220 | oredChars|=u=*source++; |
221 | *target++=(uint8_t)u; |
222 | oredChars|=u=*source++; |
223 | *target++=(uint8_t)u; |
224 | |
225 | /* were all 16 entries really valid? */ |
226 | if(oredChars>max) { |
227 | /* no, return to the first of these 16 */ |
228 | source-=16; |
229 | target-=16; |
230 | break; |
231 | } |
232 | } while(--count>0); |
233 | count=loops-count; |
234 | targetCapacity-=16*count; |
235 | |
236 | if(offsets!=nullptr) { |
237 | oldTarget+=16*count; |
238 | while(count>0) { |
239 | *offsets++=sourceIndex++; |
240 | *offsets++=sourceIndex++; |
241 | *offsets++=sourceIndex++; |
242 | *offsets++=sourceIndex++; |
243 | *offsets++=sourceIndex++; |
244 | *offsets++=sourceIndex++; |
245 | *offsets++=sourceIndex++; |
246 | *offsets++=sourceIndex++; |
247 | *offsets++=sourceIndex++; |
248 | *offsets++=sourceIndex++; |
249 | *offsets++=sourceIndex++; |
250 | *offsets++=sourceIndex++; |
251 | *offsets++=sourceIndex++; |
252 | *offsets++=sourceIndex++; |
253 | *offsets++=sourceIndex++; |
254 | *offsets++=sourceIndex++; |
255 | --count; |
256 | } |
257 | } |
258 | } |
259 | #endif |
260 | |
261 | /* conversion loop */ |
262 | c=0; |
263 | while(targetCapacity>0 && (c=*source++)<=max) { |
264 | /* convert the Unicode code point */ |
265 | *target++=(uint8_t)c; |
266 | --targetCapacity; |
267 | } |
268 | |
269 | if(c>max) { |
270 | cp=c; |
271 | if(!U_IS_SURROGATE(cp)) { |
272 | /* callback(unassigned) */ |
273 | } else if(U_IS_SURROGATE_LEAD(cp)) { |
274 | getTrail: |
275 | if(source<sourceLimit) { |
276 | /* test the following code unit */ |
277 | char16_t trail=*source; |
278 | if(U16_IS_TRAIL(trail)) { |
279 | ++source; |
280 | cp=U16_GET_SUPPLEMENTARY(cp, trail); |
281 | /* this codepage does not map supplementary code points */ |
282 | /* callback(unassigned) */ |
283 | } else { |
284 | /* this is an unmatched lead code unit (1st surrogate) */ |
285 | /* callback(illegal) */ |
286 | } |
287 | } else { |
288 | /* no more input */ |
289 | cnv->fromUChar32=cp; |
290 | goto noMoreInput; |
291 | } |
292 | } else { |
293 | /* this is an unmatched trail code unit (2nd surrogate) */ |
294 | /* callback(illegal) */ |
295 | } |
296 | |
297 | *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; |
298 | cnv->fromUChar32=cp; |
299 | } |
300 | noMoreInput: |
301 | |
302 | /* set offsets since the start */ |
303 | if(offsets!=nullptr) { |
304 | size_t count=target-oldTarget; |
305 | while(count>0) { |
306 | *offsets++=sourceIndex++; |
307 | --count; |
308 | } |
309 | } |
310 | |
311 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { |
312 | /* target is full */ |
313 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
314 | } |
315 | |
316 | /* write back the updated pointers */ |
317 | pArgs->source=source; |
318 | pArgs->target=(char *)target; |
319 | pArgs->offsets=offsets; |
320 | } |
321 | |
322 | /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */ |
323 | static void U_CALLCONV |
324 | ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
325 | UConverterToUnicodeArgs *pToUArgs, |
326 | UErrorCode *pErrorCode) { |
327 | UConverter *utf8; |
328 | const uint8_t *source, *sourceLimit; |
329 | uint8_t *target; |
330 | int32_t targetCapacity; |
331 | |
332 | UChar32 c; |
333 | uint8_t b, t1; |
334 | |
335 | /* set up the local pointers */ |
336 | utf8=pToUArgs->converter; |
337 | source=(uint8_t *)pToUArgs->source; |
338 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; |
339 | target=(uint8_t *)pFromUArgs->target; |
340 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
341 | |
342 | /* get the converter state from the UTF-8 UConverter */ |
343 | if (utf8->toULength > 0) { |
344 | c=(UChar32)utf8->toUnicodeStatus; |
345 | } else { |
346 | c = 0; |
347 | } |
348 | if(c!=0 && source<sourceLimit) { |
349 | if(targetCapacity==0) { |
350 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
351 | return; |
352 | } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) { |
353 | ++source; |
354 | *target++=(uint8_t)(((c&3)<<6)|t1); |
355 | --targetCapacity; |
356 | |
357 | utf8->toUnicodeStatus=0; |
358 | utf8->toULength=0; |
359 | } else { |
360 | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ |
361 | *pErrorCode=U_USING_DEFAULT_WARNING; |
362 | return; |
363 | } |
364 | } |
365 | |
366 | /* |
367 | * Make sure that the last byte sequence before sourceLimit is complete |
368 | * or runs into a lead byte. |
369 | * In the conversion loop compare source with sourceLimit only once |
370 | * per multi-byte character. |
371 | * For Latin-1, adjust sourceLimit only for 1 trail byte because |
372 | * the conversion loop handles at most 2-byte sequences. |
373 | */ |
374 | if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) { |
375 | --sourceLimit; |
376 | } |
377 | |
378 | /* conversion loop */ |
379 | while(source<sourceLimit) { |
380 | if(targetCapacity>0) { |
381 | b=*source++; |
382 | if(U8_IS_SINGLE(b)) { |
383 | /* convert ASCII */ |
384 | *target++=(uint8_t)b; |
385 | --targetCapacity; |
386 | } else if( /* handle U+0080..U+00FF inline */ |
387 | b>=0xc2 && b<=0xc3 && |
388 | (t1=(uint8_t)(*source-0x80)) <= 0x3f |
389 | ) { |
390 | ++source; |
391 | *target++=(uint8_t)(((b&3)<<6)|t1); |
392 | --targetCapacity; |
393 | } else { |
394 | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ |
395 | pToUArgs->source=(char *)(source-1); |
396 | pFromUArgs->target=(char *)target; |
397 | *pErrorCode=U_USING_DEFAULT_WARNING; |
398 | return; |
399 | } |
400 | } else { |
401 | /* target is full */ |
402 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
403 | break; |
404 | } |
405 | } |
406 | |
407 | /* |
408 | * The sourceLimit may have been adjusted before the conversion loop |
409 | * to stop before a truncated sequence. |
410 | * If so, then collect the truncated sequence now. |
411 | * For Latin-1, there is at most exactly one lead byte because of the |
412 | * smaller sourceLimit adjustment logic. |
413 | */ |
414 | if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { |
415 | utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; |
416 | utf8->toULength=1; |
417 | utf8->mode=U8_COUNT_BYTES(b); |
418 | } |
419 | |
420 | /* write back the updated pointers */ |
421 | pToUArgs->source=(char *)source; |
422 | pFromUArgs->target=(char *)target; |
423 | } |
424 | |
425 | static void U_CALLCONV |
426 | _Latin1GetUnicodeSet(const UConverter *cnv, |
427 | const USetAdder *sa, |
428 | UConverterUnicodeSet which, |
429 | UErrorCode *pErrorCode) { |
430 | (void)cnv; |
431 | (void)which; |
432 | (void)pErrorCode; |
433 | sa->addRange(sa->set, 0, 0xff); |
434 | } |
435 | U_CDECL_END |
436 | |
437 | |
438 | static const UConverterImpl _Latin1Impl={ |
439 | UCNV_LATIN_1, |
440 | |
441 | nullptr, |
442 | nullptr, |
443 | |
444 | nullptr, |
445 | nullptr, |
446 | nullptr, |
447 | |
448 | _Latin1ToUnicodeWithOffsets, |
449 | _Latin1ToUnicodeWithOffsets, |
450 | _Latin1FromUnicodeWithOffsets, |
451 | _Latin1FromUnicodeWithOffsets, |
452 | _Latin1GetNextUChar, |
453 | |
454 | nullptr, |
455 | nullptr, |
456 | nullptr, |
457 | nullptr, |
458 | _Latin1GetUnicodeSet, |
459 | |
460 | nullptr, |
461 | ucnv_Latin1FromUTF8 |
462 | }; |
463 | |
464 | static const UConverterStaticData _Latin1StaticData={ |
465 | sizeof(UConverterStaticData), |
466 | "ISO-8859-1" , |
467 | 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, |
468 | { 0x1a, 0, 0, 0 }, 1, false, false, |
469 | 0, |
470 | 0, |
471 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
472 | }; |
473 | |
474 | const UConverterSharedData _Latin1Data= |
475 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Latin1StaticData, &_Latin1Impl); |
476 | |
477 | /* US-ASCII ----------------------------------------------------------------- */ |
478 | |
479 | U_CDECL_BEGIN |
480 | /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
481 | static void U_CALLCONV |
482 | _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
483 | UErrorCode *pErrorCode) { |
484 | const uint8_t *source, *sourceLimit; |
485 | char16_t *target, *oldTarget; |
486 | int32_t targetCapacity, length; |
487 | int32_t *offsets; |
488 | |
489 | int32_t sourceIndex; |
490 | |
491 | uint8_t c; |
492 | |
493 | /* set up the local pointers */ |
494 | source=(const uint8_t *)pArgs->source; |
495 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
496 | target=oldTarget=pArgs->target; |
497 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
498 | offsets=pArgs->offsets; |
499 | |
500 | /* sourceIndex=-1 if the current character began in the previous buffer */ |
501 | sourceIndex=0; |
502 | |
503 | /* |
504 | * since the conversion here is 1:1 char16_t:uint8_t, we need only one counter |
505 | * for the minimum of the sourceLength and targetCapacity |
506 | */ |
507 | length=(int32_t)(sourceLimit-source); |
508 | if(length<targetCapacity) { |
509 | targetCapacity=length; |
510 | } |
511 | |
512 | if(targetCapacity>=8) { |
513 | /* This loop is unrolled for speed and improved pipelining. */ |
514 | int32_t count, loops; |
515 | char16_t oredChars; |
516 | |
517 | loops=count=targetCapacity>>3; |
518 | do { |
519 | oredChars=target[0]=source[0]; |
520 | oredChars|=target[1]=source[1]; |
521 | oredChars|=target[2]=source[2]; |
522 | oredChars|=target[3]=source[3]; |
523 | oredChars|=target[4]=source[4]; |
524 | oredChars|=target[5]=source[5]; |
525 | oredChars|=target[6]=source[6]; |
526 | oredChars|=target[7]=source[7]; |
527 | |
528 | /* were all 16 entries really valid? */ |
529 | if(oredChars>0x7f) { |
530 | /* no, return to the first of these 16 */ |
531 | break; |
532 | } |
533 | source+=8; |
534 | target+=8; |
535 | } while(--count>0); |
536 | count=loops-count; |
537 | targetCapacity-=count*8; |
538 | |
539 | if(offsets!=nullptr) { |
540 | oldTarget+=count*8; |
541 | while(count>0) { |
542 | offsets[0]=sourceIndex++; |
543 | offsets[1]=sourceIndex++; |
544 | offsets[2]=sourceIndex++; |
545 | offsets[3]=sourceIndex++; |
546 | offsets[4]=sourceIndex++; |
547 | offsets[5]=sourceIndex++; |
548 | offsets[6]=sourceIndex++; |
549 | offsets[7]=sourceIndex++; |
550 | offsets+=8; |
551 | --count; |
552 | } |
553 | } |
554 | } |
555 | |
556 | /* conversion loop */ |
557 | c=0; |
558 | while(targetCapacity>0 && (c=*source++)<=0x7f) { |
559 | *target++=c; |
560 | --targetCapacity; |
561 | } |
562 | |
563 | if(c>0x7f) { |
564 | /* callback(illegal); copy the current bytes to toUBytes[] */ |
565 | UConverter *cnv=pArgs->converter; |
566 | cnv->toUBytes[0]=c; |
567 | cnv->toULength=1; |
568 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
569 | } else if(source<sourceLimit && target>=pArgs->targetLimit) { |
570 | /* target is full */ |
571 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
572 | } |
573 | |
574 | /* set offsets since the start */ |
575 | if(offsets!=nullptr) { |
576 | size_t count=target-oldTarget; |
577 | while(count>0) { |
578 | *offsets++=sourceIndex++; |
579 | --count; |
580 | } |
581 | } |
582 | |
583 | /* write back the updated pointers */ |
584 | pArgs->source=(const char *)source; |
585 | pArgs->target=target; |
586 | pArgs->offsets=offsets; |
587 | } |
588 | |
589 | /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */ |
590 | static UChar32 U_CALLCONV |
591 | _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, |
592 | UErrorCode *pErrorCode) { |
593 | const uint8_t *source; |
594 | uint8_t b; |
595 | |
596 | source=(const uint8_t *)pArgs->source; |
597 | if(source<(const uint8_t *)pArgs->sourceLimit) { |
598 | b=*source++; |
599 | pArgs->source=(const char *)source; |
600 | if(b<=0x7f) { |
601 | return b; |
602 | } else { |
603 | UConverter *cnv=pArgs->converter; |
604 | cnv->toUBytes[0]=b; |
605 | cnv->toULength=1; |
606 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
607 | return 0xffff; |
608 | } |
609 | } |
610 | |
611 | /* no output because of empty input */ |
612 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
613 | return 0xffff; |
614 | } |
615 | |
616 | /* "Convert" UTF-8 to US-ASCII: Validate and copy. */ |
617 | static void U_CALLCONV |
618 | ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
619 | UConverterToUnicodeArgs *pToUArgs, |
620 | UErrorCode *pErrorCode) { |
621 | const uint8_t *source, *sourceLimit; |
622 | uint8_t *target; |
623 | int32_t targetCapacity, length; |
624 | |
625 | uint8_t c; |
626 | |
627 | if(pToUArgs->converter->toULength > 0) { |
628 | /* no handling of partial UTF-8 characters here, fall back to pivoting */ |
629 | *pErrorCode=U_USING_DEFAULT_WARNING; |
630 | return; |
631 | } |
632 | |
633 | /* set up the local pointers */ |
634 | source=(const uint8_t *)pToUArgs->source; |
635 | sourceLimit=(const uint8_t *)pToUArgs->sourceLimit; |
636 | target=(uint8_t *)pFromUArgs->target; |
637 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
638 | |
639 | /* |
640 | * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter |
641 | * for the minimum of the sourceLength and targetCapacity |
642 | */ |
643 | length=(int32_t)(sourceLimit-source); |
644 | if(length<targetCapacity) { |
645 | targetCapacity=length; |
646 | } |
647 | |
648 | /* unroll the loop with the most common case */ |
649 | if(targetCapacity>=16) { |
650 | int32_t count, loops; |
651 | uint8_t oredChars; |
652 | |
653 | loops=count=targetCapacity>>4; |
654 | do { |
655 | oredChars=*target++=*source++; |
656 | oredChars|=*target++=*source++; |
657 | oredChars|=*target++=*source++; |
658 | oredChars|=*target++=*source++; |
659 | oredChars|=*target++=*source++; |
660 | oredChars|=*target++=*source++; |
661 | oredChars|=*target++=*source++; |
662 | oredChars|=*target++=*source++; |
663 | oredChars|=*target++=*source++; |
664 | oredChars|=*target++=*source++; |
665 | oredChars|=*target++=*source++; |
666 | oredChars|=*target++=*source++; |
667 | oredChars|=*target++=*source++; |
668 | oredChars|=*target++=*source++; |
669 | oredChars|=*target++=*source++; |
670 | oredChars|=*target++=*source++; |
671 | |
672 | /* were all 16 entries really valid? */ |
673 | if(oredChars>0x7f) { |
674 | /* no, return to the first of these 16 */ |
675 | source-=16; |
676 | target-=16; |
677 | break; |
678 | } |
679 | } while(--count>0); |
680 | count=loops-count; |
681 | targetCapacity-=16*count; |
682 | } |
683 | |
684 | /* conversion loop */ |
685 | c=0; |
686 | while(targetCapacity>0 && (c=*source)<=0x7f) { |
687 | ++source; |
688 | *target++=c; |
689 | --targetCapacity; |
690 | } |
691 | |
692 | if(c>0x7f) { |
693 | /* non-ASCII character, handle in standard converter */ |
694 | *pErrorCode=U_USING_DEFAULT_WARNING; |
695 | } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) { |
696 | /* target is full */ |
697 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
698 | } |
699 | |
700 | /* write back the updated pointers */ |
701 | pToUArgs->source=(const char *)source; |
702 | pFromUArgs->target=(char *)target; |
703 | } |
704 | |
705 | static void U_CALLCONV |
706 | _ASCIIGetUnicodeSet(const UConverter *cnv, |
707 | const USetAdder *sa, |
708 | UConverterUnicodeSet which, |
709 | UErrorCode *pErrorCode) { |
710 | (void)cnv; |
711 | (void)which; |
712 | (void)pErrorCode; |
713 | sa->addRange(sa->set, 0, 0x7f); |
714 | } |
715 | U_CDECL_END |
716 | |
717 | static const UConverterImpl _ASCIIImpl={ |
718 | UCNV_US_ASCII, |
719 | |
720 | nullptr, |
721 | nullptr, |
722 | |
723 | nullptr, |
724 | nullptr, |
725 | nullptr, |
726 | |
727 | _ASCIIToUnicodeWithOffsets, |
728 | _ASCIIToUnicodeWithOffsets, |
729 | _Latin1FromUnicodeWithOffsets, |
730 | _Latin1FromUnicodeWithOffsets, |
731 | _ASCIIGetNextUChar, |
732 | |
733 | nullptr, |
734 | nullptr, |
735 | nullptr, |
736 | nullptr, |
737 | _ASCIIGetUnicodeSet, |
738 | |
739 | nullptr, |
740 | ucnv_ASCIIFromUTF8 |
741 | }; |
742 | |
743 | static const UConverterStaticData _ASCIIStaticData={ |
744 | sizeof(UConverterStaticData), |
745 | "US-ASCII" , |
746 | 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, |
747 | { 0x1a, 0, 0, 0 }, 1, false, false, |
748 | 0, |
749 | 0, |
750 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
751 | }; |
752 | |
753 | const UConverterSharedData _ASCIIData= |
754 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ASCIIStaticData, &_ASCIIImpl); |
755 | |
756 | #endif |
757 | |