convert_UTF.cc source code [breakpad/common/convert_UTF.cc]

1	/*
2	* Copyright © 1991-2015 Unicode, Inc. All rights reserved.
3	* Distributed under the Terms of Use in
4	* http://www.unicode.org/copyright.html.
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining
7	* a copy of the Unicode data files and any associated documentation
8	* (the "Data Files") or Unicode software and any associated documentation
9	* (the "Software") to deal in the Data Files or Software
10	* without restriction, including without limitation the rights to use,
11	* copy, modify, merge, publish, distribute, and/or sell copies of
12	* the Data Files or Software, and to permit persons to whom the Data Files
13	* or Software are furnished to do so, provided that
14	* (a) this copyright and permission notice appear with all copies
15	* of the Data Files or Software,
16	* (b) this copyright and permission notice appear in associated
17	* documentation, and
18	* (c) there is clear notice in each modified Data File or in the Software
19	* as well as in the documentation associated with the Data File(s) or
20	* Software that the data or software has been modified.
21	*
22	* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
23	* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
24	* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25	* NONINFRINGEMENT OF THIRD PARTY RIGHTS.
26	* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
27	* NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
28	* DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
29	* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
30	* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
31	* PERFORMANCE OF THE DATA FILES OR SOFTWARE.
32	*
33	* Except as contained in this notice, the name of a copyright holder
34	* shall not be used in advertising or otherwise to promote the sale,
35	* use or other dealings in these Data Files or Software without prior
36	* written authorization of the copyright holder.
37	*/
38
39	/ ---------------------------------------------------------------------*
40
41	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
42	Author: Mark E. Davis, 1994.
43	Rev History: Rick McGowan, fixes & updates May 2001.
44	Sept 2001: fixed const & error conditions per
45	mods suggested by S. Parent & A. Lillich.
46	June 2002: Tim Dodd added detection and handling of incomplete
47	source sequences, enhanced error detection, added casts
48	to eliminate compiler warnings.
49	July 2003: slight mods to back out aggressive FFFE detection.
50	Jan 2004: updated switches in from-UTF8 conversions.
51	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
52
53	See the header file "ConvertUTF.h" for complete documentation.
54
55	------------------------------------------------------------------------ /*
56
57
58	#include "convert_UTF.h"
59	#ifdef CVTUTF_DEBUG
60	#include <stdio.h>
61	#endif
62
63	#include "common/macros.h"
64
65	namespace google_breakpad {
66
67	namespace {
68
69	const int halfShift = `10`; / used for shifting by 10 bits /
70
71	const UTF32 halfBase = `0x0010000UL`;
72	const UTF32 halfMask = `0x3FFUL`;
73
74	} // namespace
75
76	#define UNI_SUR_HIGH_START (UTF32)0xD800
77	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
78	#define UNI_SUR_LOW_START (UTF32)0xDC00
79	#define UNI_SUR_LOW_END (UTF32)0xDFFF
80
81	/ --------------------------------------------------------------------- /
82
83	ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
84	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
85	ConversionResult result = conversionOK;
86	const UTF32* source = *sourceStart;
87	UTF16* target = *targetStart;
88	while (source < sourceEnd) {
89	UTF32 ch;
90	if (target >= targetEnd) {
91	result = targetExhausted; break;
92	}
93	ch = *source++;
94	if (ch <= UNI_MAX_BMP) { / Target is a character <= 0xFFFF /
95	/ UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values /
96	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
97	if (flags == strictConversion) {
98	--source; / return to the illegal value itself /
99	result = sourceIllegal;
100	break;
101	} else {
102	*target++ = UNI_REPLACEMENT_CHAR;
103	}
104	} else {
105	target++ = (UTF16)ch; /* normal case /
106	}
107	} else if (ch > UNI_MAX_LEGAL_UTF32) {
108	if (flags == strictConversion) {
109	result = sourceIllegal;
110	} else {
111	*target++ = UNI_REPLACEMENT_CHAR;
112	}
113	} else {
114	/ target is a character in range 0xFFFF - 0x10FFFF. /
115	if (target + `1` >= targetEnd) {
116	--source; / Back up source pointer! /
117	result = targetExhausted; break;
118	}
119	ch -= halfBase;
120	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
121	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
122	}
123	}
124	*sourceStart = source;
125	*targetStart = target;
126	return result;
127	}
128
129	/ --------------------------------------------------------------------- /
130
131	ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
132	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
133	ConversionResult result = conversionOK;
134	const UTF16* source = *sourceStart;
135	UTF32* target = *targetStart;
136	UTF32 ch, ch2;
137	while (source < sourceEnd) {
138	const UTF16* oldSource = source; / In case we have to back up because of target overflow. /
139	ch = *source++;
140	/ If we have a surrogate pair, convert to UTF32 first. /
141	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
142	/ If the 16 bits following the high surrogate are in the source buffer... /
143	if (source < sourceEnd) {
144	ch2 = *source;
145	/ If it's a low surrogate, convert to UTF32. /
146	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
147	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
148	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
149	++source;
150	} else if (flags == strictConversion) { / it's an unpaired high surrogate /
151	--source; / return to the illegal value itself /
152	result = sourceIllegal;
153	break;
154	}
155	} else { / We don't have the 16 bits following the high surrogate. /
156	--source; / return to the high surrogate /
157	result = sourceExhausted;
158	break;
159	}
160	} else if (flags == strictConversion) {
161	/ UTF-16 surrogate values are illegal in UTF-32 /
162	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
163	--source; / return to the illegal value itself /
164	result = sourceIllegal;
165	break;
166	}
167	}
168	if (target >= targetEnd) {
169	source = oldSource; / Back up source pointer! /
170	result = targetExhausted; break;
171	}
172	*target++ = ch;
173	}
174	*sourceStart = source;
175	*targetStart = target;
176	#ifdef CVTUTF_DEBUG
177	if (result == sourceIllegal) {
178	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
179	fflush(stderr);
180	}
181	#endif
182	return result;
183	}
184
185	/ --------------------------------------------------------------------- /
186
187	namespace {
188
189	/*
190	* Index into the table below with the first byte of a UTF-8 sequence to
191	* get the number of trailing bytes that are supposed to follow it.
192	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
193	* left as-is for anyone who may want to do such conversion, which was
194	* allowed in earlier algorithms.
195	*/
196	const char trailingBytesForUTF8[`256`] = {
197	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
198	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
199	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
200	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
201	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
202	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
203	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
204	`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`, `3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`4`,`4`,`4`,`4`,`5`,`5`,`5`,`5`
205	};
206
207	/*
208	* Magic values subtracted from a buffer value during UTF8 conversion.
209	* This table contains as many values as there might be trailing bytes
210	* in a UTF-8 sequence.
211	*/
212	const UTF32 offsetsFromUTF8[`6`] = { `0x00000000UL`, `0x00003080UL`, `0x000E2080UL`,
213	`0x03C82080UL`, `0xFA082080UL`, `0x82082080UL` };
214
215	/*
216	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
217	* into the first byte, depending on how many bytes follow. There are
218	* as many entries in this table as there are UTF-8 sequence types.
219	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
220	* for legal UTF-8 will be 4 or fewer bytes total.
221	*/
222	const UTF8 firstByteMark[`7`] = { `0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
223
224	/ --------------------------------------------------------------------- /
225
226	/ The interface converts a whole buffer to avoid function-call overhead.*
227	* Constants have been gathered. Loops & conditionals have been removed as
228	* much as possible for efficiency, in favor of drop-through switches.
229	* (See "Note A" at the bottom of the file for equivalent code.)
230	* If your compiler supports it, the "isLegalUTF8" call can be turned
231	* into an inline function.
232	*/
233
234	} // namespace
235
236	/ --------------------------------------------------------------------- /
237
238	ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
239	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
240	ConversionResult result = conversionOK;
241	const UTF16* source = *sourceStart;
242	UTF8* target = *targetStart;
243	while (source < sourceEnd) {
244	UTF32 ch;
245	unsigned short bytesToWrite = `0`;
246	const UTF32 byteMask = `0xBF`;
247	const UTF32 byteMark = `0x80`;
248	const UTF16* oldSource = source; / In case we have to back up because of target overflow. /
249	ch = *source++;
250	/ If we have a surrogate pair, convert to UTF32 first. /
251	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
252	/ If the 16 bits following the high surrogate are in the source buffer... /
253	if (source < sourceEnd) {
254	UTF32 ch2 = *source;
255	/ If it's a low surrogate, convert to UTF32. /
256	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
257	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
258	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
259	++source;
260	} else if (flags == strictConversion) { / it's an unpaired high surrogate /
261	--source; / return to the illegal value itself /
262	result = sourceIllegal;
263	break;
264	}
265	} else { / We don't have the 16 bits following the high surrogate. /
266	--source; / return to the high surrogate /
267	result = sourceExhausted;
268	break;
269	}
270	} else if (flags == strictConversion) {
271	/ UTF-16 surrogate values are illegal in UTF-32 /
272	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
273	--source; / return to the illegal value itself /
274	result = sourceIllegal;
275	break;
276	}
277	}
278	/ Figure out how many bytes the result will require /
279	if (ch < (UTF32)`0x80`) { bytesToWrite = `1`;
280	} else if (ch < (UTF32)`0x800`) { bytesToWrite = `2`;
281	} else if (ch < (UTF32)`0x10000`) { bytesToWrite = `3`;
282	} else if (ch < (UTF32)`0x110000`) { bytesToWrite = `4`;
283	} else { bytesToWrite = `3`;
284	ch = UNI_REPLACEMENT_CHAR;
285	}
286
287	target += bytesToWrite;
288	if (target > targetEnd) {
289	source = oldSource; / Back up source pointer! /
290	target -= bytesToWrite; result = targetExhausted; break;
291	}
292	switch (bytesToWrite) { / note: everything falls through. /
293	case `4`:
294	*--target = (UTF8)((ch \| byteMark) & byteMask);
295	ch >>= `6`;
296	BP_FALLTHROUGH;
297	case `3`:
298	*--target = (UTF8)((ch \| byteMark) & byteMask);
299	ch >>= `6`;
300	BP_FALLTHROUGH;
301	case `2`:
302	*--target = (UTF8)((ch \| byteMark) & byteMask);
303	ch >>= `6`;
304	BP_FALLTHROUGH;
305	case `1`:
306	*--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
307	}
308	target += bytesToWrite;
309	}
310	*sourceStart = source;
311	*targetStart = target;
312	return result;
313	}
314
315	/ --------------------------------------------------------------------- /
316
317	namespace {
318
319	/*
320	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
321	* This must be called with the length pre-determined by the first byte.
322	* If not calling this from ConvertUTF8to*, then the length can be set by:
323	* length = trailingBytesForUTF8[*source]+1;
324	* and the sequence is illegal right away if there aren't that many bytes
325	* available.
326	* If presented with a length > 4, this returns false. The Unicode
327	* definition of UTF-8 goes up to 4-byte sequences.
328	*/
329	Boolean isLegalUTF8(const UTF8 source, int* length) {
330	UTF8 a;
331	const UTF8 *srcptr = source+length;
332	switch (length) {
333	default: return false;
334	/ Everything else falls through when "true"... /
335	case `4`:
336	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
337	BP_FALLTHROUGH;
338	case `3`:
339	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
340	BP_FALLTHROUGH;
341	case `2`:
342	if ((a = (--srcptr)) > `0xBF`) return* false;
343
344	switch (*source) {
345	/ no fall-through in this inner switch /
346	case `0xE0`: if (a < `0xA0`) return false; break;
347	case `0xED`: if (a > `0x9F`) return false; break;
348	case `0xF0`: if (a < `0x90`) return false; break;
349	case `0xF4`: if (a > `0x8F`) return false; break;
350	default: if (a < `0x80`) return false;
351	}
352	BP_FALLTHROUGH;
353	case `1`: if (source >= `0x80` && source < `0xC2`) return false;
354	}
355	if (source > `0xF4`) return* false;
356	return true;
357	}
358
359	} // namespace
360
361	/ --------------------------------------------------------------------- /
362
363	/*
364	* Exported function to return whether a UTF-8 sequence is legal or not.
365	* This is not used here; it's just exported.
366	*/
367	Boolean isLegalUTF8Sequence(const UTF8 source, const* UTF8 *sourceEnd) {
368	int length = trailingBytesForUTF8[*source]+`1`;
369	if (source+length > sourceEnd) {
370	return false;
371	}
372	return isLegalUTF8(source, length);
373	}
374
375	/ --------------------------------------------------------------------- /
376
377	ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
378	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
379	ConversionResult result = conversionOK;
380	const UTF8* source = *sourceStart;
381	UTF16* target = *targetStart;
382	while (source < sourceEnd) {
383	UTF32 ch = `0`;
384	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
385	if (source + extraBytesToRead >= sourceEnd) {
386	result = sourceExhausted; break;
387	}
388	/ Do this check whether lenient or strict /
389	if (! isLegalUTF8(source, extraBytesToRead+`1`)) {
390	result = sourceIllegal;
391	break;
392	}
393	/*
394	* The cases all fall through. See "Note A" below.
395	*/
396	switch (extraBytesToRead) {
397	/ remember, illegal UTF-8 /
398	case `5`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
399	/ remember, illegal UTF-8 /
400	case `4`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
401	case `3`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
402	case `2`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
403	case `1`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
404	case `0`: ch += *source++;
405	}
406	ch -= offsetsFromUTF8[extraBytesToRead];
407
408	if (target >= targetEnd) {
409	source -= (extraBytesToRead+`1`); / Back up source pointer! /
410	result = targetExhausted; break;
411	}
412	if (ch <= UNI_MAX_BMP) { / Target is a character <= 0xFFFF /
413	/ UTF-16 surrogate values are illegal in UTF-32 /
414	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
415	if (flags == strictConversion) {
416	source -= (extraBytesToRead+`1`); / return to the illegal value itself /
417	result = sourceIllegal;
418	break;
419	} else {
420	*target++ = UNI_REPLACEMENT_CHAR;
421	}
422	} else {
423	target++ = (UTF16)ch; /* normal case /
424	}
425	} else if (ch > UNI_MAX_UTF16) {
426	if (flags == strictConversion) {
427	result = sourceIllegal;
428	source -= (extraBytesToRead+`1`); / return to the start /
429	break; / Bail out; shouldn't continue /
430	} else {
431	*target++ = UNI_REPLACEMENT_CHAR;
432	}
433	} else {
434	/ target is a character in range 0xFFFF - 0x10FFFF. /
435	if (target + `1` >= targetEnd) {
436	source -= (extraBytesToRead+`1`); / Back up source pointer! /
437	result = targetExhausted; break;
438	}
439	ch -= halfBase;
440	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
441	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
442	}
443	}
444	*sourceStart = source;
445	*targetStart = target;
446	return result;
447	}
448
449	/ --------------------------------------------------------------------- /
450
451	ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
452	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
453	ConversionResult result = conversionOK;
454	const UTF32* source = *sourceStart;
455	UTF8* target = *targetStart;
456	while (source < sourceEnd) {
457	UTF32 ch;
458	unsigned short bytesToWrite = `0`;
459	const UTF32 byteMask = `0xBF`;
460	const UTF32 byteMark = `0x80`;
461	ch = *source++;
462	if (flags == strictConversion ) {
463	/ UTF-16 surrogate values are illegal in UTF-32 /
464	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
465	--source; / return to the illegal value itself /
466	result = sourceIllegal;
467	break;
468	}
469	}
470	/*
471	* Figure out how many bytes the result will require. Turn any
472	* illegally large UTF32 things (> Plane 17) into replacement chars.
473	*/
474	if (ch < (UTF32)`0x80`) { bytesToWrite = `1`;
475	} else if (ch < (UTF32)`0x800`) { bytesToWrite = `2`;
476	} else if (ch < (UTF32)`0x10000`) { bytesToWrite = `3`;
477	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = `4`;
478	} else { bytesToWrite = `3`;
479	ch = UNI_REPLACEMENT_CHAR;
480	result = sourceIllegal;
481	}
482
483	target += bytesToWrite;
484	if (target > targetEnd) {
485	--source; / Back up source pointer! /
486	target -= bytesToWrite; result = targetExhausted; break;
487	}
488	switch (bytesToWrite) { / note: everything falls through. /
489	case `4`:
490	*--target = (UTF8)((ch \| byteMark) & byteMask);
491	ch >>= `6`;
492	BP_FALLTHROUGH;
493	case `3`:
494	*--target = (UTF8)((ch \| byteMark) & byteMask);
495	ch >>= `6`;
496	BP_FALLTHROUGH;
497	case `2`:
498	*--target = (UTF8)((ch \| byteMark) & byteMask);
499	ch >>= `6`;
500	BP_FALLTHROUGH;
501	case `1`:
502	*--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
503	}
504	target += bytesToWrite;
505	}
506	*sourceStart = source;
507	*targetStart = target;
508	return result;
509	}
510
511	/ --------------------------------------------------------------------- /
512
513	ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
514	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
515	ConversionResult result = conversionOK;
516	const UTF8* source = *sourceStart;
517	UTF32* target = *targetStart;
518	while (source < sourceEnd) {
519	UTF32 ch = `0`;
520	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
521	if (source + extraBytesToRead >= sourceEnd) {
522	result = sourceExhausted; break;
523	}
524	/ Do this check whether lenient or strict /
525	if (! isLegalUTF8(source, extraBytesToRead+`1`)) {
526	result = sourceIllegal;
527	break;
528	}
529	/*
530	* The cases all fall through. See "Note A" below.
531	*/
532	switch (extraBytesToRead) {
533	case `5`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
534	case `4`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
535	case `3`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
536	case `2`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
537	case `1`: ch += *source++; ch <<= `6`; BP_FALLTHROUGH;
538	case `0`: ch += *source++;
539	}
540	ch -= offsetsFromUTF8[extraBytesToRead];
541
542	if (target >= targetEnd) {
543	source -= (extraBytesToRead+`1`); / Back up the source pointer! /
544	result = targetExhausted; break;
545	}
546	if (ch <= UNI_MAX_LEGAL_UTF32) {
547	/*
548	* UTF-16 surrogate values are illegal in UTF-32, and anything
549	* over Plane 17 (> 0x10FFFF) is illegal.
550	*/
551	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
552	if (flags == strictConversion) {
553	source -= (extraBytesToRead+`1`); / return to the illegal value itself /
554	result = sourceIllegal;
555	break;
556	} else {
557	*target++ = UNI_REPLACEMENT_CHAR;
558	}
559	} else {
560	*target++ = ch;
561	}
562	} else { / i.e., ch > UNI_MAX_LEGAL_UTF32 /
563	result = sourceIllegal;
564	*target++ = UNI_REPLACEMENT_CHAR;
565	}
566	}
567	*sourceStart = source;
568	*targetStart = target;
569	return result;
570	}
571
572	/ ---------------------------------------------------------------------*
573
574	Note A.
575	The fall-through switches in UTF-8 reading code save a
576	temp variable, some decrements & conditionals. The switches
577	are equivalent to the following loop:
578	{
579	int tmpBytesToRead = extraBytesToRead+1;
580	do {
581	ch += source++;*
582	--tmpBytesToRead;
583	if (tmpBytesToRead) ch <<= 6;
584	} while (tmpBytesToRead > 0);
585	}
586	In UTF-8 writing code, the switches on "bytesToWrite" are
587	similarly unrolled loops.
588
589	--------------------------------------------------------------------- /*
590
591	} // namespace google_breakpad
592

Browse the source code of breakpad/common/convert_UTF.cc