ucnvmbcs.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnvmbcs.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 2000-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	* file name: ucnvmbcs.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2000jul03
16	* created by: Markus W. Scherer
17	*
18	* The current code in this file replaces the previous implementation
19	* of conversion code from multi-byte codepages to Unicode and back.
20	* This implementation supports the following:
21	* - legacy variable-length codepages with up to 4 bytes per character
22	* - all Unicode code points (up to 0x10ffff)
23	* - efficient distinction of unassigned vs. illegal byte sequences
24	* - it is possible in fromUnicode() to directly deal with simple
25	* stateful encodings (used for EBCDIC_STATEFUL)
26	* - it is possible to convert Unicode code points
27	* to a single zero byte (but not as a fallback except for SBCS)
28	*
29	* Remaining limitations in fromUnicode:
30	* - byte sequences must not have leading zero bytes
31	* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
32	* - limitation to up to 4 bytes per character
33	*
34	* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
35	* limitations and adds m:n character mappings and other features.
36	* See ucnv_ext.h for details.
37	*
38	* Change history:
39	*
40	* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
41	* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
42	* macros to ucnvmbcs.h file
43	*/
44
45	#include "unicode/utypes.h"
46
47	#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
48
49	#include "unicode/ucnv.h"
50	#include "unicode/ucnv_cb.h"
51	#include "unicode/udata.h"
52	#include "unicode/uset.h"
53	#include "unicode/utf8.h"
54	#include "unicode/utf16.h"
55	#include "ucnv_bld.h"
56	#include "ucnvmbcs.h"
57	#include "ucnv_ext.h"
58	#include "ucnv_cnv.h"
59	#include "cmemory.h"
60	#include "cstring.h"
61	#include "umutex.h"
62	#include "ustr_imp.h"
63
64	/ control optimizations according to the platform /
65	#define MBCS_UNROLL_SINGLE_TO_BMP 1
66	#define MBCS_UNROLL_SINGLE_FROM_BMP 0
67
68	/*
69	* _MBCSHeader versions 5.3 & 4.3
70	* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
71	*
72	* This version is optional. Version 5 is used for incompatible data format changes.
73	* makeconv will continue to generate version 4 files if possible.
74	*
75	* Changes from version 4:
76	*
77	* The main difference is an additional _MBCSHeader field with
78	* - the length (number of uint32_t) of the _MBCSHeader
79	* - flags for further incompatible data format changes
80	* - flags for further, backward compatible data format changes
81	*
82	* The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
83	* the file and needs to be reconstituted at load time.
84	* This requires a utf8Friendly format with an additional mbcsIndex table for fast
85	* (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
86	* (For details about these structures see below, and see ucnvmbcs.h.)
87	*
88	* utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
89	* of the Unicode code points. (This requires that the .ucm file has the \|0 etc.
90	* precision markers for all mappings.)
91	*
92	* All fallbacks have been moved to the extension table, leaving only roundtrips in the
93	* omitted data that can be reconstituted from the toUnicode data.
94	*
95	* Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
96	* With only roundtrip mappings in the base fromUnicode data, this part is fully
97	* redundant with the mbcsIndex and will be reconstituted from that (also using the
98	* stage 1 table which contains the information about how stage 2 was compacted).
99	*
100	* The rest of the stage 2 table, the part for code points above maxFastUChar,
101	* is stored in the file and will be appended to the reconstituted part.
102	*
103	* The entire fromUBytes array is omitted from the file and will be reconstitued.
104	* This is done by enumerating all toUnicode roundtrip mappings, performing
105	* each mapping (using the stage 1 and reconstituted stage 2 tables) and
106	* writing instead of reading the byte values.
107	*
108	* _MBCSHeader version 4.3
109	*
110	* Change from version 4.2:
111	* - Optional utf8Friendly data structures, with 64-entry stage 3 block
112	* allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
113	* files which can be used instead of stages 1 & 2.
114	* Faster lookups for roundtrips from most commonly used characters,
115	* and lookups from UTF-8 byte sequences with a natural bit distribution.
116	* See ucnvmbcs.h for more details.
117	*
118	* Change from version 4.1:
119	* - Added an optional extension table structure at the end of the .cnv file.
120	* It is present if the upper bits of the header flags field contains a non-zero
121	* byte offset to it.
122	* Files that contain only a conversion table and no base table
123	* use the special outputType MBCS_OUTPUT_EXT_ONLY.
124	* These contain the base table name between the MBCS header and the extension
125	* data.
126	*
127	* Change from version 4.0:
128	* - Replace header.reserved with header.fromUBytesLength so that all
129	* fields in the data have length.
130	*
131	* Changes from version 3 (for performance improvements):
132	* - new bit distribution for state table entries
133	* - reordered action codes
134	* - new data structure for single-byte fromUnicode
135	* + stage 2 only contains indexes
136	* + stage 3 stores 16 bits per character with classification bits 15..8
137	* - no multiplier for stage 1 entries
138	* - stage 2 for non-single-byte codepages contains the index and the flags in
139	* one 32-bit value
140	* - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
141	*
142	* For more details about old versions of the MBCS data structure, see
143	* the corresponding versions of this file.
144	*
145	* Converting stateless codepage data ---------------------------------------***
146	* (or codepage data with simple states) to Unicode.
147	*
148	* Data structure and algorithm for converting from complex legacy codepages
149	* to Unicode. (Designed before 2000-may-22.)
150	*
151	* The basic idea is that the structure of legacy codepages can be described
152	* with state tables.
153	* When reading a byte stream, each input byte causes a state transition.
154	* Some transitions result in the output of a code point, some result in
155	* "unassigned" or "illegal" output.
156	* This is used here for character conversion.
157	*
158	* The data structure begins with a state table consisting of a row
159	* per state, with 256 entries (columns) per row for each possible input
160	* byte value.
161	* Each entry is 32 bits wide, with two formats distinguished by
162	* the sign bit (bit 31):
163	*
164	* One format for transitional entries (bit 31 not set) for non-final bytes, and
165	* one format for final entries (bit 31 set).
166	* Both formats contain the number of the next state in the same bit
167	* positions.
168	* State 0 is the initial state.
169	*
170	* Most of the time, the offset values of subsequent states are added
171	* up to a scalar value. This value will eventually be the index of
172	* the Unicode code point in a table that follows the state table.
173	* The effect is that the code points for final state table rows
174	* are contiguous. The code points of final state rows follow each other
175	* in the order of the references to those final states by previous
176	* states, etc.
177	*
178	* For some terminal states, the offset is itself the output Unicode
179	* code point (16 bits for a BMP code point or 20 bits for a supplementary
180	* code point (stored as code point minus 0x10000 so that 20 bits are enough).
181	* For others, the code point in the Unicode table is stored with either
182	* one or two code units: one for BMP code points, two for a pair of
183	* surrogates.
184	* All code points for a final state entry take up the same number of code
185	* units, regardless of whether they all actually _use_ the same number
186	* of code units. This is necessary for simple array access.
187	*
188	* An additional feature comes in with what in ICU is called "fallback"
189	* mappings:
190	*
191	* In addition to round-trippable, precise, 1:1 mappings, there are often
192	* mappings defined between similar, though not the same, characters.
193	* Typically, such mappings occur only in fromUnicode mapping tables because
194	* Unicode has a superset repertoire of most other codepages. However, it
195	* is possible to provide such mappings in the toUnicode tables, too.
196	* In this case, the fallback mappings are partly integrated into the
197	* general state tables because the structure of the encoding includes their
198	* byte sequences.
199	* For final entries in an initial state, fallback mappings are stored in
200	* the entry itself like with roundtrip mappings.
201	* For other final entries, they are stored in the code units table if
202	* the entry is for a pair of code units.
203	* For single-unit results in the code units table, there is no space to
204	* alternatively hold a fallback mapping; in this case, the code unit
205	* is stored as U+fffe (unassigned), and the fallback mapping needs to
206	* be looked up by the scalar offset value in a separate table.
207	*
208	* "Unassigned" state entries really mean "structurally unassigned",
209	* i.e., such a byte sequence will never have a mapping result.
210	*
211	* The interpretation of the bits in each entry is as follows:
212	*
213	* Bit 31 not set, not a terminal entry ("transitional"):
214	* 30..24 next state
215	* 23..0 offset delta, to be added up
216	*
217	* Bit 31 set, terminal ("final") entry:
218	* 30..24 next state (regardless of action code)
219	* 23..20 action code:
220	* action codes 0 and 1 result in precise-mapping Unicode code points
221	* 0 valid byte sequence
222	* 19..16 not used, 0
223	* 15..0 16-bit Unicode BMP code point
224	* never U+fffe or U+ffff
225	* 1 valid byte sequence
226	* 19..0 20-bit Unicode supplementary code point
227	* never U+fffe or U+ffff
228	*
229	* action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
230	* 2 valid byte sequence (fallback)
231	* 19..16 not used, 0
232	* 15..0 16-bit Unicode BMP code point as fallback result
233	* 3 valid byte sequence (fallback)
234	* 19..0 20-bit Unicode supplementary code point as fallback result
235	*
236	* action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
237	* depending on the code units they result in
238	* 4 valid byte sequence
239	* 19..9 not used, 0
240	* 8..0 final offset delta
241	* pointing to one 16-bit code unit which may be
242	* fffe unassigned -- look for a fallback for this offset
243	* ffff illegal
244	* 5 valid byte sequence
245	* 19..9 not used, 0
246	* 8..0 final offset delta
247	* pointing to two 16-bit code units
248	* (typically UTF-16 surrogates)
249	* the result depends on the first code unit as follows:
250	* 0000..d7ff roundtrip BMP code point (1st alone)
251	* d800..dbff roundtrip surrogate pair (1st, 2nd)
252	* dc00..dfff fallback surrogate pair (1st-400, 2nd)
253	* e000 roundtrip BMP code point (2nd alone)
254	* e001 fallback BMP code point (2nd alone)
255	* fffe unassigned
256	* ffff illegal
257	* (the final offset deltas are at most 255 * 2,
258	* times 2 because of storing code unit pairs)
259	*
260	* 6 unassigned byte sequence
261	* 19..16 not used, 0
262	* 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
263	* this does not contain a final offset delta because the main
264	* purpose of this action code is to save scalar offset values;
265	* therefore, fallback values cannot be assigned to byte
266	* sequences that result in this action code
267	* 7 illegal byte sequence
268	* 19..16 not used, 0
269	* 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
270	* 8 state change only
271	* 19..0 not used, 0
272	* useful for state changes in simple stateful encodings,
273	* at Shift-In/Shift-Out codes
274	*
275	*
276	* 9..15 reserved for future use
277	* current implementations will only perform a state change
278	* and ignore bits 19..0
279	*
280	* An encoding with contiguous ranges of unassigned byte sequences, like
281	* Shift-JIS and especially EUC-TW, can be stored efficiently by having
282	* at least two states for the trail bytes:
283	* One trail byte state that results in code points, and one that only
284	* has "unassigned" and "illegal" terminal states.
285	*
286	* Note: partly by accident, this data structure supports simple stateful
287	* encodings without any additional logic.
288	* Currently, only simple Shift-In/Shift-Out schemes are handled with
289	* appropriate state tables (especially EBCDIC_STATEFUL!).
290	*
291	* MBCS version 2 added:
292	* unassigned and illegal action codes have U+fffe and U+ffff
293	* instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
294	*
295	* Converting from Unicode to codepage bytes --------------------------------***
296	*
297	* The conversion data structure for fromUnicode is designed for the known
298	* structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
299	* a sequence of 1..4 bytes, in addition to a flag that indicates if there is
300	* a roundtrip mapping.
301	*
302	* The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
303	* like in the character properties table.
304	* The beginning of the trie is at offsetFromUTable, the beginning of stage 3
305	* with the resulting bytes is at offsetFromUBytes.
306	*
307	* Beginning with version 4, single-byte codepages have a significantly different
308	* trie compared to other codepages.
309	* In all cases, the entry in stage 1 is directly the index of the block of
310	* 64 entries in stage 2.
311	*
312	* Single-byte lookup:
313	*
314	* Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
315	* Stage 3 contains one 16-bit word per result:
316	* Bits 15..8 indicate the kind of result:
317	* f roundtrip result
318	* c fallback result from private-use code point
319	* 8 fallback result from other code points
320	* 0 unassigned
321	* Bits 7..0 contain the codepage byte. A zero byte is always possible.
322	*
323	* In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
324	* file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
325	* becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
326	* ASCII code points can be looked up with a linear array access into stage 3.
327	* See maxFastUChar and other details in ucnvmbcs.h.
328	*
329	* Multi-byte lookup:
330	*
331	* Stage 2 contains a 32-bit word for each 16-block in stage 3:
332	* Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
333	* test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
334	* If this test is false, then a non-zero result will be interpreted as
335	* a fallback mapping.
336	* Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
337	*
338	* Stage 3 contains 2, 3, or 4 bytes per result.
339	* 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
340	* while 3 bytes are stored as bytes in big-endian order.
341	* Leading zero bytes are ignored, and the number of bytes is counted.
342	* A zero byte mapping result is possible as a roundtrip result.
343	* For some output types, the actual result is processed from this;
344	* see ucnv_MBCSFromUnicodeWithOffsets().
345	*
346	* Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
347	* or (version 3 and up) for BMP-only codepages, it contains 64 entries.
348	*
349	* In version 4.3, a utf8Friendly file contains an mbcsIndex table.
350	* For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
351	* becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
352	* ASCII code points can be looked up with a linear array access into stage 3.
353	* See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
354	*
355	* In version 3, stage 2 blocks may overlap by multiples of the multiplier
356	* for compaction.
357	* In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
358	* may overlap by any number of entries.
359	*
360	* MBCS version 2 added:
361	* the converter checks for known output types, which allows
362	* adding new ones without crashing an unaware converter
363	*/
364
365	/**
366	* Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
367	* consecutive sequences of bytes, starting from the one encoded in value,
368	* to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
369	* Does not currently support m:n mappings or reverse fallbacks.
370	* This function will not be called for sequences of bytes with leading zeros.
371	*
372	* @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
373	* @param value contains 1..4 bytes of the first byte sequence, right-aligned
374	* @param codePoints resulting Unicode code points, or negative if a byte sequence does
375	* not map to anything
376	* @return TRUE to continue enumeration, FALSE to stop
377	*/
378	typedef UBool U_CALLCONV
379	UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[`32`]);
380
381	static void U_CALLCONV
382	ucnv_MBCSLoad(UConverterSharedData *sharedData,
383	UConverterLoadArgs *pArgs,
384	const uint8_t *raw,
385	UErrorCode *pErrorCode);
386
387	static void U_CALLCONV
388	ucnv_MBCSUnload(UConverterSharedData *sharedData);
389
390	static void U_CALLCONV
391	ucnv_MBCSOpen(UConverter *cnv,
392	UConverterLoadArgs *pArgs,
393	UErrorCode *pErrorCode);
394
395	static UChar32 U_CALLCONV
396	ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
397	UErrorCode *pErrorCode);
398
399	static void U_CALLCONV
400	ucnv_MBCSGetStarters(const UConverter* cnv,
401	UBool starters[`256`],
402	UErrorCode *pErrorCode);
403
404	U_CDECL_BEGIN
405	static const char* U_CALLCONV
406	ucnv_MBCSGetName(const UConverter *cnv);
407	U_CDECL_END
408
409	static void U_CALLCONV
410	ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
411	int32_t offsetIndex,
412	UErrorCode *pErrorCode);
413
414	static UChar32 U_CALLCONV
415	ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
416	UErrorCode *pErrorCode);
417
418	static void U_CALLCONV
419	ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
420	UConverterToUnicodeArgs *pToUArgs,
421	UErrorCode *pErrorCode);
422
423	static void U_CALLCONV
424	ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
425	const USetAdder *sa,
426	UConverterUnicodeSet which,
427	UErrorCode *pErrorCode);
428
429	static void U_CALLCONV
430	ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
431	UConverterToUnicodeArgs *pToUArgs,
432	UErrorCode *pErrorCode);
433
434	static const UConverterImpl _SBCSUTF8Impl={
435	UCNV_MBCS,
436
437	ucnv_MBCSLoad,
438	ucnv_MBCSUnload,
439
440	ucnv_MBCSOpen,
441	NULL,
442	NULL,
443
444	ucnv_MBCSToUnicodeWithOffsets,
445	ucnv_MBCSToUnicodeWithOffsets,
446	ucnv_MBCSFromUnicodeWithOffsets,
447	ucnv_MBCSFromUnicodeWithOffsets,
448	ucnv_MBCSGetNextUChar,
449
450	ucnv_MBCSGetStarters,
451	ucnv_MBCSGetName,
452	ucnv_MBCSWriteSub,
453	NULL,
454	ucnv_MBCSGetUnicodeSet,
455
456	NULL,
457	ucnv_SBCSFromUTF8
458	};
459
460	static const UConverterImpl _DBCSUTF8Impl={
461	UCNV_MBCS,
462
463	ucnv_MBCSLoad,
464	ucnv_MBCSUnload,
465
466	ucnv_MBCSOpen,
467	NULL,
468	NULL,
469
470	ucnv_MBCSToUnicodeWithOffsets,
471	ucnv_MBCSToUnicodeWithOffsets,
472	ucnv_MBCSFromUnicodeWithOffsets,
473	ucnv_MBCSFromUnicodeWithOffsets,
474	ucnv_MBCSGetNextUChar,
475
476	ucnv_MBCSGetStarters,
477	ucnv_MBCSGetName,
478	ucnv_MBCSWriteSub,
479	NULL,
480	ucnv_MBCSGetUnicodeSet,
481
482	NULL,
483	ucnv_DBCSFromUTF8
484	};
485
486	static const UConverterImpl _MBCSImpl={
487	UCNV_MBCS,
488
489	ucnv_MBCSLoad,
490	ucnv_MBCSUnload,
491
492	ucnv_MBCSOpen,
493	NULL,
494	NULL,
495
496	ucnv_MBCSToUnicodeWithOffsets,
497	ucnv_MBCSToUnicodeWithOffsets,
498	ucnv_MBCSFromUnicodeWithOffsets,
499	ucnv_MBCSFromUnicodeWithOffsets,
500	ucnv_MBCSGetNextUChar,
501
502	ucnv_MBCSGetStarters,
503	ucnv_MBCSGetName,
504	ucnv_MBCSWriteSub,
505	NULL,
506	ucnv_MBCSGetUnicodeSet,
507	NULL,
508	NULL
509	};
510
511	/ Static data is in tools/makeconv/ucnvstat.c for data-based*
512	* converters. Be sure to update it as well.
513	*/
514
515	const UConverterSharedData _MBCSData={
516	sizeof(UConverterSharedData), `1`,
517	NULL, NULL, FALSE, TRUE, &_MBCSImpl,
518	`0`, UCNV_MBCS_TABLE_INITIALIZER
519	};
520
521
522	/ GB 18030 data ------------------------------------------------------------ /
523
524	/ helper macros for linear values for GB 18030 four-byte sequences /
525	#define LINEAR_18030(a, b, c, d) ((((a)10+(b))126L+(c))*10L+(d))
526
527	#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
528
529	#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
530
531	/*
532	* Some ranges of GB 18030 where both the Unicode code points and the
533	* GB four-byte sequences are contiguous and are handled algorithmically by
534	* the special callback functions below.
535	* The values are start & end of Unicode & GB codes.
536	*
537	* Note that single surrogates are not mapped by GB 18030
538	* as of the re-released mapping tables from 2000-nov-30.
539	*/
540	static const uint32_t
541	gb18030Ranges[`14`][`4`]={
542	{`0x10000`, `0x10FFFF`, LINEAR(`0x90308130`), LINEAR(`0xE3329A35`)},
543	{`0x9FA6`, `0xD7FF`, LINEAR(`0x82358F33`), LINEAR(`0x8336C738`)},
544	{`0x0452`, `0x1E3E`, LINEAR(`0x8130D330`), LINEAR(`0x8135F436`)},
545	{`0x1E40`, `0x200F`, LINEAR(`0x8135F438`), LINEAR(`0x8136A531`)},
546	{`0xE865`, `0xF92B`, LINEAR(`0x8336D030`), LINEAR(`0x84308534`)},
547	{`0x2643`, `0x2E80`, LINEAR(`0x8137A839`), LINEAR(`0x8138FD38`)},
548	{`0xFA2A`, `0xFE2F`, LINEAR(`0x84309C38`), LINEAR(`0x84318537`)},
549	{`0x3CE1`, `0x4055`, LINEAR(`0x8231D438`), LINEAR(`0x8232AF32`)},
550	{`0x361B`, `0x3917`, LINEAR(`0x8230A633`), LINEAR(`0x8230F237`)},
551	{`0x49B8`, `0x4C76`, LINEAR(`0x8234A131`), LINEAR(`0x8234E733`)},
552	{`0x4160`, `0x4336`, LINEAR(`0x8232C937`), LINEAR(`0x8232F837`)},
553	{`0x478E`, `0x4946`, LINEAR(`0x8233E838`), LINEAR(`0x82349638`)},
554	{`0x44D7`, `0x464B`, LINEAR(`0x8233A339`), LINEAR(`0x8233C931`)},
555	{`0xFFE6`, `0xFFFF`, LINEAR(`0x8431A234`), LINEAR(`0x8431A439`)}
556	};
557
558	/ bit flag for UConverter.options indicating GB 18030 special handling /
559	#define _MBCS_OPTION_GB18030 0x8000
560
561	/ bit flag for UConverter.options indicating KEIS,JEF,JIF special handling /
562	#define _MBCS_OPTION_KEIS 0x01000
563	#define _MBCS_OPTION_JEF 0x02000
564	#define _MBCS_OPTION_JIPS 0x04000
565
566	#define KEIS_SO_CHAR_1 0x0A
567	#define KEIS_SO_CHAR_2 0x42
568	#define KEIS_SI_CHAR_1 0x0A
569	#define KEIS_SI_CHAR_2 0x41
570
571	#define JEF_SO_CHAR 0x28
572	#define JEF_SI_CHAR 0x29
573
574	#define JIPS_SO_CHAR_1 0x1A
575	#define JIPS_SO_CHAR_2 0x70
576	#define JIPS_SI_CHAR_1 0x1A
577	#define JIPS_SI_CHAR_2 0x71
578
579	enum SISO_Option {
580	SI,
581	SO
582	};
583	typedef enum SISO_Option SISO_Option;
584
585	static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
586	int32_t SISOLength = `0`;
587
588	switch (option) {
589	case SI:
590	if ((cnvOption&_MBCS_OPTION_KEIS)!=`0`) {
591	value[`0`] = KEIS_SI_CHAR_1;
592	value[`1`] = KEIS_SI_CHAR_2;
593	SISOLength = `2`;
594	} else if ((cnvOption&_MBCS_OPTION_JEF)!=`0`) {
595	value[`0`] = JEF_SI_CHAR;
596	SISOLength = `1`;
597	} else if ((cnvOption&_MBCS_OPTION_JIPS)!=`0`) {
598	value[`0`] = JIPS_SI_CHAR_1;
599	value[`1`] = JIPS_SI_CHAR_2;
600	SISOLength = `2`;
601	} else {
602	value[`0`] = UCNV_SI;
603	SISOLength = `1`;
604	}
605	break;
606	case SO:
607	if ((cnvOption&_MBCS_OPTION_KEIS)!=`0`) {
608	value[`0`] = KEIS_SO_CHAR_1;
609	value[`1`] = KEIS_SO_CHAR_2;
610	SISOLength = `2`;
611	} else if ((cnvOption&_MBCS_OPTION_JEF)!=`0`) {
612	value[`0`] = JEF_SO_CHAR;
613	SISOLength = `1`;
614	} else if ((cnvOption&_MBCS_OPTION_JIPS)!=`0`) {
615	value[`0`] = JIPS_SO_CHAR_1;
616	value[`1`] = JIPS_SO_CHAR_2;
617	SISOLength = `2`;
618	} else {
619	value[`0`] = UCNV_SO;
620	SISOLength = `1`;
621	}
622	break;
623	default:
624	/ Should never happen. /
625	break;
626	}
627
628	return SISOLength;
629	}
630
631	/ Miscellaneous ------------------------------------------------------------ /
632
633	/ similar to ucnv_MBCSGetNextUChar() but recursive /
634	static UBool
635	enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
636	int32_t state, uint32_t offset,
637	uint32_t value,
638	UConverterEnumToUCallback callback, const* void *context,
639	UErrorCode *pErrorCode) {
640	UChar32 codePoints[`32`];
641	const int32_t *row;
642	const uint16_t *unicodeCodeUnits;
643	UChar32 anyCodePoints;
644	int32_t b, limit;
645
646	row=mbcsTable->stateTable[state];
647	unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
648
649	value<<=`8`;
650	anyCodePoints=-`1`; / becomes non-negative if there is a mapping /
651
652	b=(stateProps[state]&`0x38`)<<`2`;
653	if(b==`0` && stateProps[state]>=`0x40`) {
654	/ skip byte sequences with leading zeros because they are not stored in the fromUnicode table /
655	codePoints[`0`]=U_SENTINEL;
656	b=`1`;
657	}
658	limit=((stateProps[state]&`7`)+`1`)<<`5`;
659	while(b<limit) {
660	int32_t entry=row[b];
661	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
662	int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
663	if(stateProps[nextState]>=`0`) {
664	/ recurse to a state with non-ignorable actions /
665	if(!enumToU(
666	mbcsTable, stateProps, nextState,
667	offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
668	value\|(uint32_t)b,
669	callback, context,
670	pErrorCode)) {
671	return FALSE;
672	}
673	}
674	codePoints[b&`0x1f`]=U_SENTINEL;
675	} else {
676	UChar32 c;
677	int32_t action;
678
679	/*
680	* An if-else-if chain provides more reliable performance for
681	* the most common cases compared to a switch.
682	*/
683	action=MBCS_ENTRY_FINAL_ACTION(entry);
684	if(action==MBCS_STATE_VALID_DIRECT_16) {
685	/ output BMP code point /
686	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
687	} else if(action==MBCS_STATE_VALID_16) {
688	int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
689	c=unicodeCodeUnits[finalOffset];
690	if(c<`0xfffe`) {
691	/ output BMP code point /
692	} else {
693	c=U_SENTINEL;
694	}
695	} else if(action==MBCS_STATE_VALID_16_PAIR) {
696	int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
697	c=unicodeCodeUnits[finalOffset++];
698	if(c<`0xd800`) {
699	/ output BMP code point below 0xd800 /
700	} else if(c<=`0xdbff`) {
701	/ output roundtrip or fallback supplementary code point /
702	c=((c&`0x3ff`)<<`10`)+unicodeCodeUnits[finalOffset]+(`0x10000`-`0xdc00`);
703	} else if(c==`0xe000`) {
704	/ output roundtrip BMP code point above 0xd800 or fallback BMP code point /
705	c=unicodeCodeUnits[finalOffset];
706	} else {
707	c=U_SENTINEL;
708	}
709	} else if(action==MBCS_STATE_VALID_DIRECT_20) {
710	/ output supplementary code point /
711	c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+`0x10000`);
712	} else {
713	c=U_SENTINEL;
714	}
715
716	codePoints[b&`0x1f`]=c;
717	anyCodePoints&=c;
718	}
719	if(((++b)&`0x1f`)==`0`) {
720	if(anyCodePoints>=`0`) {
721	if(!callback(context, value\|(uint32_t)(b-`0x20`), codePoints)) {
722	return FALSE;
723	}
724	anyCodePoints=-`1`;
725	}
726	}
727	}
728	return TRUE;
729	}
730
731	/*
732	* Only called if stateProps[state]==-1.
733	* A recursive call may do stateProps[state]\|=0x40 if this state is the target of an
734	* MBCS_STATE_CHANGE_ONLY.
735	*/
736	static int8_t
737	getStateProp(const int32_t (stateTable)[`256`], int8_t stateProps[], int* state) {
738	const int32_t *row;
739	int32_t min, max, entry, nextState;
740
741	row=stateTable[state];
742	stateProps[state]=`0`;
743
744	/ find first non-ignorable state /
745	for(min=`0`;; ++min) {
746	entry=row[min];
747	nextState=MBCS_ENTRY_STATE(entry);
748	if(stateProps[nextState]==-`1`) {
749	getStateProp(stateTable, stateProps, nextState);
750	}
751	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
752	if(stateProps[nextState]>=`0`) {
753	break;
754	}
755	} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
756	break;
757	}
758	if(min==`0xff`) {
759	stateProps[state]=-`0x40`; / (int8_t)0xc0 /
760	return stateProps[state];
761	}
762	}
763	stateProps[state]\|=(int8_t)((min>>`5`)<<`3`);
764
765	/ find last non-ignorable state /
766	for(max=`0xff`; min<max; --max) {
767	entry=row[max];
768	nextState=MBCS_ENTRY_STATE(entry);
769	if(stateProps[nextState]==-`1`) {
770	getStateProp(stateTable, stateProps, nextState);
771	}
772	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
773	if(stateProps[nextState]>=`0`) {
774	break;
775	}
776	} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
777	break;
778	}
779	}
780	stateProps[state]\|=(int8_t)(max>>`5`);
781
782	/ recurse further and collect direct-state information /
783	while(min<=max) {
784	entry=row[min];
785	nextState=MBCS_ENTRY_STATE(entry);
786	if(stateProps[nextState]==-`1`) {
787	getStateProp(stateTable, stateProps, nextState);
788	}
789	if(MBCS_ENTRY_IS_FINAL(entry)) {
790	stateProps[nextState]\|=`0x40`;
791	if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
792	stateProps[state]\|=`0x40`;
793	}
794	}
795	++min;
796	}
797	return stateProps[state];
798	}
799
800	/*
801	* Internal function enumerating the toUnicode data of an MBCS converter.
802	* Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
803	* table, but could also be used for a future ucnv_getUnicodeSet() option
804	* that includes reverse fallbacks (after updating this function's implementation).
805	* Currently only handles roundtrip mappings.
806	* Does not currently handle extensions.
807	*/
808	static void
809	ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
810	UConverterEnumToUCallback callback, const* void *context,
811	UErrorCode *pErrorCode) {
812	/*
813	* Properties for each state, to speed up the enumeration.
814	* Ignorable actions are unassigned/illegal/state-change-only:
815	* They do not lead to mappings.
816	*
817	* Bits 7..6:
818	* 1 direct/initial state (stateful converters have multiple)
819	* 0 non-initial state with transitions or with non-ignorable result actions
820	* -1 final state with only ignorable actions
821	*
822	* Bits 5..3:
823	* The lowest byte value with non-ignorable actions is
824	* value<<5 (rounded down).
825	*
826	* Bits 2..0:
827	* The highest byte value with non-ignorable actions is
828	* (value<<5)&0x1f (rounded up).
829	*/
830	int8_t stateProps[MBCS_MAX_STATE_COUNT];
831	int32_t state;
832
833	uprv_memset(stateProps, -`1`, sizeof(stateProps));
834
835	/ recurse from state 0 and set all stateProps /
836	getStateProp(mbcsTable->stateTable, stateProps, `0`);
837
838	for(state=`0`; state<mbcsTable->countStates; ++state) {
839	/if(stateProps[state]==-1) {*
840	printf("unused/unreachable <icu:state> %d\n", state);
841	}/*
842	if(stateProps[state]>=`0x40`) {
843	/ start from each direct state /
844	enumToU(
845	mbcsTable, stateProps, state, `0`, `0`,
846	callback, context,
847	pErrorCode);
848	}
849	}
850	}
851
852	U_CFUNC void
853	ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
854	const USetAdder *sa,
855	UConverterUnicodeSet which,
856	UConverterSetFilter filter,
857	UErrorCode *pErrorCode) {
858	const UConverterMBCSTable *mbcsTable;
859	const uint16_t *table;
860
861	uint32_t st3;
862	uint16_t st1, maxStage1, st2;
863
864	UChar32 c;
865
866	/ enumerate the from-Unicode trie table /
867	mbcsTable=&sharedData->mbcs;
868	table=mbcsTable->fromUnicodeTable;
869	if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
870	maxStage1=`0x440`;
871	} else {
872	maxStage1=`0x40`;
873	}
874
875	c=`0`; / keep track of the current code point while enumerating /
876
877	if(mbcsTable->outputType==MBCS_OUTPUT_1) {
878	const uint16_t stage2, stage3, *results;
879	uint16_t minValue;
880
881	results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
882
883	/*
884	* Set a threshold variable for selecting which mappings to use.
885	* See ucnv_MBCSSingleFromBMPWithOffsets() and
886	* MBCS_SINGLE_RESULT_FROM_U() for details.
887	*/
888	if(which==UCNV_ROUNDTRIP_SET) {
889	/ use only roundtrips /
890	minValue=`0xf00`;
891	} else / UCNV_ROUNDTRIP_AND_FALLBACK_SET / {
892	/ use all roundtrip and fallback results /
893	minValue=`0x800`;
894	}
895
896	for(st1=`0`; st1<maxStage1; ++st1) {
897	st2=table[st1];
898	if(st2>maxStage1) {
899	stage2=table+st2;
900	for(st2=`0`; st2<`64`; ++st2) {
901	if((st3=stage2[st2])!=`0`) {
902	/ read the stage 3 block /
903	stage3=results+st3;
904
905	do {
906	if(*stage3++>=minValue) {
907	sa->add(sa->set, c);
908	}
909	} while((++c&`0xf`)!=`0`);
910	} else {
911	c+=`16`; / empty stage 3 block /
912	}
913	}
914	} else {
915	c+=`1024`; / empty stage 2 block /
916	}
917	}
918	} else {
919	const uint32_t *stage2;
920	const uint8_t stage3, bytes;
921	uint32_t st3Multiplier;
922	uint32_t value;
923	UBool useFallback;
924
925	bytes=mbcsTable->fromUnicodeBytes;
926
927	useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
928
929	switch(mbcsTable->outputType) {
930	case MBCS_OUTPUT_3:
931	case MBCS_OUTPUT_4_EUC:
932	st3Multiplier=`3`;
933	break;
934	case MBCS_OUTPUT_4:
935	st3Multiplier=`4`;
936	break;
937	default:
938	st3Multiplier=`2`;
939	break;
940	}
941
942	for(st1=`0`; st1<maxStage1; ++st1) {
943	st2=table[st1];
944	if(st2>(maxStage1>>`1`)) {
945	stage2=(const uint32_t *)table+st2;
946	for(st2=`0`; st2<`64`; ++st2) {
947	if((st3=stage2[st2])!=`0`) {
948	/ read the stage 3 block /
949	stage3=bytes+st3Multiplier`16`(uint32_t)(uint16_t)st3;
950
951	/ get the roundtrip flags for the stage 3 block /
952	st3>>=`16`;
953
954	/*
955	* Add code points for which the roundtrip flag is set,
956	* or which map to non-zero bytes if we use fallbacks.
957	* See ucnv_MBCSFromUnicodeWithOffsets() for details.
958	*/
959	switch(filter) {
960	case UCNV_SET_FILTER_NONE:
961	do {
962	if(st3&`1`) {
963	sa->add(sa->set, c);
964	stage3+=st3Multiplier;
965	} else if(useFallback) {
966	uint8_t b=`0`;
967	switch(st3Multiplier) {
968	case `4`:
969	b\|=*stage3++;
970	U_FALLTHROUGH;
971	case `3`:
972	b\|=*stage3++;
973	U_FALLTHROUGH;
974	case `2`:
975	b\|=stage3[`0`]\|stage3[`1`];
976	stage3+=`2`;
977	U_FALLTHROUGH;
978	default:
979	break;
980	}
981	if(b!=`0`) {
982	sa->add(sa->set, c);
983	}
984	}
985	st3>>=`1`;
986	} while((++c&`0xf`)!=`0`);
987	break;
988	case UCNV_SET_FILTER_DBCS_ONLY:
989	/ Ignore single-byte results (<0x100). /
990	do {
991	if(((st3&`1`)!=`0` \|\| useFallback) && ((const* uint16_t *)stage3)>=`0x100`) {
992	sa->add(sa->set, c);
993	}
994	st3>>=`1`;
995	stage3+=`2`; / +=st3Multiplier /
996	} while((++c&`0xf`)!=`0`);
997	break;
998	case UCNV_SET_FILTER_2022_CN:
999	/ Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. /
1000	do {
1001	if(((st3&`1`)!=`0` \|\| useFallback) && ((value=*stage3)==`0x81` \|\| value==`0x82`)) {
1002	sa->add(sa->set, c);
1003	}
1004	st3>>=`1`;
1005	stage3+=`3`; / +=st3Multiplier /
1006	} while((++c&`0xf`)!=`0`);
1007	break;
1008	case UCNV_SET_FILTER_SJIS:
1009	/ Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. /
1010	do {
1011	if(((st3&`1`)!=`0` \|\| useFallback) && (value=((const* uint16_t *)stage3))>=`0x8140` && value<=`0xeffc`) {
1012	sa->add(sa->set, c);
1013	}
1014	st3>>=`1`;
1015	stage3+=`2`; / +=st3Multiplier /
1016	} while((++c&`0xf`)!=`0`);
1017	break;
1018	case UCNV_SET_FILTER_GR94DBCS:
1019	/ Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). /
1020	do {
1021	if( ((st3&`1`)!=`0` \|\| useFallback) &&
1022	(uint16_t)((value=((const* uint16_t *)stage3)) - `0xa1a1`)<=(`0xfefe` - `0xa1a1`) &&
1023	(uint8_t)(value-`0xa1`)<=(`0xfe` - `0xa1`)
1024	) {
1025	sa->add(sa->set, c);
1026	}
1027	st3>>=`1`;
1028	stage3+=`2`; / +=st3Multiplier /
1029	} while((++c&`0xf`)!=`0`);
1030	break;
1031	case UCNV_SET_FILTER_HZ:
1032	/ Only add code points that are suitable for HZ DBCS (lead byte A1..FD). /
1033	do {
1034	if( ((st3&`1`)!=`0` \|\| useFallback) &&
1035	(uint16_t)((value=((const* uint16_t *)stage3))-`0xa1a1`)<=(`0xfdfe` - `0xa1a1`) &&
1036	(uint8_t)(value-`0xa1`)<=(`0xfe` - `0xa1`)
1037	) {
1038	sa->add(sa->set, c);
1039	}
1040	st3>>=`1`;
1041	stage3+=`2`; / +=st3Multiplier /
1042	} while((++c&`0xf`)!=`0`);
1043	break;
1044	default:
1045	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1046	return;
1047	}
1048	} else {
1049	c+=`16`; / empty stage 3 block /
1050	}
1051	}
1052	} else {
1053	c+=`1024`; / empty stage 2 block /
1054	}
1055	}
1056	}
1057
1058	ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
1059	}
1060
1061	U_CFUNC void
1062	ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
1063	const USetAdder *sa,
1064	UConverterUnicodeSet which,
1065	UErrorCode *pErrorCode) {
1066	ucnv_MBCSGetFilteredUnicodeSetForUnicode(
1067	sharedData, sa, which,
1068	sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
1069	UCNV_SET_FILTER_DBCS_ONLY :
1070	UCNV_SET_FILTER_NONE,
1071	pErrorCode);
1072	}
1073
1074	static void U_CALLCONV
1075	ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
1076	const USetAdder *sa,
1077	UConverterUnicodeSet which,
1078	UErrorCode *pErrorCode) {
1079	if(cnv->options&_MBCS_OPTION_GB18030) {
1080	sa->addRange(sa->set, `0`, `0xd7ff`);
1081	sa->addRange(sa->set, `0xe000`, `0x10ffff`);
1082	} else {
1083	ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
1084	}
1085	}
1086
1087	/ conversion extensions for input not in the main table -------------------- /
1088
1089	/*
1090	* Hardcoded extension handling for GB 18030.
1091	* Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
1092	*
1093	* In the future, conversion extensions may handle m:n mappings and delta tables,
1094	* see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
1095	*
1096	* If an input character cannot be mapped, then these functions set an error
1097	* code. The framework will then call the callback function.
1098	*/
1099
1100	/*
1101	* @return if(U_FAILURE) return the code point for cnv->fromUChar32
1102	* else return 0 after output has been written to the target
1103	*/
1104	static UChar32
1105	_extFromU(UConverter cnv, const* UConverterSharedData *sharedData,
1106	UChar32 cp,
1107	const UChar *source, const* UChar *sourceLimit,
1108	uint8_t *target, const* uint8_t *targetLimit,
1109	int32_t **offsets, int32_t sourceIndex,
1110	UBool flush,
1111	UErrorCode *pErrorCode) {
1112	const int32_t *cx;
1113
1114	cnv->useSubChar1=FALSE;
1115
1116	if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1117	ucnv_extInitialMatchFromU(
1118	cnv, cx,
1119	cp, source, sourceLimit,
1120	(char *)target, (char* *)targetLimit,
1121	offsets, sourceIndex,
1122	flush,
1123	pErrorCode)
1124	) {
1125	return `0`; / an extension mapping handled the input /
1126	}
1127
1128	/ GB 18030 /
1129	if((cnv->options&_MBCS_OPTION_GB18030)!=`0`) {
1130	const uint32_t *range;
1131	int32_t i;
1132
1133	range=gb18030Ranges[`0`];
1134	for(i=`0`; i<UPRV_LENGTHOF(gb18030Ranges); range+=`4`, ++i) {
1135	if(range[`0`]<=(uint32_t)cp && (uint32_t)cp<=range[`1`]) {
1136	/ found the Unicode code point, output the four-byte sequence for it /
1137	uint32_t linear;
1138	char bytes[`4`];
1139
1140	/ get the linear value of the first GB 18030 code in this range /
1141	linear=range[`2`]-LINEAR_18030_BASE;
1142
1143	/ add the offset from the beginning of the range /
1144	linear+=((uint32_t)cp-range[`0`]);
1145
1146	/ turn this into a four-byte sequence /
1147	bytes[`3`]=(char)(`0x30`+linear%`10`); linear/=`10`;
1148	bytes[`2`]=(char)(`0x81`+linear%`126`); linear/=`126`;
1149	bytes[`1`]=(char)(`0x30`+linear%`10`); linear/=`10`;
1150	bytes[`0`]=(char)(`0x81`+linear);
1151
1152	/ output this sequence /
1153	ucnv_fromUWriteBytes(cnv,
1154	bytes, `4`, (char *)target, (char* *)targetLimit,
1155	offsets, sourceIndex, pErrorCode);
1156	return `0`;
1157	}
1158	}
1159	}
1160
1161	/ no mapping /
1162	*pErrorCode=U_INVALID_CHAR_FOUND;
1163	return cp;
1164	}
1165
1166	/*
1167	* Input sequence: cnv->toUBytes[0..length[
1168	* @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
1169	* else return 0 after output has been written to the target
1170	*/
1171	static int8_t
1172	_extToU(UConverter cnv, const* UConverterSharedData *sharedData,
1173	int8_t length,
1174	const uint8_t *source, const* uint8_t *sourceLimit,
1175	UChar *target, const* UChar *targetLimit,
1176	int32_t **offsets, int32_t sourceIndex,
1177	UBool flush,
1178	UErrorCode *pErrorCode) {
1179	const int32_t *cx;
1180
1181	if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1182	ucnv_extInitialMatchToU(
1183	cnv, cx,
1184	length, (const char *)source, (const* char *)sourceLimit,
1185	target, targetLimit,
1186	offsets, sourceIndex,
1187	flush,
1188	pErrorCode)
1189	) {
1190	return `0`; / an extension mapping handled the input /
1191	}
1192
1193	/ GB 18030 /
1194	if(length==`4` && (cnv->options&_MBCS_OPTION_GB18030)!=`0`) {
1195	const uint32_t *range;
1196	uint32_t linear;
1197	int32_t i;
1198
1199	linear=LINEAR_18030(cnv->toUBytes[`0`], cnv->toUBytes[`1`], cnv->toUBytes[`2`], cnv->toUBytes[`3`]);
1200	range=gb18030Ranges[`0`];
1201	for(i=`0`; i<UPRV_LENGTHOF(gb18030Ranges); range+=`4`, ++i) {
1202	if(range[`2`]<=linear && linear<=range[`3`]) {
1203	/ found the sequence, output the Unicode code point for it /
1204	*pErrorCode=U_ZERO_ERROR;
1205
1206	/ add the linear difference between the input and start sequences to the start code point /
1207	linear=range[`0`]+(linear-range[`2`]);
1208
1209	/ output this code point /
1210	ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
1211
1212	return `0`;
1213	}
1214	}
1215	}
1216
1217	/ no mapping /
1218	*pErrorCode=U_INVALID_CHAR_FOUND;
1219	return length;
1220	}
1221
1222	/ EBCDIC swap LF<->NL ------------------------------------------------------ /
1223
1224	/*
1225	* This code modifies a standard EBCDIC<->Unicode mapping table for
1226	* OS/390 (z/OS) Unix System Services (Open Edition).
1227	* The difference is in the mapping of Line Feed and New Line control codes:
1228	* Standard EBCDIC maps
1229	*
1230	* <U000A> \x25 \|0
1231	* <U0085> \x15 \|0
1232	*
1233	* but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1234	* mapping
1235	*
1236	* <U000A> \x15 \|0
1237	* <U0085> \x25 \|0
1238	*
1239	* This code modifies a loaded standard EBCDIC<->Unicode mapping table
1240	* by copying it into allocated memory and swapping the LF and NL values.
1241	* It allows to support the same EBCDIC charset in both versions without
1242	* duplicating the entire installed table.
1243	*/
1244
1245	/ standard EBCDIC codes /
1246	#define EBCDIC_LF 0x25
1247	#define EBCDIC_NL 0x15
1248
1249	/ standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables /
1250	#define EBCDIC_RT_LF 0xf25
1251	#define EBCDIC_RT_NL 0xf15
1252
1253	/ Unicode code points /
1254	#define U_LF 0x0a
1255	#define U_NL 0x85
1256
1257	static UBool
1258	_EBCDICSwapLFNL(UConverterSharedData sharedData, UErrorCode pErrorCode) {
1259	UConverterMBCSTable *mbcsTable;
1260
1261	const uint16_t table, results;
1262	const uint8_t *bytes;
1263
1264	int32_t (*newStateTable)[`256`];
1265	uint16_t *newResults;
1266	uint8_t *p;
1267	char *name;
1268
1269	uint32_t stage2Entry;
1270	uint32_t size, sizeofFromUBytes;
1271
1272	mbcsTable=&sharedData->mbcs;
1273
1274	table=mbcsTable->fromUnicodeTable;
1275	bytes=mbcsTable->fromUnicodeBytes;
1276	results=(const uint16_t *)bytes;
1277
1278	/*
1279	* Check that this is an EBCDIC table with SBCS portion -
1280	* SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1281	*
1282	* If not, ignore the option. Options are always ignored if they do not apply.
1283	*/
1284	if(!(
1285	(mbcsTable->outputType==MBCS_OUTPUT_1 \|\| mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1286	mbcsTable->stateTable[`0`][EBCDIC_LF]==MBCS_ENTRY_FINAL(`0`, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
1287	mbcsTable->stateTable[`0`][EBCDIC_NL]==MBCS_ENTRY_FINAL(`0`, MBCS_STATE_VALID_DIRECT_16, U_NL)
1288	)) {
1289	return FALSE;
1290	}
1291
1292	if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1293	if(!(
1294	EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1295	EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
1296	)) {
1297	return FALSE;
1298	}
1299	} else / MBCS_OUTPUT_2_SISO / {
1300	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1301	if(!(
1302	MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=`0` &&
1303	EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
1304	)) {
1305	return FALSE;
1306	}
1307
1308	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1309	if(!(
1310	MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=`0` &&
1311	EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
1312	)) {
1313	return FALSE;
1314	}
1315	}
1316
1317	if(mbcsTable->fromUBytesLength>`0`) {
1318	/*
1319	* We _know_ the number of bytes in the fromUnicodeBytes array
1320	* starting with header.version 4.1.
1321	*/
1322	sizeofFromUBytes=mbcsTable->fromUBytesLength;
1323	} else {
1324	/*
1325	* Otherwise:
1326	* There used to be code to enumerate the fromUnicode
1327	* trie and find the highest entry, but it was removed in ICU 3.2
1328	* because it was not tested and caused a low code coverage number.
1329	* See Jitterbug 3674.
1330	* This affects only some .cnv file formats with a header.version
1331	* below 4.1, and only when swaplfnl is requested.
1332	*
1333	* ucnvmbcs.c revision 1.99 is the last one with the
1334	* ucnv_MBCSSizeofFromUBytes() function.
1335	*/
1336	*pErrorCode=U_INVALID_FORMAT_ERROR;
1337	return FALSE;
1338	}
1339
1340	/*
1341	* The table has an appropriate format.
1342	* Allocate and build
1343	* - a modified to-Unicode state table
1344	* - a modified from-Unicode output array
1345	* - a converter name string with the swap option appended
1346	*/
1347	size=
1348	mbcsTable->countStates*`1024`+
1349	sizeofFromUBytes+
1350	UCNV_MAX_CONVERTER_NAME_LENGTH+`20`;
1351	p=(uint8_t *)uprv_malloc(size);
1352	if(p==NULL) {
1353	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1354	return FALSE;
1355	}
1356
1357	/ copy and modify the to-Unicode state table /
1358	newStateTable=(int32_t (*)[`256`])p;
1359	uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*`1024`);
1360
1361	newStateTable[`0`][EBCDIC_LF]=MBCS_ENTRY_FINAL(`0`, MBCS_STATE_VALID_DIRECT_16, U_NL);
1362	newStateTable[`0`][EBCDIC_NL]=MBCS_ENTRY_FINAL(`0`, MBCS_STATE_VALID_DIRECT_16, U_LF);
1363
1364	/ copy and modify the from-Unicode result table /
1365	newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1366	uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1367
1368	/ conveniently, the table access macros work on the left side of expressions /
1369	if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1370	MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
1371	MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
1372	} else / MBCS_OUTPUT_2_SISO / {
1373	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1374	MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1375
1376	stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1377	MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1378	}
1379
1380	/ set the canonical converter name /
1381	name=(char *)newResults+sizeofFromUBytes;
1382	uprv_strcpy(name, sharedData->staticData->name);
1383	uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
1384
1385	/ set the pointers /
1386	icu::umtx_lock(NULL);
1387	if(mbcsTable->swapLFNLStateTable==NULL) {
1388	mbcsTable->swapLFNLStateTable=newStateTable;
1389	mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1390	mbcsTable->swapLFNLName=name;
1391
1392	newStateTable=NULL;
1393	}
1394	icu::umtx_unlock(NULL);
1395
1396	/ release the allocated memory if another thread beat us to it /
1397	if(newStateTable!=NULL) {
1398	uprv_free(newStateTable);
1399	}
1400	return TRUE;
1401	}
1402
1403	/ reconstitute omitted fromUnicode data ------------------------------------ /
1404
1405	/ for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() /
1406	static UBool U_CALLCONV
1407	writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[`32`]) {
1408	UConverterMBCSTable mbcsTable=(UConverterMBCSTable )context;
1409	const uint16_t *table;
1410	uint32_t *stage2;
1411	uint8_t bytes, p;
1412	UChar32 c;
1413	int32_t i, st3;
1414
1415	table=mbcsTable->fromUnicodeTable;
1416	bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1417
1418	/ for EUC outputTypes, modify the value like genmbcs.c's transformEUC() /
1419	switch(mbcsTable->outputType) {
1420	case MBCS_OUTPUT_3_EUC:
1421	if(value<=`0xffff`) {
1422	/ short sequences are stored directly /
1423	/ code set 0 or 1 /
1424	} else if(value<=`0x8effff`) {
1425	/ code set 2 /
1426	value&=`0x7fff`;
1427	} else / first byte is 0x8f / {
1428	/ code set 3 /
1429	value&=`0xff7f`;
1430	}
1431	break;
1432	case MBCS_OUTPUT_4_EUC:
1433	if(value<=`0xffffff`) {
1434	/ short sequences are stored directly /
1435	/ code set 0 or 1 /
1436	} else if(value<=`0x8effffff`) {
1437	/ code set 2 /
1438	value&=`0x7fffff`;
1439	} else / first byte is 0x8f / {
1440	/ code set 3 /
1441	value&=`0xff7fff`;
1442	}
1443	break;
1444	default:
1445	break;
1446	}
1447
1448	for(i=`0`; i<=`0x1f`; ++value, ++i) {
1449	c=codePoints[i];
1450	if(c<`0`) {
1451	continue;
1452	}
1453
1454	/ locate the stage 2 & 3 data /
1455	stage2=((uint32_t *)table)+table[c>>`10`]+((c>>`4`)&`0x3f`);
1456	p=bytes;
1457	st3=(int32_t)(uint16_t)stage2`16`+(c&`0xf`);
1458
1459	/ write the codepage bytes into stage 3 /
1460	switch(mbcsTable->outputType) {
1461	case MBCS_OUTPUT_3:
1462	case MBCS_OUTPUT_4_EUC:
1463	p+=st3*`3`;
1464	p[`0`]=(uint8_t)(value>>`16`);
1465	p[`1`]=(uint8_t)(value>>`8`);
1466	p[`2`]=(uint8_t)value;
1467	break;
1468	case MBCS_OUTPUT_4:
1469	((uint32_t *)p)[st3]=value;
1470	break;
1471	default:
1472	/ 2 bytes per character /
1473	((uint16_t *)p)[st3]=(uint16_t)value;
1474	break;
1475	}
1476
1477	/ set the roundtrip flag /
1478	*stage2\|=(`1UL`<<(`16`+(c&`0xf`)));
1479	}
1480	return TRUE;
1481	}
1482
1483	static void
1484	reconstituteData(UConverterMBCSTable *mbcsTable,
1485	uint32_t stage1Length, uint32_t stage2Length,
1486	uint32_t fullStage2Length, / lengths are numbers of units, not bytes /
1487	UErrorCode *pErrorCode) {
1488	uint16_t *stage1;
1489	uint32_t *stage2;
1490	uint32_t dataLength=stage1Length`2`+fullStage2Length`4`+mbcsTable->fromUBytesLength;
1491	mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1492	if(mbcsTable->reconstitutedData==NULL) {
1493	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1494	return;
1495	}
1496	uprv_memset(mbcsTable->reconstitutedData, `0`, dataLength);
1497
1498	/ copy existing data and reroute the pointers /
1499	stage1=(uint16_t *)mbcsTable->reconstitutedData;
1500	uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*`2`);
1501
1502	stage2=(uint32_t *)(stage1+stage1Length);
1503	uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1504	mbcsTable->fromUnicodeTable+stage1Length,
1505	stage2Length*`4`);
1506
1507	mbcsTable->fromUnicodeTable=stage1;
1508	mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
1509
1510	/ indexes into stage 2 count from the bottom of the fromUnicodeTable /
1511	stage2=(uint32_t *)stage1;
1512
1513	/ reconstitute the initial part of stage 2 from the mbcsIndex /
1514	{
1515	int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+`1`)>>`6`;
1516	int32_t stageUTF8Index=`0`;
1517	int32_t st1, st2, st3, i;
1518
1519	for(st1=`0`; stageUTF8Index<stageUTF8Length; ++st1) {
1520	st2=stage1[st1];
1521	if(st2!=(int32_t)stage1Length/`2`) {
1522	/ each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex /
1523	for(i=`0`; i<`16`; ++i) {
1524	st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1525	if(st3!=`0`) {
1526	/ an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry /
1527	st3>>=`4`;
1528	/*
1529	* 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1530	* allocated together as a single 64-block for access from the mbcsIndex
1531	*/
1532	stage2[st2++]=st3++;
1533	stage2[st2++]=st3++;
1534	stage2[st2++]=st3++;
1535	stage2[st2++]=st3;
1536	} else {
1537	/ no stage 3 block, skip /
1538	st2+=`4`;
1539	}
1540	}
1541	} else {
1542	/ no stage 2 block, skip /
1543	stageUTF8Index+=`16`;
1544	}
1545	}
1546	}
1547
1548	/ reconstitute fromUnicodeBytes with roundtrips from toUnicode data /
1549	ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1550	}
1551
1552	/ MBCS setup functions ----------------------------------------------------- /
1553
1554	static void U_CALLCONV
1555	ucnv_MBCSLoad(UConverterSharedData *sharedData,
1556	UConverterLoadArgs *pArgs,
1557	const uint8_t *raw,
1558	UErrorCode *pErrorCode) {
1559	UDataInfo info;
1560	UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1561	_MBCSHeader header=(_MBCSHeader )raw;
1562	uint32_t offset;
1563	uint32_t headerLength;
1564	UBool noFromU=FALSE;
1565
1566	if(header->version[`0`]==`4`) {
1567	headerLength=MBCS_HEADER_V4_LENGTH;
1568	} else if(header->version[`0`]==`5` && header->version[`1`]>=`3` &&
1569	(header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==`0`) {
1570	headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1571	noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=`0`);
1572	} else {
1573	*pErrorCode=U_INVALID_TABLE_FORMAT;
1574	return;
1575	}
1576
1577	mbcsTable->outputType=(uint8_t)header->flags;
1578	if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1579	*pErrorCode=U_INVALID_TABLE_FORMAT;
1580	return;
1581	}
1582
1583	/ extension data, header version 4.2 and higher /
1584	offset=header->flags>>`8`;
1585	if(offset!=`0`) {
1586	mbcsTable->extIndexes=(const int32_t *)(raw+offset);
1587	}
1588
1589	if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
1590	UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER;
1591	UConverterSharedData *baseSharedData;
1592	const int32_t *extIndexes;
1593	const char *baseName;
1594
1595	/ extension-only file, load the base table and set values appropriately /
1596	if((extIndexes=mbcsTable->extIndexes)==NULL) {
1597	/ extension-only file without extension /
1598	*pErrorCode=U_INVALID_TABLE_FORMAT;
1599	return;
1600	}
1601
1602	if(pArgs->nestedLoads!=`1`) {
1603	/ an extension table must not be loaded as a base table /
1604	*pErrorCode=U_INVALID_TABLE_FILE;
1605	return;
1606	}
1607
1608	/ load the base table /
1609	baseName=(const char )header+headerLength`4`;
1610	if(`0`==uprv_strcmp(baseName, sharedData->staticData->name)) {
1611	/ forbid loading this same extension-only file /
1612	*pErrorCode=U_INVALID_TABLE_FORMAT;
1613	return;
1614	}
1615
1616	/ TODO parse package name out of the prefix of the base name in the extension .cnv file? /
1617	args.size=sizeof(UConverterLoadArgs);
1618	args.nestedLoads=`2`;
1619	args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
1620	args.reserved=pArgs->reserved;
1621	args.options=pArgs->options;
1622	args.pkg=pArgs->pkg;
1623	args.name=baseName;
1624	baseSharedData=ucnv_load(&args, pErrorCode);
1625	if(U_FAILURE(*pErrorCode)) {
1626	return;
1627	}
1628	if( baseSharedData->staticData->conversionType!=UCNV_MBCS \|\|
1629	baseSharedData->mbcs.baseSharedData!=NULL
1630	) {
1631	ucnv_unload(baseSharedData);
1632	*pErrorCode=U_INVALID_TABLE_FORMAT;
1633	return;
1634	}
1635	if(pArgs->onlyTestIsLoadable) {
1636	/*
1637	* Exit as soon as we know that we can load the converter
1638	* and the format is valid and supported.
1639	* The worst that can happen in the following code is a memory
1640	* allocation error.
1641	*/
1642	ucnv_unload(baseSharedData);
1643	return;
1644	}
1645
1646	/ copy the base table data /
1647	uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1648
1649	/ overwrite values with relevant ones for the extension converter /
1650	mbcsTable->baseSharedData=baseSharedData;
1651	mbcsTable->extIndexes=extIndexes;
1652
1653	/*
1654	* It would be possible to share the swapLFNL data with a base converter,
1655	* but the generated name would have to be different, and the memory
1656	* would have to be free'd only once.
1657	* It is easier to just create the data for the extension converter
1658	* separately when it is requested.
1659	*/
1660	mbcsTable->swapLFNLStateTable=NULL;
1661	mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1662	mbcsTable->swapLFNLName=NULL;
1663
1664	/*
1665	* The reconstitutedData must be deleted only when the base converter
1666	* is unloaded.
1667	*/
1668	mbcsTable->reconstitutedData=NULL;
1669
1670	/*
1671	* Set a special, runtime-only outputType if the extension converter
1672	* is a DBCS version of a base converter that also maps single bytes.
1673	*/
1674	if( sharedData->staticData->conversionType==UCNV_DBCS \|\|
1675	(sharedData->staticData->conversionType==UCNV_MBCS &&
1676	sharedData->staticData->minBytesPerChar>=`2`)
1677	) {
1678	if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1679	/ the base converter is SI/SO-stateful /
1680	int32_t entry;
1681
1682	/ get the dbcs state from the state table entry for SO=0x0e /
1683	entry=mbcsTable->stateTable[`0`][`0xe`];
1684	if( MBCS_ENTRY_IS_FINAL(entry) &&
1685	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1686	MBCS_ENTRY_FINAL_STATE(entry)!=`0`
1687	) {
1688	mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1689
1690	mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1691	}
1692	} else if(
1693	baseSharedData->staticData->conversionType==UCNV_MBCS &&
1694	baseSharedData->staticData->minBytesPerChar==`1` &&
1695	baseSharedData->staticData->maxBytesPerChar==`2` &&
1696	mbcsTable->countStates<=`127`
1697	) {
1698	/ non-stateful base converter, need to modify the state table /
1699	int32_t (*newStateTable)[`256`];
1700	int32_t *state;
1701	int32_t i, count;
1702
1703	/ allocate a new state table and copy the base state table contents /
1704	count=mbcsTable->countStates;
1705	newStateTable=(int32_t ()[`256`])uprv_malloc((count+`1`)`1024`);
1706	if(newStateTable==NULL) {
1707	ucnv_unload(baseSharedData);
1708	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1709	return;
1710	}
1711
1712	uprv_memcpy(newStateTable, mbcsTable->stateTable, count*`1024`);
1713
1714	/ change all final single-byte entries to go to a new all-illegal state /
1715	state=newStateTable[`0`];
1716	for(i=`0`; i<`256`; ++i) {
1717	if(MBCS_ENTRY_IS_FINAL(state[i])) {
1718	state[i]=MBCS_ENTRY_TRANSITION(count, `0`);
1719	}
1720	}
1721
1722	/ build the new all-illegal state /
1723	state=newStateTable[count];
1724	for(i=`0`; i<`256`; ++i) {
1725	state[i]=MBCS_ENTRY_FINAL(`0`, MBCS_STATE_ILLEGAL, `0`);
1726	}
1727	mbcsTable->stateTable=(const int32_t (*)[`256`])newStateTable;
1728	mbcsTable->countStates=(uint8_t)(count+`1`);
1729	mbcsTable->stateTableOwned=TRUE;
1730
1731	mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1732	}
1733	}
1734
1735	/*
1736	* unlike below for files with base tables, do not get the unicodeMask
1737	* from the sharedData; instead, use the base table's unicodeMask,
1738	* which we copied in the memcpy above;
1739	* this is necessary because the static data unicodeMask, especially
1740	* the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1741	*/
1742	} else {
1743	/ conversion file with a base table; an additional extension table is optional /
1744	/ make sure that the output type is known /
1745	switch(mbcsTable->outputType) {
1746	case MBCS_OUTPUT_1:
1747	case MBCS_OUTPUT_2:
1748	case MBCS_OUTPUT_3:
1749	case MBCS_OUTPUT_4:
1750	case MBCS_OUTPUT_3_EUC:
1751	case MBCS_OUTPUT_4_EUC:
1752	case MBCS_OUTPUT_2_SISO:
1753	/ OK /
1754	break;
1755	default:
1756	*pErrorCode=U_INVALID_TABLE_FORMAT;
1757	return;
1758	}
1759	if(pArgs->onlyTestIsLoadable) {
1760	/*
1761	* Exit as soon as we know that we can load the converter
1762	* and the format is valid and supported.
1763	* The worst that can happen in the following code is a memory
1764	* allocation error.
1765	*/
1766	return;
1767	}
1768
1769	mbcsTable->countStates=(uint8_t)header->countStates;
1770	mbcsTable->countToUFallbacks=header->countToUFallbacks;
1771	mbcsTable->stateTable=(const int32_t ()[`256`])(raw+headerLength`4`);
1772	mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1773	mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1774
1775	mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1776	mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1777	mbcsTable->fromUBytesLength=header->fromUBytesLength;
1778
1779	/*
1780	* converter versions 6.1 and up contain a unicodeMask that is
1781	* used here to select the most efficient function implementations
1782	*/
1783	info.size=sizeof(UDataInfo);
1784	udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1785	if(info.formatVersion[`0`]>`6` \|\| (info.formatVersion[`0`]==`6` && info.formatVersion[`1`]>=`1`)) {
1786	/ mask off possible future extensions to be safe /
1787	mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&`3`);
1788	} else {
1789	/ for older versions, assume worst case: contains anything possible (prevent over-optimizations) /
1790	mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY\|UCNV_HAS_SURROGATES;
1791	}
1792
1793	/*
1794	* _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1795	* Check for the header version, SBCS vs. MBCS, and for whether the
1796	* data structures are optimized for code points as high as what the
1797	* runtime code is designed for.
1798	* The implementation does not handle mapping tables with entries for
1799	* unpaired surrogates.
1800	*/
1801	if( header->version[`1`]>=`3` &&
1802	(mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==`0` &&
1803	(mbcsTable->countStates==`1` ?
1804	(header->version[`2`]>=(SBCS_FAST_MAX>>`8`)) :
1805	(header->version[`2`]>=(MBCS_FAST_MAX>>`8`))
1806	)
1807	) {
1808	mbcsTable->utf8Friendly=TRUE;
1809
1810	if(mbcsTable->countStates==`1`) {
1811	/*
1812	* SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1813	* Build a table with indexes to each block, to be used instead of
1814	* the regular stage 1/2 table.
1815	*/
1816	int32_t i;
1817	for(i=`0`; i<(SBCS_FAST_LIMIT>>`6`); ++i) {
1818	mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>`4`]+((i<<`2`)&`0x3c`)];
1819	}
1820	/ set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) /
1821	mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1822	} else {
1823	/*
1824	* MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1825	* The .cnv file is prebuilt with an additional stage table with indexes
1826	* to each block.
1827	*/
1828	mbcsTable->mbcsIndex=(const uint16_t *)
1829	(mbcsTable->fromUnicodeBytes+
1830	(noFromU ? `0` : mbcsTable->fromUBytesLength));
1831	mbcsTable->maxFastUChar=(((UChar)header->version[`2`])<<`8`)\|`0xff`;
1832	}
1833	}
1834
1835	/ calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes /
1836	{
1837	uint32_t asciiRoundtrips=`0xffffffff`;
1838	int32_t i;
1839
1840	for(i=`0`; i<`0x80`; ++i) {
1841	if(mbcsTable->stateTable[`0`][i]!=MBCS_ENTRY_FINAL(`0`, MBCS_STATE_VALID_DIRECT_16, i)) {
1842	asciiRoundtrips&=~((uint32_t)`1`<<(i>>`2`));
1843	}
1844	}
1845	mbcsTable->asciiRoundtrips=asciiRoundtrips;
1846	}
1847
1848	if(noFromU) {
1849	uint32_t stage1Length=
1850	mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1851	`0x440` : `0x40`;
1852	uint32_t stage2Length=
1853	(header->offsetFromUBytes-header->offsetFromUTable)/`4`-
1854	stage1Length/`2`;
1855	reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1856	}
1857	}
1858
1859	/ Set the impl pointer here so that it is set for both extension-only and base tables. /
1860	if(mbcsTable->utf8Friendly) {
1861	if(mbcsTable->countStates==`1`) {
1862	sharedData->impl=&_SBCSUTF8Impl;
1863	} else {
1864	if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1865	sharedData->impl=&_DBCSUTF8Impl;
1866	}
1867	}
1868	}
1869
1870	if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY \|\| mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1871	/*
1872	* MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1873	* MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1874	*/
1875	mbcsTable->asciiRoundtrips=`0`;
1876	}
1877	}
1878
1879	static void U_CALLCONV
1880	ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1881	UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1882
1883	if(mbcsTable->swapLFNLStateTable!=NULL) {
1884	uprv_free(mbcsTable->swapLFNLStateTable);
1885	}
1886	if(mbcsTable->stateTableOwned) {
1887	uprv_free((void *)mbcsTable->stateTable);
1888	}
1889	if(mbcsTable->baseSharedData!=NULL) {
1890	ucnv_unload(mbcsTable->baseSharedData);
1891	}
1892	if(mbcsTable->reconstitutedData!=NULL) {
1893	uprv_free(mbcsTable->reconstitutedData);
1894	}
1895	}
1896
1897	static void U_CALLCONV
1898	ucnv_MBCSOpen(UConverter *cnv,
1899	UConverterLoadArgs *pArgs,
1900	UErrorCode *pErrorCode) {
1901	UConverterMBCSTable *mbcsTable;
1902	const int32_t *extIndexes;
1903	uint8_t outputType;
1904	int8_t maxBytesPerUChar;
1905
1906	if(pArgs->onlyTestIsLoadable) {
1907	return;
1908	}
1909
1910	mbcsTable=&cnv->sharedData->mbcs;
1911	outputType=mbcsTable->outputType;
1912
1913	if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1914	/ the swaplfnl option does not apply, remove it /
1915	cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
1916	}
1917
1918	if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
1919	/ do this because double-checked locking is broken /
1920	UBool isCached;
1921
1922	icu::umtx_lock(NULL);
1923	isCached=mbcsTable->swapLFNLStateTable!=NULL;
1924	icu::umtx_unlock(NULL);
1925
1926	if(!isCached) {
1927	if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1928	if(U_FAILURE(*pErrorCode)) {
1929	return; / something went wrong /
1930	}
1931
1932	/ the option does not apply, remove it /
1933	cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
1934	}
1935	}
1936	}
1937
1938	if(uprv_strstr(pArgs->name, "18030")!=NULL) {
1939	if(uprv_strstr(pArgs->name, "gb18030")!=NULL \|\| uprv_strstr(pArgs->name, "GB18030")!=NULL) {
1940	/ set a flag for GB 18030 mode, which changes the callback behavior /
1941	cnv->options\|=_MBCS_OPTION_GB18030;
1942	}
1943	} else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) \|\| (uprv_strstr(pArgs->name, "keis")!=NULL)) {
1944	/ set a flag for KEIS converter, which changes the SI/SO character sequence /
1945	cnv->options\|=_MBCS_OPTION_KEIS;
1946	} else if((uprv_strstr(pArgs->name, "JEF")!=NULL) \|\| (uprv_strstr(pArgs->name, "jef")!=NULL)) {
1947	/ set a flag for JEF converter, which changes the SI/SO character sequence /
1948	cnv->options\|=_MBCS_OPTION_JEF;
1949	} else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) \|\| (uprv_strstr(pArgs->name, "jips")!=NULL)) {
1950	/ set a flag for JIPS converter, which changes the SI/SO character sequence /
1951	cnv->options\|=_MBCS_OPTION_JIPS;
1952	}
1953
1954	/ fix maxBytesPerUChar depending on outputType and options etc. /
1955	if(outputType==MBCS_OUTPUT_2_SISO) {
1956	cnv->maxBytesPerUChar=`3`; / SO+DBCS /
1957	}
1958
1959	extIndexes=mbcsTable->extIndexes;
1960	if(extIndexes!=NULL) {
1961	maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1962	if(outputType==MBCS_OUTPUT_2_SISO) {
1963	++maxBytesPerUChar; / SO + multiple DBCS /
1964	}
1965
1966	if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1967	cnv->maxBytesPerUChar=maxBytesPerUChar;
1968	}
1969	}
1970
1971	#if 0
1972	/*
1973	* documentation of UConverter fields used for status
1974	* all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1975	*/
1976
1977	/ toUnicode /
1978	cnv->toUnicodeStatus=`0`; / offset /
1979	cnv->mode=`0`; / state /
1980	cnv->toULength=`0`; / byteIndex /
1981
1982	/ fromUnicode /
1983	cnv->fromUChar32=`0`;
1984	cnv->fromUnicodeStatus=`1`; / prevLength /
1985	#endif
1986	}
1987
1988	U_CDECL_BEGIN
1989
1990	static const char* U_CALLCONV
1991	ucnv_MBCSGetName(const UConverter *cnv) {
1992	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0` && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1993	return cnv->sharedData->mbcs.swapLFNLName;
1994	} else {
1995	return cnv->sharedData->staticData->name;
1996	}
1997	}
1998	U_CDECL_END
1999
2000
2001	/ MBCS-to-Unicode conversion functions ------------------------------------- /
2002
2003	static UChar32 U_CALLCONV
2004	ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
2005	const _MBCSToUFallback *toUFallbacks;
2006	uint32_t i, start, limit;
2007
2008	limit=mbcsTable->countToUFallbacks;
2009	if(limit>`0`) {
2010	/ do a binary search for the fallback mapping /
2011	toUFallbacks=mbcsTable->toUFallbacks;
2012	start=`0`;
2013	while(start<limit-`1`) {
2014	i=(start+limit)/`2`;
2015	if(offset<toUFallbacks[i].offset) {
2016	limit=i;
2017	} else {
2018	start=i;
2019	}
2020	}
2021
2022	/ did we really find it? /
2023	if(offset==toUFallbacks[start].offset) {
2024	return toUFallbacks[start].codePoint;
2025	}
2026	}
2027
2028	return `0xfffe`;
2029	}
2030
2031	/ This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. /
2032	static void
2033	ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2034	UErrorCode *pErrorCode) {
2035	UConverter *cnv;
2036	const uint8_t source, sourceLimit;
2037	UChar *target;
2038	const UChar *targetLimit;
2039	int32_t *offsets;
2040
2041	const int32_t (*stateTable)[`256`];
2042
2043	int32_t sourceIndex;
2044
2045	int32_t entry;
2046	UChar c;
2047	uint8_t action;
2048
2049	/ set up the local pointers /
2050	cnv=pArgs->converter;
2051	source=(const uint8_t *)pArgs->source;
2052	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2053	target=pArgs->target;
2054	targetLimit=pArgs->targetLimit;
2055	offsets=pArgs->offsets;
2056
2057	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
2058	stateTable=(const int32_t (*)[`256`])cnv->sharedData->mbcs.swapLFNLStateTable;
2059	} else {
2060	stateTable=cnv->sharedData->mbcs.stateTable;
2061	}
2062
2063	/ sourceIndex=-1 if the current character began in the previous buffer /
2064	sourceIndex=`0`;
2065
2066	/ conversion loop /
2067	while(source<sourceLimit) {
2068	/*
2069	* This following test is to see if available input would overflow the output.
2070	* It does not catch output of more than one code unit that
2071	* overflows as a result of a surrogate pair or callback output
2072	* from the last source byte.
2073	* Therefore, those situations also test for overflows and will
2074	* then break the loop, too.
2075	*/
2076	if(target>=targetLimit) {
2077	/ target is full /
2078	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2079	break;
2080	}
2081
2082	entry=stateTable[`0`][*source++];
2083	/ MBCS_ENTRY_IS_FINAL(entry) /
2084
2085	/ test the most common case first /
2086	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2087	/ output BMP code point /
2088	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2089	if(offsets!=NULL) {
2090	*offsets++=sourceIndex;
2091	}
2092
2093	/ normal end of action codes: prepare for a new character /
2094	++sourceIndex;
2095	continue;
2096	}
2097
2098	/*
2099	* An if-else-if chain provides more reliable performance for
2100	* the most common cases compared to a switch.
2101	*/
2102	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2103	if(action==MBCS_STATE_VALID_DIRECT_20 \|\|
2104	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2105	) {
2106	entry=MBCS_ENTRY_FINAL_VALUE(entry);
2107	/ output surrogate pair /
2108	*target++=(UChar)(`0xd800`\|(UChar)(entry>>`10`));
2109	if(offsets!=NULL) {
2110	*offsets++=sourceIndex;
2111	}
2112	c=(UChar)(`0xdc00`\|(UChar)(entry&`0x3ff`));
2113	if(target<targetLimit) {
2114	*target++=c;
2115	if(offsets!=NULL) {
2116	*offsets++=sourceIndex;
2117	}
2118	} else {
2119	/ target overflow /
2120	cnv->UCharErrorBuffer[`0`]=c;
2121	cnv->UCharErrorBufferLength=`1`;
2122	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2123	break;
2124	}
2125
2126	++sourceIndex;
2127	continue;
2128	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2129	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2130	/ output BMP code point /
2131	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2132	if(offsets!=NULL) {
2133	*offsets++=sourceIndex;
2134	}
2135
2136	++sourceIndex;
2137	continue;
2138	}
2139	} else if(action==MBCS_STATE_UNASSIGNED) {
2140	/ just fall through /
2141	} else if(action==MBCS_STATE_ILLEGAL) {
2142	/ callback(illegal) /
2143	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2144	} else {
2145	/ reserved, must never occur /
2146	++sourceIndex;
2147	continue;
2148	}
2149
2150	if(U_FAILURE(*pErrorCode)) {
2151	/ callback(illegal) /
2152	break;
2153	} else / unassigned sequences indicated with byteIndex>0 / {
2154	/ try an extension mapping /
2155	pArgs->source=(const char *)source;
2156	cnv->toUBytes[`0`]=*(source-`1`);
2157	cnv->toULength=_extToU(cnv, cnv->sharedData,
2158	`1`, &source, sourceLimit,
2159	&target, targetLimit,
2160	&offsets, sourceIndex,
2161	pArgs->flush,
2162	pErrorCode);
2163	sourceIndex+=`1`+(int32_t)(source-(const uint8_t *)pArgs->source);
2164
2165	if(U_FAILURE(*pErrorCode)) {
2166	/ not mappable or buffer overflow /
2167	break;
2168	}
2169	}
2170	}
2171
2172	/ write back the updated pointers /
2173	pArgs->source=(const char *)source;
2174	pArgs->target=target;
2175	pArgs->offsets=offsets;
2176	}
2177
2178	/*
2179	* This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
2180	* that only map to and from the BMP.
2181	* In addition to single-byte optimizations, the offset calculations
2182	* become much easier.
2183	*/
2184	static void
2185	ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
2186	UErrorCode *pErrorCode) {
2187	UConverter *cnv;
2188	const uint8_t source, sourceLimit, *lastSource;
2189	UChar *target;
2190	int32_t targetCapacity, length;
2191	int32_t *offsets;
2192
2193	const int32_t (*stateTable)[`256`];
2194
2195	int32_t sourceIndex;
2196
2197	int32_t entry;
2198	uint8_t action;
2199
2200	/ set up the local pointers /
2201	cnv=pArgs->converter;
2202	source=(const uint8_t *)pArgs->source;
2203	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2204	target=pArgs->target;
2205	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
2206	offsets=pArgs->offsets;
2207
2208	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
2209	stateTable=(const int32_t (*)[`256`])cnv->sharedData->mbcs.swapLFNLStateTable;
2210	} else {
2211	stateTable=cnv->sharedData->mbcs.stateTable;
2212	}
2213
2214	/ sourceIndex=-1 if the current character began in the previous buffer /
2215	sourceIndex=`0`;
2216	lastSource=source;
2217
2218	/*
2219	* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
2220	* for the minimum of the sourceLength and targetCapacity
2221	*/
2222	length=(int32_t)(sourceLimit-source);
2223	if(length<targetCapacity) {
2224	targetCapacity=length;
2225	}
2226
2227	#if MBCS_UNROLL_SINGLE_TO_BMP
2228	/ unrolling makes it faster on Pentium III/Windows 2000 /
2229	/ unroll the loop with the most common case /
2230	unrolled:
2231	if(targetCapacity>=`16`) {
2232	int32_t count, loops, oredEntries;
2233
2234	loops=count=targetCapacity>>`4`;
2235	do {
2236	oredEntries=entry=stateTable[`0`][*source++];
2237	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2238	oredEntries\|=entry=stateTable[`0`][*source++];
2239	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2240	oredEntries\|=entry=stateTable[`0`][*source++];
2241	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2242	oredEntries\|=entry=stateTable[`0`][*source++];
2243	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2244	oredEntries\|=entry=stateTable[`0`][*source++];
2245	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2246	oredEntries\|=entry=stateTable[`0`][*source++];
2247	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2248	oredEntries\|=entry=stateTable[`0`][*source++];
2249	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2250	oredEntries\|=entry=stateTable[`0`][*source++];
2251	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2252	oredEntries\|=entry=stateTable[`0`][*source++];
2253	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2254	oredEntries\|=entry=stateTable[`0`][*source++];
2255	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2256	oredEntries\|=entry=stateTable[`0`][*source++];
2257	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2258	oredEntries\|=entry=stateTable[`0`][*source++];
2259	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2260	oredEntries\|=entry=stateTable[`0`][*source++];
2261	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2262	oredEntries\|=entry=stateTable[`0`][*source++];
2263	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2264	oredEntries\|=entry=stateTable[`0`][*source++];
2265	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2266	oredEntries\|=entry=stateTable[`0`][*source++];
2267	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2268
2269	/ were all 16 entries really valid? /
2270	if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2271	/ no, return to the first of these 16 /
2272	source-=`16`;
2273	target-=`16`;
2274	break;
2275	}
2276	} while(--count>`0`);
2277	count=loops-count;
2278	targetCapacity-=`16`*count;
2279
2280	if(offsets!=NULL) {
2281	lastSource+=`16`*count;
2282	while(count>`0`) {
2283	*offsets++=sourceIndex++;
2284	*offsets++=sourceIndex++;
2285	*offsets++=sourceIndex++;
2286	*offsets++=sourceIndex++;
2287	*offsets++=sourceIndex++;
2288	*offsets++=sourceIndex++;
2289	*offsets++=sourceIndex++;
2290	*offsets++=sourceIndex++;
2291	*offsets++=sourceIndex++;
2292	*offsets++=sourceIndex++;
2293	*offsets++=sourceIndex++;
2294	*offsets++=sourceIndex++;
2295	*offsets++=sourceIndex++;
2296	*offsets++=sourceIndex++;
2297	*offsets++=sourceIndex++;
2298	*offsets++=sourceIndex++;
2299	--count;
2300	}
2301	}
2302	}
2303	#endif
2304
2305	/ conversion loop /
2306	while(targetCapacity > `0` && source < sourceLimit) {
2307	entry=stateTable[`0`][*source++];
2308	/ MBCS_ENTRY_IS_FINAL(entry) /
2309
2310	/ test the most common case first /
2311	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2312	/ output BMP code point /
2313	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2314	--targetCapacity;
2315	continue;
2316	}
2317
2318	/*
2319	* An if-else-if chain provides more reliable performance for
2320	* the most common cases compared to a switch.
2321	*/
2322	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2323	if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2324	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2325	/ output BMP code point /
2326	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2327	--targetCapacity;
2328	continue;
2329	}
2330	} else if(action==MBCS_STATE_UNASSIGNED) {
2331	/ just fall through /
2332	} else if(action==MBCS_STATE_ILLEGAL) {
2333	/ callback(illegal) /
2334	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2335	} else {
2336	/ reserved, must never occur /
2337	continue;
2338	}
2339
2340	/ set offsets since the start or the last extension /
2341	if(offsets!=NULL) {
2342	int32_t count=(int32_t)(source-lastSource);
2343
2344	/ predecrement: do not set the offset for the callback-causing character /
2345	while(--count>`0`) {
2346	*offsets++=sourceIndex++;
2347	}
2348	/ offset and sourceIndex are now set for the current character /
2349	}
2350
2351	if(U_FAILURE(*pErrorCode)) {
2352	/ callback(illegal) /
2353	break;
2354	} else / unassigned sequences indicated with byteIndex>0 / {
2355	/ try an extension mapping /
2356	lastSource=source;
2357	cnv->toUBytes[`0`]=*(source-`1`);
2358	cnv->toULength=_extToU(cnv, cnv->sharedData,
2359	`1`, &source, sourceLimit,
2360	&target, pArgs->targetLimit,
2361	&offsets, sourceIndex,
2362	pArgs->flush,
2363	pErrorCode);
2364	sourceIndex+=`1`+(int32_t)(source-lastSource);
2365
2366	if(U_FAILURE(*pErrorCode)) {
2367	/ not mappable or buffer overflow /
2368	break;
2369	}
2370
2371	/ recalculate the targetCapacity after an extension mapping /
2372	targetCapacity=(int32_t)(pArgs->targetLimit-target);
2373	length=(int32_t)(sourceLimit-source);
2374	if(length<targetCapacity) {
2375	targetCapacity=length;
2376	}
2377	}
2378
2379	#if MBCS_UNROLL_SINGLE_TO_BMP
2380	/ unrolling makes it faster on Pentium III/Windows 2000 /
2381	goto unrolled;
2382	#endif
2383	}
2384
2385	if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2386	/ target is full /
2387	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2388	}
2389
2390	/ set offsets since the start or the last callback /
2391	if(offsets!=NULL) {
2392	size_t count=source-lastSource;
2393	while(count>`0`) {
2394	*offsets++=sourceIndex++;
2395	--count;
2396	}
2397	}
2398
2399	/ write back the updated pointers /
2400	pArgs->source=(const char *)source;
2401	pArgs->target=target;
2402	pArgs->offsets=offsets;
2403	}
2404
2405	static UBool
2406	hasValidTrailBytes(const int32_t (*stateTable)[`256`], uint8_t state) {
2407	const int32_t *row=stateTable[state];
2408	int32_t b, entry;
2409	/ First test for final entries in this state for some commonly valid byte values. /
2410	entry=row[`0xa1`];
2411	if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2412	MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2413	) {
2414	return TRUE;
2415	}
2416	entry=row[`0x41`];
2417	if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2418	MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2419	) {
2420	return TRUE;
2421	}
2422	/ Then test for final entries in this state. /
2423	for(b=`0`; b<=`0xff`; ++b) {
2424	entry=row[b];
2425	if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2426	MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2427	) {
2428	return TRUE;
2429	}
2430	}
2431	/ Then recurse for transition entries. /
2432	for(b=`0`; b<=`0xff`; ++b) {
2433	entry=row[b];
2434	if( MBCS_ENTRY_IS_TRANSITION(entry) &&
2435	hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
2436	) {
2437	return TRUE;
2438	}
2439	}
2440	return FALSE;
2441	}
2442
2443	/*
2444	* Is byte b a single/lead byte in this state?
2445	* Recurse for transition states, because here we don't want to say that
2446	* b is a lead byte if all byte sequences that start with b are illegal.
2447	*/
2448	static UBool
2449	isSingleOrLead(const int32_t (*stateTable)[`256`], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2450	const int32_t *row=stateTable[state];
2451	int32_t entry=row[b];
2452	if(MBCS_ENTRY_IS_TRANSITION(entry)) { / lead byte /
2453	return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
2454	} else {
2455	uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2456	if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2457	return FALSE; / SI/SO are illegal for DBCS-only conversion /
2458	} else {
2459	return action!=MBCS_STATE_ILLEGAL;
2460	}
2461	}
2462	}
2463
2464	U_CFUNC void
2465	ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2466	UErrorCode *pErrorCode) {
2467	UConverter *cnv;
2468	const uint8_t source, sourceLimit;
2469	UChar *target;
2470	const UChar *targetLimit;
2471	int32_t *offsets;
2472
2473	const int32_t (*stateTable)[`256`];
2474	const uint16_t *unicodeCodeUnits;
2475
2476	uint32_t offset;
2477	uint8_t state;
2478	int8_t byteIndex;
2479	uint8_t *bytes;
2480
2481	int32_t sourceIndex, nextSourceIndex;
2482
2483	int32_t entry;
2484	UChar c;
2485	uint8_t action;
2486
2487	/ use optimized function if possible /
2488	cnv=pArgs->converter;
2489
2490	if(cnv->preToULength>`0`) {
2491	/*
2492	* pass sourceIndex=-1 because we continue from an earlier buffer
2493	* in the future, this may change with continuous offsets
2494	*/
2495	ucnv_extContinueMatchToU(cnv, pArgs, -`1`, pErrorCode);
2496
2497	if(U_FAILURE(*pErrorCode) \|\| cnv->preToULength<`0`) {
2498	return;
2499	}
2500	}
2501
2502	if(cnv->sharedData->mbcs.countStates==`1`) {
2503	if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2504	ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2505	} else {
2506	ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2507	}
2508	return;
2509	}
2510
2511	/ set up the local pointers /
2512	source=(const uint8_t *)pArgs->source;
2513	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2514	target=pArgs->target;
2515	targetLimit=pArgs->targetLimit;
2516	offsets=pArgs->offsets;
2517
2518	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
2519	stateTable=(const int32_t (*)[`256`])cnv->sharedData->mbcs.swapLFNLStateTable;
2520	} else {
2521	stateTable=cnv->sharedData->mbcs.stateTable;
2522	}
2523	unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2524
2525	/ get the converter state from UConverter /
2526	offset=cnv->toUnicodeStatus;
2527	byteIndex=cnv->toULength;
2528	bytes=cnv->toUBytes;
2529
2530	/*
2531	* if we are in the SBCS state for a DBCS-only converter,
2532	* then load the DBCS state from the MBCS data
2533	* (dbcsOnlyState==0 if it is not a DBCS-only converter)
2534	*/
2535	if((state=(uint8_t)(cnv->mode))==`0`) {
2536	state=cnv->sharedData->mbcs.dbcsOnlyState;
2537	}
2538
2539	/ sourceIndex=-1 if the current character began in the previous buffer /
2540	sourceIndex=byteIndex==`0` ? `0` : -`1`;
2541	nextSourceIndex=`0`;
2542
2543	/ conversion loop /
2544	while(source<sourceLimit) {
2545	/*
2546	* This following test is to see if available input would overflow the output.
2547	* It does not catch output of more than one code unit that
2548	* overflows as a result of a surrogate pair or callback output
2549	* from the last source byte.
2550	* Therefore, those situations also test for overflows and will
2551	* then break the loop, too.
2552	*/
2553	if(target>=targetLimit) {
2554	/ target is full /
2555	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2556	break;
2557	}
2558
2559	if(byteIndex==`0`) {
2560	/ optimized loop for 1/2-byte input and BMP output /
2561	if(offsets==NULL) {
2562	do {
2563	entry=stateTable[state][*source];
2564	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2565	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2566	offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2567
2568	++source;
2569	if( source<sourceLimit &&
2570	MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2571	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2572	(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<`0xfffe`
2573	) {
2574	++source;
2575	*target++=c;
2576	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
2577	offset=`0`;
2578	} else {
2579	/ set the state and leave the optimized loop /
2580	bytes[`0`]=*(source-`1`);
2581	byteIndex=`1`;
2582	break;
2583	}
2584	} else {
2585	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2586	/ output BMP code point /
2587	++source;
2588	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2589	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
2590	} else {
2591	/ leave the optimized loop /
2592	break;
2593	}
2594	}
2595	} while(source<sourceLimit && target<targetLimit);
2596	} else / offsets!=NULL / {
2597	do {
2598	entry=stateTable[state][*source];
2599	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2600	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2601	offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2602
2603	++source;
2604	if( source<sourceLimit &&
2605	MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2606	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2607	(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<`0xfffe`
2608	) {
2609	++source;
2610	*target++=c;
2611	if(offsets!=NULL) {
2612	*offsets++=sourceIndex;
2613	sourceIndex=(nextSourceIndex+=`2`);
2614	}
2615	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
2616	offset=`0`;
2617	} else {
2618	/ set the state and leave the optimized loop /
2619	++nextSourceIndex;
2620	bytes[`0`]=*(source-`1`);
2621	byteIndex=`1`;
2622	break;
2623	}
2624	} else {
2625	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2626	/ output BMP code point /
2627	++source;
2628	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2629	if(offsets!=NULL) {
2630	*offsets++=sourceIndex;
2631	sourceIndex=++nextSourceIndex;
2632	}
2633	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
2634	} else {
2635	/ leave the optimized loop /
2636	break;
2637	}
2638	}
2639	} while(source<sourceLimit && target<targetLimit);
2640	}
2641
2642	/*
2643	* these tests and break statements could be put inside the loop
2644	* if C had "break outerLoop" like Java
2645	*/
2646	if(source>=sourceLimit) {
2647	break;
2648	}
2649	if(target>=targetLimit) {
2650	/ target is full /
2651	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2652	break;
2653	}
2654
2655	++nextSourceIndex;
2656	bytes[byteIndex++]=*source++;
2657	} else / byteIndex>0 / {
2658	++nextSourceIndex;
2659	entry=stateTable[state][bytes[byteIndex++]=*source++];
2660	}
2661
2662	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2663	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2664	offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2665	continue;
2666	}
2667
2668	/ save the previous state for proper extension mapping with SI/SO-stateful converters /
2669	cnv->mode=state;
2670
2671	/ set the next state early so that we can reuse the entry variable /
2672	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
2673
2674	/*
2675	* An if-else-if chain provides more reliable performance for
2676	* the most common cases compared to a switch.
2677	*/
2678	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2679	if(action==MBCS_STATE_VALID_16) {
2680	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2681	c=unicodeCodeUnits[offset];
2682	if(c<`0xfffe`) {
2683	/ output BMP code point /
2684	*target++=c;
2685	if(offsets!=NULL) {
2686	*offsets++=sourceIndex;
2687	}
2688	byteIndex=`0`;
2689	} else if(c==`0xfffe`) {
2690	if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=`0xfffe`) {
2691	/ output fallback BMP code point /
2692	*target++=(UChar)entry;
2693	if(offsets!=NULL) {
2694	*offsets++=sourceIndex;
2695	}
2696	byteIndex=`0`;
2697	}
2698	} else {
2699	/ callback(illegal) /
2700	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2701	}
2702	} else if(action==MBCS_STATE_VALID_DIRECT_16) {
2703	/ output BMP code point /
2704	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2705	if(offsets!=NULL) {
2706	*offsets++=sourceIndex;
2707	}
2708	byteIndex=`0`;
2709	} else if(action==MBCS_STATE_VALID_16_PAIR) {
2710	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2711	c=unicodeCodeUnits[offset++];
2712	if(c<`0xd800`) {
2713	/ output BMP code point below 0xd800 /
2714	*target++=c;
2715	if(offsets!=NULL) {
2716	*offsets++=sourceIndex;
2717	}
2718	byteIndex=`0`;
2719	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=`0xdfff` : c<=`0xdbff`) {
2720	/ output roundtrip or fallback surrogate pair /
2721	*target++=(UChar)(c&`0xdbff`);
2722	if(offsets!=NULL) {
2723	*offsets++=sourceIndex;
2724	}
2725	byteIndex=`0`;
2726	if(target<targetLimit) {
2727	*target++=unicodeCodeUnits[offset];
2728	if(offsets!=NULL) {
2729	*offsets++=sourceIndex;
2730	}
2731	} else {
2732	/ target overflow /
2733	cnv->UCharErrorBuffer[`0`]=unicodeCodeUnits[offset];
2734	cnv->UCharErrorBufferLength=`1`;
2735	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2736
2737	offset=`0`;
2738	break;
2739	}
2740	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&`0xfffe`)==`0xe000` : c==`0xe000`) {
2741	/ output roundtrip BMP code point above 0xd800 or fallback BMP code point /
2742	*target++=unicodeCodeUnits[offset];
2743	if(offsets!=NULL) {
2744	*offsets++=sourceIndex;
2745	}
2746	byteIndex=`0`;
2747	} else if(c==`0xffff`) {
2748	/ callback(illegal) /
2749	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2750	}
2751	} else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|
2752	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2753	) {
2754	entry=MBCS_ENTRY_FINAL_VALUE(entry);
2755	/ output surrogate pair /
2756	*target++=(UChar)(`0xd800`\|(UChar)(entry>>`10`));
2757	if(offsets!=NULL) {
2758	*offsets++=sourceIndex;
2759	}
2760	byteIndex=`0`;
2761	c=(UChar)(`0xdc00`\|(UChar)(entry&`0x3ff`));
2762	if(target<targetLimit) {
2763	*target++=c;
2764	if(offsets!=NULL) {
2765	*offsets++=sourceIndex;
2766	}
2767	} else {
2768	/ target overflow /
2769	cnv->UCharErrorBuffer[`0`]=c;
2770	cnv->UCharErrorBufferLength=`1`;
2771	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2772
2773	offset=`0`;
2774	break;
2775	}
2776	} else if(action==MBCS_STATE_CHANGE_ONLY) {
2777	/*
2778	* This serves as a state change without any output.
2779	* It is useful for reading simple stateful encodings,
2780	* for example using just Shift-In/Shift-Out codes.
2781	* The 21 unused bits may later be used for more sophisticated
2782	* state transitions.
2783	*/
2784	if(cnv->sharedData->mbcs.dbcsOnlyState==`0`) {
2785	byteIndex=`0`;
2786	} else {
2787	/ SI/SO are illegal for DBCS-only conversion /
2788	state=(uint8_t)(cnv->mode); / restore the previous state /
2789
2790	/ callback(illegal) /
2791	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2792	}
2793	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2794	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2795	/ output BMP code point /
2796	*target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2797	if(offsets!=NULL) {
2798	*offsets++=sourceIndex;
2799	}
2800	byteIndex=`0`;
2801	}
2802	} else if(action==MBCS_STATE_UNASSIGNED) {
2803	/ just fall through /
2804	} else if(action==MBCS_STATE_ILLEGAL) {
2805	/ callback(illegal) /
2806	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2807	} else {
2808	/ reserved, must never occur /
2809	byteIndex=`0`;
2810	}
2811
2812	/ end of action codes: prepare for a new character /
2813	offset=`0`;
2814
2815	if(byteIndex==`0`) {
2816	sourceIndex=nextSourceIndex;
2817	} else if(U_FAILURE(*pErrorCode)) {
2818	/ callback(illegal) /
2819	if(byteIndex>`1`) {
2820	/*
2821	* Ticket 5691: consistent illegal sequences:
2822	* - We include at least the first byte in the illegal sequence.
2823	* - If any of the non-initial bytes could be the start of a character,
2824	* we stop the illegal sequence before the first one of those.
2825	*/
2826	UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=`0`);
2827	int8_t i;
2828	for(i=`1`;
2829	i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2830	++i) {}
2831	if(i<byteIndex) {
2832	/ Back out some bytes. /
2833	int8_t backOutDistance=byteIndex-i;
2834	int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2835	byteIndex=i; / length of reported illegal byte sequence /
2836	if(backOutDistance<=bytesFromThisBuffer) {
2837	source-=backOutDistance;
2838	} else {
2839	/ Back out bytes from the previous buffer: Need to replay them. /
2840	cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2841	/ preToULength is negative! /
2842	uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
2843	source=(const uint8_t *)pArgs->source;
2844	}
2845	}
2846	}
2847	break;
2848	} else / unassigned sequences indicated with byteIndex>0 / {
2849	/ try an extension mapping /
2850	pArgs->source=(const char *)source;
2851	byteIndex=_extToU(cnv, cnv->sharedData,
2852	byteIndex, &source, sourceLimit,
2853	&target, targetLimit,
2854	&offsets, sourceIndex,
2855	pArgs->flush,
2856	pErrorCode);
2857	sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
2858
2859	if(U_FAILURE(*pErrorCode)) {
2860	/ not mappable or buffer overflow /
2861	break;
2862	}
2863	}
2864	}
2865
2866	/ set the converter state back into UConverter /
2867	cnv->toUnicodeStatus=offset;
2868	cnv->mode=state;
2869	cnv->toULength=byteIndex;
2870
2871	/ write back the updated pointers /
2872	pArgs->source=(const char *)source;
2873	pArgs->target=target;
2874	pArgs->offsets=offsets;
2875	}
2876
2877	/*
2878	* This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2879	* We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2880	*/
2881	static UChar32
2882	ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
2883	UErrorCode *pErrorCode) {
2884	UConverter *cnv;
2885	const int32_t (*stateTable)[`256`];
2886	const uint8_t source, sourceLimit;
2887
2888	int32_t entry;
2889	uint8_t action;
2890
2891	/ set up the local pointers /
2892	cnv=pArgs->converter;
2893	source=(const uint8_t *)pArgs->source;
2894	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2895	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
2896	stateTable=(const int32_t (*)[`256`])cnv->sharedData->mbcs.swapLFNLStateTable;
2897	} else {
2898	stateTable=cnv->sharedData->mbcs.stateTable;
2899	}
2900
2901	/ conversion loop /
2902	while(source<sourceLimit) {
2903	entry=stateTable[`0`][*source++];
2904	/ MBCS_ENTRY_IS_FINAL(entry) /
2905
2906	/ write back the updated pointer early so that we can return directly /
2907	pArgs->source=(const char *)source;
2908
2909	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2910	/ output BMP code point /
2911	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2912	}
2913
2914	/*
2915	* An if-else-if chain provides more reliable performance for
2916	* the most common cases compared to a switch.
2917	*/
2918	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2919	if( action==MBCS_STATE_VALID_DIRECT_20 \|\|
2920	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2921	) {
2922	/ output supplementary code point /
2923	return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+`0x10000`);
2924	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2925	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2926	/ output BMP code point /
2927	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2928	}
2929	} else if(action==MBCS_STATE_UNASSIGNED) {
2930	/ just fall through /
2931	} else if(action==MBCS_STATE_ILLEGAL) {
2932	/ callback(illegal) /
2933	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
2934	} else {
2935	/ reserved, must never occur /
2936	continue;
2937	}
2938
2939	if(U_FAILURE(*pErrorCode)) {
2940	/ callback(illegal) /
2941	break;
2942	} else / unassigned sequence / {
2943	/ defer to the generic implementation /
2944	pArgs->source=(const char *)source-`1`;
2945	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2946	}
2947	}
2948
2949	/ no output because of empty input or only state changes /
2950	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2951	return `0xffff`;
2952	}
2953
2954	/*
2955	* Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2956	* conversion without offset handling.
2957	*
2958	* When a character does not have a mapping to Unicode, then we return to the
2959	* generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2960	* handling.
2961	* We also defer to the generic code in other complicated cases and have them
2962	* ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2963	*
2964	* All normal mappings and errors are handled here.
2965	*/
2966	static UChar32 U_CALLCONV
2967	ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2968	UErrorCode *pErrorCode) {
2969	UConverter *cnv;
2970	const uint8_t source, sourceLimit, *lastSource;
2971
2972	const int32_t (*stateTable)[`256`];
2973	const uint16_t *unicodeCodeUnits;
2974
2975	uint32_t offset;
2976	uint8_t state;
2977
2978	int32_t entry;
2979	UChar32 c;
2980	uint8_t action;
2981
2982	/ use optimized function if possible /
2983	cnv=pArgs->converter;
2984
2985	if(cnv->preToULength>`0`) {
2986	/ use the generic code in ucnv_getNextUChar() to continue with a partial match /
2987	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2988	}
2989
2990	if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2991	/*
2992	* Using the generic ucnv_getNextUChar() code lets us deal correctly
2993	* with the rare case of a codepage that maps single surrogates
2994	* without adding the complexity to this already complicated function here.
2995	*/
2996	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2997	} else if(cnv->sharedData->mbcs.countStates==`1`) {
2998	return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2999	}
3000
3001	/ set up the local pointers /
3002	source=lastSource=(const uint8_t *)pArgs->source;
3003	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
3004
3005	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
3006	stateTable=(const int32_t (*)[`256`])cnv->sharedData->mbcs.swapLFNLStateTable;
3007	} else {
3008	stateTable=cnv->sharedData->mbcs.stateTable;
3009	}
3010	unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
3011
3012	/ get the converter state from UConverter /
3013	offset=cnv->toUnicodeStatus;
3014
3015	/*
3016	* if we are in the SBCS state for a DBCS-only converter,
3017	* then load the DBCS state from the MBCS data
3018	* (dbcsOnlyState==0 if it is not a DBCS-only converter)
3019	*/
3020	if((state=(uint8_t)(cnv->mode))==`0`) {
3021	state=cnv->sharedData->mbcs.dbcsOnlyState;
3022	}
3023
3024	/ conversion loop /
3025	c=U_SENTINEL;
3026	while(source<sourceLimit) {
3027	entry=stateTable[state][*source++];
3028	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3029	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3030	offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3031
3032	/ optimization for 1/2-byte input and BMP output /
3033	if( source<sourceLimit &&
3034	MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
3035	MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
3036	(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<`0xfffe`
3037	) {
3038	++source;
3039	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
3040	/ output BMP code point /
3041	break;
3042	}
3043	} else {
3044	/ save the previous state for proper extension mapping with SI/SO-stateful converters /
3045	cnv->mode=state;
3046
3047	/ set the next state early so that we can reuse the entry variable /
3048	state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); / typically 0 /
3049
3050	/*
3051	* An if-else-if chain provides more reliable performance for
3052	* the most common cases compared to a switch.
3053	*/
3054	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3055	if(action==MBCS_STATE_VALID_DIRECT_16) {
3056	/ output BMP code point /
3057	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3058	break;
3059	} else if(action==MBCS_STATE_VALID_16) {
3060	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3061	c=unicodeCodeUnits[offset];
3062	if(c<`0xfffe`) {
3063	/ output BMP code point /
3064	break;
3065	} else if(c==`0xfffe`) {
3066	if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=`0xfffe`) {
3067	break;
3068	}
3069	} else {
3070	/ callback(illegal) /
3071	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3072	}
3073	} else if(action==MBCS_STATE_VALID_16_PAIR) {
3074	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3075	c=unicodeCodeUnits[offset++];
3076	if(c<`0xd800`) {
3077	/ output BMP code point below 0xd800 /
3078	break;
3079	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=`0xdfff` : c<=`0xdbff`) {
3080	/ output roundtrip or fallback supplementary code point /
3081	c=((c&`0x3ff`)<<`10`)+unicodeCodeUnits[offset]+(`0x10000`-`0xdc00`);
3082	break;
3083	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&`0xfffe`)==`0xe000` : c==`0xe000`) {
3084	/ output roundtrip BMP code point above 0xd800 or fallback BMP code point /
3085	c=unicodeCodeUnits[offset];
3086	break;
3087	} else if(c==`0xffff`) {
3088	/ callback(illegal) /
3089	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3090	}
3091	} else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|
3092	(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
3093	) {
3094	/ output supplementary code point /
3095	c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+`0x10000`);
3096	break;
3097	} else if(action==MBCS_STATE_CHANGE_ONLY) {
3098	/*
3099	* This serves as a state change without any output.
3100	* It is useful for reading simple stateful encodings,
3101	* for example using just Shift-In/Shift-Out codes.
3102	* The 21 unused bits may later be used for more sophisticated
3103	* state transitions.
3104	*/
3105	if(cnv->sharedData->mbcs.dbcsOnlyState!=`0`) {
3106	/ SI/SO are illegal for DBCS-only conversion /
3107	state=(uint8_t)(cnv->mode); / restore the previous state /
3108
3109	/ callback(illegal) /
3110	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3111	}
3112	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3113	if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3114	/ output BMP code point /
3115	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3116	break;
3117	}
3118	} else if(action==MBCS_STATE_UNASSIGNED) {
3119	/ just fall through /
3120	} else if(action==MBCS_STATE_ILLEGAL) {
3121	/ callback(illegal) /
3122	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3123	} else {
3124	/ reserved (must never occur), or only state change /
3125	offset=`0`;
3126	lastSource=source;
3127	continue;
3128	}
3129
3130	/ end of action codes: prepare for a new character /
3131	offset=`0`;
3132
3133	if(U_FAILURE(*pErrorCode)) {
3134	/ callback(illegal) /
3135	break;
3136	} else / unassigned sequence / {
3137	/ defer to the generic implementation /
3138	cnv->toUnicodeStatus=`0`;
3139	cnv->mode=state;
3140	pArgs->source=(const char *)lastSource;
3141	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
3142	}
3143	}
3144	}
3145
3146	if(c<`0`) {
3147	if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
3148	/ incomplete character byte sequence /
3149	uint8_t *bytes=cnv->toUBytes;
3150	cnv->toULength=(int8_t)(source-lastSource);
3151	do {
3152	bytes++=lastSource++;
3153	} while(lastSource<source);
3154	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
3155	} else if(U_FAILURE(*pErrorCode)) {
3156	/ callback(illegal) /
3157	/*
3158	* Ticket 5691: consistent illegal sequences:
3159	* - We include at least the first byte in the illegal sequence.
3160	* - If any of the non-initial bytes could be the start of a character,
3161	* we stop the illegal sequence before the first one of those.
3162	*/
3163	UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=`0`);
3164	uint8_t *bytes=cnv->toUBytes;
3165	bytes++=lastSource++; / first byte /
3166	if(lastSource==source) {
3167	cnv->toULength=`1`;
3168	} else / lastSource<source: multi-byte character / {
3169	int8_t i;
3170	for(i=`1`;
3171	lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
3172	++i
3173	) {
3174	bytes++=lastSource++;
3175	}
3176	cnv->toULength=i;
3177	source=lastSource;
3178	}
3179	} else {
3180	/ no output because of empty input or only state changes /
3181	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
3182	}
3183	c=`0xffff`;
3184	}
3185
3186	/ set the converter state back into UConverter, ready for a new character /
3187	cnv->toUnicodeStatus=`0`;
3188	cnv->mode=state;
3189
3190	/ write back the updated pointer /
3191	pArgs->source=(const char *)source;
3192	return c;
3193	}
3194
3195	#if 0
3196	/*
3197	* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3198	* Removal improves code coverage.
3199	*/
3200	/**
3201	* This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
3202	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
3203	* It does not handle conversion extensions (_extToU()).
3204	*/
3205	U_CFUNC UChar32
3206	ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
3207	uint8_t b, UBool useFallback) {
3208	int32_t entry;
3209	uint8_t action;
3210
3211	entry=sharedData->mbcs.stateTable[`0`][b];
3212	/ MBCS_ENTRY_IS_FINAL(entry) /
3213
3214	if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
3215	/ output BMP code point /
3216	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3217	}
3218
3219	/*
3220	* An if-else-if chain provides more reliable performance for
3221	* the most common cases compared to a switch.
3222	*/
3223	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3224	if(action==MBCS_STATE_VALID_DIRECT_20) {
3225	/ output supplementary code point /
3226	return `0x10000`+MBCS_ENTRY_FINAL_VALUE(entry);
3227	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3228	if(!TO_U_USE_FALLBACK(useFallback)) {
3229	return `0xfffe`;
3230	}
3231	/ output BMP code point /
3232	return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3233	} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3234	if(!TO_U_USE_FALLBACK(useFallback)) {
3235	return `0xfffe`;
3236	}
3237	/ output supplementary code point /
3238	return `0x10000`+MBCS_ENTRY_FINAL_VALUE(entry);
3239	} else if(action==MBCS_STATE_UNASSIGNED) {
3240	return `0xfffe`;
3241	} else if(action==MBCS_STATE_ILLEGAL) {
3242	return `0xffff`;
3243	} else {
3244	/ reserved, must never occur /
3245	return `0xffff`;
3246	}
3247	}
3248	#endif
3249
3250	/*
3251	* This is a simple version of _MBCSGetNextUChar() that is used
3252	* by other converter implementations.
3253	* It only returns an "assigned" result if it consumes the entire input.
3254	* It does not use state from the converter, nor error codes.
3255	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
3256	* It handles conversion extensions but not GB 18030.
3257	*
3258	* Return value:
3259	* U+fffe unassigned
3260	* U+ffff illegal
3261	* otherwise the Unicode code point
3262	*/
3263	U_CFUNC UChar32
3264	ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
3265	const char *source, int32_t length,
3266	UBool useFallback) {
3267	const int32_t (*stateTable)[`256`];
3268	const uint16_t *unicodeCodeUnits;
3269
3270	uint32_t offset;
3271	uint8_t state, action;
3272
3273	UChar32 c;
3274	int32_t i, entry;
3275
3276	if(length<=`0`) {
3277	/ no input at all: "illegal" /
3278	return `0xffff`;
3279	}
3280
3281	#if 0
3282	/*
3283	* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3284	* TODO In future releases, verify that this function is never called for SBCS
3285	* conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3286	* Removal improves code coverage.
3287	*/
3288	/ use optimized function if possible /
3289	if(sharedData->mbcs.countStates==`1`) {
3290	if(length==`1`) {
3291	return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3292	} else {
3293	return `0xffff`; / illegal: more than a single byte for an SBCS converter /
3294	}
3295	}
3296	#endif
3297
3298	/ set up the local pointers /
3299	stateTable=sharedData->mbcs.stateTable;
3300	unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3301
3302	/ converter state /
3303	offset=`0`;
3304	state=sharedData->mbcs.dbcsOnlyState;
3305
3306	/ conversion loop /
3307	for(i=`0`;;) {
3308	entry=stateTable[state][(uint8_t)source[i++]];
3309	if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3310	state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3311	offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3312
3313	if(i==length) {
3314	return `0xffff`; / truncated character /
3315	}
3316	} else {
3317	/*
3318	* An if-else-if chain provides more reliable performance for
3319	* the most common cases compared to a switch.
3320	*/
3321	action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3322	if(action==MBCS_STATE_VALID_16) {
3323	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3324	c=unicodeCodeUnits[offset];
3325	if(c!=`0xfffe`) {
3326	/ done /
3327	} else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3328	c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3329	/ else done with 0xfffe /
3330	}
3331	break;
3332	} else if(action==MBCS_STATE_VALID_DIRECT_16) {
3333	/ output BMP code point /
3334	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3335	break;
3336	} else if(action==MBCS_STATE_VALID_16_PAIR) {
3337	offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3338	c=unicodeCodeUnits[offset++];
3339	if(c<`0xd800`) {
3340	/ output BMP code point below 0xd800 /
3341	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=`0xdfff` : c<=`0xdbff`) {
3342	/ output roundtrip or fallback supplementary code point /
3343	c=(UChar32)(((c&`0x3ff`)<<`10`)+unicodeCodeUnits[offset]+(`0x10000`-`0xdc00`));
3344	} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&`0xfffe`)==`0xe000` : c==`0xe000`) {
3345	/ output roundtrip BMP code point above 0xd800 or fallback BMP code point /
3346	c=unicodeCodeUnits[offset];
3347	} else if(c==`0xffff`) {
3348	return `0xffff`;
3349	} else {
3350	c=`0xfffe`;
3351	}
3352	break;
3353	} else if(action==MBCS_STATE_VALID_DIRECT_20) {
3354	/ output supplementary code point /
3355	c=`0x10000`+MBCS_ENTRY_FINAL_VALUE(entry);
3356	break;
3357	} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3358	if(!TO_U_USE_FALLBACK(useFallback)) {
3359	c=`0xfffe`;
3360	break;
3361	}
3362	/ output BMP code point /
3363	c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3364	break;
3365	} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3366	if(!TO_U_USE_FALLBACK(useFallback)) {
3367	c=`0xfffe`;
3368	break;
3369	}
3370	/ output supplementary code point /
3371	c=`0x10000`+MBCS_ENTRY_FINAL_VALUE(entry);
3372	break;
3373	} else if(action==MBCS_STATE_UNASSIGNED) {
3374	c=`0xfffe`;
3375	break;
3376	}
3377
3378	/*
3379	* forbid MBCS_STATE_CHANGE_ONLY for this function,
3380	* and MBCS_STATE_ILLEGAL and reserved action codes
3381	*/
3382	return `0xffff`;
3383	}
3384	}
3385
3386	if(i!=length) {
3387	/ illegal for this function: not all input consumed /
3388	return `0xffff`;
3389	}
3390
3391	if(c==`0xfffe`) {
3392	/ try an extension mapping /
3393	const int32_t *cx=sharedData->mbcs.extIndexes;
3394	if(cx!=NULL) {
3395	return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3396	}
3397	}
3398
3399	return c;
3400	}
3401
3402	/ MBCS-from-Unicode conversion functions ----------------------------------- /
3403
3404	/ This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. /
3405	static void
3406	ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3407	UErrorCode *pErrorCode) {
3408	UConverter *cnv;
3409	const UChar source, sourceLimit;
3410	uint8_t *target;
3411	int32_t targetCapacity;
3412	int32_t *offsets;
3413
3414	const uint16_t *table;
3415	const uint16_t *mbcsIndex;
3416	const uint8_t *bytes;
3417
3418	UChar32 c;
3419
3420	int32_t sourceIndex, nextSourceIndex;
3421
3422	uint32_t stage2Entry;
3423	uint32_t asciiRoundtrips;
3424	uint32_t value;
3425	uint8_t unicodeMask;
3426
3427	/ use optimized function if possible /
3428	cnv=pArgs->converter;
3429	unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3430
3431	/ set up the local pointers /
3432	source=pArgs->source;
3433	sourceLimit=pArgs->sourceLimit;
3434	target=(uint8_t *)pArgs->target;
3435	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3436	offsets=pArgs->offsets;
3437
3438	table=cnv->sharedData->mbcs.fromUnicodeTable;
3439	mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3440	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
3441	bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3442	} else {
3443	bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3444	}
3445	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3446
3447	/ get the converter state from UConverter /
3448	c=cnv->fromUChar32;
3449
3450	/ sourceIndex=-1 if the current character began in the previous buffer /
3451	sourceIndex= c==`0` ? `0` : -`1`;
3452	nextSourceIndex=`0`;
3453
3454	/ conversion loop /
3455	if(c!=`0` && targetCapacity>`0`) {
3456	goto getTrail;
3457	}
3458
3459	while(source<sourceLimit) {
3460	/*
3461	* This following test is to see if available input would overflow the output.
3462	* It does not catch output of more than one byte that
3463	* overflows as a result of a multi-byte character or callback output
3464	* from the last source character.
3465	* Therefore, those situations also test for overflows and will
3466	* then break the loop, too.
3467	*/
3468	if(targetCapacity>`0`) {
3469	/*
3470	* Get a correct Unicode code point:
3471	* a single UChar for a BMP code point or
3472	* a matched surrogate pair for a "supplementary code point".
3473	*/
3474	c=*source++;
3475	++nextSourceIndex;
3476	if(c<=`0x7f` && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3477	*target++=(uint8_t)c;
3478	if(offsets!=NULL) {
3479	*offsets++=sourceIndex;
3480	sourceIndex=nextSourceIndex;
3481	}
3482	--targetCapacity;
3483	c=`0`;
3484	continue;
3485	}
3486	/*
3487	* utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3488	* to avoid dealing with surrogates.
3489	* MBCS_FAST_MAX must be >=0xd7ff.
3490	*/
3491	if(c<=`0xd7ff`) {
3492	value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3493	/ There are only roundtrips (!=0) and no-mapping (==0) entries. /
3494	if(value==`0`) {
3495	goto unassigned;
3496	}
3497	/ output the value /
3498	} else {
3499	/*
3500	* This also tests if the codepage maps single surrogates.
3501	* If it does, then surrogates are not paired but mapped separately.
3502	* Note that in this case unmatched surrogates are not detected.
3503	*/
3504	if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3505	if(U16_IS_SURROGATE_LEAD(c)) {
3506	getTrail:
3507	if(source<sourceLimit) {
3508	/ test the following code unit /
3509	UChar trail=*source;
3510	if(U16_IS_TRAIL(trail)) {
3511	++source;
3512	++nextSourceIndex;
3513	c=U16_GET_SUPPLEMENTARY(c, trail);
3514	if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3515	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
3516	/ callback(unassigned) /
3517	goto unassigned;
3518	}
3519	/ convert this supplementary code point /
3520	/ exit this condition tree /
3521	} else {
3522	/ this is an unmatched lead code unit (1st surrogate) /
3523	/ callback(illegal) /
3524	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3525	break;
3526	}
3527	} else {
3528	/ no more input /
3529	break;
3530	}
3531	} else {
3532	/ this is an unmatched trail code unit (2nd surrogate) /
3533	/ callback(illegal) /
3534	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3535	break;
3536	}
3537	}
3538
3539	/ convert the Unicode code point in c into codepage bytes /
3540	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3541
3542	/ get the bytes and the length for the output /
3543	/ MBCS_OUTPUT_2 /
3544	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3545
3546	/ is this code point assigned, or do we use fallbacks? /
3547	if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|
3548	(UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=`0`))
3549	) {
3550	/*
3551	* We allow a 0 byte output if the "assigned" bit is set for this entry.
3552	* There is no way with this data structure for fallback output
3553	* to be a zero byte.
3554	*/
3555
3556	unassigned:
3557	/ try an extension mapping /
3558	pArgs->source=source;
3559	c=_extFromU(cnv, cnv->sharedData,
3560	c, &source, sourceLimit,
3561	&target, target+targetCapacity,
3562	&offsets, sourceIndex,
3563	pArgs->flush,
3564	pErrorCode);
3565	nextSourceIndex+=(int32_t)(source-pArgs->source);
3566
3567	if(U_FAILURE(*pErrorCode)) {
3568	/ not mappable or buffer overflow /
3569	break;
3570	} else {
3571	/ a mapping was written to the target, continue /
3572
3573	/ recalculate the targetCapacity after an extension mapping /
3574	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3575
3576	/ normal end of conversion: prepare for a new character /
3577	sourceIndex=nextSourceIndex;
3578	continue;
3579	}
3580	}
3581	}
3582
3583	/ write the output character bytes from value and length /
3584	/ from the first if in the loop we know that targetCapacity>0 /
3585	if(value<=`0xff`) {
3586	/ this is easy because we know that there is enough space /
3587	*target++=(uint8_t)value;
3588	if(offsets!=NULL) {
3589	*offsets++=sourceIndex;
3590	}
3591	--targetCapacity;
3592	} else / length==2 / {
3593	*target++=(uint8_t)(value>>`8`);
3594	if(`2`<=targetCapacity) {
3595	*target++=(uint8_t)value;
3596	if(offsets!=NULL) {
3597	*offsets++=sourceIndex;
3598	*offsets++=sourceIndex;
3599	}
3600	targetCapacity-=`2`;
3601	} else {
3602	if(offsets!=NULL) {
3603	*offsets++=sourceIndex;
3604	}
3605	cnv->charErrorBuffer[`0`]=(char)value;
3606	cnv->charErrorBufferLength=`1`;
3607
3608	/ target overflow /
3609	targetCapacity=`0`;
3610	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3611	c=`0`;
3612	break;
3613	}
3614	}
3615
3616	/ normal end of conversion: prepare for a new character /
3617	c=`0`;
3618	sourceIndex=nextSourceIndex;
3619	continue;
3620	} else {
3621	/ target is full /
3622	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3623	break;
3624	}
3625	}
3626
3627	/ set the converter state back into UConverter /
3628	cnv->fromUChar32=c;
3629
3630	/ write back the updated pointers /
3631	pArgs->source=source;
3632	pArgs->target=(char *)target;
3633	pArgs->offsets=offsets;
3634	}
3635
3636	/ This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. /
3637	static void
3638	ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3639	UErrorCode *pErrorCode) {
3640	UConverter *cnv;
3641	const UChar source, sourceLimit;
3642	uint8_t *target;
3643	int32_t targetCapacity;
3644	int32_t *offsets;
3645
3646	const uint16_t *table;
3647	const uint16_t *results;
3648
3649	UChar32 c;
3650
3651	int32_t sourceIndex, nextSourceIndex;
3652
3653	uint16_t value, minValue;
3654	UBool hasSupplementary;
3655
3656	/ set up the local pointers /
3657	cnv=pArgs->converter;
3658	source=pArgs->source;
3659	sourceLimit=pArgs->sourceLimit;
3660	target=(uint8_t *)pArgs->target;
3661	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3662	offsets=pArgs->offsets;
3663
3664	table=cnv->sharedData->mbcs.fromUnicodeTable;
3665	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
3666	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3667	} else {
3668	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3669	}
3670
3671	if(cnv->useFallback) {
3672	/ use all roundtrip and fallback results /
3673	minValue=`0x800`;
3674	} else {
3675	/ use only roundtrips and fallbacks from private-use characters /
3676	minValue=`0xc00`;
3677	}
3678	hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
3679
3680	/ get the converter state from UConverter /
3681	c=cnv->fromUChar32;
3682
3683	/ sourceIndex=-1 if the current character began in the previous buffer /
3684	sourceIndex= c==`0` ? `0` : -`1`;
3685	nextSourceIndex=`0`;
3686
3687	/ conversion loop /
3688	if(c!=`0` && targetCapacity>`0`) {
3689	goto getTrail;
3690	}
3691
3692	while(source<sourceLimit) {
3693	/*
3694	* This following test is to see if available input would overflow the output.
3695	* It does not catch output of more than one byte that
3696	* overflows as a result of a multi-byte character or callback output
3697	* from the last source character.
3698	* Therefore, those situations also test for overflows and will
3699	* then break the loop, too.
3700	*/
3701	if(targetCapacity>`0`) {
3702	/*
3703	* Get a correct Unicode code point:
3704	* a single UChar for a BMP code point or
3705	* a matched surrogate pair for a "supplementary code point".
3706	*/
3707	c=*source++;
3708	++nextSourceIndex;
3709	if(U16_IS_SURROGATE(c)) {
3710	if(U16_IS_SURROGATE_LEAD(c)) {
3711	getTrail:
3712	if(source<sourceLimit) {
3713	/ test the following code unit /
3714	UChar trail=*source;
3715	if(U16_IS_TRAIL(trail)) {
3716	++source;
3717	++nextSourceIndex;
3718	c=U16_GET_SUPPLEMENTARY(c, trail);
3719	if(!hasSupplementary) {
3720	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
3721	/ callback(unassigned) /
3722	goto unassigned;
3723	}
3724	/ convert this supplementary code point /
3725	/ exit this condition tree /
3726	} else {
3727	/ this is an unmatched lead code unit (1st surrogate) /
3728	/ callback(illegal) /
3729	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3730	break;
3731	}
3732	} else {
3733	/ no more input /
3734	break;
3735	}
3736	} else {
3737	/ this is an unmatched trail code unit (2nd surrogate) /
3738	/ callback(illegal) /
3739	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3740	break;
3741	}
3742	}
3743
3744	/ convert the Unicode code point in c into codepage bytes /
3745	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3746
3747	/ is this code point assigned, or do we use fallbacks? /
3748	if(value>=minValue) {
3749	/ assigned, write the output character bytes from value and length /
3750	/ length==1 /
3751	/ this is easy because we know that there is enough space /
3752	*target++=(uint8_t)value;
3753	if(offsets!=NULL) {
3754	*offsets++=sourceIndex;
3755	}
3756	--targetCapacity;
3757
3758	/ normal end of conversion: prepare for a new character /
3759	c=`0`;
3760	sourceIndex=nextSourceIndex;
3761	} else { / unassigned /
3762	unassigned:
3763	/ try an extension mapping /
3764	pArgs->source=source;
3765	c=_extFromU(cnv, cnv->sharedData,
3766	c, &source, sourceLimit,
3767	&target, target+targetCapacity,
3768	&offsets, sourceIndex,
3769	pArgs->flush,
3770	pErrorCode);
3771	nextSourceIndex+=(int32_t)(source-pArgs->source);
3772
3773	if(U_FAILURE(*pErrorCode)) {
3774	/ not mappable or buffer overflow /
3775	break;
3776	} else {
3777	/ a mapping was written to the target, continue /
3778
3779	/ recalculate the targetCapacity after an extension mapping /
3780	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3781
3782	/ normal end of conversion: prepare for a new character /
3783	sourceIndex=nextSourceIndex;
3784	}
3785	}
3786	} else {
3787	/ target is full /
3788	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3789	break;
3790	}
3791	}
3792
3793	/ set the converter state back into UConverter /
3794	cnv->fromUChar32=c;
3795
3796	/ write back the updated pointers /
3797	pArgs->source=source;
3798	pArgs->target=(char *)target;
3799	pArgs->offsets=offsets;
3800	}
3801
3802	/*
3803	* This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3804	* that map only to and from the BMP.
3805	* In addition to single-byte/state optimizations, the offset calculations
3806	* become much easier.
3807	* It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3808	* but measurements have shown that this diminishes performance
3809	* in more cases than it improves it.
3810	* See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3811	* for various MBCS and SBCS optimizations.
3812	*/
3813	static void
3814	ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3815	UErrorCode *pErrorCode) {
3816	UConverter *cnv;
3817	const UChar source, sourceLimit, *lastSource;
3818	uint8_t *target;
3819	int32_t targetCapacity, length;
3820	int32_t *offsets;
3821
3822	const uint16_t *table;
3823	const uint16_t *results;
3824
3825	UChar32 c;
3826
3827	int32_t sourceIndex;
3828
3829	uint32_t asciiRoundtrips;
3830	uint16_t value, minValue;
3831
3832	/ set up the local pointers /
3833	cnv=pArgs->converter;
3834	source=pArgs->source;
3835	sourceLimit=pArgs->sourceLimit;
3836	target=(uint8_t *)pArgs->target;
3837	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3838	offsets=pArgs->offsets;
3839
3840	table=cnv->sharedData->mbcs.fromUnicodeTable;
3841	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
3842	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3843	} else {
3844	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3845	}
3846	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3847
3848	if(cnv->useFallback) {
3849	/ use all roundtrip and fallback results /
3850	minValue=`0x800`;
3851	} else {
3852	/ use only roundtrips and fallbacks from private-use characters /
3853	minValue=`0xc00`;
3854	}
3855
3856	/ get the converter state from UConverter /
3857	c=cnv->fromUChar32;
3858
3859	/ sourceIndex=-1 if the current character began in the previous buffer /
3860	sourceIndex= c==`0` ? `0` : -`1`;
3861	lastSource=source;
3862
3863	/*
3864	* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3865	* for the minimum of the sourceLength and targetCapacity
3866	*/
3867	length=(int32_t)(sourceLimit-source);
3868	if(length<targetCapacity) {
3869	targetCapacity=length;
3870	}
3871
3872	/ conversion loop /
3873	if(c!=`0` && targetCapacity>`0`) {
3874	goto getTrail;
3875	}
3876
3877	#if MBCS_UNROLL_SINGLE_FROM_BMP
3878	/ unrolling makes it slower on Pentium III/Windows 2000?! /
3879	/ unroll the loop with the most common case /
3880	unrolled:
3881	if(targetCapacity>=`4`) {
3882	int32_t count, loops;
3883	uint16_t andedValues;
3884
3885	loops=count=targetCapacity>>`2`;
3886	do {
3887	c=*source++;
3888	andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3889	*target++=(uint8_t)value;
3890	c=*source++;
3891	andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3892	*target++=(uint8_t)value;
3893	c=*source++;
3894	andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3895	*target++=(uint8_t)value;
3896	c=*source++;
3897	andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3898	*target++=(uint8_t)value;
3899
3900	/ were all 4 entries really valid? /
3901	if(andedValues<minValue) {
3902	/ no, return to the first of these 4 /
3903	source-=`4`;
3904	target-=`4`;
3905	break;
3906	}
3907	} while(--count>`0`);
3908	count=loops-count;
3909	targetCapacity-=`4`*count;
3910
3911	if(offsets!=NULL) {
3912	lastSource+=`4`*count;
3913	while(count>`0`) {
3914	*offsets++=sourceIndex++;
3915	*offsets++=sourceIndex++;
3916	*offsets++=sourceIndex++;
3917	*offsets++=sourceIndex++;
3918	--count;
3919	}
3920	}
3921
3922	c=`0`;
3923	}
3924	#endif
3925
3926	while(targetCapacity>`0`) {
3927	/*
3928	* Get a correct Unicode code point:
3929	* a single UChar for a BMP code point or
3930	* a matched surrogate pair for a "supplementary code point".
3931	*/
3932	c=*source++;
3933	/*
3934	* Do not immediately check for single surrogates:
3935	* Assume that they are unassigned and check for them in that case.
3936	* This speeds up the conversion of assigned characters.
3937	*/
3938	/ convert the Unicode code point in c into codepage bytes /
3939	if(c<=`0x7f` && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3940	*target++=(uint8_t)c;
3941	--targetCapacity;
3942	c=`0`;
3943	continue;
3944	}
3945	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3946	/ is this code point assigned, or do we use fallbacks? /
3947	if(value>=minValue) {
3948	/ assigned, write the output character bytes from value and length /
3949	/ length==1 /
3950	/ this is easy because we know that there is enough space /
3951	*target++=(uint8_t)value;
3952	--targetCapacity;
3953
3954	/ normal end of conversion: prepare for a new character /
3955	c=`0`;
3956	continue;
3957	} else if(!U16_IS_SURROGATE(c)) {
3958	/ normal, unassigned BMP character /
3959	} else if(U16_IS_SURROGATE_LEAD(c)) {
3960	getTrail:
3961	if(source<sourceLimit) {
3962	/ test the following code unit /
3963	UChar trail=*source;
3964	if(U16_IS_TRAIL(trail)) {
3965	++source;
3966	c=U16_GET_SUPPLEMENTARY(c, trail);
3967	/ this codepage does not map supplementary code points /
3968	/ callback(unassigned) /
3969	} else {
3970	/ this is an unmatched lead code unit (1st surrogate) /
3971	/ callback(illegal) /
3972	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3973	break;
3974	}
3975	} else {
3976	/ no more input /
3977	if (pArgs->flush) {
3978	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
3979	}
3980	break;
3981	}
3982	} else {
3983	/ this is an unmatched trail code unit (2nd surrogate) /
3984	/ callback(illegal) /
3985	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
3986	break;
3987	}
3988
3989	/ c does not have a mapping /
3990
3991	/ get the number of code units for c to correctly advance sourceIndex /
3992	length=U16_LENGTH(c);
3993
3994	/ set offsets since the start or the last extension /
3995	if(offsets!=NULL) {
3996	int32_t count=(int32_t)(source-lastSource);
3997
3998	/ do not set the offset for this character /
3999	count-=length;
4000
4001	while(count>`0`) {
4002	*offsets++=sourceIndex++;
4003	--count;
4004	}
4005	/ offsets and sourceIndex are now set for the current character /
4006	}
4007
4008	/ try an extension mapping /
4009	lastSource=source;
4010	c=_extFromU(cnv, cnv->sharedData,
4011	c, &source, sourceLimit,
4012	&target, (const uint8_t *)(pArgs->targetLimit),
4013	&offsets, sourceIndex,
4014	pArgs->flush,
4015	pErrorCode);
4016	sourceIndex+=length+(int32_t)(source-lastSource);
4017	lastSource=source;
4018
4019	if(U_FAILURE(*pErrorCode)) {
4020	/ not mappable or buffer overflow /
4021	break;
4022	} else {
4023	/ a mapping was written to the target, continue /
4024
4025	/ recalculate the targetCapacity after an extension mapping /
4026	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4027	length=(int32_t)(sourceLimit-source);
4028	if(length<targetCapacity) {
4029	targetCapacity=length;
4030	}
4031	}
4032
4033	#if MBCS_UNROLL_SINGLE_FROM_BMP
4034	/ unrolling makes it slower on Pentium III/Windows 2000?! /
4035	goto unrolled;
4036	#endif
4037	}
4038
4039	if(U_SUCCESS(pErrorCode) && source<sourceLimit && target>=(uint8_t )pArgs->targetLimit) {
4040	/ target is full /
4041	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4042	}
4043
4044	/ set offsets since the start or the last callback /
4045	if(offsets!=NULL) {
4046	size_t count=source-lastSource;
4047	if (count > `0` && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
4048	/*
4049	Caller gave us a partial supplementary character,
4050	which this function couldn't convert in any case.
4051	The callback will handle the offset.
4052	*/
4053	count--;
4054	}
4055	while(count>`0`) {
4056	*offsets++=sourceIndex++;
4057	--count;
4058	}
4059	}
4060
4061	/ set the converter state back into UConverter /
4062	cnv->fromUChar32=c;
4063
4064	/ write back the updated pointers /
4065	pArgs->source=source;
4066	pArgs->target=(char *)target;
4067	pArgs->offsets=offsets;
4068	}
4069
4070	U_CFUNC void
4071	ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
4072	UErrorCode *pErrorCode) {
4073	UConverter *cnv;
4074	const UChar source, sourceLimit;
4075	uint8_t *target;
4076	int32_t targetCapacity;
4077	int32_t *offsets;
4078
4079	const uint16_t *table;
4080	const uint16_t *mbcsIndex;
4081	const uint8_t p, bytes;
4082	uint8_t outputType;
4083
4084	UChar32 c;
4085
4086	int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
4087
4088	uint32_t stage2Entry;
4089	uint32_t asciiRoundtrips;
4090	uint32_t value;
4091	/ Shift-In and Shift-Out byte sequences differ by encoding scheme. /
4092	uint8_t siBytes[`2`] = {`0`, `0`};
4093	uint8_t soBytes[`2`] = {`0`, `0`};
4094	uint8_t siLength, soLength;
4095	int32_t length = `0`, prevLength;
4096	uint8_t unicodeMask;
4097
4098	cnv=pArgs->converter;
4099
4100	if(cnv->preFromUFirstCP>=`0`) {
4101	/*
4102	* pass sourceIndex=-1 because we continue from an earlier buffer
4103	* in the future, this may change with continuous offsets
4104	*/
4105	ucnv_extContinueMatchFromU(cnv, pArgs, -`1`, pErrorCode);
4106
4107	if(U_FAILURE(*pErrorCode) \|\| cnv->preFromULength<`0`) {
4108	return;
4109	}
4110	}
4111
4112	/ use optimized function if possible /
4113	outputType=cnv->sharedData->mbcs.outputType;
4114	unicodeMask=cnv->sharedData->mbcs.unicodeMask;
4115	if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4116	if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4117	ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
4118	} else {
4119	ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
4120	}
4121	return;
4122	} else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
4123	ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
4124	return;
4125	}
4126
4127	/ set up the local pointers /
4128	source=pArgs->source;
4129	sourceLimit=pArgs->sourceLimit;
4130	target=(uint8_t *)pArgs->target;
4131	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
4132	offsets=pArgs->offsets;
4133
4134	table=cnv->sharedData->mbcs.fromUnicodeTable;
4135	if(cnv->sharedData->mbcs.utf8Friendly) {
4136	mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
4137	} else {
4138	mbcsIndex=NULL;
4139	}
4140	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
4141	bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4142	} else {
4143	bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
4144	}
4145	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4146
4147	/ get the converter state from UConverter /
4148	c=cnv->fromUChar32;
4149
4150	if(outputType==MBCS_OUTPUT_2_SISO) {
4151	prevLength=cnv->fromUnicodeStatus;
4152	if(prevLength==`0`) {
4153	/ set the real value /
4154	prevLength=`1`;
4155	}
4156	} else {
4157	/ prevent fromUnicodeStatus from being set to something non-0 /
4158	prevLength=`0`;
4159	}
4160
4161	/ sourceIndex=-1 if the current character began in the previous buffer /
4162	prevSourceIndex=-`1`;
4163	sourceIndex= c==`0` ? `0` : -`1`;
4164	nextSourceIndex=`0`;
4165
4166	/ Get the SI/SO character for the converter /
4167	siLength = static_cast<uint8_t>(getSISOBytes(SI, cnv->options, siBytes));
4168	soLength = static_cast<uint8_t>(getSISOBytes(SO, cnv->options, soBytes));
4169
4170	/ conversion loop /
4171	/*
4172	* This is another piece of ugly code:
4173	* A goto into the loop if the converter state contains a first surrogate
4174	* from the previous function call.
4175	* It saves me to check in each loop iteration a check of if(c==0)
4176	* and duplicating the trail-surrogate-handling code in the else
4177	* branch of that check.
4178	* I could not find any other way to get around this other than
4179	* using a function call for the conversion and callback, which would
4180	* be even more inefficient.
4181	*
4182	* Markus Scherer 2000-jul-19
4183	*/
4184	if(c!=`0` && targetCapacity>`0`) {
4185	goto getTrail;
4186	}
4187
4188	while(source<sourceLimit) {
4189	/*
4190	* This following test is to see if available input would overflow the output.
4191	* It does not catch output of more than one byte that
4192	* overflows as a result of a multi-byte character or callback output
4193	* from the last source character.
4194	* Therefore, those situations also test for overflows and will
4195	* then break the loop, too.
4196	*/
4197	if(targetCapacity>`0`) {
4198	/*
4199	* Get a correct Unicode code point:
4200	* a single UChar for a BMP code point or
4201	* a matched surrogate pair for a "supplementary code point".
4202	*/
4203	c=*source++;
4204	++nextSourceIndex;
4205	if(c<=`0x7f` && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
4206	*target++=(uint8_t)c;
4207	if(offsets!=NULL) {
4208	*offsets++=sourceIndex;
4209	prevSourceIndex=sourceIndex;
4210	sourceIndex=nextSourceIndex;
4211	}
4212	--targetCapacity;
4213	c=`0`;
4214	continue;
4215	}
4216	/*
4217	* utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
4218	* to avoid dealing with surrogates.
4219	* MBCS_FAST_MAX must be >=0xd7ff.
4220	*/
4221	if(c<=`0xd7ff` && mbcsIndex!=NULL) {
4222	value=mbcsIndex[c>>`6`];
4223
4224	/ get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) /
4225	/ There are only roundtrips (!=0) and no-mapping (==0) entries. /
4226	switch(outputType) {
4227	case MBCS_OUTPUT_2:
4228	value=((const uint16_t *)bytes)[value +(c&`0x3f`)];
4229	if(value<=`0xff`) {
4230	if(value==`0`) {
4231	goto unassigned;
4232	} else {
4233	length=`1`;
4234	}
4235	} else {
4236	length=`2`;
4237	}
4238	break;
4239	case MBCS_OUTPUT_2_SISO:
4240	/ 1/2-byte stateful with Shift-In/Shift-Out /
4241	/*
4242	* Save the old state in the converter object
4243	* right here, then change the local prevLength state variable if necessary.
4244	* Then, if this character turns out to be unassigned or a fallback that
4245	* is not taken, the callback code must not save the new state in the converter
4246	* because the new state is for a character that is not output.
4247	* However, the callback must still restore the state from the converter
4248	* in case the callback function changed it for its output.
4249	*/
4250	cnv->fromUnicodeStatus=prevLength; / save the old state /
4251	value=((const uint16_t *)bytes)[value +(c&`0x3f`)];
4252	if(value<=`0xff`) {
4253	if(value==`0`) {
4254	goto unassigned;
4255	} else if(prevLength<=`1`) {
4256	length=`1`;
4257	} else {
4258	/ change from double-byte mode to single-byte /
4259	if (siLength == `1`) {
4260	value\|=(uint32_t)siBytes[`0`]<<`8`;
4261	length = `2`;
4262	} else if (siLength == `2`) {
4263	value\|=(uint32_t)siBytes[`1`]<<`8`;
4264	value\|=(uint32_t)siBytes[`0`]<<`16`;
4265	length = `3`;
4266	}
4267	prevLength=`1`;
4268	}
4269	} else {
4270	if(prevLength==`2`) {
4271	length=`2`;
4272	} else {
4273	/ change from single-byte mode to double-byte /
4274	if (soLength == `1`) {
4275	value\|=(uint32_t)soBytes[`0`]<<`16`;
4276	length = `3`;
4277	} else if (soLength == `2`) {
4278	value\|=(uint32_t)soBytes[`1`]<<`16`;
4279	value\|=(uint32_t)soBytes[`0`]<<`24`;
4280	length = `4`;
4281	}
4282	prevLength=`2`;
4283	}
4284	}
4285	break;
4286	case MBCS_OUTPUT_DBCS_ONLY:
4287	/ table with single-byte results, but only DBCS mappings used /
4288	value=((const uint16_t *)bytes)[value +(c&`0x3f`)];
4289	if(value<=`0xff`) {
4290	/ no mapping or SBCS result, not taken for DBCS-only /
4291	goto unassigned;
4292	} else {
4293	length=`2`;
4294	}
4295	break;
4296	case MBCS_OUTPUT_3:
4297	p=bytes+(value+(c&`0x3f`))*`3`;
4298	value=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
4299	if(value<=`0xff`) {
4300	if(value==`0`) {
4301	goto unassigned;
4302	} else {
4303	length=`1`;
4304	}
4305	} else if(value<=`0xffff`) {
4306	length=`2`;
4307	} else {
4308	length=`3`;
4309	}
4310	break;
4311	case MBCS_OUTPUT_4:
4312	value=((const uint32_t *)bytes)[value +(c&`0x3f`)];
4313	if(value<=`0xff`) {
4314	if(value==`0`) {
4315	goto unassigned;
4316	} else {
4317	length=`1`;
4318	}
4319	} else if(value<=`0xffff`) {
4320	length=`2`;
4321	} else if(value<=`0xffffff`) {
4322	length=`3`;
4323	} else {
4324	length=`4`;
4325	}
4326	break;
4327	case MBCS_OUTPUT_3_EUC:
4328	value=((const uint16_t *)bytes)[value +(c&`0x3f`)];
4329	/ EUC 16-bit fixed-length representation /
4330	if(value<=`0xff`) {
4331	if(value==`0`) {
4332	goto unassigned;
4333	} else {
4334	length=`1`;
4335	}
4336	} else if((value&`0x8000`)==`0`) {
4337	value\|=`0x8e8000`;
4338	length=`3`;
4339	} else if((value&`0x80`)==`0`) {
4340	value\|=`0x8f0080`;
4341	length=`3`;
4342	} else {
4343	length=`2`;
4344	}
4345	break;
4346	case MBCS_OUTPUT_4_EUC:
4347	p=bytes+(value+(c&`0x3f`))*`3`;
4348	value=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
4349	/ EUC 16-bit fixed-length representation applied to the first two bytes /
4350	if(value<=`0xff`) {
4351	if(value==`0`) {
4352	goto unassigned;
4353	} else {
4354	length=`1`;
4355	}
4356	} else if(value<=`0xffff`) {
4357	length=`2`;
4358	} else if((value&`0x800000`)==`0`) {
4359	value\|=`0x8e800000`;
4360	length=`4`;
4361	} else if((value&`0x8000`)==`0`) {
4362	value\|=`0x8f008000`;
4363	length=`4`;
4364	} else {
4365	length=`3`;
4366	}
4367	break;
4368	default:
4369	/ must not occur /
4370	/*
4371	* To avoid compiler warnings that value & length may be
4372	* used without having been initialized, we set them here.
4373	* In reality, this is unreachable code.
4374	* Not having a default branch also causes warnings with
4375	* some compilers.
4376	*/
4377	value=`0`;
4378	length=`0`;
4379	break;
4380	}
4381	/ output the value /
4382	} else {
4383	/*
4384	* This also tests if the codepage maps single surrogates.
4385	* If it does, then surrogates are not paired but mapped separately.
4386	* Note that in this case unmatched surrogates are not detected.
4387	*/
4388	if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4389	if(U16_IS_SURROGATE_LEAD(c)) {
4390	getTrail:
4391	if(source<sourceLimit) {
4392	/ test the following code unit /
4393	UChar trail=*source;
4394	if(U16_IS_TRAIL(trail)) {
4395	++source;
4396	++nextSourceIndex;
4397	c=U16_GET_SUPPLEMENTARY(c, trail);
4398	if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4399	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
4400	cnv->fromUnicodeStatus=prevLength; / save the old state /
4401	/ callback(unassigned) /
4402	goto unassigned;
4403	}
4404	/ convert this supplementary code point /
4405	/ exit this condition tree /
4406	} else {
4407	/ this is an unmatched lead code unit (1st surrogate) /
4408	/ callback(illegal) /
4409	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
4410	break;
4411	}
4412	} else {
4413	/ no more input /
4414	break;
4415	}
4416	} else {
4417	/ this is an unmatched trail code unit (2nd surrogate) /
4418	/ callback(illegal) /
4419	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
4420	break;
4421	}
4422	}
4423
4424	/ convert the Unicode code point in c into codepage bytes /
4425
4426	/*
4427	* The basic lookup is a triple-stage compact array (trie) lookup.
4428	* For details see the beginning of this file.
4429	*
4430	* Single-byte codepages are handled with a different data structure
4431	* by _MBCSSingle... functions.
4432	*
4433	* The result consists of a 32-bit value from stage 2 and
4434	* a pointer to as many bytes as are stored per character.
4435	* The pointer points to the character's bytes in stage 3.
4436	* Bits 15..0 of the stage 2 entry contain the stage 3 index
4437	* for that pointer, while bits 31..16 are flags for which of
4438	* the 16 characters in the block are roundtrip-assigned.
4439	*
4440	* For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4441	* respectively as uint32_t, in the platform encoding.
4442	* For 3-byte codepages, the bytes are always stored in big-endian order.
4443	*
4444	* For EUC encodings that use only either 0x8e or 0x8f as the first
4445	* byte of their longest byte sequences, the first two bytes in
4446	* this third stage indicate with their 7th bits whether these bytes
4447	* are to be written directly or actually need to be preceeded by
4448	* one of the two Single-Shift codes. With this, the third stage
4449	* stores one byte fewer per character than the actual maximum length of
4450	* EUC byte sequences.
4451	*
4452	* Other than that, leading zero bytes are removed and the other
4453	* bytes output. A single zero byte may be output if the "assigned"
4454	* bit in stage 2 was on.
4455	* The data structure does not support zero byte output as a fallback,
4456	* and also does not allow output of leading zeros.
4457	*/
4458	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4459
4460	/ get the bytes and the length for the output /
4461	switch(outputType) {
4462	case MBCS_OUTPUT_2:
4463	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4464	if(value<=`0xff`) {
4465	length=`1`;
4466	} else {
4467	length=`2`;
4468	}
4469	break;
4470	case MBCS_OUTPUT_2_SISO:
4471	/ 1/2-byte stateful with Shift-In/Shift-Out /
4472	/*
4473	* Save the old state in the converter object
4474	* right here, then change the local prevLength state variable if necessary.
4475	* Then, if this character turns out to be unassigned or a fallback that
4476	* is not taken, the callback code must not save the new state in the converter
4477	* because the new state is for a character that is not output.
4478	* However, the callback must still restore the state from the converter
4479	* in case the callback function changed it for its output.
4480	*/
4481	cnv->fromUnicodeStatus=prevLength; / save the old state /
4482	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4483	if(value<=`0xff`) {
4484	if(value==`0` && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==`0`) {
4485	/ no mapping, leave value==0 /
4486	length=`0`;
4487	} else if(prevLength<=`1`) {
4488	length=`1`;
4489	} else {
4490	/ change from double-byte mode to single-byte /
4491	if (siLength == `1`) {
4492	value\|=(uint32_t)siBytes[`0`]<<`8`;
4493	length = `2`;
4494	} else if (siLength == `2`) {
4495	value\|=(uint32_t)siBytes[`1`]<<`8`;
4496	value\|=(uint32_t)siBytes[`0`]<<`16`;
4497	length = `3`;
4498	}
4499	prevLength=`1`;
4500	}
4501	} else {
4502	if(prevLength==`2`) {
4503	length=`2`;
4504	} else {
4505	/ change from single-byte mode to double-byte /
4506	if (soLength == `1`) {
4507	value\|=(uint32_t)soBytes[`0`]<<`16`;
4508	length = `3`;
4509	} else if (soLength == `2`) {
4510	value\|=(uint32_t)soBytes[`1`]<<`16`;
4511	value\|=(uint32_t)soBytes[`0`]<<`24`;
4512	length = `4`;
4513	}
4514	prevLength=`2`;
4515	}
4516	}
4517	break;
4518	case MBCS_OUTPUT_DBCS_ONLY:
4519	/ table with single-byte results, but only DBCS mappings used /
4520	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4521	if(value<=`0xff`) {
4522	/ no mapping or SBCS result, not taken for DBCS-only /
4523	value=stage2Entry=`0`; / stage2Entry=0 to reset roundtrip flags /
4524	length=`0`;
4525	} else {
4526	length=`2`;
4527	}
4528	break;
4529	case MBCS_OUTPUT_3:
4530	p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4531	value=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
4532	if(value<=`0xff`) {
4533	length=`1`;
4534	} else if(value<=`0xffff`) {
4535	length=`2`;
4536	} else {
4537	length=`3`;
4538	}
4539	break;
4540	case MBCS_OUTPUT_4:
4541	value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4542	if(value<=`0xff`) {
4543	length=`1`;
4544	} else if(value<=`0xffff`) {
4545	length=`2`;
4546	} else if(value<=`0xffffff`) {
4547	length=`3`;
4548	} else {
4549	length=`4`;
4550	}
4551	break;
4552	case MBCS_OUTPUT_3_EUC:
4553	value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4554	/ EUC 16-bit fixed-length representation /
4555	if(value<=`0xff`) {
4556	length=`1`;
4557	} else if((value&`0x8000`)==`0`) {
4558	value\|=`0x8e8000`;
4559	length=`3`;
4560	} else if((value&`0x80`)==`0`) {
4561	value\|=`0x8f0080`;
4562	length=`3`;
4563	} else {
4564	length=`2`;
4565	}
4566	break;
4567	case MBCS_OUTPUT_4_EUC:
4568	p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4569	value=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
4570	/ EUC 16-bit fixed-length representation applied to the first two bytes /
4571	if(value<=`0xff`) {
4572	length=`1`;
4573	} else if(value<=`0xffff`) {
4574	length=`2`;
4575	} else if((value&`0x800000`)==`0`) {
4576	value\|=`0x8e800000`;
4577	length=`4`;
4578	} else if((value&`0x8000`)==`0`) {
4579	value\|=`0x8f008000`;
4580	length=`4`;
4581	} else {
4582	length=`3`;
4583	}
4584	break;
4585	default:
4586	/ must not occur /
4587	/*
4588	* To avoid compiler warnings that value & length may be
4589	* used without having been initialized, we set them here.
4590	* In reality, this is unreachable code.
4591	* Not having a default branch also causes warnings with
4592	* some compilers.
4593	*/
4594	value=stage2Entry=`0`; / stage2Entry=0 to reset roundtrip flags /
4595	length=`0`;
4596	break;
4597	}
4598
4599	/ is this code point assigned, or do we use fallbacks? /
4600	if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=`0` \|\|
4601	(UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=`0`))
4602	) {
4603	/*
4604	* We allow a 0 byte output if the "assigned" bit is set for this entry.
4605	* There is no way with this data structure for fallback output
4606	* to be a zero byte.
4607	*/
4608
4609	unassigned:
4610	/ try an extension mapping /
4611	pArgs->source=source;
4612	c=_extFromU(cnv, cnv->sharedData,
4613	c, &source, sourceLimit,
4614	&target, target+targetCapacity,
4615	&offsets, sourceIndex,
4616	pArgs->flush,
4617	pErrorCode);
4618	nextSourceIndex+=(int32_t)(source-pArgs->source);
4619	prevLength=cnv->fromUnicodeStatus; / restore SISO state /
4620
4621	if(U_FAILURE(*pErrorCode)) {
4622	/ not mappable or buffer overflow /
4623	break;
4624	} else {
4625	/ a mapping was written to the target, continue /
4626
4627	/ recalculate the targetCapacity after an extension mapping /
4628	targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4629
4630	/ normal end of conversion: prepare for a new character /
4631	if(offsets!=NULL) {
4632	prevSourceIndex=sourceIndex;
4633	sourceIndex=nextSourceIndex;
4634	}
4635	continue;
4636	}
4637	}
4638	}
4639
4640	/ write the output character bytes from value and length /
4641	/ from the first if in the loop we know that targetCapacity>0 /
4642	if(length<=targetCapacity) {
4643	if(offsets==NULL) {
4644	switch(length) {
4645	/ each branch falls through to the next one /
4646	case `4`:
4647	*target++=(uint8_t)(value>>`24`);
4648	U_FALLTHROUGH;
4649	case `3`:
4650	*target++=(uint8_t)(value>>`16`);
4651	U_FALLTHROUGH;
4652	case `2`:
4653	*target++=(uint8_t)(value>>`8`);
4654	U_FALLTHROUGH;
4655	case `1`:
4656	*target++=(uint8_t)value;
4657	U_FALLTHROUGH;
4658	default:
4659	/ will never occur /
4660	break;
4661	}
4662	} else {
4663	switch(length) {
4664	/ each branch falls through to the next one /
4665	case `4`:
4666	*target++=(uint8_t)(value>>`24`);
4667	*offsets++=sourceIndex;
4668	U_FALLTHROUGH;
4669	case `3`:
4670	*target++=(uint8_t)(value>>`16`);
4671	*offsets++=sourceIndex;
4672	U_FALLTHROUGH;
4673	case `2`:
4674	*target++=(uint8_t)(value>>`8`);
4675	*offsets++=sourceIndex;
4676	U_FALLTHROUGH;
4677	case `1`:
4678	*target++=(uint8_t)value;
4679	*offsets++=sourceIndex;
4680	U_FALLTHROUGH;
4681	default:
4682	/ will never occur /
4683	break;
4684	}
4685	}
4686	targetCapacity-=length;
4687	} else {
4688	uint8_t *charErrorBuffer;
4689
4690	/*
4691	* We actually do this backwards here:
4692	* In order to save an intermediate variable, we output
4693	* first to the overflow buffer what does not fit into the
4694	* regular target.
4695	*/
4696	/ we know that 1<=targetCapacity<length<=4 /
4697	length-=targetCapacity;
4698	charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4699	switch(length) {
4700	/ each branch falls through to the next one /
4701	case `3`:
4702	*charErrorBuffer++=(uint8_t)(value>>`16`);
4703	U_FALLTHROUGH;
4704	case `2`:
4705	*charErrorBuffer++=(uint8_t)(value>>`8`);
4706	U_FALLTHROUGH;
4707	case `1`:
4708	*charErrorBuffer=(uint8_t)value;
4709	U_FALLTHROUGH;
4710	default:
4711	/ will never occur /
4712	break;
4713	}
4714	cnv->charErrorBufferLength=(int8_t)length;
4715
4716	/ now output what fits into the regular target /
4717	value>>=`8`length; /* length was reduced by targetCapacity /
4718	switch(targetCapacity) {
4719	/ each branch falls through to the next one /
4720	case `3`:
4721	*target++=(uint8_t)(value>>`16`);
4722	if(offsets!=NULL) {
4723	*offsets++=sourceIndex;
4724	}
4725	U_FALLTHROUGH;
4726	case `2`:
4727	*target++=(uint8_t)(value>>`8`);
4728	if(offsets!=NULL) {
4729	*offsets++=sourceIndex;
4730	}
4731	U_FALLTHROUGH;
4732	case `1`:
4733	*target++=(uint8_t)value;
4734	if(offsets!=NULL) {
4735	*offsets++=sourceIndex;
4736	}
4737	U_FALLTHROUGH;
4738	default:
4739	/ will never occur /
4740	break;
4741	}
4742
4743	/ target overflow /
4744	targetCapacity=`0`;
4745	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4746	c=`0`;
4747	break;
4748	}
4749
4750	/ normal end of conversion: prepare for a new character /
4751	c=`0`;
4752	if(offsets!=NULL) {
4753	prevSourceIndex=sourceIndex;
4754	sourceIndex=nextSourceIndex;
4755	}
4756	continue;
4757	} else {
4758	/ target is full /
4759	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4760	break;
4761	}
4762	}
4763
4764	/*
4765	* the end of the input stream and detection of truncated input
4766	* are handled by the framework, but for EBCDIC_STATEFUL conversion
4767	* we need to emit an SI at the very end
4768	*
4769	* conditions:
4770	* successful
4771	* EBCDIC_STATEFUL in DBCS mode
4772	* end of input and no truncated input
4773	*/
4774	if( U_SUCCESS(*pErrorCode) &&
4775	outputType==MBCS_OUTPUT_2_SISO && prevLength==`2` &&
4776	pArgs->flush && source>=sourceLimit && c==`0`
4777	) {
4778	/ EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS /
4779	if(targetCapacity>`0`) {
4780	*target++=(uint8_t)siBytes[`0`];
4781	if (siLength == `2`) {
4782	if (targetCapacity<`2`) {
4783	cnv->charErrorBuffer[`0`]=(uint8_t)siBytes[`1`];
4784	cnv->charErrorBufferLength=`1`;
4785	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4786	} else {
4787	*target++=(uint8_t)siBytes[`1`];
4788	}
4789	}
4790	if(offsets!=NULL) {
4791	/ set the last source character's index (sourceIndex points at sourceLimit now) /
4792	*offsets++=prevSourceIndex;
4793	}
4794	} else {
4795	/ target is full /
4796	cnv->charErrorBuffer[`0`]=(uint8_t)siBytes[`0`];
4797	if (siLength == `2`) {
4798	cnv->charErrorBuffer[`1`]=(uint8_t)siBytes[`1`];
4799	}
4800	cnv->charErrorBufferLength=siLength;
4801	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4802	}
4803	prevLength=`1`; / we switched into SBCS /
4804	}
4805
4806	/ set the converter state back into UConverter /
4807	cnv->fromUChar32=c;
4808	cnv->fromUnicodeStatus=prevLength;
4809
4810	/ write back the updated pointers /
4811	pArgs->source=source;
4812	pArgs->target=(char *)target;
4813	pArgs->offsets=offsets;
4814	}
4815
4816	/*
4817	* This is another simple conversion function for internal use by other
4818	* conversion implementations.
4819	* It does not use the converter state nor call callbacks.
4820	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
4821	* It handles conversion extensions but not GB 18030.
4822	*
4823	* It converts one single Unicode code point into codepage bytes, encoded
4824	* as one 32-bit value. The function returns the number of bytes in *pValue:
4825	* 1..4 the number of bytes in *pValue
4826	* 0 unassigned (*pValue undefined)
4827	* -1 illegal (currently not used, *pValue undefined)
4828	*
4829	* *pValue will contain the resulting bytes with the last byte in bits 7..0,
4830	* the second to last byte in bits 15..8, etc.
4831	* Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4832	*/
4833	U_CFUNC int32_t
4834	ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
4835	UChar32 c, uint32_t *pValue,
4836	UBool useFallback) {
4837	const int32_t *cx;
4838	const uint16_t *table;
4839	#if 0
4840	/ #if 0 because this is not currently used in ICU - reduce code, increase code coverage /
4841	const uint8_t *p;
4842	#endif
4843	uint32_t stage2Entry;
4844	uint32_t value;
4845	int32_t length;
4846
4847	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
4848	if(c<=`0xffff` \|\| (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4849	table=sharedData->mbcs.fromUnicodeTable;
4850
4851	/ convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) /
4852	if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4853	value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4854	/ is this code point assigned, or do we use fallbacks? /
4855	if(useFallback ? value>=`0x800` : value>=`0xc00`) {
4856	*pValue=value&`0xff`;
4857	return `1`;
4858	}
4859	} else / outputType!=MBCS_OUTPUT_1 / {
4860	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4861
4862	/ get the bytes and the length for the output /
4863	switch(sharedData->mbcs.outputType) {
4864	case MBCS_OUTPUT_2:
4865	value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4866	if(value<=`0xff`) {
4867	length=`1`;
4868	} else {
4869	length=`2`;
4870	}
4871	break;
4872	#if 0
4873	/ #if 0 because this is not currently used in ICU - reduce code, increase code coverage /
4874	case MBCS_OUTPUT_DBCS_ONLY:
4875	/ table with single-byte results, but only DBCS mappings used /
4876	value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4877	if(value<=`0xff`) {
4878	/ no mapping or SBCS result, not taken for DBCS-only /
4879	value=stage2Entry=`0`; / stage2Entry=0 to reset roundtrip flags /
4880	length=`0`;
4881	} else {
4882	length=`2`;
4883	}
4884	break;
4885	case MBCS_OUTPUT_3:
4886	p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4887	value=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
4888	if(value<=`0xff`) {
4889	length=`1`;
4890	} else if(value<=`0xffff`) {
4891	length=`2`;
4892	} else {
4893	length=`3`;
4894	}
4895	break;
4896	case MBCS_OUTPUT_4:
4897	value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4898	if(value<=`0xff`) {
4899	length=`1`;
4900	} else if(value<=`0xffff`) {
4901	length=`2`;
4902	} else if(value<=`0xffffff`) {
4903	length=`3`;
4904	} else {
4905	length=`4`;
4906	}
4907	break;
4908	case MBCS_OUTPUT_3_EUC:
4909	value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4910	/ EUC 16-bit fixed-length representation /
4911	if(value<=`0xff`) {
4912	length=`1`;
4913	} else if((value&`0x8000`)==`0`) {
4914	value\|=`0x8e8000`;
4915	length=`3`;
4916	} else if((value&`0x80`)==`0`) {
4917	value\|=`0x8f0080`;
4918	length=`3`;
4919	} else {
4920	length=`2`;
4921	}
4922	break;
4923	case MBCS_OUTPUT_4_EUC:
4924	p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4925	value=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
4926	/ EUC 16-bit fixed-length representation applied to the first two bytes /
4927	if(value<=`0xff`) {
4928	length=`1`;
4929	} else if(value<=`0xffff`) {
4930	length=`2`;
4931	} else if((value&`0x800000`)==`0`) {
4932	value\|=`0x8e800000`;
4933	length=`4`;
4934	} else if((value&`0x8000`)==`0`) {
4935	value\|=`0x8f008000`;
4936	length=`4`;
4937	} else {
4938	length=`3`;
4939	}
4940	break;
4941	#endif
4942	default:
4943	/ must not occur /
4944	return -`1`;
4945	}
4946
4947	/ is this code point assigned, or do we use fallbacks? /
4948	if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|
4949	(FROM_U_USE_FALLBACK(useFallback, c) && value!=`0`)
4950	) {
4951	/*
4952	* We allow a 0 byte output if the "assigned" bit is set for this entry.
4953	* There is no way with this data structure for fallback output
4954	* to be a zero byte.
4955	*/
4956	/ assigned /
4957	*pValue=value;
4958	return length;
4959	}
4960	}
4961	}
4962
4963	cx=sharedData->mbcs.extIndexes;
4964	if(cx!=NULL) {
4965	length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4966	return length>=`0` ? length : -length; / return abs(length); /
4967	}
4968
4969	/ unassigned /
4970	return `0`;
4971	}
4972
4973
4974	#if 0
4975	/*
4976	* This function has been moved to ucnv2022.c for inlining.
4977	* This implementation is here only for documentation purposes
4978	*/
4979
4980	/**
4981	* This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4982	* It does not handle the EBCDIC swaplfnl option (set in UConverter).
4983	* It does not handle conversion extensions (_extFromU()).
4984	*
4985	* It returns the codepage byte for the code point, or -1 if it is unassigned.
4986	*/
4987	U_CFUNC int32_t
4988	ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
4989	UChar32 c,
4990	UBool useFallback) {
4991	const uint16_t *table;
4992	int32_t value;
4993
4994	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
4995	if(c>=`0x10000` && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4996	return -`1`;
4997	}
4998
4999	/ convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) /
5000	table=sharedData->mbcs.fromUnicodeTable;
5001
5002	/ get the byte for the output /
5003	value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
5004	/ is this code point assigned, or do we use fallbacks? /
5005	if(useFallback ? value>=`0x800` : value>=`0xc00`) {
5006	return value&`0xff`;
5007	} else {
5008	return -`1`;
5009	}
5010	}
5011	#endif
5012
5013	/ MBCS-from-UTF-8 conversion functions ------------------------------------- /
5014
5015	/ offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... /
5016	static const UChar32
5017	utf8_offsets[`5`]={ `0`, `0`, `0x3080`, `0xE2080`, `0x3C82080` };
5018
5019	static void U_CALLCONV
5020	ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5021	UConverterToUnicodeArgs *pToUArgs,
5022	UErrorCode *pErrorCode) {
5023	UConverter utf8, cnv;
5024	const uint8_t source, sourceLimit;
5025	uint8_t *target;
5026	int32_t targetCapacity;
5027
5028	const uint16_t table, sbcsIndex;
5029	const uint16_t *results;
5030
5031	int8_t oldToULength, toULength, toULimit;
5032
5033	UChar32 c;
5034	uint8_t b, t1, t2;
5035
5036	uint32_t asciiRoundtrips;
5037	uint16_t value, minValue = `0`;
5038	UBool hasSupplementary;
5039
5040	/ set up the local pointers /
5041	utf8=pToUArgs->converter;
5042	cnv=pFromUArgs->converter;
5043	source=(uint8_t *)pToUArgs->source;
5044	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5045	target=(uint8_t *)pFromUArgs->target;
5046	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5047
5048	table=cnv->sharedData->mbcs.fromUnicodeTable;
5049	sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
5050	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
5051	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5052	} else {
5053	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5054	}
5055	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5056
5057	if(cnv->useFallback) {
5058	/ use all roundtrip and fallback results /
5059	minValue=`0x800`;
5060	} else {
5061	/ use only roundtrips and fallbacks from private-use characters /
5062	minValue=`0xc00`;
5063	}
5064	hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5065
5066	/ get the converter state from the UTF-8 UConverter /
5067	if(utf8->toULength > `0`) {
5068	toULength=oldToULength=utf8->toULength;
5069	toULimit=(int8_t)utf8->mode;
5070	c=(UChar32)utf8->toUnicodeStatus;
5071	} else {
5072	toULength=oldToULength=toULimit=`0`;
5073	c = `0`;
5074	}
5075
5076	// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
5077	// If the buffer ends with a truncated 2- or 3-byte sequence,
5078	// then we reduce the sourceLimit to before that,
5079	// and collect the remaining bytes after the conversion loop.
5080	{
5081	// Do not go back into the bytes that will be read for finishing a partial
5082	// sequence from the previous buffer.
5083	int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5084	if(length>`0`) {
5085	uint8_t b1=*(sourceLimit-`1`);
5086	if(U8_IS_SINGLE(b1)) {
5087	// common ASCII character
5088	} else if(U8_IS_TRAIL(b1) && length>=`2`) {
5089	uint8_t b2=*(sourceLimit-`2`);
5090	if(`0xe0`<=b2 && b2<`0xf0` && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
5091	// truncated 3-byte sequence
5092	sourceLimit-=`2`;
5093	}
5094	} else if(`0xc2`<=b1 && b1<`0xf0`) {
5095	// truncated 2- or 3-byte sequence
5096	--sourceLimit;
5097	}
5098	}
5099	}
5100
5101	if(c!=`0` && targetCapacity>`0`) {
5102	utf8->toUnicodeStatus=`0`;
5103	utf8->toULength=`0`;
5104	goto moreBytes;
5105	/*
5106	* Note: We could avoid the goto by duplicating some of the moreBytes
5107	* code, but only up to the point of collecting a complete UTF-8
5108	* sequence; then recurse for the toUBytes[toULength]
5109	* and then continue with normal conversion.
5110	*
5111	* If so, move this code to just after initializing the minimum
5112	* set of local variables for reading the UTF-8 input
5113	* (utf8, source, target, limits but not cnv, table, minValue, etc.).
5114	*
5115	* Potential advantages:
5116	* - avoid the goto
5117	* - oldToULength could become a local variable in just those code blocks
5118	* that deal with buffer boundaries
5119	* - possibly faster if the goto prevents some compiler optimizations
5120	* (this would need measuring to confirm)
5121	* Disadvantage:
5122	* - code duplication
5123	*/
5124	}
5125
5126	/ conversion loop /
5127	while(source<sourceLimit) {
5128	if(targetCapacity>`0`) {
5129	b=*source++;
5130	if(U8_IS_SINGLE(b)) {
5131	/ convert ASCII /
5132	if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5133	*target++=(uint8_t)b;
5134	--targetCapacity;
5135	continue;
5136	} else {
5137	c=b;
5138	value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, `0`, c);
5139	}
5140	} else {
5141	if(b<`0xe0`) {
5142	if( / handle U+0080..U+07FF inline /
5143	b>=`0xc2` &&
5144	(t1=(uint8_t)(*source-`0x80`)) <= `0x3f`
5145	) {
5146	c=b&`0x1f`;
5147	++source;
5148	value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
5149	if(value>=minValue) {
5150	*target++=(uint8_t)value;
5151	--targetCapacity;
5152	continue;
5153	} else {
5154	c=(c<<`6`)\|t1;
5155	}
5156	} else {
5157	c=-`1`;
5158	}
5159	} else if(b==`0xe0`) {
5160	if( / handle U+0800..U+0FFF inline /
5161	(t1=(uint8_t)(source[`0`]-`0x80`)) <= `0x3f` && t1 >= `0x20` &&
5162	(t2=(uint8_t)(source[`1`]-`0x80`)) <= `0x3f`
5163	) {
5164	c=t1;
5165	source+=`2`;
5166	value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
5167	if(value>=minValue) {
5168	*target++=(uint8_t)value;
5169	--targetCapacity;
5170	continue;
5171	} else {
5172	c=(c<<`6`)\|t2;
5173	}
5174	} else {
5175	c=-`1`;
5176	}
5177	} else {
5178	c=-`1`;
5179	}
5180
5181	if(c<`0`) {
5182	/ handle "complicated" and error cases, and continuing partial characters /
5183	oldToULength=`0`;
5184	toULength=`1`;
5185	toULimit=U8_COUNT_BYTES_NON_ASCII(b);
5186	c=b;
5187	moreBytes:
5188	while(toULength<toULimit) {
5189	/*
5190	* The sourceLimit may have been adjusted before the conversion loop
5191	* to stop before a truncated sequence.
5192	* Here we need to use the real limit in case we have two truncated
5193	* sequences at the end.
5194	* See ticket #7492.
5195	*/
5196	if(source<(uint8_t *)pToUArgs->sourceLimit) {
5197	b=*source;
5198	if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
5199	++source;
5200	++toULength;
5201	c=(c<<`6`)+b;
5202	} else {
5203	break; / sequence too short, stop with toULength<toULimit /
5204	}
5205	} else {
5206	/ store the partial UTF-8 character, compatible with the regular UTF-8 converter /
5207	source-=(toULength-oldToULength);
5208	while(oldToULength<toULength) {
5209	utf8->toUBytes[oldToULength++]=*source++;
5210	}
5211	utf8->toUnicodeStatus=c;
5212	utf8->toULength=toULength;
5213	utf8->mode=toULimit;
5214	pToUArgs->source=(char *)source;
5215	pFromUArgs->target=(char *)target;
5216	return;
5217	}
5218	}
5219
5220	if(toULength==toULimit) {
5221	c-=utf8_offsets[toULength];
5222	if(toULength<=`3`) { / BMP /
5223	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
5224	} else {
5225	/ supplementary code point /
5226	if(!hasSupplementary) {
5227	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
5228	value=`0`;
5229	} else {
5230	value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
5231	}
5232	}
5233	} else {
5234	/ error handling: illegal UTF-8 byte sequence /
5235	source-=(toULength-oldToULength);
5236	while(oldToULength<toULength) {
5237	utf8->toUBytes[oldToULength++]=*source++;
5238	}
5239	utf8->toULength=toULength;
5240	pToUArgs->source=(char *)source;
5241	pFromUArgs->target=(char *)target;
5242	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
5243	return;
5244	}
5245	}
5246	}
5247
5248	if(value>=minValue) {
5249	/ output the mapping for c /
5250	*target++=(uint8_t)value;
5251	--targetCapacity;
5252	} else {
5253	/ value<minValue means c is unassigned (unmappable) /
5254	/*
5255	* Try an extension mapping.
5256	* Pass in no source because we don't have UTF-16 input.
5257	* If we have a partial match on c, we will return and revert
5258	* to UTF-8->UTF-16->charset conversion.
5259	*/
5260	static const UChar nul=`0`;
5261	const UChar *noSource=&nul;
5262	c=_extFromU(cnv, cnv->sharedData,
5263	c, &noSource, noSource,
5264	&target, target+targetCapacity,
5265	NULL, -`1`,
5266	pFromUArgs->flush,
5267	pErrorCode);
5268
5269	if(U_FAILURE(*pErrorCode)) {
5270	/ not mappable or buffer overflow /
5271	cnv->fromUChar32=c;
5272	break;
5273	} else if(cnv->preFromUFirstCP>=`0`) {
5274	/*
5275	* Partial match, return and revert to pivoting.
5276	* In normal from-UTF-16 conversion, we would just continue
5277	* but then exit the loop because the extension match would
5278	* have consumed the source.
5279	*/
5280	*pErrorCode=U_USING_DEFAULT_WARNING;
5281	break;
5282	} else {
5283	/ a mapping was written to the target, continue /
5284
5285	/ recalculate the targetCapacity after an extension mapping /
5286	targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5287	}
5288	}
5289	} else {
5290	/ target is full /
5291	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5292	break;
5293	}
5294	}
5295
5296	/*
5297	* The sourceLimit may have been adjusted before the conversion loop
5298	* to stop before a truncated sequence.
5299	* If so, then collect the truncated sequence now.
5300	*/
5301	if(U_SUCCESS(*pErrorCode) &&
5302	cnv->preFromUFirstCP<`0` &&
5303	source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5304	c=utf8->toUBytes[`0`]=b=*source++;
5305	toULength=`1`;
5306	toULimit=U8_COUNT_BYTES(b);
5307	while(source<sourceLimit) {
5308	utf8->toUBytes[toULength++]=b=*source++;
5309	c=(c<<`6`)+b;
5310	}
5311	utf8->toUnicodeStatus=c;
5312	utf8->toULength=toULength;
5313	utf8->mode=toULimit;
5314	}
5315
5316	/ write back the updated pointers /
5317	pToUArgs->source=(char *)source;
5318	pFromUArgs->target=(char *)target;
5319	}
5320
5321	static void U_CALLCONV
5322	ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5323	UConverterToUnicodeArgs *pToUArgs,
5324	UErrorCode *pErrorCode) {
5325	UConverter utf8, cnv;
5326	const uint8_t source, sourceLimit;
5327	uint8_t *target;
5328	int32_t targetCapacity;
5329
5330	const uint16_t table, mbcsIndex;
5331	const uint16_t *results;
5332
5333	int8_t oldToULength, toULength, toULimit;
5334
5335	UChar32 c;
5336	uint8_t b, t1, t2;
5337
5338	uint32_t stage2Entry;
5339	uint32_t asciiRoundtrips;
5340	uint16_t value = `0`;
5341	UBool hasSupplementary;
5342
5343	/ set up the local pointers /
5344	utf8=pToUArgs->converter;
5345	cnv=pFromUArgs->converter;
5346	source=(uint8_t *)pToUArgs->source;
5347	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5348	target=(uint8_t *)pFromUArgs->target;
5349	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5350
5351	table=cnv->sharedData->mbcs.fromUnicodeTable;
5352	mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
5353	if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=`0`) {
5354	results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5355	} else {
5356	results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5357	}
5358	asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5359
5360	hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5361
5362	/ get the converter state from the UTF-8 UConverter /
5363	if(utf8->toULength > `0`) {
5364	toULength=oldToULength=utf8->toULength;
5365	toULimit=(int8_t)utf8->mode;
5366	c=(UChar32)utf8->toUnicodeStatus;
5367	} else {
5368	toULength=oldToULength=toULimit=`0`;
5369	c = `0`;
5370	}
5371
5372	// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
5373	// If the buffer ends with a truncated 2- or 3-byte sequence,
5374	// then we reduce the sourceLimit to before that,
5375	// and collect the remaining bytes after the conversion loop.
5376	{
5377	// Do not go back into the bytes that will be read for finishing a partial
5378	// sequence from the previous buffer.
5379	int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5380	if(length>`0`) {
5381	uint8_t b1=*(sourceLimit-`1`);
5382	if(U8_IS_SINGLE(b1)) {
5383	// common ASCII character
5384	} else if(U8_IS_TRAIL(b1) && length>=`2`) {
5385	uint8_t b2=*(sourceLimit-`2`);
5386	if(`0xe0`<=b2 && b2<`0xf0` && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
5387	// truncated 3-byte sequence
5388	sourceLimit-=`2`;
5389	}
5390	} else if(`0xc2`<=b1 && b1<`0xf0`) {
5391	// truncated 2- or 3-byte sequence
5392	--sourceLimit;
5393	}
5394	}
5395	}
5396
5397	if(c!=`0` && targetCapacity>`0`) {
5398	utf8->toUnicodeStatus=`0`;
5399	utf8->toULength=`0`;
5400	goto moreBytes;
5401	/ See note in ucnv_SBCSFromUTF8() about this goto. /
5402	}
5403
5404	/ conversion loop /
5405	while(source<sourceLimit) {
5406	if(targetCapacity>`0`) {
5407	b=*source++;
5408	if(U8_IS_SINGLE(b)) {
5409	/ convert ASCII /
5410	if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5411	*target++=b;
5412	--targetCapacity;
5413	continue;
5414	} else {
5415	value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, `0`, b);
5416	if(value==`0`) {
5417	c=b;
5418	goto unassigned;
5419	}
5420	}
5421	} else {
5422	if(b>=`0xe0`) {
5423	if( / handle U+0800..U+D7FF inline /
5424	b<=`0xed` && // do not assume maxFastUChar>0xd7ff
5425	U8_IS_VALID_LEAD3_AND_T1(b, t1=source[`0`]) &&
5426	(t2=(uint8_t)(source[`1`]-`0x80`)) <= `0x3f`
5427	) {
5428	c=((b&`0xf`)<<`6`)\|(t1&`0x3f`);
5429	source+=`2`;
5430	value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
5431	if(value==`0`) {
5432	c=(c<<`6`)\|t2;
5433	goto unassigned;
5434	}
5435	} else {
5436	c=-`1`;
5437	}
5438	} else {
5439	if( / handle U+0080..U+07FF inline /
5440	b>=`0xc2` &&
5441	(t1=(uint8_t)(*source-`0x80`)) <= `0x3f`
5442	) {
5443	c=b&`0x1f`;
5444	++source;
5445	value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
5446	if(value==`0`) {
5447	c=(c<<`6`)\|t1;
5448	goto unassigned;
5449	}
5450	} else {
5451	c=-`1`;
5452	}
5453	}
5454
5455	if(c<`0`) {
5456	/ handle "complicated" and error cases, and continuing partial characters /
5457	oldToULength=`0`;
5458	toULength=`1`;
5459	toULimit=U8_COUNT_BYTES_NON_ASCII(b);
5460	c=b;
5461	moreBytes:
5462	while(toULength<toULimit) {
5463	/*
5464	* The sourceLimit may have been adjusted before the conversion loop
5465	* to stop before a truncated sequence.
5466	* Here we need to use the real limit in case we have two truncated
5467	* sequences at the end.
5468	* See ticket #7492.
5469	*/
5470	if(source<(uint8_t *)pToUArgs->sourceLimit) {
5471	b=*source;
5472	if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
5473	++source;
5474	++toULength;
5475	c=(c<<`6`)+b;
5476	} else {
5477	break; / sequence too short, stop with toULength<toULimit /
5478	}
5479	} else {
5480	/ store the partial UTF-8 character, compatible with the regular UTF-8 converter /
5481	source-=(toULength-oldToULength);
5482	while(oldToULength<toULength) {
5483	utf8->toUBytes[oldToULength++]=*source++;
5484	}
5485	utf8->toUnicodeStatus=c;
5486	utf8->toULength=toULength;
5487	utf8->mode=toULimit;
5488	pToUArgs->source=(char *)source;
5489	pFromUArgs->target=(char *)target;
5490	return;
5491	}
5492	}
5493
5494	if(toULength==toULimit) {
5495	c-=utf8_offsets[toULength];
5496	if(toULength<=`3`) { / BMP /
5497	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5498	} else {
5499	/ supplementary code point /
5500	if(!hasSupplementary) {
5501	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
5502	stage2Entry=`0`;
5503	} else {
5504	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5505	}
5506	}
5507	} else {
5508	/ error handling: illegal UTF-8 byte sequence /
5509	source-=(toULength-oldToULength);
5510	while(oldToULength<toULength) {
5511	utf8->toUBytes[oldToULength++]=*source++;
5512	}
5513	utf8->toULength=toULength;
5514	pToUArgs->source=(char *)source;
5515	pFromUArgs->target=(char *)target;
5516	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
5517	return;
5518	}
5519
5520	/ get the bytes and the length for the output /
5521	/ MBCS_OUTPUT_2 /
5522	value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
5523
5524	/ is this code point assigned, or do we use fallbacks? /
5525	if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|
5526	(UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=`0`))
5527	) {
5528	goto unassigned;
5529	}
5530	}
5531	}
5532
5533	/ write the output character bytes from value and length /
5534	/ from the first if in the loop we know that targetCapacity>0 /
5535	if(value<=`0xff`) {
5536	/ this is easy because we know that there is enough space /
5537	*target++=(uint8_t)value;
5538	--targetCapacity;
5539	} else / length==2 / {
5540	*target++=(uint8_t)(value>>`8`);
5541	if(`2`<=targetCapacity) {
5542	*target++=(uint8_t)value;
5543	targetCapacity-=`2`;
5544	} else {
5545	cnv->charErrorBuffer[`0`]=(char)value;
5546	cnv->charErrorBufferLength=`1`;
5547
5548	/ target overflow /
5549	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5550	break;
5551	}
5552	}
5553	continue;
5554
5555	unassigned:
5556	{
5557	/*
5558	* Try an extension mapping.
5559	* Pass in no source because we don't have UTF-16 input.
5560	* If we have a partial match on c, we will return and revert
5561	* to UTF-8->UTF-16->charset conversion.
5562	*/
5563	static const UChar nul=`0`;
5564	const UChar *noSource=&nul;
5565	c=_extFromU(cnv, cnv->sharedData,
5566	c, &noSource, noSource,
5567	&target, target+targetCapacity,
5568	NULL, -`1`,
5569	pFromUArgs->flush,
5570	pErrorCode);
5571
5572	if(U_FAILURE(*pErrorCode)) {
5573	/ not mappable or buffer overflow /
5574	cnv->fromUChar32=c;
5575	break;
5576	} else if(cnv->preFromUFirstCP>=`0`) {
5577	/*
5578	* Partial match, return and revert to pivoting.
5579	* In normal from-UTF-16 conversion, we would just continue
5580	* but then exit the loop because the extension match would
5581	* have consumed the source.
5582	*/
5583	*pErrorCode=U_USING_DEFAULT_WARNING;
5584	break;
5585	} else {
5586	/ a mapping was written to the target, continue /
5587
5588	/ recalculate the targetCapacity after an extension mapping /
5589	targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5590	continue;
5591	}
5592	}
5593	} else {
5594	/ target is full /
5595	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5596	break;
5597	}
5598	}
5599
5600	/*
5601	* The sourceLimit may have been adjusted before the conversion loop
5602	* to stop before a truncated sequence.
5603	* If so, then collect the truncated sequence now.
5604	*/
5605	if(U_SUCCESS(*pErrorCode) &&
5606	cnv->preFromUFirstCP<`0` &&
5607	source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5608	c=utf8->toUBytes[`0`]=b=*source++;
5609	toULength=`1`;
5610	toULimit=U8_COUNT_BYTES(b);
5611	while(source<sourceLimit) {
5612	utf8->toUBytes[toULength++]=b=*source++;
5613	c=(c<<`6`)+b;
5614	}
5615	utf8->toUnicodeStatus=c;
5616	utf8->toULength=toULength;
5617	utf8->mode=toULimit;
5618	}
5619
5620	/ write back the updated pointers /
5621	pToUArgs->source=(char *)source;
5622	pFromUArgs->target=(char *)target;
5623	}
5624
5625	/ miscellaneous ------------------------------------------------------------ /
5626
5627	static void U_CALLCONV
5628	ucnv_MBCSGetStarters(const UConverter* cnv,
5629	UBool starters[`256`],
5630	UErrorCode *) {
5631	const int32_t *state0;
5632	int i;
5633
5634	state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
5635	for(i=`0`; i<`256`; ++i) {
5636	/ all bytes that cause a state transition from state 0 are lead bytes /
5637	starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
5638	}
5639	}
5640
5641	/*
5642	* This is an internal function that allows other converter implementations
5643	* to check whether a byte is a lead byte.
5644	*/
5645	U_CFUNC UBool
5646	ucnv_MBCSIsLeadByte(UConverterSharedData sharedData, char* byte) {
5647	return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[`0`][(uint8_t)byte]);
5648	}
5649
5650	static void U_CALLCONV
5651	ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
5652	int32_t offsetIndex,
5653	UErrorCode *pErrorCode) {
5654	UConverter *cnv=pArgs->converter;
5655	char p, subchar;
5656	char buffer[`4`];
5657	int32_t length;
5658
5659	/ first, select between subChar and subChar1 /
5660	if( cnv->subChar1!=`0` &&
5661	(cnv->sharedData->mbcs.extIndexes!=NULL ?
5662	cnv->useSubChar1 :
5663	(cnv->invalidUCharBuffer[`0`]<=`0xff`))
5664	) {
5665	/ select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) /
5666	subchar=(char *)&cnv->subChar1;
5667	length=`1`;
5668	} else {
5669	/ select subChar in all other cases /
5670	subchar=(char *)cnv->subChars;
5671	length=cnv->subCharLen;
5672	}
5673
5674	/ reset the selector for the next code point /
5675	cnv->useSubChar1=FALSE;
5676
5677	if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
5678	p=buffer;
5679
5680	/ fromUnicodeStatus contains prevLength /
5681	switch(length) {
5682	case `1`:
5683	if(cnv->fromUnicodeStatus==`2`) {
5684	/ DBCS mode and SBCS sub char: change to SBCS /
5685	cnv->fromUnicodeStatus=`1`;
5686	*p++=UCNV_SI;
5687	}
5688	*p++=subchar[`0`];
5689	break;
5690	case `2`:
5691	if(cnv->fromUnicodeStatus<=`1`) {
5692	/ SBCS mode and DBCS sub char: change to DBCS /
5693	cnv->fromUnicodeStatus=`2`;
5694	*p++=UCNV_SO;
5695	}
5696	*p++=subchar[`0`];
5697	*p++=subchar[`1`];
5698	break;
5699	default:
5700	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
5701	return;
5702	}
5703	subchar=buffer;
5704	length=(int32_t)(p-buffer);
5705	}
5706
5707	ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
5708	}
5709
5710	U_CFUNC UConverterType
5711	ucnv_MBCSGetType(const UConverter* converter) {
5712	/ SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little /
5713	if(converter->sharedData->mbcs.countStates==`1`) {
5714	return (UConverterType)UCNV_SBCS;
5715	} else if((converter->sharedData->mbcs.outputType&`0xff`)==MBCS_OUTPUT_2_SISO) {
5716	return (UConverterType)UCNV_EBCDIC_STATEFUL;
5717	} else if(converter->sharedData->staticData->minBytesPerChar==`2` && converter->sharedData->staticData->maxBytesPerChar==`2`) {
5718	return (UConverterType)UCNV_DBCS;
5719	}
5720	return (UConverterType)UCNV_MBCS;
5721	}
5722
5723	#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
5724

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnvmbcs.cpp