uloc.cpp source code [engine/third_party/icu/source/common/uloc.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1997-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*
9	* File ULOC.CPP
10	*
11	* Modification History:
12	*
13	* Date Name Description
14	* 04/01/97 aliu Creation.
15	* 08/21/98 stephen JDK 1.2 sync
16	* 12/08/98 rtg New Locale implementation and C API
17	* 03/15/99 damiba overhaul.
18	* 04/06/99 stephen changed setDefault() to realloc and copy
19	* 06/14/99 stephen Changed calls to ures_open for new params
20	* 07/21/99 stephen Modified setDefault() to propagate to C++
21	* 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22	* brought canonicalization code into line with spec
23	*****************************************************************************/
24
25	/*
26	POSIX's locale format, from putil.c: [no spaces]
27
28	ll [ _CC ] [ . MM ] [ @ VV]
29
30	l = lang, C = ctry, M = charmap, V = variant
31	*/
32
33	#include "unicode/bytestream.h"
34	#include "unicode/errorcode.h"
35	#include "unicode/stringpiece.h"
36	#include "unicode/utypes.h"
37	#include "unicode/ustring.h"
38	#include "unicode/uloc.h"
39
40	#include "bytesinkutil.h"
41	#include "putilimp.h"
42	#include "ustr_imp.h"
43	#include "ulocimp.h"
44	#include "umutex.h"
45	#include "cstring.h"
46	#include "cmemory.h"
47	#include "locmap.h"
48	#include "uarrsort.h"
49	#include "uenumimp.h"
50	#include "uassert.h"
51	#include "charstr.h"
52
53	#include <algorithm>
54	#include <stdio.h> /* for sprintf */
55
56	U_NAMESPACE_USE
57
58	/ ### Declarations *************************************************/
59
60	/ Locale stuff from locid.cpp /
61	U_CFUNC void locale_set_default(const char *id);
62	U_CFUNC const char locale_get_default(void*);
63	U_CFUNC int32_t
64	locale_getKeywords(const char *localeID,
65	char prev,
66	char *keywords, int32_t keywordCapacity,
67	UBool valuesToo,
68	UErrorCode *status);
69
70	/ ### Data tables *************************************************/
71
72	/**
73	* Table of language codes, both 2- and 3-letter, with preference
74	* given to 2-letter codes where possible. Includes 3-letter codes
75	* that lack a 2-letter equivalent.
76	*
77	* This list must be in sorted order. This list is returned directly
78	* to the user by some API.
79	*
80	* This list must be kept in sync with LANGUAGES_3, with corresponding
81	* entries matched.
82	*
83	* This table should be terminated with a NULL entry, followed by a
84	* second list, and another NULL entry. The first list is visible to
85	* user code when this array is returned by API. The second list
86	* contains codes we support, but do not expose through user API.
87	*
88	* Notes
89	*
90	* Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
91	* include the revisions up to 2001/7/27 CWB
92	*
93	* The 3 character codes are the terminology codes like RFC 3066. This
94	* is compatible with prior ICU codes
95	*
96	* "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
97	* table but now at the end of the table because 3 character codes are
98	* duplicates. This avoids bad searches going from 3 to 2 character
99	* codes.
100	*
101	* The range qaa-qtz is reserved for local use
102	*/
103	/ Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables /
104	/ ISO639 table version is 20150505 /
105	/ Subsequent hand addition of selected languages /
106	static const char * const LANGUAGES[] = {
107	"aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
108	"af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
109	"aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
110	"arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
111	"asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
112	"ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
113	"be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
114	"bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
115	"bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
116	"brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
117	"ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
118	"ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
119	"chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
120	"cs", "csb", "cu", "cv", "cy",
121	"da", "dak", "dar", "dav", "de", "del", "den", "dgr",
122	"din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
123	"dyo", "dyu", "dz", "dzg",
124	"ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
125	"en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
126	"ext",
127	"fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
128	"fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
129	"frs", "fur", "fy",
130	"ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
131	"gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
132	"gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
133	"gur", "guz", "gv", "gwi",
134	"ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
135	"hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
136	"hup", "hy", "hz",
137	"ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
138	"ilo", "inh", "io", "is", "it", "iu", "izh",
139	"ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
140	"jv",
141	"ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
142	"kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
143	"kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
144	"kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
145	"kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
146	"kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
147	"kv", "kw", "ky",
148	"la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
149	"lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
150	"lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
151	"lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
152	"mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
153	"mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
154	"mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
155	"ml", "mn", "mnc", "mni", "mo",
156	"moh", "mos", "mr", "mrj",
157	"ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
158	"my", "mye", "myv", "mzn",
159	"na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
160	"new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
161	"nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
162	"nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
163	"oc", "oj", "om", "or", "os", "osa", "ota",
164	"pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
165	"pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
166	"pon", "prg", "pro", "ps", "pt",
167	"qu", "quc", "qug",
168	"raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
169	"rof", "rom", "rtm", "ru", "rue", "rug", "rup",
170	"rw", "rwk",
171	"sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
172	"sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
173	"se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
174	"sgs", "shi", "shn", "shu", "si", "sid", "sk",
175	"sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
176	"sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
177	"ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
178	"sv", "sw", "swb", "swc", "syc", "syr", "szl",
179	"ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
180	"th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
181	"tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
182	"tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
183	"tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
184	"udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
185	"vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
186	"vot", "vro", "vun",
187	"wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
188	"xal", "xh", "xmf", "xog",
189	"yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
190	"za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
191	"zun", "zxx", "zza",
192	NULL,
193	"in", "iw", "ji", "jw", "sh", / obsolete language codes /
194	NULL
195	};
196
197	static const char* const DEPRECATED_LANGUAGES[]={
198	"in", "iw", "ji", "jw", NULL, NULL
199	};
200	static const char* const REPLACEMENT_LANGUAGES[]={
201	"id", "he", "yi", "jv", NULL, NULL
202	};
203
204	/**
205	* Table of 3-letter language codes.
206	*
207	* This is a lookup table used to convert 3-letter language codes to
208	* their 2-letter equivalent, where possible. It must be kept in sync
209	* with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
210	* same language as LANGUAGES_3[i]. The commented-out lines are
211	* copied from LANGUAGES to make eyeballing this baby easier.
212	*
213	* Where a 3-letter language code has no 2-letter equivalent, the
214	* 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
215	*
216	* This table should be terminated with a NULL entry, followed by a
217	* second list, and another NULL entry. The two lists correspond to
218	* the two lists in LANGUAGES.
219	*/
220	/ Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables /
221	/ ISO639 table version is 20150505 /
222	/ Subsequent hand addition of selected languages /
223	static const char * const LANGUAGES_3[] = {
224	"aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
225	"afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
226	"aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
227	"arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
228	"asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
229	"bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
230	"bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
231	"bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
232	"bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
233	"brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
234	"cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
235	"cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
236	"chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
237	"ces", "csb", "chu", "chv", "cym",
238	"dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
239	"din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
240	"dyo", "dyu", "dzo", "dzg",
241	"ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
242	"eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
243	"ext",
244	"fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
245	"fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
246	"frs", "fur", "fry",
247	"gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
248	"gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
249	"gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
250	"gur", "guz", "glv", "gwi",
251	"hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
252	"hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
253	"hup", "hye", "her",
254	"ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
255	"ilo", "inh", "ido", "isl", "ita", "iku", "izh",
256	"jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
257	"jav",
258	"kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
259	"kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
260	"kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
261	"kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
262	"kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
263	"kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
264	"kom", "cor", "kir",
265	"lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
266	"lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
267	"lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
268	"lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
269	"mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
270	"mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
271	"mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
272	"mal", "mon", "mnc", "mni", "mol",
273	"moh", "mos", "mar", "mrj",
274	"msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
275	"mya", "mye", "myv", "mzn",
276	"nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
277	"new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
278	"nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
279	"nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
280	"oci", "oji", "orm", "ori", "oss", "osa", "ota",
281	"pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
282	"pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
283	"pon", "prg", "pro", "pus", "por",
284	"que", "quc", "qug",
285	"raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
286	"rof", "rom", "rtm", "rus", "rue", "rug", "rup",
287	"kin", "rwk",
288	"san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
289	"sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
290	"sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
291	"sgs", "shi", "shn", "shu", "sin", "sid", "slk",
292	"slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
293	"sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
294	"ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
295	"swe", "swa", "swb", "swc", "syc", "syr", "szl",
296	"tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
297	"tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
298	"tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
299	"tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
300	"tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
301	"udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
302	"vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
303	"vot", "vro", "vun",
304	"wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
305	"xal", "xho", "xmf", "xog",
306	"yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
307	"zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
308	"zun", "zxx", "zza",
309	NULL,
310	/ "in", "iw", "ji", "jw", "sh", /
311	"ind", "heb", "yid", "jaw", "srp",
312	NULL
313	};
314
315	/**
316	* Table of 2-letter country codes.
317	*
318	* This list must be in sorted order. This list is returned directly
319	* to the user by some API.
320	*
321	* This list must be kept in sync with COUNTRIES_3, with corresponding
322	* entries matched.
323	*
324	* This table should be terminated with a NULL entry, followed by a
325	* second list, and another NULL entry. The first list is visible to
326	* user code when this array is returned by API. The second list
327	* contains codes we support, but do not expose through user API.
328	*
329	* Notes:
330	*
331	* ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
332	* http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
333	* new codes keeping the old ones for compatibility updated to include
334	* 1999/12/03 revisions CWB
335	*
336	* RO(ROM) is now RO(ROU) according to
337	* http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
338	*/
339	static const char * const COUNTRIES[] = {
340	"AD", "AE", "AF", "AG", "AI", "AL", "AM",
341	"AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
342	"BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
343	"BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
344	"BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
345	"CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR",
346	"CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK",
347	"DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER",
348	"ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
349	"GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
350	"GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
351	"GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
352	"ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
353	"IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
354	"KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
355	"LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
356	"LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
357	"ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
358	"MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
359	"NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
360	"NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
361	"PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
362	"PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
363	"SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
364	"SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
365	"SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
366	"TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
367	"TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
368	"VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
369	"WS", "YE", "YT", "ZA", "ZM", "ZW",
370	NULL,
371	"AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", / obsolete country codes /
372	NULL
373	};
374
375	static const char* const DEPRECATED_COUNTRIES[] = {
376	"AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL / deprecated country list /
377	};
378	static const char* const REPLACEMENT_COUNTRIES[] = {
379	/ "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" /
380	"CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL / replacement country codes /
381	};
382
383	/**
384	* Table of 3-letter country codes.
385	*
386	* This is a lookup table used to convert 3-letter country codes to
387	* their 2-letter equivalent. It must be kept in sync with COUNTRIES.
388	* For all valid i, COUNTRIES[i] must refer to the same country as
389	* COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
390	* to make eyeballing this baby easier.
391	*
392	* This table should be terminated with a NULL entry, followed by a
393	* second list, and another NULL entry. The two lists correspond to
394	* the two lists in COUNTRIES.
395	*/
396	static const char * const COUNTRIES_3[] = {
397	/ "AD", "AE", "AF", "AG", "AI", "AL", "AM", /
398	"AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
399	/ "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", /
400	"AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
401	/ "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", /
402	"BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
403	/ "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", /
404	"BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
405	/ "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", /
406	"BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
407	/ "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", /
408	"CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
409	/ "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK", /
410	"CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
411	/ "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", /
412	"DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
413	/ "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", /
414	"ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
415	/ "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", /
416	"GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
417	/ "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", /
418	"GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
419	/ "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", /
420	"GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
421	/ "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" /
422	"IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
423	/ "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", /
424	"ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
425	/ "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", /
426	"COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
427	/ "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", /
428	"LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
429	/ "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", /
430	"LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
431	/ "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", /
432	"MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
433	/ "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", /
434	"MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
435	/ "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", /
436	"NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
437	/ "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", /
438	"NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
439	/ "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", /
440	"PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
441	/ "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", /
442	"PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
443	/ "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", /
444	"SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
445	/ "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", /
446	"SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
447	/ "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", /
448	"SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
449	/ "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", /
450	"TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
451	/ "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", /
452	"TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
453	/ "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", /
454	"VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
455	/ "WS", "YE", "YT", "ZA", "ZM", "ZW", /
456	"WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
457	NULL,
458	/ "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" /
459	"ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
460	NULL
461	};
462
463	typedef struct CanonicalizationMap {
464	const char id; /* input ID /
465	const char canonicalID; /* canonicalized output ID /
466	} CanonicalizationMap;
467
468	/**
469	* A map to canonicalize locale IDs. This handles a variety of
470	* different semantic kinds of transformations.
471	*/
472	static const CanonicalizationMap CANONICALIZE_MAP[] = {
473	{ "art__LOJBAN", "jbo" }, / registered name /
474	{ "hy__AREVELA", "hy" }, / Registered IANA variant /
475	{ "hy__AREVMDA", "hyw" }, / Registered IANA variant /
476	{ "zh__GUOYU", "zh" }, / registered name /
477	{ "zh__HAKKA", "hak" }, / registered name /
478	{ "zh__XIANG", "hsn" }, / registered name /
479	// subtags with 3 chars won't be treated as variants.
480	{ "zh_GAN", "gan" }, / registered name /
481	{ "zh_MIN_NAN", "nan" }, / registered name /
482	{ "zh_WUU", "wuu" }, / registered name /
483	{ "zh_YUE", "yue" }, / registered name /
484	};
485
486	/ ### BCP47 Conversion ******************************************/
487	/ Test if the locale id has BCP47 u extension and does not have '@' /
488	#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
489	/ Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails /
490	#define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
491	if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 \|\| \
492	U_FAILURE(err) \|\| err == U_STRING_NOT_TERMINATED_WARNING) { \
493	finalID=id; \
494	if (err == U_STRING_NOT_TERMINATED_WARNING) { err = U_BUFFER_OVERFLOW_ERROR; } \
495	} else { \
496	finalID=buffer; \
497	} \
498	} UPRV_BLOCK_MACRO_END
499	/ Gets the size of the shortest subtag in the given localeID. /
500	static int32_t getShortestSubtagLength(const char *localeID) {
501	int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
502	int32_t length = localeIDLength;
503	int32_t tmpLength = `0`;
504	int32_t i;
505	UBool reset = TRUE;
506
507	for (i = `0`; i < localeIDLength; i++) {
508	if (localeID[i] != `'_'` && localeID[i] != `'-'`) {
509	if (reset) {
510	tmpLength = `0`;
511	reset = FALSE;
512	}
513	tmpLength++;
514	} else {
515	if (tmpLength != `0` && tmpLength < length) {
516	length = tmpLength;
517	}
518	reset = TRUE;
519	}
520	}
521
522	return length;
523	}
524
525	/ ### Keywords *************************************************/
526	#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
527	#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) \|\| UPRV_ISDIGIT(c) )
528	/ Punctuation/symbols allowed in legacy key values /
529	#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' \|\| (c) == '-' \|\| (c) == '+' \|\| (c) == '/')
530
531	#define ULOC_KEYWORD_BUFFER_LEN 25
532	#define ULOC_MAX_NO_KEYWORDS 25
533
534	U_CAPI const char * U_EXPORT2
535	locale_getKeywordsStart(const char *localeID) {
536	const char *result = NULL;
537	if((result = uprv_strchr(localeID, `'@'`)) != NULL) {
538	return result;
539	}
540	#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
541	else {
542	/ We do this because the @ sign is variant, and the @ sign used on one*
543	EBCDIC machine won't be compiled the same way on other EBCDIC based
544	machines. /*
545	static const uint8_t ebcdicSigns[] = { `0x7C`, `0x44`, `0x66`, `0x80`, `0xAC`, `0xAE`, `0xAF`, `0xB5`, `0xEC`, `0xEF`, `0x00` };
546	const uint8_t *charToFind = ebcdicSigns;
547	while(*charToFind) {
548	if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
549	return result;
550	}
551	charToFind++;
552	}
553	}
554	#endif
555	return NULL;
556	}
557
558	/**
559	* @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
560	* @param keywordName incoming name to be canonicalized
561	* @param status return status (keyword too long)
562	* @return length of the keyword name
563	*/
564	static int32_t locale_canonKeywordName(char buf, const* char keywordName, UErrorCode status)
565	{
566	int32_t keywordNameLen = `0`;
567
568	for (; *keywordName != `0`; keywordName++) {
569	if (!UPRV_ISALPHANUM(*keywordName)) {
570	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name /
571	return `0`;
572	}
573	if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - `1`) {
574	buf[keywordNameLen++] = uprv_tolower(*keywordName);
575	} else {
576	/ keyword name too long for internal buffer /
577	*status = U_INTERNAL_PROGRAM_ERROR;
578	return `0`;
579	}
580	}
581	if (keywordNameLen == `0`) {
582	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name /
583	return `0`;
584	}
585	buf[keywordNameLen] = `0`; / terminate /
586
587	return keywordNameLen;
588	}
589
590	typedef struct {
591	char keyword[ULOC_KEYWORD_BUFFER_LEN];
592	int32_t keywordLen;
593	const char *valueStart;
594	int32_t valueLen;
595	} KeywordStruct;
596
597	static int32_t U_CALLCONV
598	compareKeywordStructs(const void * /context/, const void left, const* void *right) {
599	const char* leftString = ((const KeywordStruct *)left)->keyword;
600	const char* rightString = ((const KeywordStruct *)right)->keyword;
601	return uprv_strcmp(leftString, rightString);
602	}
603
604	static void
605	_getKeywords(const char *localeID,
606	char prev,
607	ByteSink& sink,
608	UBool valuesToo,
609	UErrorCode *status)
610	{
611	KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
612
613	int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
614	int32_t numKeywords = `0`;
615	const char* pos = localeID;
616	const char* equalSign = NULL;
617	const char* semicolon = NULL;
618	int32_t i = `0`, j, n;
619
620	if(prev == `'@'`) { / start of keyword definition /
621	/ we will grab pairs, trim spaces, lowercase keywords, sort and return /
622	do {
623	UBool duplicate = FALSE;
624	/ skip leading spaces /
625	while(*pos == `' '`) {
626	pos++;
627	}
628	if (!pos) { /* handle trailing "; " /
629	break;
630	}
631	if(numKeywords == maxKeywords) {
632	*status = U_INTERNAL_PROGRAM_ERROR;
633	return;
634	}
635	equalSign = uprv_strchr(pos, `'='`);
636	semicolon = uprv_strchr(pos, `';'`);
637	/ lack of '=' [foo@currency] is illegal /
638	/ ';' before '=' [foo@currency;collation=pinyin] is illegal /
639	if(!equalSign \|\| (semicolon && semicolon<equalSign)) {
640	*status = U_INVALID_FORMAT_ERROR;
641	return;
642	}
643	/ need to normalize both keyword and keyword name /
644	if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
645	/ keyword name too long for internal buffer /
646	*status = U_INTERNAL_PROGRAM_ERROR;
647	return;
648	}
649	for(i = `0`, n = `0`; i < equalSign - pos; ++i) {
650	if (pos[i] != `' '`) {
651	keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
652	}
653	}
654
655	/ zero-length keyword is an error. /
656	if (n == `0`) {
657	*status = U_INVALID_FORMAT_ERROR;
658	return;
659	}
660
661	keywordList[numKeywords].keyword[n] = `0`;
662	keywordList[numKeywords].keywordLen = n;
663	/ now grab the value part. First we skip the '=' /
664	equalSign++;
665	/ then we leading spaces /
666	while(*equalSign == `' '`) {
667	equalSign++;
668	}
669
670	/ Premature end or zero-length value /
671	if (!*equalSign \|\| equalSign == semicolon) {
672	*status = U_INVALID_FORMAT_ERROR;
673	return;
674	}
675
676	keywordList[numKeywords].valueStart = equalSign;
677
678	pos = semicolon;
679	i = `0`;
680	if(pos) {
681	while(*(pos - i - `1`) == `' '`) {
682	i++;
683	}
684	keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
685	pos++;
686	} else {
687	i = (int32_t)uprv_strlen(equalSign);
688	while(i && equalSign[i-`1`] == `' '`) {
689	i--;
690	}
691	keywordList[numKeywords].valueLen = i;
692	}
693	/ If this is a duplicate keyword, then ignore it /
694	for (j=`0`; j<numKeywords; ++j) {
695	if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == `0`) {
696	duplicate = TRUE;
697	break;
698	}
699	}
700	if (!duplicate) {
701	++numKeywords;
702	}
703	} while(pos);
704
705	/ now we have a list of keywords /
706	/ we need to sort it /
707	uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
708
709	/ Now construct the keyword part /
710	for(i = `0`; i < numKeywords; i++) {
711	sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
712	if(valuesToo) {
713	sink.Append("=", `1`);
714	sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
715	if(i < numKeywords - `1`) {
716	sink.Append(";", `1`);
717	}
718	} else {
719	sink.Append("\0", `1`);
720	}
721	}
722	}
723	}
724
725	U_CFUNC int32_t
726	locale_getKeywords(const char *localeID,
727	char prev,
728	char *keywords, int32_t keywordCapacity,
729	UBool valuesToo,
730	UErrorCode *status) {
731	if (U_FAILURE(*status)) {
732	return `0`;
733	}
734
735	CheckedArrayByteSink sink(keywords, keywordCapacity);
736	_getKeywords(localeID, prev, sink, valuesToo, status);
737
738	int32_t reslen = sink.NumberOfBytesAppended();
739
740	if (U_FAILURE(*status)) {
741	return reslen;
742	}
743
744	if (sink.Overflowed()) {
745	*status = U_BUFFER_OVERFLOW_ERROR;
746	} else {
747	u_terminateChars(keywords, keywordCapacity, reslen, status);
748	}
749
750	return reslen;
751	}
752
753	U_CAPI int32_t U_EXPORT2
754	uloc_getKeywordValue(const char* localeID,
755	const char* keywordName,
756	char* buffer, int32_t bufferCapacity,
757	UErrorCode* status)
758	{
759	if (buffer != nullptr) {
760	buffer[`0`] = `'\0'`;
761	}
762	const char* startSearchHere = NULL;
763	const char* nextSeparator = NULL;
764	char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
765	char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
766	int32_t result = `0`;
767
768	if(status && U_SUCCESS(*status) && localeID) {
769	char tempBuffer[ULOC_FULLNAME_CAPACITY];
770	const char* tmpLocaleID;
771
772	if (keywordName == NULL \|\| keywordName[`0`] == `0`) {
773	*status = U_ILLEGAL_ARGUMENT_ERROR;
774	return `0`;
775	}
776
777	locale_canonKeywordName(keywordNameBuffer, keywordName, status);
778	if(U_FAILURE(*status)) {
779	return `0`;
780	}
781
782	if (_hasBCP47Extension(localeID)) {
783	_ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
784	} else {
785	tmpLocaleID=localeID;
786	}
787
788	startSearchHere = locale_getKeywordsStart(tmpLocaleID);
789	if(startSearchHere == NULL) {
790	/ no keywords, return at once /
791	return `0`;
792	}
793
794	/ find the first keyword /
795	while(startSearchHere) {
796	const char* keyValueTail;
797	int32_t keyValueLen;
798
799	startSearchHere++; / skip @ or ; /
800	nextSeparator = uprv_strchr(startSearchHere, `'='`);
801	if(!nextSeparator) {
802	status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value /
803	return `0`;
804	}
805	/ strip leading & trailing spaces (TC decided to tolerate these) /
806	while(*startSearchHere == `' '`) {
807	startSearchHere++;
808	}
809	keyValueTail = nextSeparator;
810	while (keyValueTail > startSearchHere && *(keyValueTail-`1`) == `' '`) {
811	keyValueTail--;
812	}
813	/ now keyValueTail points to first char after the keyName /
814	/ copy & normalize keyName from locale /
815	if (startSearchHere == keyValueTail) {
816	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale /
817	return `0`;
818	}
819	keyValueLen = `0`;
820	while (startSearchHere < keyValueTail) {
821	if (!UPRV_ISALPHANUM(*startSearchHere)) {
822	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name /
823	return `0`;
824	}
825	if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - `1`) {
826	localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
827	} else {
828	/ keyword name too long for internal buffer /
829	*status = U_INTERNAL_PROGRAM_ERROR;
830	return `0`;
831	}
832	}
833	localeKeywordNameBuffer[keyValueLen] = `0`; / terminate /
834
835	startSearchHere = uprv_strchr(nextSeparator, `';'`);
836
837	if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == `0`) {
838	/ current entry matches the keyword. /
839	nextSeparator++; / skip '=' /
840	/ First strip leading & trailing spaces (TC decided to tolerate these) /
841	while(*nextSeparator == `' '`) {
842	nextSeparator++;
843	}
844	keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
845	while(keyValueTail > nextSeparator && *(keyValueTail-`1`) == `' '`) {
846	keyValueTail--;
847	}
848	/ Now copy the value, but check well-formedness /
849	if (nextSeparator == keyValueTail) {
850	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale /
851	return `0`;
852	}
853	keyValueLen = `0`;
854	while (nextSeparator < keyValueTail) {
855	if (!UPRV_ISALPHANUM(nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(nextSeparator)) {
856	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value /
857	return `0`;
858	}
859	if (keyValueLen < bufferCapacity) {
860	/ Should we lowercase value to return here? Tests expect as-is. /
861	buffer[keyValueLen++] = *nextSeparator++;
862	} else { / keep advancing so we return correct length in case of overflow /
863	keyValueLen++;
864	nextSeparator++;
865	}
866	}
867	result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
868	return result;
869	}
870	}
871	}
872	return `0`;
873	}
874
875	U_CAPI int32_t U_EXPORT2
876	uloc_setKeywordValue(const char* keywordName,
877	const char* keywordValue,
878	char* buffer, int32_t bufferCapacity,
879	UErrorCode* status)
880	{
881	/ TODO: sorting. removal. /
882	int32_t keywordNameLen;
883	int32_t keywordValueLen;
884	int32_t bufLen;
885	int32_t needLen = `0`;
886	char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
887	char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+`1`];
888	char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
889	int32_t rc;
890	char* nextSeparator = NULL;
891	char* nextEqualsign = NULL;
892	char* startSearchHere = NULL;
893	char* keywordStart = NULL;
894	CharString updatedKeysAndValues;
895	int32_t updatedKeysAndValuesLen;
896	UBool handledInputKeyAndValue = FALSE;
897	char keyValuePrefix = `'@'`;
898
899	if(U_FAILURE(*status)) {
900	return -`1`;
901	}
902	if (keywordName == NULL \|\| keywordName[`0`] == `0` \|\| bufferCapacity <= `1`) {
903	*status = U_ILLEGAL_ARGUMENT_ERROR;
904	return `0`;
905	}
906	bufLen = (int32_t)uprv_strlen(buffer);
907	if(bufferCapacity<bufLen) {
908	/ The capacity is less than the length?! Is this NULL terminated? /
909	*status = U_ILLEGAL_ARGUMENT_ERROR;
910	return `0`;
911	}
912	keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
913	if(U_FAILURE(*status)) {
914	return `0`;
915	}
916
917	keywordValueLen = `0`;
918	if(keywordValue) {
919	while (*keywordValue != `0`) {
920	if (!UPRV_ISALPHANUM(keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(keywordValue)) {
921	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value /
922	return `0`;
923	}
924	if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
925	/ Should we force lowercase in value to set? /
926	keywordValueBuffer[keywordValueLen++] = *keywordValue++;
927	} else {
928	/ keywordValue too long for internal buffer /
929	*status = U_INTERNAL_PROGRAM_ERROR;
930	return `0`;
931	}
932	}
933	}
934	keywordValueBuffer[keywordValueLen] = `0`; / terminate /
935
936	startSearchHere = (char*)locale_getKeywordsStart(buffer);
937	if(startSearchHere == NULL \|\| (startSearchHere[`1`]==`0`)) {
938	if(keywordValueLen == `0`) { / no keywords = nothing to remove /
939	return bufLen;
940	}
941
942	needLen = bufLen+`1`+keywordNameLen+`1`+keywordValueLen;
943	if(startSearchHere) { / had a single @ /
944	needLen--; / already had the @ /
945	/ startSearchHere points at the @ /
946	} else {
947	startSearchHere=buffer+bufLen;
948	}
949	if(needLen >= bufferCapacity) {
950	*status = U_BUFFER_OVERFLOW_ERROR;
951	return needLen; / no change /
952	}
953	*startSearchHere++ = `'@'`;
954	uprv_strcpy(startSearchHere, keywordNameBuffer);
955	startSearchHere += keywordNameLen;
956	*startSearchHere++ = `'='`;
957	uprv_strcpy(startSearchHere, keywordValueBuffer);
958	return needLen;
959	} / end shortcut - no @ /
960
961	keywordStart = startSearchHere;
962	/ search for keyword /
963	while(keywordStart) {
964	const char* keyValueTail;
965	int32_t keyValueLen;
966
967	keywordStart++; / skip @ or ; /
968	nextEqualsign = uprv_strchr(keywordStart, `'='`);
969	if (!nextEqualsign) {
970	status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value /
971	return `0`;
972	}
973	/ strip leading & trailing spaces (TC decided to tolerate these) /
974	while(*keywordStart == `' '`) {
975	keywordStart++;
976	}
977	keyValueTail = nextEqualsign;
978	while (keyValueTail > keywordStart && *(keyValueTail-`1`) == `' '`) {
979	keyValueTail--;
980	}
981	/ now keyValueTail points to first char after the keyName /
982	/ copy & normalize keyName from locale /
983	if (keywordStart == keyValueTail) {
984	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale /
985	return `0`;
986	}
987	keyValueLen = `0`;
988	while (keywordStart < keyValueTail) {
989	if (!UPRV_ISALPHANUM(*keywordStart)) {
990	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name /
991	return `0`;
992	}
993	if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - `1`) {
994	localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
995	} else {
996	/ keyword name too long for internal buffer /
997	*status = U_INTERNAL_PROGRAM_ERROR;
998	return `0`;
999	}
1000	}
1001	localeKeywordNameBuffer[keyValueLen] = `0`; / terminate /
1002
1003	nextSeparator = uprv_strchr(nextEqualsign, `';'`);
1004
1005	/ start processing the value part /
1006	nextEqualsign++; / skip '=' /
1007	/ First strip leading & trailing spaces (TC decided to tolerate these) /
1008	while(*nextEqualsign == `' '`) {
1009	nextEqualsign++;
1010	}
1011	keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1012	while(keyValueTail > nextEqualsign && *(keyValueTail-`1`) == `' '`) {
1013	keyValueTail--;
1014	}
1015	if (nextEqualsign == keyValueTail) {
1016	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale /
1017	return `0`;
1018	}
1019
1020	rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1021	if(rc == `0`) {
1022	/ Current entry matches the input keyword. Update the entry /
1023	if(keywordValueLen > `0`) { / updating a value /
1024	updatedKeysAndValues.append(keyValuePrefix, *status);
1025	keyValuePrefix = `';'`; / for any subsequent key-value pair /
1026	updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1027	updatedKeysAndValues.append(`'='`, *status);
1028	updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1029	} / else removing this entry, don't emit anything /
1030	handledInputKeyAndValue = TRUE;
1031	} else {
1032	/ input keyword sorts earlier than current entry, add before current entry /
1033	if (rc < `0` && keywordValueLen > `0` && !handledInputKeyAndValue) {
1034	/ insert new entry at this location /
1035	updatedKeysAndValues.append(keyValuePrefix, *status);
1036	keyValuePrefix = `';'`; / for any subsequent key-value pair /
1037	updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1038	updatedKeysAndValues.append(`'='`, *status);
1039	updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1040	handledInputKeyAndValue = TRUE;
1041	}
1042	/ copy the current entry /
1043	updatedKeysAndValues.append(keyValuePrefix, *status);
1044	keyValuePrefix = `';'`; / for any subsequent key-value pair /
1045	updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1046	updatedKeysAndValues.append(`'='`, *status);
1047	updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1048	}
1049	if (!nextSeparator && keywordValueLen > `0` && !handledInputKeyAndValue) {
1050	/ append new entry at the end, it sorts later than existing entries /
1051	updatedKeysAndValues.append(keyValuePrefix, *status);
1052	/ skip keyValuePrefix update, no subsequent key-value pair /
1053	updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1054	updatedKeysAndValues.append(`'='`, *status);
1055	updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1056	handledInputKeyAndValue = TRUE;
1057	}
1058	keywordStart = nextSeparator;
1059	} / end loop searching /
1060
1061	/ Any error from updatedKeysAndValues.append above would be internal and not due to*
1062	* problems with the passed-in locale. So if we did encounter problems with the
1063	* passed-in locale above, those errors took precedence and overrode any error
1064	* status from updatedKeysAndValues.append, and also caused a return of 0. If there
1065	* are errors here they are from updatedKeysAndValues.append; they do cause an
1066	* error return but the passed-in locale is unmodified and the original bufLen is
1067	* returned.
1068	*/
1069	if (!handledInputKeyAndValue \|\| U_FAILURE(*status)) {
1070	/ if input key/value specified removal of a keyword not present in locale, or*
1071	* there was an error in CharString.append, leave original locale alone. */
1072	return bufLen;
1073	}
1074
1075	updatedKeysAndValuesLen = updatedKeysAndValues.length();
1076	/ needLen = length of the part before '@' + length of updated key-value part including '@' /
1077	needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1078	if(needLen >= bufferCapacity) {
1079	*status = U_BUFFER_OVERFLOW_ERROR;
1080	return needLen; / no change /
1081	}
1082	if (updatedKeysAndValuesLen > `0`) {
1083	uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1084	}
1085	buffer[needLen]=`0`;
1086	return needLen;
1087	}
1088
1089	/ ### ID parsing implementation *************************************************/
1090
1091	#define _isPrefixLetter(a) ((a=='x')\|\|(a=='X')\|\|(a=='i')\|\|(a=='I'))
1092
1093	/returns TRUE if one of the special prefixes is here (s=string)*
1094	'x-' or 'i-' /*
1095	#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1096
1097	/ Dot terminates it because of POSIX form where dot precedes the codepage*
1098	* except for variant
1099	*/
1100	#define _isTerminator(a) ((a==0)\|\|(a=='.')\|\|(a=='@'))
1101
1102	/**
1103	* Lookup 'key' in the array 'list'. The array 'list' should contain
1104	* a NULL entry, followed by more entries, and a second NULL entry.
1105	*
1106	* The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1107	* COUNTRIES_3.
1108	*/
1109	static int16_t _findIndex(const char* const* list, const char* key)
1110	{
1111	const char* const* anchor = list;
1112	int32_t pass = `0`;
1113
1114	/ Make two passes through two NULL-terminated arrays at 'list' /
1115	while (pass++ < `2`) {
1116	while (*list) {
1117	if (uprv_strcmp(key, *list) == `0`) {
1118	return (int16_t)(list - anchor);
1119	}
1120	list++;
1121	}
1122	++list; / skip final NULL CWB/*
1123	}
1124	return -`1`;
1125	}
1126
1127	U_CFUNC const char*
1128	uloc_getCurrentCountryID(const char* oldID){
1129	int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1130	if (offset >= `0`) {
1131	return REPLACEMENT_COUNTRIES[offset];
1132	}
1133	return oldID;
1134	}
1135	U_CFUNC const char*
1136	uloc_getCurrentLanguageID(const char* oldID){
1137	int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1138	if (offset >= `0`) {
1139	return REPLACEMENT_LANGUAGES[offset];
1140	}
1141	return oldID;
1142	}
1143	/*
1144	* the internal functions _getLanguage(), _getCountry(), _getVariant()
1145	* avoid duplicating code to handle the earlier locale ID pieces
1146	* in the functions for the later ones by
1147	* setting the *pEnd pointer to where they stopped parsing
1148	*
1149	* TODO try to use this in Locale
1150	*/
1151	static CharString
1152	ulocimp_getLanguage(const char *localeID,
1153	const char **pEnd,
1154	UErrorCode &status) {
1155	CharString result;
1156
1157	if (uprv_stricmp(localeID, "root") == `0`) {
1158	localeID += `4`;
1159	} else if (uprv_strnicmp(localeID, "und", `3`) == `0` &&
1160	(localeID[`3`] == `'\0'` \|\|
1161	localeID[`3`] == `'-'` \|\|
1162	localeID[`3`] == `'_'` \|\|
1163	localeID[`3`] == `'@'`)) {
1164	localeID += `3`;
1165	}
1166
1167	/ if it starts with i- or x- then copy that prefix /
1168	if(_isIDPrefix(localeID)) {
1169	result.append((char)uprv_tolower(*localeID), status);
1170	result.append(`'-'`, status);
1171	localeID+=`2`;
1172	}
1173
1174	/ copy the language as far as possible and count its length /
1175	while(!_isTerminator(localeID) && !_isIDSeparator(localeID)) {
1176	result.append((char)uprv_tolower(*localeID), status);
1177	localeID++;
1178	}
1179
1180	if(result.length()==`3`) {
1181	/ convert 3 character code to 2 character code if possible CWB/*
1182	int32_t offset = _findIndex(LANGUAGES_3, result.data());
1183	if(offset>=`0`) {
1184	result.clear();
1185	result.append(LANGUAGES[offset], status);
1186	}
1187	}
1188
1189	if(pEnd!=NULL) {
1190	*pEnd=localeID;
1191	}
1192
1193	return result;
1194	}
1195
1196	U_CFUNC int32_t
1197	ulocimp_getLanguage(const char *localeID,
1198	char *language, int32_t languageCapacity,
1199	const char **pEnd) {
1200	ErrorCode status;
1201	CharString result = ulocimp_getLanguage(localeID, pEnd, status);
1202	if (status.isFailure()) {
1203	return `0`;
1204	}
1205	int32_t reslen = result.length();
1206	uprv_memcpy(language, result.data(), std::min(reslen, languageCapacity));
1207	return reslen;
1208	}
1209
1210	static CharString
1211	ulocimp_getScript(const char *localeID,
1212	const char **pEnd,
1213	UErrorCode &status) {
1214	CharString result;
1215	int32_t idLen = `0`;
1216
1217	if (pEnd != NULL) {
1218	*pEnd = localeID;
1219	}
1220
1221	/ copy the second item as far as possible and count its length /
1222	while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1223	&& uprv_isASCIILetter(localeID[idLen])) {
1224	idLen++;
1225	}
1226
1227	/ If it's exactly 4 characters long, then it's a script and not a country. /
1228	if (idLen == `4`) {
1229	int32_t i;
1230	if (pEnd != NULL) {
1231	*pEnd = localeID+idLen;
1232	}
1233	if (idLen >= `1`) {
1234	result.append((char)uprv_toupper(*(localeID++)), status);
1235	}
1236	for (i = `1`; i < idLen; i++) {
1237	result.append((char)uprv_tolower(*(localeID++)), status);
1238	}
1239	}
1240
1241	return result;
1242	}
1243
1244	U_CFUNC int32_t
1245	ulocimp_getScript(const char *localeID,
1246	char *script, int32_t scriptCapacity,
1247	const char **pEnd) {
1248	ErrorCode status;
1249	CharString result = ulocimp_getScript(localeID, pEnd, status);
1250	if (status.isFailure()) {
1251	return `0`;
1252	}
1253	int32_t reslen = result.length();
1254	uprv_memcpy(script, result.data(), std::min(reslen, scriptCapacity));
1255	return reslen;
1256	}
1257
1258	static CharString
1259	ulocimp_getCountry(const char *localeID,
1260	const char **pEnd,
1261	UErrorCode &status) {
1262	CharString result;
1263	int32_t idLen=`0`;
1264
1265	/ copy the country as far as possible and count its length /
1266	while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1267	result.append((char)uprv_toupper(localeID[idLen]), status);
1268	idLen++;
1269	}
1270
1271	/ the country should be either length 2 or 3 /
1272	if (idLen == `2` \|\| idLen == `3`) {
1273	/ convert 3 character code to 2 character code if possible CWB/*
1274	if(idLen==`3`) {
1275	int32_t offset = _findIndex(COUNTRIES_3, result.data());
1276	if(offset>=`0`) {
1277	result.clear();
1278	result.append(COUNTRIES[offset], status);
1279	}
1280	}
1281	localeID+=idLen;
1282	} else {
1283	result.clear();
1284	}
1285
1286	if(pEnd!=NULL) {
1287	*pEnd=localeID;
1288	}
1289
1290	return result;
1291	}
1292
1293	U_CFUNC int32_t
1294	ulocimp_getCountry(const char *localeID,
1295	char *country, int32_t countryCapacity,
1296	const char **pEnd) {
1297	ErrorCode status;
1298	CharString result = ulocimp_getCountry(localeID, pEnd, status);
1299	if (status.isFailure()) {
1300	return `0`;
1301	}
1302	int32_t reslen = result.length();
1303	uprv_memcpy(country, result.data(), std::min(reslen, countryCapacity));
1304	return reslen;
1305	}
1306
1307	/**
1308	* @param needSeparator if true, then add leading '_' if any variants
1309	* are added to 'variant'
1310	*/
1311	static void
1312	_getVariantEx(const char *localeID,
1313	char prev,
1314	ByteSink& sink,
1315	UBool needSeparator) {
1316	UBool hasVariant = FALSE;
1317
1318	/ get one or more variant tags and separate them with '_' /
1319	if(_isIDSeparator(prev)) {
1320	/ get a variant string after a '-' or '_' /
1321	while(!_isTerminator(*localeID)) {
1322	if (needSeparator) {
1323	sink.Append("_", `1`);
1324	needSeparator = FALSE;
1325	}
1326	char c = (char)uprv_toupper(*localeID);
1327	if (c == `'-'`) c = `'_'`;
1328	sink.Append(&c, `1`);
1329	hasVariant = TRUE;
1330	localeID++;
1331	}
1332	}
1333
1334	/ if there is no variant tag after a '-' or '_' then look for '@' /
1335	if(!hasVariant) {
1336	if(prev==`'@'`) {
1337	/ keep localeID /
1338	} else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1339	++localeID; / point after the '@' /
1340	} else {
1341	return;
1342	}
1343	while(!_isTerminator(*localeID)) {
1344	if (needSeparator) {
1345	sink.Append("_", `1`);
1346	needSeparator = FALSE;
1347	}
1348	char c = (char)uprv_toupper(*localeID);
1349	if (c == `'-'` \|\| c == `','`) c = `'_'`;
1350	sink.Append(&c, `1`);
1351	localeID++;
1352	}
1353	}
1354	}
1355
1356	static int32_t
1357	_getVariantEx(const char *localeID,
1358	char prev,
1359	char *variant, int32_t variantCapacity,
1360	UBool needSeparator) {
1361	CheckedArrayByteSink sink(variant, variantCapacity);
1362	_getVariantEx(localeID, prev, sink, needSeparator);
1363	return sink.NumberOfBytesAppended();
1364	}
1365
1366	static int32_t
1367	_getVariant(const char *localeID,
1368	char prev,
1369	char *variant, int32_t variantCapacity) {
1370	return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1371	}
1372
1373	/ Keyword enumeration /
1374
1375	typedef struct UKeywordsContext {
1376	char* keywords;
1377	char* current;
1378	} UKeywordsContext;
1379
1380	U_CDECL_BEGIN
1381
1382	static void U_CALLCONV
1383	uloc_kw_closeKeywords(UEnumeration *enumerator) {
1384	uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1385	uprv_free(enumerator->context);
1386	uprv_free(enumerator);
1387	}
1388
1389	static int32_t U_CALLCONV
1390	uloc_kw_countKeywords(UEnumeration en, UErrorCode /status/) {
1391	char kw = ((UKeywordsContext )en->context)->keywords;
1392	int32_t result = `0`;
1393	while(*kw) {
1394	result++;
1395	kw += uprv_strlen(kw)+`1`;
1396	}
1397	return result;
1398	}
1399
1400	static const char * U_CALLCONV
1401	uloc_kw_nextKeyword(UEnumeration* en,
1402	int32_t* resultLength,
1403	UErrorCode* /status/) {
1404	const char* result = ((UKeywordsContext *)en->context)->current;
1405	int32_t len = `0`;
1406	if(*result) {
1407	len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1408	((UKeywordsContext *)en->context)->current += len+`1`;
1409	} else {
1410	result = NULL;
1411	}
1412	if (resultLength) {
1413	*resultLength = len;
1414	}
1415	return result;
1416	}
1417
1418	static void U_CALLCONV
1419	uloc_kw_resetKeywords(UEnumeration* en,
1420	UErrorCode* /status/) {
1421	((UKeywordsContext )en->context)->current = ((UKeywordsContext )en->context)->keywords;
1422	}
1423
1424	U_CDECL_END
1425
1426
1427	static const UEnumeration gKeywordsEnum = {
1428	NULL,
1429	NULL,
1430	uloc_kw_closeKeywords,
1431	uloc_kw_countKeywords,
1432	uenum_unextDefault,
1433	uloc_kw_nextKeyword,
1434	uloc_kw_resetKeywords
1435	};
1436
1437	U_CAPI UEnumeration* U_EXPORT2
1438	uloc_openKeywordList(const char keywordList, int32_t keywordListSize, UErrorCode status)
1439	{
1440	LocalMemory<UKeywordsContext> myContext;
1441	LocalMemory<UEnumeration> result;
1442
1443	if (U_FAILURE(*status)) {
1444	return nullptr;
1445	}
1446	myContext.adoptInstead(static_cast<UKeywordsContext >(uprv_malloc(sizeof*(UKeywordsContext))));
1447	result.adoptInstead(static_cast<UEnumeration >(uprv_malloc(sizeof*(UEnumeration))));
1448	if (myContext.isNull() \|\| result.isNull()) {
1449	*status = U_MEMORY_ALLOCATION_ERROR;
1450	return nullptr;
1451	}
1452	uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1453	myContext ->keywords = static_cast<char *>(uprv_malloc(keywordListSize+`1`));
1454	if (myContext ->keywords == nullptr) {
1455	*status = U_MEMORY_ALLOCATION_ERROR;
1456	return nullptr;
1457	}
1458	uprv_memcpy(myContext ->keywords, keywordList, keywordListSize);
1459	myContext ->keywords[keywordListSize] = `0`;
1460	myContext ->current = myContext ->keywords;
1461	result ->context = myContext.orphan();
1462	return result.orphan();
1463	}
1464
1465	U_CAPI UEnumeration* U_EXPORT2
1466	uloc_openKeywords(const char* localeID,
1467	UErrorCode* status)
1468	{
1469	int32_t i=`0`;
1470	char keywords[`256`];
1471	int32_t keywordsCapacity = `256`;
1472	char tempBuffer[ULOC_FULLNAME_CAPACITY];
1473	const char* tmpLocaleID;
1474
1475	if(status==NULL \|\| U_FAILURE(*status)) {
1476	return `0`;
1477	}
1478
1479	if (_hasBCP47Extension(localeID)) {
1480	_ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1481	} else {
1482	if (localeID==NULL) {
1483	localeID=uloc_getDefault();
1484	}
1485	tmpLocaleID=localeID;
1486	}
1487
1488	/ Skip the language /
1489	ulocimp_getLanguage(tmpLocaleID, NULL, `0`, &tmpLocaleID);
1490	if(_isIDSeparator(*tmpLocaleID)) {
1491	const char *scriptID;
1492	/ Skip the script if available /
1493	ulocimp_getScript(tmpLocaleID+`1`, NULL, `0`, &scriptID);
1494	if(scriptID != tmpLocaleID+`1`) {
1495	/ Found optional script /
1496	tmpLocaleID = scriptID;
1497	}
1498	/ Skip the Country /
1499	if (_isIDSeparator(*tmpLocaleID)) {
1500	ulocimp_getCountry(tmpLocaleID+`1`, NULL, `0`, &tmpLocaleID);
1501	if(_isIDSeparator(*tmpLocaleID)) {
1502	_getVariant(tmpLocaleID+`1`, *tmpLocaleID, NULL, `0`);
1503	}
1504	}
1505	}
1506
1507	/ keywords are located after '@' /
1508	if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1509	i=locale_getKeywords(tmpLocaleID+`1`, `'@'`, keywords, keywordsCapacity, FALSE, status);
1510	}
1511
1512	if(i) {
1513	return uloc_openKeywordList(keywords, i, status);
1514	} else {
1515	return NULL;
1516	}
1517	}
1518
1519
1520	/ bit-flags for 'options' parameter of _canonicalize /
1521	#define _ULOC_STRIP_KEYWORDS 0x2
1522	#define _ULOC_CANONICALIZE 0x1
1523
1524	#define OPTION_SET(options, mask) ((options & mask) != 0)
1525
1526	static const char i_default[] = {`'i'`, `'-'`, `'d'`, `'e'`, `'f'`, `'a'`, `'u'`, `'l'`, `'t'`};
1527	#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1528
1529	/**
1530	* Canonicalize the given localeID, to level 1 or to level 2,
1531	* depending on the options. To specify level 1, pass in options=0.
1532	* To specify level 2, pass in options=_ULOC_CANONICALIZE.
1533	*
1534	* This is the code underlying uloc_getName and uloc_canonicalize.
1535	*/
1536	static void
1537	_canonicalize(const char* localeID,
1538	ByteSink& sink,
1539	uint32_t options,
1540	UErrorCode* err) {
1541	int32_t j, fieldCount=`0`, scriptSize=`0`, variantSize=`0`;
1542	char tempBuffer[ULOC_FULLNAME_CAPACITY];
1543	const char* origLocaleID;
1544	const char* tmpLocaleID;
1545	const char* keywordAssign = NULL;
1546	const char* separatorIndicator = NULL;
1547
1548	if (U_FAILURE(*err)) {
1549	return;
1550	}
1551
1552	if (_hasBCP47Extension(localeID)) {
1553	_ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1554	} else {
1555	if (localeID==NULL) {
1556	localeID=uloc_getDefault();
1557	}
1558	tmpLocaleID=localeID;
1559	}
1560
1561	origLocaleID=tmpLocaleID;
1562
1563	/ get all pieces, one after another, and separate with '_' /
1564	CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1565
1566	if (tag.length() == I_DEFAULT_LENGTH &&
1567	uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == `0`) {
1568	tag.clear();
1569	tag.append(uloc_getDefault(), *err);
1570	} else if(_isIDSeparator(*tmpLocaleID)) {
1571	const char *scriptID;
1572
1573	++fieldCount;
1574	tag.append(`'_'`, *err);
1575
1576	CharString script = ulocimp_getScript(tmpLocaleID+`1`, &scriptID, *err);
1577	tag.append(script, *err);
1578	scriptSize = script.length();
1579	if(scriptSize > `0`) {
1580	/ Found optional script /
1581	tmpLocaleID = scriptID;
1582	++fieldCount;
1583	if (_isIDSeparator(*tmpLocaleID)) {
1584	/ If there is something else, then we add the _ /
1585	tag.append(`'_'`, *err);
1586	}
1587	}
1588
1589	if (_isIDSeparator(*tmpLocaleID)) {
1590	const char *cntryID;
1591
1592	CharString country = ulocimp_getCountry(tmpLocaleID+`1`, &cntryID, *err);
1593	tag.append(country, *err);
1594	if (!country.isEmpty()) {
1595	/ Found optional country /
1596	tmpLocaleID = cntryID;
1597	}
1598	if(_isIDSeparator(*tmpLocaleID)) {
1599	/ If there is something else, then we add the _ if we found country before. /
1600	if (!_isIDSeparator(*(tmpLocaleID+`1`))) {
1601	++fieldCount;
1602	tag.append(`'_'`, *err);
1603	}
1604
1605	variantSize = -tag.length();
1606	{
1607	CharStringByteSink s(&tag);
1608	_getVariantEx(tmpLocaleID+`1`, *tmpLocaleID, s, FALSE);
1609	}
1610	variantSize += tag.length();
1611	if (variantSize > `0`) {
1612	tmpLocaleID += variantSize + `1`; / skip '_' and variant /
1613	}
1614	}
1615	}
1616	}
1617
1618	/ Copy POSIX-style charset specifier, if any [mr.utf8] /
1619	if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == `'.'`) {
1620	UBool done = FALSE;
1621	do {
1622	char c = *tmpLocaleID;
1623	switch (c) {
1624	case `0`:
1625	case `'@'`:
1626	done = TRUE;
1627	break;
1628	default:
1629	tag.append(c, *err);
1630	++tmpLocaleID;
1631	break;
1632	}
1633	} while (!done);
1634	}
1635
1636	/ Scan ahead to next '@' and determine if it is followed by '=' and/or ';'*
1637	After this, tmpLocaleID either points to '@' or is NULL /*
1638	if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1639	keywordAssign = uprv_strchr(tmpLocaleID, `'='`);
1640	separatorIndicator = uprv_strchr(tmpLocaleID, `';'`);
1641	}
1642
1643	/ Copy POSIX-style variant, if any [mr@FOO] /
1644	if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1645	tmpLocaleID != NULL && keywordAssign == NULL) {
1646	for (;;) {
1647	char c = *tmpLocaleID;
1648	if (c == `0`) {
1649	break;
1650	}
1651	tag.append(c, *err);
1652	++tmpLocaleID;
1653	}
1654	}
1655
1656	if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1657	/ Handle @FOO variant if @ is present and not followed by = /
1658	if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1659	/ Add missing '_' if needed /
1660	if (fieldCount < `2` \|\| (fieldCount < `3` && scriptSize > `0`)) {
1661	do {
1662	tag.append(`'_'`, *err);
1663	++fieldCount;
1664	} while(fieldCount<`2`);
1665	}
1666
1667	int32_t posixVariantSize = -tag.length();
1668	{
1669	CharStringByteSink s(&tag);
1670	_getVariantEx(tmpLocaleID+`1`, `'@'`, s, (UBool)(variantSize > `0`));
1671	}
1672	posixVariantSize += tag.length();
1673	if (posixVariantSize > `0`) {
1674	variantSize += posixVariantSize;
1675	}
1676	}
1677
1678	/ Look up the ID in the canonicalization map /
1679	for (j=`0`; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1680	StringPiece id(CANONICALIZE_MAP[j].id);
1681	if (tag == id) {
1682	if (id.empty() && tmpLocaleID != NULL) {
1683	break; / Don't remap "" if keywords present /
1684	}
1685	tag.clear();
1686	tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1687	break;
1688	}
1689	}
1690	}
1691
1692	sink.Append(tag.data(), tag.length());
1693
1694	if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1695	if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1696	(!separatorIndicator \|\| separatorIndicator > keywordAssign)) {
1697	sink.Append("@", `1`);
1698	++fieldCount;
1699	_getKeywords(tmpLocaleID+`1`, `'@'`, sink, TRUE, err);
1700	}
1701	}
1702	}
1703
1704	/ ### ID parsing API *************************************************/
1705
1706	U_CAPI int32_t U_EXPORT2
1707	uloc_getParent(const char* localeID,
1708	char* parent,
1709	int32_t parentCapacity,
1710	UErrorCode* err)
1711	{
1712	const char *lastUnderscore;
1713	int32_t i;
1714
1715	if (U_FAILURE(*err))
1716	return `0`;
1717
1718	if (localeID == NULL)
1719	localeID = uloc_getDefault();
1720
1721	lastUnderscore=uprv_strrchr(localeID, `'_'`);
1722	if(lastUnderscore!=NULL) {
1723	i=(int32_t)(lastUnderscore-localeID);
1724	} else {
1725	i=`0`;
1726	}
1727
1728	if (i > `0`) {
1729	if (uprv_strnicmp(localeID, "und_", `4`) == `0`) {
1730	localeID += `3`;
1731	i -= `3`;
1732	uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1733	} else if (parent != localeID) {
1734	uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1735	}
1736	}
1737
1738	return u_terminateChars(parent, parentCapacity, i, err);
1739	}
1740
1741	U_CAPI int32_t U_EXPORT2
1742	uloc_getLanguage(const char* localeID,
1743	char* language,
1744	int32_t languageCapacity,
1745	UErrorCode* err)
1746	{
1747	/ uloc_getLanguage will return a 2 character iso-639 code if one exists. CWB/*
1748	int32_t i=`0`;
1749
1750	if (err==NULL \|\| U_FAILURE(*err)) {
1751	return `0`;
1752	}
1753
1754	if(localeID==NULL) {
1755	localeID=uloc_getDefault();
1756	}
1757
1758	i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1759	return u_terminateChars(language, languageCapacity, i, err);
1760	}
1761
1762	U_CAPI int32_t U_EXPORT2
1763	uloc_getScript(const char* localeID,
1764	char* script,
1765	int32_t scriptCapacity,
1766	UErrorCode* err)
1767	{
1768	int32_t i=`0`;
1769
1770	if(err==NULL \|\| U_FAILURE(*err)) {
1771	return `0`;
1772	}
1773
1774	if(localeID==NULL) {
1775	localeID=uloc_getDefault();
1776	}
1777
1778	/ skip the language /
1779	ulocimp_getLanguage(localeID, NULL, `0`, &localeID);
1780	if(_isIDSeparator(*localeID)) {
1781	i=ulocimp_getScript(localeID+`1`, script, scriptCapacity, NULL);
1782	}
1783	return u_terminateChars(script, scriptCapacity, i, err);
1784	}
1785
1786	U_CAPI int32_t U_EXPORT2
1787	uloc_getCountry(const char* localeID,
1788	char* country,
1789	int32_t countryCapacity,
1790	UErrorCode* err)
1791	{
1792	int32_t i=`0`;
1793
1794	if(err==NULL \|\| U_FAILURE(*err)) {
1795	return `0`;
1796	}
1797
1798	if(localeID==NULL) {
1799	localeID=uloc_getDefault();
1800	}
1801
1802	/ Skip the language /
1803	ulocimp_getLanguage(localeID, NULL, `0`, &localeID);
1804	if(_isIDSeparator(*localeID)) {
1805	const char *scriptID;
1806	/ Skip the script if available /
1807	ulocimp_getScript(localeID+`1`, NULL, `0`, &scriptID);
1808	if(scriptID != localeID+`1`) {
1809	/ Found optional script /
1810	localeID = scriptID;
1811	}
1812	if(_isIDSeparator(*localeID)) {
1813	i=ulocimp_getCountry(localeID+`1`, country, countryCapacity, NULL);
1814	}
1815	}
1816	return u_terminateChars(country, countryCapacity, i, err);
1817	}
1818
1819	U_CAPI int32_t U_EXPORT2
1820	uloc_getVariant(const char* localeID,
1821	char* variant,
1822	int32_t variantCapacity,
1823	UErrorCode* err)
1824	{
1825	char tempBuffer[ULOC_FULLNAME_CAPACITY];
1826	const char* tmpLocaleID;
1827	int32_t i=`0`;
1828
1829	if(err==NULL \|\| U_FAILURE(*err)) {
1830	return `0`;
1831	}
1832
1833	if (_hasBCP47Extension(localeID)) {
1834	_ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1835	} else {
1836	if (localeID==NULL) {
1837	localeID=uloc_getDefault();
1838	}
1839	tmpLocaleID=localeID;
1840	}
1841
1842	/ Skip the language /
1843	ulocimp_getLanguage(tmpLocaleID, NULL, `0`, &tmpLocaleID);
1844	if(_isIDSeparator(*tmpLocaleID)) {
1845	const char *scriptID;
1846	/ Skip the script if available /
1847	ulocimp_getScript(tmpLocaleID+`1`, NULL, `0`, &scriptID);
1848	if(scriptID != tmpLocaleID+`1`) {
1849	/ Found optional script /
1850	tmpLocaleID = scriptID;
1851	}
1852	/ Skip the Country /
1853	if (_isIDSeparator(*tmpLocaleID)) {
1854	const char *cntryID;
1855	ulocimp_getCountry(tmpLocaleID+`1`, NULL, `0`, &cntryID);
1856	if (cntryID != tmpLocaleID+`1`) {
1857	/ Found optional country /
1858	tmpLocaleID = cntryID;
1859	}
1860	if(_isIDSeparator(*tmpLocaleID)) {
1861	/ If there was no country ID, skip a possible extra IDSeparator /
1862	if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[`1`])) {
1863	tmpLocaleID++;
1864	}
1865	i=_getVariant(tmpLocaleID+`1`, *tmpLocaleID, variant, variantCapacity);
1866	}
1867	}
1868	}
1869
1870	/ removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function /
1871	/ if we do not have a variant tag yet then try a POSIX variant after '@' /
1872	/*
1873	if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1874	i=_getVariant(localeID+1, '@', variant, variantCapacity);
1875	}
1876	*/
1877	return u_terminateChars(variant, variantCapacity, i, err);
1878	}
1879
1880	U_CAPI int32_t U_EXPORT2
1881	uloc_getName(const char* localeID,
1882	char* name,
1883	int32_t nameCapacity,
1884	UErrorCode* err)
1885	{
1886	if (U_FAILURE(*err)) {
1887	return `0`;
1888	}
1889
1890	CheckedArrayByteSink sink(name, nameCapacity);
1891	ulocimp_getName(localeID, sink, err);
1892
1893	int32_t reslen = sink.NumberOfBytesAppended();
1894
1895	if (U_FAILURE(*err)) {
1896	return reslen;
1897	}
1898
1899	if (sink.Overflowed()) {
1900	*err = U_BUFFER_OVERFLOW_ERROR;
1901	} else {
1902	u_terminateChars(name, nameCapacity, reslen, err);
1903	}
1904
1905	return reslen;
1906	}
1907
1908	U_STABLE void U_EXPORT2
1909	ulocimp_getName(const char* localeID,
1910	ByteSink& sink,
1911	UErrorCode* err)
1912	{
1913	_canonicalize(localeID, sink, `0`, err);
1914	}
1915
1916	U_CAPI int32_t U_EXPORT2
1917	uloc_getBaseName(const char* localeID,
1918	char* name,
1919	int32_t nameCapacity,
1920	UErrorCode* err)
1921	{
1922	if (U_FAILURE(*err)) {
1923	return `0`;
1924	}
1925
1926	CheckedArrayByteSink sink(name, nameCapacity);
1927	ulocimp_getBaseName(localeID, sink, err);
1928
1929	int32_t reslen = sink.NumberOfBytesAppended();
1930
1931	if (U_FAILURE(*err)) {
1932	return reslen;
1933	}
1934
1935	if (sink.Overflowed()) {
1936	*err = U_BUFFER_OVERFLOW_ERROR;
1937	} else {
1938	u_terminateChars(name, nameCapacity, reslen, err);
1939	}
1940
1941	return reslen;
1942	}
1943
1944	U_STABLE void U_EXPORT2
1945	ulocimp_getBaseName(const char* localeID,
1946	ByteSink& sink,
1947	UErrorCode* err)
1948	{
1949	_canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1950	}
1951
1952	U_CAPI int32_t U_EXPORT2
1953	uloc_canonicalize(const char* localeID,
1954	char* name,
1955	int32_t nameCapacity,
1956	UErrorCode* err)
1957	{
1958	if (U_FAILURE(*err)) {
1959	return `0`;
1960	}
1961
1962	CheckedArrayByteSink sink(name, nameCapacity);
1963	ulocimp_canonicalize(localeID, sink, err);
1964
1965	int32_t reslen = sink.NumberOfBytesAppended();
1966
1967	if (U_FAILURE(*err)) {
1968	return reslen;
1969	}
1970
1971	if (sink.Overflowed()) {
1972	*err = U_BUFFER_OVERFLOW_ERROR;
1973	} else {
1974	u_terminateChars(name, nameCapacity, reslen, err);
1975	}
1976
1977	return reslen;
1978	}
1979
1980	U_STABLE void U_EXPORT2
1981	ulocimp_canonicalize(const char* localeID,
1982	ByteSink& sink,
1983	UErrorCode* err)
1984	{
1985	_canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1986	}
1987
1988	U_CAPI const char* U_EXPORT2
1989	uloc_getISO3Language(const char* localeID)
1990	{
1991	int16_t offset;
1992	char lang[ULOC_LANG_CAPACITY];
1993	UErrorCode err = U_ZERO_ERROR;
1994
1995	if (localeID == NULL)
1996	{
1997	localeID = uloc_getDefault();
1998	}
1999	uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2000	if (U_FAILURE(err))
2001	return "";
2002	offset = _findIndex(LANGUAGES, lang);
2003	if (offset < `0`)
2004	return "";
2005	return LANGUAGES_3[offset];
2006	}
2007
2008	U_CAPI const char* U_EXPORT2
2009	uloc_getISO3Country(const char* localeID)
2010	{
2011	int16_t offset;
2012	char cntry[ULOC_LANG_CAPACITY];
2013	UErrorCode err = U_ZERO_ERROR;
2014
2015	if (localeID == NULL)
2016	{
2017	localeID = uloc_getDefault();
2018	}
2019	uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2020	if (U_FAILURE(err))
2021	return "";
2022	offset = _findIndex(COUNTRIES, cntry);
2023	if (offset < `0`)
2024	return "";
2025
2026	return COUNTRIES_3[offset];
2027	}
2028
2029	U_CAPI uint32_t U_EXPORT2
2030	uloc_getLCID(const char* localeID)
2031	{
2032	UErrorCode status = U_ZERO_ERROR;
2033	char langID[ULOC_FULLNAME_CAPACITY];
2034	uint32_t lcid = `0`;
2035
2036	/ Check for incomplete id. /
2037	if (!localeID \|\| uprv_strlen(localeID) < `2`) {
2038	return `0`;
2039	}
2040
2041	// First, attempt Windows platform lookup if available, but fall
2042	// through to catch any special cases (ICU vs Windows name differences).
2043	lcid = uprv_convertToLCIDPlatform(localeID, &status);
2044	if (U_FAILURE(status)) {
2045	return `0`;
2046	}
2047	if (lcid > `0`) {
2048	// Windows found an LCID, return that
2049	return lcid;
2050	}
2051
2052	uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2053	if (U_FAILURE(status) \|\| status == U_STRING_NOT_TERMINATED_WARNING) {
2054	return `0`;
2055	}
2056
2057	if (uprv_strchr(localeID, `'@'`)) {
2058	// uprv_convertToLCID does not support keywords other than collation.
2059	// Remove all keywords except collation.
2060	int32_t len;
2061	char collVal[ULOC_KEYWORDS_CAPACITY];
2062	char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2063
2064	len = uloc_getKeywordValue(localeID, "collation", collVal,
2065	UPRV_LENGTHOF(collVal) - `1`, &status);
2066
2067	if (U_SUCCESS(status) && len > `0`) {
2068	collVal[len] = `0`;
2069
2070	len = uloc_getBaseName(localeID, tmpLocaleID,
2071	UPRV_LENGTHOF(tmpLocaleID) - `1`, &status);
2072
2073	if (U_SUCCESS(status) && len > `0`) {
2074	tmpLocaleID[len] = `0`;
2075
2076	len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2077	UPRV_LENGTHOF(tmpLocaleID) - len - `1`, &status);
2078
2079	if (U_SUCCESS(status) && len > `0`) {
2080	tmpLocaleID[len] = `0`;
2081	return uprv_convertToLCID(langID, tmpLocaleID, &status);
2082	}
2083	}
2084	}
2085
2086	// fall through - all keywords are simply ignored
2087	status = U_ZERO_ERROR;
2088	}
2089
2090	return uprv_convertToLCID(langID, localeID, &status);
2091	}
2092
2093	U_CAPI int32_t U_EXPORT2
2094	uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2095	UErrorCode *status)
2096	{
2097	return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2098	}
2099
2100	/ ### Default locale *************************************************/
2101
2102	U_CAPI const char* U_EXPORT2
2103	uloc_getDefault()
2104	{
2105	return locale_get_default();
2106	}
2107
2108	U_CAPI void U_EXPORT2
2109	uloc_setDefault(const char* newDefaultLocale,
2110	UErrorCode* err)
2111	{
2112	if (U_FAILURE(*err))
2113	return;
2114	/ the error code isn't currently used for anything by this function/
2115
2116	/ propagate change to C++ /
2117	locale_set_default(newDefaultLocale);
2118	}
2119
2120	/**
2121	* Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2122	* to an array of pointers to arrays of char. All of these pointers are owned
2123	* by ICU-- do not delete them, and do not write through them. The array is
2124	* terminated with a null pointer.
2125	*/
2126	U_CAPI const char* const* U_EXPORT2
2127	uloc_getISOLanguages()
2128	{
2129	return LANGUAGES;
2130	}
2131
2132	/**
2133	* Returns a list of all 2-letter country codes defined in ISO 639. This is a
2134	* pointer to an array of pointers to arrays of char. All of these pointers are
2135	* owned by ICU-- do not delete them, and do not write through them. The array is
2136	* terminated with a null pointer.
2137	*/
2138	U_CAPI const char* const* U_EXPORT2
2139	uloc_getISOCountries()
2140	{
2141	return COUNTRIES;
2142	}
2143
2144	U_CAPI const char* U_EXPORT2
2145	uloc_toUnicodeLocaleKey(const char* keyword)
2146	{
2147	const char* bcpKey = ulocimp_toBcpKey(keyword);
2148	if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -`1`)) {
2149	// unknown keyword, but syntax is fine..
2150	return keyword;
2151	}
2152	return bcpKey;
2153	}
2154
2155	U_CAPI const char* U_EXPORT2
2156	uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2157	{
2158	const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2159	if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -`1`)) {
2160	// unknown keyword, but syntax is fine..
2161	return value;
2162	}
2163	return bcpType;
2164	}
2165
2166	static UBool
2167	isWellFormedLegacyKey(const char* legacyKey)
2168	{
2169	const char* p = legacyKey;
2170	while (*p) {
2171	if (!UPRV_ISALPHANUM(*p)) {
2172	return FALSE;
2173	}
2174	p++;
2175	}
2176	return TRUE;
2177	}
2178
2179	static UBool
2180	isWellFormedLegacyType(const char* legacyType)
2181	{
2182	const char* p = legacyType;
2183	int32_t alphaNumLen = `0`;
2184	while (*p) {
2185	if (p == `'_'` \|\| p == `'/'` \|\| *p == `'-'`) {
2186	if (alphaNumLen == `0`) {
2187	return FALSE;
2188	}
2189	alphaNumLen = `0`;
2190	} else if (UPRV_ISALPHANUM(*p)) {
2191	alphaNumLen++;
2192	} else {
2193	return FALSE;
2194	}
2195	p++;
2196	}
2197	return (alphaNumLen != `0`);
2198	}
2199
2200	U_CAPI const char* U_EXPORT2
2201	uloc_toLegacyKey(const char* keyword)
2202	{
2203	const char* legacyKey = ulocimp_toLegacyKey(keyword);
2204	if (legacyKey == NULL) {
2205	// Checks if the specified locale key is well-formed with the legacy locale syntax.
2206	//
2207	// Note:
2208	// LDML/CLDR provides some definition of keyword syntax in
2209	// http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and*
2210	// http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax*
2211	// Keys can only consist of [0-9a-zA-Z].
2212	if (isWellFormedLegacyKey(keyword)) {
2213	return keyword;
2214	}
2215	}
2216	return legacyKey;
2217	}
2218
2219	U_CAPI const char* U_EXPORT2
2220	uloc_toLegacyType(const char* keyword, const char* value)
2221	{
2222	const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2223	if (legacyType == NULL) {
2224	// Checks if the specified locale type is well-formed with the legacy locale syntax.
2225	//
2226	// Note:
2227	// LDML/CLDR provides some definition of keyword syntax in
2228	// http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and*
2229	// http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax*
2230	// Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2231	// we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2232	if (isWellFormedLegacyType(value)) {
2233	return value;
2234	}
2235	}
2236	return legacyType;
2237	}
2238
2239	/eof/
2240

Browse the source code of engine/third_party/icu/source/common/uloc.cpp