uloc.cpp source code [Godot/thirdparty/icu4c/common/uloc.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1997-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*
9	* File ULOC.CPP
10	*
11	* Modification History:
12	*
13	* Date Name Description
14	* 04/01/97 aliu Creation.
15	* 08/21/98 stephen JDK 1.2 sync
16	* 12/08/98 rtg New Locale implementation and C API
17	* 03/15/99 damiba overhaul.
18	* 04/06/99 stephen changed setDefault() to realloc and copy
19	* 06/14/99 stephen Changed calls to ures_open for new params
20	* 07/21/99 stephen Modified setDefault() to propagate to C++
21	* 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22	* brought canonicalization code into line with spec
23	*****************************************************************************/
24
25	/*
26	POSIX's locale format, from putil.c: [no spaces]
27
28	ll [ _CC ] [ . MM ] [ @ VV]
29
30	l = lang, C = ctry, M = charmap, V = variant
31	*/
32
33	#include "unicode/bytestream.h"
34	#include "unicode/errorcode.h"
35	#include "unicode/stringpiece.h"
36	#include "unicode/utypes.h"
37	#include "unicode/ustring.h"
38	#include "unicode/uloc.h"
39
40	#include "bytesinkutil.h"
41	#include "putilimp.h"
42	#include "ustr_imp.h"
43	#include "ulocimp.h"
44	#include "umutex.h"
45	#include "cstring.h"
46	#include "cmemory.h"
47	#include "locmap.h"
48	#include "uarrsort.h"
49	#include "uenumimp.h"
50	#include "uassert.h"
51	#include "charstr.h"
52
53	U_NAMESPACE_USE
54
55	/ ### Declarations *************************************************/
56
57	/ Locale stuff from locid.cpp /
58	U_CFUNC void locale_set_default(const char *id);
59	U_CFUNC const char *locale_get_default();
60
61	/ ### Data tables *************************************************/
62
63	/**
64	* Table of language codes, both 2- and 3-letter, with preference
65	* given to 2-letter codes where possible. Includes 3-letter codes
66	* that lack a 2-letter equivalent.
67	*
68	* This list must be in sorted order. This list is returned directly
69	* to the user by some API.
70	*
71	* This list must be kept in sync with LANGUAGES_3, with corresponding
72	* entries matched.
73	*
74	* This table should be terminated with a nullptr entry, followed by a
75	* second list, and another nullptr entry. The first list is visible to
76	* user code when this array is returned by API. The second list
77	* contains codes we support, but do not expose through user API.
78	*
79	* Notes
80	*
81	* Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82	* include the revisions up to 2001/7/27 CWB
83	*
84	* The 3 character codes are the terminology codes like RFC 3066. This
85	* is compatible with prior ICU codes
86	*
87	* "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88	* table but now at the end of the table because 3 character codes are
89	* duplicates. This avoids bad searches going from 3 to 2 character
90	* codes.
91	*
92	* The range qaa-qtz is reserved for local use
93	*/
94	/ Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables /
95	/ ISO639 table version is 20150505 /
96	/ Subsequent hand addition of selected languages /
97	static const char * const LANGUAGES[] = {
98	"aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
99	"af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
100	"aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
101	"arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
102	"asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
103	"ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
104	"be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
105	"bgc", "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
106	"bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
107	"brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
108	"ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
109	"ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
110	"chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
111	"cs", "csb", "cu", "cv", "cy",
112	"da", "dak", "dar", "dav", "de", "del", "den", "dgr",
113	"din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
114	"dyo", "dyu", "dz", "dzg",
115	"ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
116	"en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
117	"ext",
118	"fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
119	"fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
120	"frs", "fur", "fy",
121	"ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
122	"gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
123	"gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
124	"gur", "guz", "gv", "gwi",
125	"ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
126	"hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
127	"hup", "hy", "hz",
128	"ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
129	"ilo", "inh", "io", "is", "it", "iu", "izh",
130	"ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
131	"jv",
132	"ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
133	"kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
134	"kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
135	"kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
136	"kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
137	"kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
138	"kv", "kw", "ky",
139	"la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
140	"lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
141	"lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
142	"lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
143	"mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
144	"mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
145	"mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
146	"ml", "mn", "mnc", "mni",
147	"moh", "mos", "mr", "mrj",
148	"ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
149	"my", "mye", "myv", "mzn",
150	"na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
151	"new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
152	"nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
153	"nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
154	"oc", "oj", "om", "or", "os", "osa", "ota",
155	"pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
156	"pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
157	"pon", "prg", "pro", "ps", "pt",
158	"qu", "quc", "qug",
159	"raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
160	"rof", "rom", "rtm", "ru", "rue", "rug", "rup",
161	"rw", "rwk",
162	"sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
163	"sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
164	"se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
165	"sgs", "shi", "shn", "shu", "si", "sid", "sk",
166	"sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
167	"sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
168	"ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
169	"sv", "sw", "swb", "syc", "syr", "szl",
170	"ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
171	"th", "ti", "tig", "tiv", "tk", "tkl", "tkr",
172	"tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
173	"tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
174	"tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
175	"udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
176	"vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
177	"vot", "vro", "vun",
178	"wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
179	"xal", "xh", "xmf", "xog",
180	"yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
181	"za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
182	"zun", "zxx", "zza",
183	nullptr,
184	"in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", / obsolete language codes /
185	nullptr
186	};
187
188	static const char* const DEPRECATED_LANGUAGES[]={
189	"in", "iw", "ji", "jw", "mo", nullptr, nullptr
190	};
191	static const char* const REPLACEMENT_LANGUAGES[]={
192	"id", "he", "yi", "jv", "ro", nullptr, nullptr
193	};
194
195	/**
196	* Table of 3-letter language codes.
197	*
198	* This is a lookup table used to convert 3-letter language codes to
199	* their 2-letter equivalent, where possible. It must be kept in sync
200	* with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
201	* same language as LANGUAGES_3[i]. The commented-out lines are
202	* copied from LANGUAGES to make eyeballing this baby easier.
203	*
204	* Where a 3-letter language code has no 2-letter equivalent, the
205	* 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
206	*
207	* This table should be terminated with a nullptr entry, followed by a
208	* second list, and another nullptr entry. The two lists correspond to
209	* the two lists in LANGUAGES.
210	*/
211	/ Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables /
212	/ ISO639 table version is 20150505 /
213	/ Subsequent hand addition of selected languages /
214	static const char * const LANGUAGES_3[] = {
215	"aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
216	"afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
217	"aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
218	"arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
219	"asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
220	"bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
221	"bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
222	"bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
223	"bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
224	"brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
225	"cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
226	"cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
227	"chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
228	"ces", "csb", "chu", "chv", "cym",
229	"dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
230	"din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
231	"dyo", "dyu", "dzo", "dzg",
232	"ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
233	"eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
234	"ext",
235	"fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
236	"fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
237	"frs", "fur", "fry",
238	"gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
239	"gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
240	"gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
241	"gur", "guz", "glv", "gwi",
242	"hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
243	"hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
244	"hup", "hye", "her",
245	"ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
246	"ilo", "inh", "ido", "isl", "ita", "iku", "izh",
247	"jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
248	"jav",
249	"kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
250	"kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
251	"kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
252	"kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
253	"kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
254	"kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
255	"kom", "cor", "kir",
256	"lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
257	"lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
258	"lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
259	"lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
260	"mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
261	"mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
262	"mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
263	"mal", "mon", "mnc", "mni",
264	"moh", "mos", "mar", "mrj",
265	"msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266	"mya", "mye", "myv", "mzn",
267	"nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268	"new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269	"nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270	"nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271	"oci", "oji", "orm", "ori", "oss", "osa", "ota",
272	"pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
273	"pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274	"pon", "prg", "pro", "pus", "por",
275	"que", "quc", "qug",
276	"raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277	"rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278	"kin", "rwk",
279	"san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280	"sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281	"sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282	"sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283	"slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284	"sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285	"ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286	"swe", "swa", "swb", "syc", "syr", "szl",
287	"tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288	"tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
289	"tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290	"tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291	"tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292	"udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293	"vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294	"vot", "vro", "vun",
295	"wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296	"xal", "xho", "xmf", "xog",
297	"yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298	"zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299	"zun", "zxx", "zza",
300	nullptr,
301	/ "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", /
302	"ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
303	nullptr
304	};
305
306	/**
307	* Table of 2-letter country codes.
308	*
309	* This list must be in sorted order. This list is returned directly
310	* to the user by some API.
311	*
312	* This list must be kept in sync with COUNTRIES_3, with corresponding
313	* entries matched.
314	*
315	* This table should be terminated with a nullptr entry, followed by a
316	* second list, and another nullptr entry. The first list is visible to
317	* user code when this array is returned by API. The second list
318	* contains codes we support, but do not expose through user API.
319	*
320	* Notes:
321	*
322	* ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323	* http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324	* new codes keeping the old ones for compatibility updated to include
325	* 1999/12/03 revisions CWB
326	*
327	* RO(ROM) is now RO(ROU) according to
328	* http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329	*/
330	static const char * const COUNTRIES[] = {
331	"AD", "AE", "AF", "AG", "AI", "AL", "AM",
332	"AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
333	"BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
334	"BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
335	"BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
336	"CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR",
337	"CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
338	"DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
339	"ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
340	"GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
341	"GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
342	"GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
343	"IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
344	"IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
345	"KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
346	"LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
347	"LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
348	"ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
349	"MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
350	"NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
351	"NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
352	"PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
353	"PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
354	"SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
355	"SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
356	"SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
357	"TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
358	"TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
359	"VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
360	"WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
361	nullptr,
362	"AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", / obsolete country codes /
363	nullptr
364	};
365
366	static const char* const DEPRECATED_COUNTRIES[] = {
367	"AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr / deprecated country list /
368	};
369	static const char* const REPLACEMENT_COUNTRIES[] = {
370	/ "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" /
371	"CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr / replacement country codes /
372	};
373
374	/**
375	* Table of 3-letter country codes.
376	*
377	* This is a lookup table used to convert 3-letter country codes to
378	* their 2-letter equivalent. It must be kept in sync with COUNTRIES.
379	* For all valid i, COUNTRIES[i] must refer to the same country as
380	* COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
381	* to make eyeballing this baby easier.
382	*
383	* This table should be terminated with a nullptr entry, followed by a
384	* second list, and another nullptr entry. The two lists correspond to
385	* the two lists in COUNTRIES.
386	*/
387	static const char * const COUNTRIES_3[] = {
388	/ "AD", "AE", "AF", "AG", "AI", "AL", "AM", /
389	"AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390	/ "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", /
391	"AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392	/ "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", /
393	"BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394	/ "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", /
395	"BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396	/ "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", /
397	"BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398	/ "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR", /
399	"CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
400	/ "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", /
401	"CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
402	/ "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", /
403	"DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
404	/ "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", /
405	"ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406	/ "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", /
407	"GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408	/ "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", /
409	"GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410	/ "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", /
411	"GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412	/ "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" /
413	"XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414	/ "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", /
415	"ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416	/ "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", /
417	"COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418	/ "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", /
419	"LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420	/ "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", /
421	"LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422	/ "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", /
423	"MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424	/ "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", /
425	"MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426	/ "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", /
427	"NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428	/ "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", /
429	"NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430	/ "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", /
431	"PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432	/ "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", /
433	"PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434	/ "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", /
435	"SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436	/ "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", /
437	"SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438	/ "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", /
439	"SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440	/ "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", /
441	"TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442	/ "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", /
443	"TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444	/ "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", /
445	"VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446	/ "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", /
447	"WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448	nullptr,
449	/ "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" /
450	"ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451	nullptr
452	};
453
454	typedef struct CanonicalizationMap {
455	const char id; /* input ID /
456	const char canonicalID; /* canonicalized output ID /
457	} CanonicalizationMap;
458
459	/**
460	* A map to canonicalize locale IDs. This handles a variety of
461	* different semantic kinds of transformations.
462	*/
463	static const CanonicalizationMap CANONICALIZE_MAP[] = {
464	{ "art__LOJBAN", "jbo" }, / registered name /
465	{ "hy__AREVELA", "hy" }, / Registered IANA variant /
466	{ "hy__AREVMDA", "hyw" }, / Registered IANA variant /
467	{ "zh__GUOYU", "zh" }, / registered name /
468	{ "zh__HAKKA", "hak" }, / registered name /
469	{ "zh__XIANG", "hsn" }, / registered name /
470	// subtags with 3 chars won't be treated as variants.
471	{ "zh_GAN", "gan" }, / registered name /
472	{ "zh_MIN_NAN", "nan" }, / registered name /
473	{ "zh_WUU", "wuu" }, / registered name /
474	{ "zh_YUE", "yue" }, / registered name /
475	};
476
477	/ ### BCP47 Conversion ******************************************/
478	/ Test if the locale id has BCP47 u extension and does not have '@' /
479	#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(localeID) == 1)
480	/ Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails /
481	static const char* _ConvertBCP47(
482	const char* id, char* buffer, int32_t length,
483	UErrorCode* err, int32_t* pLocaleIdSize) {
484	const char* finalID;
485	int32_t localeIDSize = uloc_forLanguageTag(id, buffer, length, nullptr, err);
486	if (localeIDSize <= `0` \|\| U_FAILURE(err) \|\| err == U_STRING_NOT_TERMINATED_WARNING) {
487	finalID=id;
488	if (*err == U_STRING_NOT_TERMINATED_WARNING) {
489	*err = U_BUFFER_OVERFLOW_ERROR;
490	}
491	} else {
492	finalID=buffer;
493	}
494	if (pLocaleIdSize != nullptr) {
495	*pLocaleIdSize = localeIDSize;
496	}
497	return finalID;
498	}
499	/ Gets the size of the shortest subtag in the given localeID. /
500	static int32_t getShortestSubtagLength(const char *localeID) {
501	int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
502	int32_t length = localeIDLength;
503	int32_t tmpLength = `0`;
504	int32_t i;
505	UBool reset = true;
506
507	for (i = `0`; i < localeIDLength; i++) {
508	if (localeID[i] != `'_'` && localeID[i] != `'-'`) {
509	if (reset) {
510	tmpLength = `0`;
511	reset = false;
512	}
513	tmpLength++;
514	} else {
515	if (tmpLength != `0` && tmpLength < length) {
516	length = tmpLength;
517	}
518	reset = true;
519	}
520	}
521
522	return length;
523	}
524
525	/ ### Keywords *************************************************/
526	#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
527	#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) \|\| UPRV_ISDIGIT(c) )
528	/ Punctuation/symbols allowed in legacy key values /
529	#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' \|\| (c) == '-' \|\| (c) == '+' \|\| (c) == '/')
530
531	#define ULOC_KEYWORD_BUFFER_LEN 25
532	#define ULOC_MAX_NO_KEYWORDS 25
533
534	U_CAPI const char * U_EXPORT2
535	locale_getKeywordsStart(const char *localeID) {
536	const char result = nullptr*;
537	if((result = uprv_strchr(localeID, `'@'`)) != nullptr) {
538	return result;
539	}
540	#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
541	else {
542	/ We do this because the @ sign is variant, and the @ sign used on one*
543	EBCDIC machine won't be compiled the same way on other EBCDIC based
544	machines. /*
545	static const uint8_t ebcdicSigns[] = { `0x7C`, `0x44`, `0x66`, `0x80`, `0xAC`, `0xAE`, `0xAF`, `0xB5`, `0xEC`, `0xEF`, `0x00` };
546	const uint8_t *charToFind = ebcdicSigns;
547	while(*charToFind) {
548	if((result = uprv_strchr(localeID, charToFind)) != nullptr*) {
549	return result;
550	}
551	charToFind++;
552	}
553	}
554	#endif
555	return nullptr;
556	}
557
558	/**
559	* @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
560	* @param keywordName incoming name to be canonicalized
561	* @param status return status (keyword too long)
562	* @return length of the keyword name
563	*/
564	static int32_t locale_canonKeywordName(char buf, const* char keywordName, UErrorCode status)
565	{
566	int32_t keywordNameLen = `0`;
567
568	for (; *keywordName != `0`; keywordName++) {
569	if (!UPRV_ISALPHANUM(*keywordName)) {
570	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name /
571	return `0`;
572	}
573	if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - `1`) {
574	buf[keywordNameLen++] = uprv_tolower(*keywordName);
575	} else {
576	/ keyword name too long for internal buffer /
577	*status = U_INTERNAL_PROGRAM_ERROR;
578	return `0`;
579	}
580	}
581	if (keywordNameLen == `0`) {
582	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name /
583	return `0`;
584	}
585	buf[keywordNameLen] = `0`; / terminate /
586
587	return keywordNameLen;
588	}
589
590	typedef struct {
591	char keyword[ULOC_KEYWORD_BUFFER_LEN];
592	int32_t keywordLen;
593	const char *valueStart;
594	int32_t valueLen;
595	} KeywordStruct;
596
597	static int32_t U_CALLCONV
598	compareKeywordStructs(const void * /context/, const void left, const* void *right) {
599	const char* leftString = ((const KeywordStruct *)left)->keyword;
600	const char* rightString = ((const KeywordStruct *)right)->keyword;
601	return uprv_strcmp(leftString, rightString);
602	}
603
604	U_CFUNC void
605	ulocimp_getKeywords(const char *localeID,
606	char prev,
607	ByteSink& sink,
608	UBool valuesToo,
609	UErrorCode *status)
610	{
611	KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
612
613	int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
614	int32_t numKeywords = `0`;
615	const char* pos = localeID;
616	const char* equalSign = nullptr;
617	const char* semicolon = nullptr;
618	int32_t i = `0`, j, n;
619
620	if(prev == `'@'`) { / start of keyword definition /
621	/ we will grab pairs, trim spaces, lowercase keywords, sort and return /
622	do {
623	UBool duplicate = false;
624	/ skip leading spaces /
625	while(*pos == `' '`) {
626	pos++;
627	}
628	if (!pos) { /* handle trailing "; " /
629	break;
630	}
631	if(numKeywords == maxKeywords) {
632	*status = U_INTERNAL_PROGRAM_ERROR;
633	return;
634	}
635	equalSign = uprv_strchr(pos, `'='`);
636	semicolon = uprv_strchr(pos, `';'`);
637	/ lack of '=' [foo@currency] is illegal /
638	/ ';' before '=' [foo@currency;collation=pinyin] is illegal /
639	if(!equalSign \|\| (semicolon && semicolon<equalSign)) {
640	*status = U_INVALID_FORMAT_ERROR;
641	return;
642	}
643	/ need to normalize both keyword and keyword name /
644	if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
645	/ keyword name too long for internal buffer /
646	*status = U_INTERNAL_PROGRAM_ERROR;
647	return;
648	}
649	for(i = `0`, n = `0`; i < equalSign - pos; ++i) {
650	if (pos[i] != `' '`) {
651	keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
652	}
653	}
654
655	/ zero-length keyword is an error. /
656	if (n == `0`) {
657	*status = U_INVALID_FORMAT_ERROR;
658	return;
659	}
660
661	keywordList[numKeywords].keyword[n] = `0`;
662	keywordList[numKeywords].keywordLen = n;
663	/ now grab the value part. First we skip the '=' /
664	equalSign++;
665	/ then we leading spaces /
666	while(*equalSign == `' '`) {
667	equalSign++;
668	}
669
670	/ Premature end or zero-length value /
671	if (!*equalSign \|\| equalSign == semicolon) {
672	*status = U_INVALID_FORMAT_ERROR;
673	return;
674	}
675
676	keywordList[numKeywords].valueStart = equalSign;
677
678	pos = semicolon;
679	i = `0`;
680	if(pos) {
681	while(*(pos - i - `1`) == `' '`) {
682	i++;
683	}
684	keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
685	pos++;
686	} else {
687	i = (int32_t)uprv_strlen(equalSign);
688	while(i && equalSign[i-`1`] == `' '`) {
689	i--;
690	}
691	keywordList[numKeywords].valueLen = i;
692	}
693	/ If this is a duplicate keyword, then ignore it /
694	for (j=`0`; j<numKeywords; ++j) {
695	if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == `0`) {
696	duplicate = true;
697	break;
698	}
699	}
700	if (!duplicate) {
701	++numKeywords;
702	}
703	} while(pos);
704
705	/ now we have a list of keywords /
706	/ we need to sort it /
707	uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, status);
708
709	/ Now construct the keyword part /
710	for(i = `0`; i < numKeywords; i++) {
711	sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
712	if(valuesToo) {
713	sink.Append("=", `1`);
714	sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
715	if(i < numKeywords - `1`) {
716	sink.Append(";", `1`);
717	}
718	} else {
719	sink.Append("\0", `1`);
720	}
721	}
722	}
723	}
724
725	U_CAPI int32_t U_EXPORT2
726	uloc_getKeywordValue(const char* localeID,
727	const char* keywordName,
728	char* buffer, int32_t bufferCapacity,
729	UErrorCode* status)
730	{
731	if (U_FAILURE(*status)) {
732	return `0`;
733	}
734
735	CheckedArrayByteSink sink(buffer, bufferCapacity);
736	ulocimp_getKeywordValue(localeID, keywordName, sink, status);
737
738	int32_t reslen = sink.NumberOfBytesAppended();
739
740	if (U_FAILURE(*status)) {
741	return reslen;
742	}
743
744	if (sink.Overflowed()) {
745	*status = U_BUFFER_OVERFLOW_ERROR;
746	} else {
747	u_terminateChars(buffer, bufferCapacity, reslen, status);
748	}
749
750	return reslen;
751	}
752
753	U_CAPI void U_EXPORT2
754	ulocimp_getKeywordValue(const char* localeID,
755	const char* keywordName,
756	icu::ByteSink& sink,
757	UErrorCode* status)
758	{
759	const char* startSearchHere = nullptr;
760	const char* nextSeparator = nullptr;
761	char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
762	char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
763
764	if(status && U_SUCCESS(*status) && localeID) {
765	char tempBuffer[ULOC_FULLNAME_CAPACITY];
766	const char* tmpLocaleID;
767
768	if (keywordName == nullptr \|\| keywordName[`0`] == `0`) {
769	*status = U_ILLEGAL_ARGUMENT_ERROR;
770	return;
771	}
772
773	locale_canonKeywordName(keywordNameBuffer, keywordName, status);
774	if(U_FAILURE(*status)) {
775	return;
776	}
777
778	if (_hasBCP47Extension(localeID)) {
779	tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
780	sizeof(tempBuffer), status, nullptr);
781	} else {
782	tmpLocaleID=localeID;
783	}
784
785	startSearchHere = locale_getKeywordsStart(tmpLocaleID);
786	if(startSearchHere == nullptr) {
787	/ no keywords, return at once /
788	return;
789	}
790
791	/ find the first keyword /
792	while(startSearchHere) {
793	const char* keyValueTail;
794	int32_t keyValueLen;
795
796	startSearchHere++; / skip @ or ; /
797	nextSeparator = uprv_strchr(startSearchHere, `'='`);
798	if(!nextSeparator) {
799	status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value /
800	return;
801	}
802	/ strip leading & trailing spaces (TC decided to tolerate these) /
803	while(*startSearchHere == `' '`) {
804	startSearchHere++;
805	}
806	keyValueTail = nextSeparator;
807	while (keyValueTail > startSearchHere && *(keyValueTail-`1`) == `' '`) {
808	keyValueTail--;
809	}
810	/ now keyValueTail points to first char after the keyName /
811	/ copy & normalize keyName from locale /
812	if (startSearchHere == keyValueTail) {
813	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale /
814	return;
815	}
816	keyValueLen = `0`;
817	while (startSearchHere < keyValueTail) {
818	if (!UPRV_ISALPHANUM(*startSearchHere)) {
819	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name /
820	return;
821	}
822	if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - `1`) {
823	localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
824	} else {
825	/ keyword name too long for internal buffer /
826	*status = U_INTERNAL_PROGRAM_ERROR;
827	return;
828	}
829	}
830	localeKeywordNameBuffer[keyValueLen] = `0`; / terminate /
831
832	startSearchHere = uprv_strchr(nextSeparator, `';'`);
833
834	if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == `0`) {
835	/ current entry matches the keyword. /
836	nextSeparator++; / skip '=' /
837	/ First strip leading & trailing spaces (TC decided to tolerate these) /
838	while(*nextSeparator == `' '`) {
839	nextSeparator++;
840	}
841	keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
842	while(keyValueTail > nextSeparator && *(keyValueTail-`1`) == `' '`) {
843	keyValueTail--;
844	}
845	/ Now copy the value, but check well-formedness /
846	if (nextSeparator == keyValueTail) {
847	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale /
848	return;
849	}
850	while (nextSeparator < keyValueTail) {
851	if (!UPRV_ISALPHANUM(nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(nextSeparator)) {
852	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value /
853	return;
854	}
855	/ Should we lowercase value to return here? Tests expect as-is. /
856	sink.Append(nextSeparator++, `1`);
857	}
858	return;
859	}
860	}
861	}
862	}
863
864	U_CAPI int32_t U_EXPORT2
865	uloc_setKeywordValue(const char* keywordName,
866	const char* keywordValue,
867	char* buffer, int32_t bufferCapacity,
868	UErrorCode* status)
869	{
870	/ TODO: sorting. removal. /
871	int32_t keywordNameLen;
872	int32_t keywordValueLen;
873	int32_t bufLen;
874	int32_t needLen = `0`;
875	char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
876	char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+`1`];
877	char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
878	int32_t rc;
879	char* nextSeparator = nullptr;
880	char* nextEqualsign = nullptr;
881	char* startSearchHere = nullptr;
882	char* keywordStart = nullptr;
883	CharString updatedKeysAndValues;
884	UBool handledInputKeyAndValue = false;
885	char keyValuePrefix = `'@'`;
886
887	if(U_FAILURE(*status)) {
888	return -`1`;
889	}
890	if (*status == U_STRING_NOT_TERMINATED_WARNING) {
891	*status = U_ZERO_ERROR;
892	}
893	if (keywordName == nullptr \|\| keywordName[`0`] == `0` \|\| bufferCapacity <= `1`) {
894	*status = U_ILLEGAL_ARGUMENT_ERROR;
895	return `0`;
896	}
897	bufLen = (int32_t)uprv_strlen(buffer);
898	if(bufferCapacity<bufLen) {
899	/ The capacity is less than the length?! Is this NUL terminated? /
900	*status = U_ILLEGAL_ARGUMENT_ERROR;
901	return `0`;
902	}
903	keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
904	if(U_FAILURE(*status)) {
905	return `0`;
906	}
907
908	keywordValueLen = `0`;
909	if(keywordValue) {
910	while (*keywordValue != `0`) {
911	if (!UPRV_ISALPHANUM(keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(keywordValue)) {
912	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value /
913	return `0`;
914	}
915	if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
916	/ Should we force lowercase in value to set? /
917	keywordValueBuffer[keywordValueLen++] = *keywordValue++;
918	} else {
919	/ keywordValue too long for internal buffer /
920	*status = U_INTERNAL_PROGRAM_ERROR;
921	return `0`;
922	}
923	}
924	}
925	keywordValueBuffer[keywordValueLen] = `0`; / terminate /
926
927	startSearchHere = (char*)locale_getKeywordsStart(buffer);
928	if(startSearchHere == nullptr \|\| (startSearchHere[`1`]==`0`)) {
929	if(keywordValueLen == `0`) { / no keywords = nothing to remove /
930	U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
931	return bufLen;
932	}
933
934	needLen = bufLen+`1`+keywordNameLen+`1`+keywordValueLen;
935	if(startSearchHere) { / had a single @ /
936	needLen--; / already had the @ /
937	/ startSearchHere points at the @ /
938	} else {
939	startSearchHere=buffer+bufLen;
940	}
941	if(needLen >= bufferCapacity) {
942	*status = U_BUFFER_OVERFLOW_ERROR;
943	return needLen; / no change /
944	}
945	*startSearchHere++ = `'@'`;
946	uprv_strcpy(startSearchHere, keywordNameBuffer);
947	startSearchHere += keywordNameLen;
948	*startSearchHere++ = `'='`;
949	uprv_strcpy(startSearchHere, keywordValueBuffer);
950	U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
951	return needLen;
952	} / end shortcut - no @ /
953
954	keywordStart = startSearchHere;
955	/ search for keyword /
956	while(keywordStart) {
957	const char* keyValueTail;
958	int32_t keyValueLen;
959
960	keywordStart++; / skip @ or ; /
961	nextEqualsign = uprv_strchr(keywordStart, `'='`);
962	if (!nextEqualsign) {
963	status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value /
964	return `0`;
965	}
966	/ strip leading & trailing spaces (TC decided to tolerate these) /
967	while(*keywordStart == `' '`) {
968	keywordStart++;
969	}
970	keyValueTail = nextEqualsign;
971	while (keyValueTail > keywordStart && *(keyValueTail-`1`) == `' '`) {
972	keyValueTail--;
973	}
974	/ now keyValueTail points to first char after the keyName /
975	/ copy & normalize keyName from locale /
976	if (keywordStart == keyValueTail) {
977	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale /
978	return `0`;
979	}
980	keyValueLen = `0`;
981	while (keywordStart < keyValueTail) {
982	if (!UPRV_ISALPHANUM(*keywordStart)) {
983	status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name /
984	return `0`;
985	}
986	if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - `1`) {
987	localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
988	} else {
989	/ keyword name too long for internal buffer /
990	*status = U_INTERNAL_PROGRAM_ERROR;
991	return `0`;
992	}
993	}
994	localeKeywordNameBuffer[keyValueLen] = `0`; / terminate /
995
996	nextSeparator = uprv_strchr(nextEqualsign, `';'`);
997
998	/ start processing the value part /
999	nextEqualsign++; / skip '=' /
1000	/ First strip leading & trailing spaces (TC decided to tolerate these) /
1001	while(*nextEqualsign == `' '`) {
1002	nextEqualsign++;
1003	}
1004	keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1005	while(keyValueTail > nextEqualsign && *(keyValueTail-`1`) == `' '`) {
1006	keyValueTail--;
1007	}
1008	if (nextEqualsign == keyValueTail) {
1009	status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale /
1010	return `0`;
1011	}
1012
1013	rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1014	if(rc == `0`) {
1015	/ Current entry matches the input keyword. Update the entry /
1016	if(keywordValueLen > `0`) { / updating a value /
1017	updatedKeysAndValues.append(keyValuePrefix, *status);
1018	keyValuePrefix = `';'`; / for any subsequent key-value pair /
1019	updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1020	updatedKeysAndValues.append(`'='`, *status);
1021	updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1022	} / else removing this entry, don't emit anything /
1023	handledInputKeyAndValue = true;
1024	} else {
1025	/ input keyword sorts earlier than current entry, add before current entry /
1026	if (rc < `0` && keywordValueLen > `0` && !handledInputKeyAndValue) {
1027	/ insert new entry at this location /
1028	updatedKeysAndValues.append(keyValuePrefix, *status);
1029	keyValuePrefix = `';'`; / for any subsequent key-value pair /
1030	updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1031	updatedKeysAndValues.append(`'='`, *status);
1032	updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1033	handledInputKeyAndValue = true;
1034	}
1035	/ copy the current entry /
1036	updatedKeysAndValues.append(keyValuePrefix, *status);
1037	keyValuePrefix = `';'`; / for any subsequent key-value pair /
1038	updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1039	updatedKeysAndValues.append(`'='`, *status);
1040	updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1041	}
1042	if (!nextSeparator && keywordValueLen > `0` && !handledInputKeyAndValue) {
1043	/ append new entry at the end, it sorts later than existing entries /
1044	updatedKeysAndValues.append(keyValuePrefix, *status);
1045	/ skip keyValuePrefix update, no subsequent key-value pair /
1046	updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1047	updatedKeysAndValues.append(`'='`, *status);
1048	updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1049	handledInputKeyAndValue = true;
1050	}
1051	keywordStart = nextSeparator;
1052	} / end loop searching /
1053
1054	/ Any error from updatedKeysAndValues.append above would be internal and not due to*
1055	* problems with the passed-in locale. So if we did encounter problems with the
1056	* passed-in locale above, those errors took precedence and overrode any error
1057	* status from updatedKeysAndValues.append, and also caused a return of 0. If there
1058	* are errors here they are from updatedKeysAndValues.append; they do cause an
1059	* error return but the passed-in locale is unmodified and the original bufLen is
1060	* returned.
1061	*/
1062	if (!handledInputKeyAndValue \|\| U_FAILURE(*status)) {
1063	/ if input key/value specified removal of a keyword not present in locale, or*
1064	* there was an error in CharString.append, leave original locale alone. */
1065	U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1066	return bufLen;
1067	}
1068
1069	// needLen = length of the part before '@'
1070	needLen = (int32_t)(startSearchHere - buffer);
1071	// Check to see can we fit the startSearchHere, if not, return
1072	// U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1073	// We do this because this API function does not behave like most others:
1074	// It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1075	// When the contents fits but without the terminating NUL, in this case we need to not change
1076	// the buffer contents and return with a buffer overflow error.
1077	int32_t appendLength = updatedKeysAndValues.length();
1078	if (appendLength >= bufferCapacity - needLen) {
1079	*status = U_BUFFER_OVERFLOW_ERROR;
1080	return needLen + appendLength;
1081	}
1082	needLen += updatedKeysAndValues.extract(
1083	startSearchHere, bufferCapacity - needLen, *status);
1084	U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1085	return needLen;
1086	}
1087
1088	/ ### ID parsing implementation *************************************************/
1089
1090	#define _isPrefixLetter(a) ((a=='x')\|\|(a=='X')\|\|(a=='i')\|\|(a=='I'))
1091
1092	/returns true if one of the special prefixes is here (s=string)*
1093	'x-' or 'i-' /*
1094	#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1095
1096	/ Dot terminates it because of POSIX form where dot precedes the codepage*
1097	* except for variant
1098	*/
1099	#define _isTerminator(a) ((a==0)\|\|(a=='.')\|\|(a=='@'))
1100
1101	/**
1102	* Lookup 'key' in the array 'list'. The array 'list' should contain
1103	* a nullptr entry, followed by more entries, and a second nullptr entry.
1104	*
1105	* The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1106	* COUNTRIES_3.
1107	*/
1108	static int16_t _findIndex(const char* const* list, const char* key)
1109	{
1110	const char* const* anchor = list;
1111	int32_t pass = `0`;
1112
1113	/ Make two passes through two nullptr-terminated arrays at 'list' /
1114	while (pass++ < `2`) {
1115	while (*list) {
1116	if (uprv_strcmp(key, *list) == `0`) {
1117	return (int16_t)(list - anchor);
1118	}
1119	list++;
1120	}
1121	++list; / skip final nullptr CWB/*
1122	}
1123	return -`1`;
1124	}
1125
1126	U_CFUNC const char*
1127	uloc_getCurrentCountryID(const char* oldID){
1128	int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1129	if (offset >= `0`) {
1130	return REPLACEMENT_COUNTRIES[offset];
1131	}
1132	return oldID;
1133	}
1134	U_CFUNC const char*
1135	uloc_getCurrentLanguageID(const char* oldID){
1136	int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1137	if (offset >= `0`) {
1138	return REPLACEMENT_LANGUAGES[offset];
1139	}
1140	return oldID;
1141	}
1142	/*
1143	* the internal functions _getLanguage(), _getCountry(), _getVariant()
1144	* avoid duplicating code to handle the earlier locale ID pieces
1145	* in the functions for the later ones by
1146	* setting the *pEnd pointer to where they stopped parsing
1147	*
1148	* TODO try to use this in Locale
1149	*/
1150	CharString U_EXPORT2
1151	ulocimp_getLanguage(const char *localeID,
1152	const char **pEnd,
1153	UErrorCode &status) {
1154	CharString result;
1155
1156	if (uprv_stricmp(localeID, "root") == `0`) {
1157	localeID += `4`;
1158	} else if (uprv_strnicmp(localeID, "und", `3`) == `0` &&
1159	(localeID[`3`] == `'\0'` \|\|
1160	localeID[`3`] == `'-'` \|\|
1161	localeID[`3`] == `'_'` \|\|
1162	localeID[`3`] == `'@'`)) {
1163	localeID += `3`;
1164	}
1165
1166	/ if it starts with i- or x- then copy that prefix /
1167	if(_isIDPrefix(localeID)) {
1168	result.append((char)uprv_tolower(*localeID), status);
1169	result.append(`'-'`, status);
1170	localeID+=`2`;
1171	}
1172
1173	/ copy the language as far as possible and count its length /
1174	while(!_isTerminator(localeID) && !_isIDSeparator(localeID)) {
1175	result.append((char)uprv_tolower(*localeID), status);
1176	localeID++;
1177	}
1178
1179	if(result.length()==`3`) {
1180	/ convert 3 character code to 2 character code if possible CWB/*
1181	int32_t offset = _findIndex(LANGUAGES_3, result.data());
1182	if(offset>=`0`) {
1183	result.clear();
1184	result.append(LANGUAGES[offset], status);
1185	}
1186	}
1187
1188	if(pEnd!=nullptr) {
1189	*pEnd=localeID;
1190	}
1191
1192	return result;
1193	}
1194
1195	CharString U_EXPORT2
1196	ulocimp_getScript(const char *localeID,
1197	const char **pEnd,
1198	UErrorCode &status) {
1199	CharString result;
1200	int32_t idLen = `0`;
1201
1202	if (pEnd != nullptr) {
1203	*pEnd = localeID;
1204	}
1205
1206	/ copy the second item as far as possible and count its length /
1207	while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1208	&& uprv_isASCIILetter(localeID[idLen])) {
1209	idLen++;
1210	}
1211
1212	/ If it's exactly 4 characters long, then it's a script and not a country. /
1213	if (idLen == `4`) {
1214	int32_t i;
1215	if (pEnd != nullptr) {
1216	*pEnd = localeID+idLen;
1217	}
1218	if (idLen >= `1`) {
1219	result.append((char)uprv_toupper(*(localeID++)), status);
1220	}
1221	for (i = `1`; i < idLen; i++) {
1222	result.append((char)uprv_tolower(*(localeID++)), status);
1223	}
1224	}
1225
1226	return result;
1227	}
1228
1229	CharString U_EXPORT2
1230	ulocimp_getCountry(const char *localeID,
1231	const char **pEnd,
1232	UErrorCode &status) {
1233	CharString result;
1234	int32_t idLen=`0`;
1235
1236	/ copy the country as far as possible and count its length /
1237	while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1238	result.append((char)uprv_toupper(localeID[idLen]), status);
1239	idLen++;
1240	}
1241
1242	/ the country should be either length 2 or 3 /
1243	if (idLen == `2` \|\| idLen == `3`) {
1244	/ convert 3 character code to 2 character code if possible CWB/*
1245	if(idLen==`3`) {
1246	int32_t offset = _findIndex(COUNTRIES_3, result.data());
1247	if(offset>=`0`) {
1248	result.clear();
1249	result.append(COUNTRIES[offset], status);
1250	}
1251	}
1252	localeID+=idLen;
1253	} else {
1254	result.clear();
1255	}
1256
1257	if(pEnd!=nullptr) {
1258	*pEnd=localeID;
1259	}
1260
1261	return result;
1262	}
1263
1264	/**
1265	* @param needSeparator if true, then add leading '_' if any variants
1266	* are added to 'variant'
1267	*/
1268	static void
1269	_getVariant(const char *localeID,
1270	char prev,
1271	ByteSink& sink,
1272	UBool needSeparator) {
1273	UBool hasVariant = false;
1274
1275	/ get one or more variant tags and separate them with '_' /
1276	if(_isIDSeparator(prev)) {
1277	/ get a variant string after a '-' or '_' /
1278	while(!_isTerminator(*localeID)) {
1279	if (needSeparator) {
1280	sink.Append("_", `1`);
1281	needSeparator = false;
1282	}
1283	char c = (char)uprv_toupper(*localeID);
1284	if (c == `'-'`) c = `'_'`;
1285	sink.Append(&c, `1`);
1286	hasVariant = true;
1287	localeID++;
1288	}
1289	}
1290
1291	/ if there is no variant tag after a '-' or '_' then look for '@' /
1292	if(!hasVariant) {
1293	if(prev==`'@'`) {
1294	/ keep localeID /
1295	} else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
1296	++localeID; / point after the '@' /
1297	} else {
1298	return;
1299	}
1300	while(!_isTerminator(*localeID)) {
1301	if (needSeparator) {
1302	sink.Append("_", `1`);
1303	needSeparator = false;
1304	}
1305	char c = (char)uprv_toupper(*localeID);
1306	if (c == `'-'` \|\| c == `','`) c = `'_'`;
1307	sink.Append(&c, `1`);
1308	localeID++;
1309	}
1310	}
1311	}
1312
1313	/ Keyword enumeration /
1314
1315	typedef struct UKeywordsContext {
1316	char* keywords;
1317	char* current;
1318	} UKeywordsContext;
1319
1320	U_CDECL_BEGIN
1321
1322	static void U_CALLCONV
1323	uloc_kw_closeKeywords(UEnumeration *enumerator) {
1324	uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1325	uprv_free(enumerator->context);
1326	uprv_free(enumerator);
1327	}
1328
1329	static int32_t U_CALLCONV
1330	uloc_kw_countKeywords(UEnumeration en, UErrorCode /status/) {
1331	char kw = ((UKeywordsContext )en->context)->keywords;
1332	int32_t result = `0`;
1333	while(*kw) {
1334	result++;
1335	kw += uprv_strlen(kw)+`1`;
1336	}
1337	return result;
1338	}
1339
1340	static const char * U_CALLCONV
1341	uloc_kw_nextKeyword(UEnumeration* en,
1342	int32_t* resultLength,
1343	UErrorCode* /status/) {
1344	const char* result = ((UKeywordsContext *)en->context)->current;
1345	int32_t len = `0`;
1346	if(*result) {
1347	len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1348	((UKeywordsContext *)en->context)->current += len+`1`;
1349	} else {
1350	result = nullptr;
1351	}
1352	if (resultLength) {
1353	*resultLength = len;
1354	}
1355	return result;
1356	}
1357
1358	static void U_CALLCONV
1359	uloc_kw_resetKeywords(UEnumeration* en,
1360	UErrorCode* /status/) {
1361	((UKeywordsContext )en->context)->current = ((UKeywordsContext )en->context)->keywords;
1362	}
1363
1364	U_CDECL_END
1365
1366
1367	static const UEnumeration gKeywordsEnum = {
1368	nullptr,
1369	nullptr,
1370	uloc_kw_closeKeywords,
1371	uloc_kw_countKeywords,
1372	uenum_unextDefault,
1373	uloc_kw_nextKeyword,
1374	uloc_kw_resetKeywords
1375	};
1376
1377	U_CAPI UEnumeration* U_EXPORT2
1378	uloc_openKeywordList(const char keywordList, int32_t keywordListSize, UErrorCode status)
1379	{
1380	LocalMemory<UKeywordsContext> myContext;
1381	LocalMemory<UEnumeration> result;
1382
1383	if (U_FAILURE(*status)) {
1384	return nullptr;
1385	}
1386	myContext.adoptInstead(static_cast<UKeywordsContext >(uprv_malloc(sizeof*(UKeywordsContext))));
1387	result.adoptInstead(static_cast<UEnumeration >(uprv_malloc(sizeof*(UEnumeration))));
1388	if (myContext.isNull() \|\| result.isNull()) {
1389	*status = U_MEMORY_ALLOCATION_ERROR;
1390	return nullptr;
1391	}
1392	uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1393	myContext ->keywords = static_cast<char *>(uprv_malloc(keywordListSize+`1`));
1394	if (myContext ->keywords == nullptr) {
1395	*status = U_MEMORY_ALLOCATION_ERROR;
1396	return nullptr;
1397	}
1398	uprv_memcpy(myContext ->keywords, keywordList, keywordListSize);
1399	myContext ->keywords[keywordListSize] = `0`;
1400	myContext ->current = myContext ->keywords;
1401	result ->context = myContext.orphan();
1402	return result.orphan();
1403	}
1404
1405	U_CAPI UEnumeration* U_EXPORT2
1406	uloc_openKeywords(const char* localeID,
1407	UErrorCode* status)
1408	{
1409	char tempBuffer[ULOC_FULLNAME_CAPACITY];
1410	const char* tmpLocaleID;
1411
1412	if(status==nullptr \|\| U_FAILURE(*status)) {
1413	return `0`;
1414	}
1415
1416	if (_hasBCP47Extension(localeID)) {
1417	tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
1418	sizeof(tempBuffer), status, nullptr);
1419	} else {
1420	if (localeID==nullptr) {
1421	localeID=uloc_getDefault();
1422	}
1423	tmpLocaleID=localeID;
1424	}
1425
1426	/ Skip the language /
1427	ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
1428	if (U_FAILURE(*status)) {
1429	return `0`;
1430	}
1431
1432	if(_isIDSeparator(*tmpLocaleID)) {
1433	const char *scriptID;
1434	/ Skip the script if available /
1435	ulocimp_getScript(tmpLocaleID+`1`, &scriptID, *status);
1436	if (U_FAILURE(*status)) {
1437	return `0`;
1438	}
1439	if(scriptID != tmpLocaleID+`1`) {
1440	/ Found optional script /
1441	tmpLocaleID = scriptID;
1442	}
1443	/ Skip the Country /
1444	if (_isIDSeparator(*tmpLocaleID)) {
1445	ulocimp_getCountry(tmpLocaleID+`1`, &tmpLocaleID, *status);
1446	if (U_FAILURE(*status)) {
1447	return `0`;
1448	}
1449	}
1450	}
1451
1452	/ keywords are located after '@' /
1453	if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1454	CharString keywords;
1455	CharStringByteSink sink(&keywords);
1456	ulocimp_getKeywords(tmpLocaleID+`1`, `'@'`, sink, false, status);
1457	if (U_FAILURE(*status)) {
1458	return nullptr;
1459	}
1460	return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1461	}
1462	return nullptr;
1463	}
1464
1465
1466	/ bit-flags for 'options' parameter of _canonicalize /
1467	#define _ULOC_STRIP_KEYWORDS 0x2
1468	#define _ULOC_CANONICALIZE 0x1
1469
1470	#define OPTION_SET(options, mask) ((options & mask) != 0)
1471
1472	static const char i_default[] = {`'i'`, `'-'`, `'d'`, `'e'`, `'f'`, `'a'`, `'u'`, `'l'`, `'t'`};
1473	#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1474
1475	/**
1476	* Canonicalize the given localeID, to level 1 or to level 2,
1477	* depending on the options. To specify level 1, pass in options=0.
1478	* To specify level 2, pass in options=_ULOC_CANONICALIZE.
1479	*
1480	* This is the code underlying uloc_getName and uloc_canonicalize.
1481	*/
1482	static void
1483	_canonicalize(const char* localeID,
1484	ByteSink& sink,
1485	uint32_t options,
1486	UErrorCode* err) {
1487	if (U_FAILURE(*err)) {
1488	return;
1489	}
1490
1491	int32_t j, fieldCount=`0`, scriptSize=`0`, variantSize=`0`;
1492	PreflightingLocaleIDBuffer tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
1493	CharString localeIDWithHyphens; // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1494	const char* origLocaleID;
1495	const char* tmpLocaleID;
1496	const char* keywordAssign = nullptr;
1497	const char* separatorIndicator = nullptr;
1498
1499	if (_hasBCP47Extension(localeID)) {
1500	const char* localeIDPtr = localeID;
1501
1502	// convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1503	if (uprv_strchr(localeID, `'_'`) != nullptr && localeID[`1`] != `'-'` && localeID[`1`] != `'_'`) {
1504	localeIDWithHyphens.append(localeID, -`1`, *err);
1505	if (U_SUCCESS(*err)) {
1506	for (char* p = localeIDWithHyphens.data(); *p != `'\0'`; ++p) {
1507	if (*p == `'_'`) {
1508	*p = `'-'`;
1509	}
1510	}
1511	localeIDPtr = localeIDWithHyphens.data();
1512	}
1513	}
1514
1515	do {
1516	// After this call tmpLocaleID may point to localeIDPtr which may
1517	// point to either localeID or localeIDWithHyphens.data().
1518	tmpLocaleID = _ConvertBCP47(localeIDPtr, tempBuffer.getBuffer(),
1519	tempBuffer.getCapacity(), err,
1520	&(tempBuffer.requestedCapacity));
1521	} while (tempBuffer.needToTryAgain(err));
1522	} else {
1523	if (localeID==nullptr) {
1524	localeID=uloc_getDefault();
1525	}
1526	tmpLocaleID=localeID;
1527	}
1528
1529	origLocaleID=tmpLocaleID;
1530
1531	/ get all pieces, one after another, and separate with '_' /
1532	CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1533
1534	if (tag.length() == I_DEFAULT_LENGTH &&
1535	uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == `0`) {
1536	tag.clear();
1537	tag.append(uloc_getDefault(), *err);
1538	} else if(_isIDSeparator(*tmpLocaleID)) {
1539	const char *scriptID;
1540
1541	++fieldCount;
1542	tag.append(`'_'`, *err);
1543
1544	CharString script = ulocimp_getScript(tmpLocaleID+`1`, &scriptID, *err);
1545	tag.append(script, *err);
1546	scriptSize = script.length();
1547	if(scriptSize > `0`) {
1548	/ Found optional script /
1549	tmpLocaleID = scriptID;
1550	++fieldCount;
1551	if (_isIDSeparator(*tmpLocaleID)) {
1552	/ If there is something else, then we add the _ /
1553	tag.append(`'_'`, *err);
1554	}
1555	}
1556
1557	if (_isIDSeparator(*tmpLocaleID)) {
1558	const char *cntryID;
1559
1560	CharString country = ulocimp_getCountry(tmpLocaleID+`1`, &cntryID, *err);
1561	tag.append(country, *err);
1562	if (!country.isEmpty()) {
1563	/ Found optional country /
1564	tmpLocaleID = cntryID;
1565	}
1566	if(_isIDSeparator(*tmpLocaleID)) {
1567	/ If there is something else, then we add the _ if we found country before. /
1568	if (!_isIDSeparator(*(tmpLocaleID+`1`))) {
1569	++fieldCount;
1570	tag.append(`'_'`, *err);
1571	}
1572
1573	variantSize = -tag.length();
1574	{
1575	CharStringByteSink s(&tag);
1576	_getVariant(tmpLocaleID+`1`, tmpLocaleID, s, false*);
1577	}
1578	variantSize += tag.length();
1579	if (variantSize > `0`) {
1580	tmpLocaleID += variantSize + `1`; / skip '_' and variant /
1581	}
1582	}
1583	}
1584	}
1585
1586	/ Copy POSIX-style charset specifier, if any [mr.utf8] /
1587	if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == `'.'`) {
1588	UBool done = false;
1589	do {
1590	char c = *tmpLocaleID;
1591	switch (c) {
1592	case `0`:
1593	case `'@'`:
1594	done = true;
1595	break;
1596	default:
1597	tag.append(c, *err);
1598	++tmpLocaleID;
1599	break;
1600	}
1601	} while (!done);
1602	}
1603
1604	/ Scan ahead to next '@' and determine if it is followed by '=' and/or ';'*
1605	After this, tmpLocaleID either points to '@' or is nullptr /*
1606	if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=nullptr) {
1607	keywordAssign = uprv_strchr(tmpLocaleID, `'='`);
1608	separatorIndicator = uprv_strchr(tmpLocaleID, `';'`);
1609	}
1610
1611	/ Copy POSIX-style variant, if any [mr@FOO] /
1612	if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1613	tmpLocaleID != nullptr && keywordAssign == nullptr) {
1614	for (;;) {
1615	char c = *tmpLocaleID;
1616	if (c == `0`) {
1617	break;
1618	}
1619	tag.append(c, *err);
1620	++tmpLocaleID;
1621	}
1622	}
1623
1624	if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1625	/ Handle @FOO variant if @ is present and not followed by = /
1626	if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
1627	/ Add missing '_' if needed /
1628	if (fieldCount < `2` \|\| (fieldCount < `3` && scriptSize > `0`)) {
1629	do {
1630	tag.append(`'_'`, *err);
1631	++fieldCount;
1632	} while(fieldCount<`2`);
1633	}
1634
1635	int32_t posixVariantSize = -tag.length();
1636	{
1637	CharStringByteSink s(&tag);
1638	_getVariant(tmpLocaleID+`1`, `'@'`, s, (UBool)(variantSize > `0`));
1639	}
1640	posixVariantSize += tag.length();
1641	if (posixVariantSize > `0`) {
1642	variantSize += posixVariantSize;
1643	}
1644	}
1645
1646	/ Look up the ID in the canonicalization map /
1647	for (j=`0`; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1648	StringPiece id(CANONICALIZE_MAP[j].id);
1649	if (tag == id) {
1650	if (id.empty() && tmpLocaleID != nullptr) {
1651	break; / Don't remap "" if keywords present /
1652	}
1653	tag.clear();
1654	tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1655	break;
1656	}
1657	}
1658	}
1659
1660	sink.Append(tag.data(), tag.length());
1661
1662	if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1663	if (tmpLocaleID!=nullptr && keywordAssign!=nullptr &&
1664	(!separatorIndicator \|\| separatorIndicator > keywordAssign)) {
1665	sink.Append("@", `1`);
1666	++fieldCount;
1667	ulocimp_getKeywords(tmpLocaleID+`1`, `'@'`, sink, true, err);
1668	}
1669	}
1670	}
1671
1672	/ ### ID parsing API *************************************************/
1673
1674	U_CAPI int32_t U_EXPORT2
1675	uloc_getParent(const char* localeID,
1676	char* parent,
1677	int32_t parentCapacity,
1678	UErrorCode* err)
1679	{
1680	const char *lastUnderscore;
1681	int32_t i;
1682
1683	if (U_FAILURE(*err))
1684	return `0`;
1685
1686	if (localeID == nullptr)
1687	localeID = uloc_getDefault();
1688
1689	lastUnderscore=uprv_strrchr(localeID, `'_'`);
1690	if(lastUnderscore!=nullptr) {
1691	i=(int32_t)(lastUnderscore-localeID);
1692	} else {
1693	i=`0`;
1694	}
1695
1696	if (i > `0`) {
1697	if (uprv_strnicmp(localeID, "und_", `4`) == `0`) {
1698	localeID += `3`;
1699	i -= `3`;
1700	uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1701	} else if (parent != localeID) {
1702	uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1703	}
1704	}
1705
1706	return u_terminateChars(parent, parentCapacity, i, err);
1707	}
1708
1709	U_CAPI int32_t U_EXPORT2
1710	uloc_getLanguage(const char* localeID,
1711	char* language,
1712	int32_t languageCapacity,
1713	UErrorCode* err)
1714	{
1715	/ uloc_getLanguage will return a 2 character iso-639 code if one exists. CWB/*
1716
1717	if (err==nullptr \|\| U_FAILURE(*err)) {
1718	return `0`;
1719	}
1720
1721	if(localeID==nullptr) {
1722	localeID=uloc_getDefault();
1723	}
1724
1725	return ulocimp_getLanguage(localeID, nullptr, err).extract(language, languageCapacity, err);
1726	}
1727
1728	U_CAPI int32_t U_EXPORT2
1729	uloc_getScript(const char* localeID,
1730	char* script,
1731	int32_t scriptCapacity,
1732	UErrorCode* err)
1733	{
1734	if(err==nullptr \|\| U_FAILURE(*err)) {
1735	return `0`;
1736	}
1737
1738	if(localeID==nullptr) {
1739	localeID=uloc_getDefault();
1740	}
1741
1742	/ skip the language /
1743	ulocimp_getLanguage(localeID, &localeID, *err);
1744	if (U_FAILURE(*err)) {
1745	return `0`;
1746	}
1747
1748	if(_isIDSeparator(*localeID)) {
1749	return ulocimp_getScript(localeID+`1`, nullptr, err).extract(script, scriptCapacity, err);
1750	}
1751	return u_terminateChars(script, scriptCapacity, `0`, err);
1752	}
1753
1754	U_CAPI int32_t U_EXPORT2
1755	uloc_getCountry(const char* localeID,
1756	char* country,
1757	int32_t countryCapacity,
1758	UErrorCode* err)
1759	{
1760	if(err==nullptr \|\| U_FAILURE(*err)) {
1761	return `0`;
1762	}
1763
1764	if(localeID==nullptr) {
1765	localeID=uloc_getDefault();
1766	}
1767
1768	/ Skip the language /
1769	ulocimp_getLanguage(localeID, &localeID, *err);
1770	if (U_FAILURE(*err)) {
1771	return `0`;
1772	}
1773
1774	if(_isIDSeparator(*localeID)) {
1775	const char *scriptID;
1776	/ Skip the script if available /
1777	ulocimp_getScript(localeID+`1`, &scriptID, *err);
1778	if (U_FAILURE(*err)) {
1779	return `0`;
1780	}
1781	if(scriptID != localeID+`1`) {
1782	/ Found optional script /
1783	localeID = scriptID;
1784	}
1785	if(_isIDSeparator(*localeID)) {
1786	return ulocimp_getCountry(localeID+`1`, nullptr, err).extract(country, countryCapacity, err);
1787	}
1788	}
1789	return u_terminateChars(country, countryCapacity, `0`, err);
1790	}
1791
1792	U_CAPI int32_t U_EXPORT2
1793	uloc_getVariant(const char* localeID,
1794	char* variant,
1795	int32_t variantCapacity,
1796	UErrorCode* err)
1797	{
1798	char tempBuffer[ULOC_FULLNAME_CAPACITY];
1799	const char* tmpLocaleID;
1800	int32_t i=`0`;
1801
1802	if(err==nullptr \|\| U_FAILURE(*err)) {
1803	return `0`;
1804	}
1805
1806	if (_hasBCP47Extension(localeID)) {
1807	tmpLocaleID =_ConvertBCP47(localeID, tempBuffer, sizeof(tempBuffer), err, nullptr);
1808	} else {
1809	if (localeID==nullptr) {
1810	localeID=uloc_getDefault();
1811	}
1812	tmpLocaleID=localeID;
1813	}
1814
1815	/ Skip the language /
1816	ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1817	if (U_FAILURE(*err)) {
1818	return `0`;
1819	}
1820
1821	if(_isIDSeparator(*tmpLocaleID)) {
1822	const char *scriptID;
1823	/ Skip the script if available /
1824	ulocimp_getScript(tmpLocaleID+`1`, &scriptID, *err);
1825	if (U_FAILURE(*err)) {
1826	return `0`;
1827	}
1828	if(scriptID != tmpLocaleID+`1`) {
1829	/ Found optional script /
1830	tmpLocaleID = scriptID;
1831	}
1832	/ Skip the Country /
1833	if (_isIDSeparator(*tmpLocaleID)) {
1834	const char *cntryID;
1835	ulocimp_getCountry(tmpLocaleID+`1`, &cntryID, *err);
1836	if (U_FAILURE(*err)) {
1837	return `0`;
1838	}
1839	if (cntryID != tmpLocaleID+`1`) {
1840	/ Found optional country /
1841	tmpLocaleID = cntryID;
1842	}
1843	if(_isIDSeparator(*tmpLocaleID)) {
1844	/ If there was no country ID, skip a possible extra IDSeparator /
1845	if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[`1`])) {
1846	tmpLocaleID++;
1847	}
1848
1849	CheckedArrayByteSink sink(variant, variantCapacity);
1850	_getVariant(tmpLocaleID+`1`, tmpLocaleID, sink, false*);
1851
1852	i = sink.NumberOfBytesAppended();
1853
1854	if (U_FAILURE(*err)) {
1855	return i;
1856	}
1857
1858	if (sink.Overflowed()) {
1859	*err = U_BUFFER_OVERFLOW_ERROR;
1860	return i;
1861	}
1862	}
1863	}
1864	}
1865
1866	return u_terminateChars(variant, variantCapacity, i, err);
1867	}
1868
1869	U_CAPI int32_t U_EXPORT2
1870	uloc_getName(const char* localeID,
1871	char* name,
1872	int32_t nameCapacity,
1873	UErrorCode* err)
1874	{
1875	if (U_FAILURE(*err)) {
1876	return `0`;
1877	}
1878
1879	CheckedArrayByteSink sink(name, nameCapacity);
1880	ulocimp_getName(localeID, sink, err);
1881
1882	int32_t reslen = sink.NumberOfBytesAppended();
1883
1884	if (U_FAILURE(*err)) {
1885	return reslen;
1886	}
1887
1888	if (sink.Overflowed()) {
1889	*err = U_BUFFER_OVERFLOW_ERROR;
1890	} else {
1891	u_terminateChars(name, nameCapacity, reslen, err);
1892	}
1893
1894	return reslen;
1895	}
1896
1897	U_CAPI void U_EXPORT2
1898	ulocimp_getName(const char* localeID,
1899	ByteSink& sink,
1900	UErrorCode* err)
1901	{
1902	_canonicalize(localeID, sink, `0`, err);
1903	}
1904
1905	U_CAPI int32_t U_EXPORT2
1906	uloc_getBaseName(const char* localeID,
1907	char* name,
1908	int32_t nameCapacity,
1909	UErrorCode* err)
1910	{
1911	if (U_FAILURE(*err)) {
1912	return `0`;
1913	}
1914
1915	CheckedArrayByteSink sink(name, nameCapacity);
1916	ulocimp_getBaseName(localeID, sink, err);
1917
1918	int32_t reslen = sink.NumberOfBytesAppended();
1919
1920	if (U_FAILURE(*err)) {
1921	return reslen;
1922	}
1923
1924	if (sink.Overflowed()) {
1925	*err = U_BUFFER_OVERFLOW_ERROR;
1926	} else {
1927	u_terminateChars(name, nameCapacity, reslen, err);
1928	}
1929
1930	return reslen;
1931	}
1932
1933	U_CAPI void U_EXPORT2
1934	ulocimp_getBaseName(const char* localeID,
1935	ByteSink& sink,
1936	UErrorCode* err)
1937	{
1938	_canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1939	}
1940
1941	U_CAPI int32_t U_EXPORT2
1942	uloc_canonicalize(const char* localeID,
1943	char* name,
1944	int32_t nameCapacity,
1945	UErrorCode* err)
1946	{
1947	if (U_FAILURE(*err)) {
1948	return `0`;
1949	}
1950
1951	CheckedArrayByteSink sink(name, nameCapacity);
1952	ulocimp_canonicalize(localeID, sink, err);
1953
1954	int32_t reslen = sink.NumberOfBytesAppended();
1955
1956	if (U_FAILURE(*err)) {
1957	return reslen;
1958	}
1959
1960	if (sink.Overflowed()) {
1961	*err = U_BUFFER_OVERFLOW_ERROR;
1962	} else {
1963	u_terminateChars(name, nameCapacity, reslen, err);
1964	}
1965
1966	return reslen;
1967	}
1968
1969	U_CAPI void U_EXPORT2
1970	ulocimp_canonicalize(const char* localeID,
1971	ByteSink& sink,
1972	UErrorCode* err)
1973	{
1974	_canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1975	}
1976
1977	U_CAPI const char* U_EXPORT2
1978	uloc_getISO3Language(const char* localeID)
1979	{
1980	int16_t offset;
1981	char lang[ULOC_LANG_CAPACITY];
1982	UErrorCode err = U_ZERO_ERROR;
1983
1984	if (localeID == nullptr)
1985	{
1986	localeID = uloc_getDefault();
1987	}
1988	uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1989	if (U_FAILURE(err))
1990	return "";
1991	offset = _findIndex(LANGUAGES, lang);
1992	if (offset < `0`)
1993	return "";
1994	return LANGUAGES_3[offset];
1995	}
1996
1997	U_CAPI const char* U_EXPORT2
1998	uloc_getISO3Country(const char* localeID)
1999	{
2000	int16_t offset;
2001	char cntry[ULOC_LANG_CAPACITY];
2002	UErrorCode err = U_ZERO_ERROR;
2003
2004	if (localeID == nullptr)
2005	{
2006	localeID = uloc_getDefault();
2007	}
2008	uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2009	if (U_FAILURE(err))
2010	return "";
2011	offset = _findIndex(COUNTRIES, cntry);
2012	if (offset < `0`)
2013	return "";
2014
2015	return COUNTRIES_3[offset];
2016	}
2017
2018	U_CAPI uint32_t U_EXPORT2
2019	uloc_getLCID(const char* localeID)
2020	{
2021	UErrorCode status = U_ZERO_ERROR;
2022	char langID[ULOC_FULLNAME_CAPACITY];
2023	uint32_t lcid = `0`;
2024
2025	/ Check for incomplete id. /
2026	if (!localeID \|\| uprv_strlen(localeID) < `2`) {
2027	return `0`;
2028	}
2029
2030	// First, attempt Windows platform lookup if available, but fall
2031	// through to catch any special cases (ICU vs Windows name differences).
2032	lcid = uprv_convertToLCIDPlatform(localeID, &status);
2033	if (U_FAILURE(status)) {
2034	return `0`;
2035	}
2036	if (lcid > `0`) {
2037	// Windows found an LCID, return that
2038	return lcid;
2039	}
2040
2041	uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2042	if (U_FAILURE(status) \|\| status == U_STRING_NOT_TERMINATED_WARNING) {
2043	return `0`;
2044	}
2045
2046	if (uprv_strchr(localeID, `'@'`)) {
2047	// uprv_convertToLCID does not support keywords other than collation.
2048	// Remove all keywords except collation.
2049	int32_t len;
2050	char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2051
2052	CharString collVal;
2053	{
2054	CharStringByteSink sink(&collVal);
2055	ulocimp_getKeywordValue(localeID, "collation", sink, &status);
2056	}
2057
2058	if (U_SUCCESS(status) && !collVal.isEmpty()) {
2059	len = uloc_getBaseName(localeID, tmpLocaleID,
2060	UPRV_LENGTHOF(tmpLocaleID) - `1`, &status);
2061
2062	if (U_SUCCESS(status) && len > `0`) {
2063	tmpLocaleID[len] = `0`;
2064
2065	len = uloc_setKeywordValue("collation", collVal.data(), tmpLocaleID,
2066	UPRV_LENGTHOF(tmpLocaleID) - len - `1`, &status);
2067
2068	if (U_SUCCESS(status) && len > `0`) {
2069	tmpLocaleID[len] = `0`;
2070	return uprv_convertToLCID(langID, tmpLocaleID, &status);
2071	}
2072	}
2073	}
2074
2075	// fall through - all keywords are simply ignored
2076	status = U_ZERO_ERROR;
2077	}
2078
2079	return uprv_convertToLCID(langID, localeID, &status);
2080	}
2081
2082	U_CAPI int32_t U_EXPORT2
2083	uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2084	UErrorCode *status)
2085	{
2086	return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2087	}
2088
2089	/ ### Default locale *************************************************/
2090
2091	U_CAPI const char* U_EXPORT2
2092	uloc_getDefault()
2093	{
2094	return locale_get_default();
2095	}
2096
2097	U_CAPI void U_EXPORT2
2098	uloc_setDefault(const char* newDefaultLocale,
2099	UErrorCode* err)
2100	{
2101	if (U_FAILURE(*err))
2102	return;
2103	/ the error code isn't currently used for anything by this function/
2104
2105	/ propagate change to C++ /
2106	locale_set_default(newDefaultLocale);
2107	}
2108
2109	/**
2110	* Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2111	* to an array of pointers to arrays of char. All of these pointers are owned
2112	* by ICU-- do not delete them, and do not write through them. The array is
2113	* terminated with a null pointer.
2114	*/
2115	U_CAPI const char* const* U_EXPORT2
2116	uloc_getISOLanguages()
2117	{
2118	return LANGUAGES;
2119	}
2120
2121	/**
2122	* Returns a list of all 2-letter country codes defined in ISO 639. This is a
2123	* pointer to an array of pointers to arrays of char. All of these pointers are
2124	* owned by ICU-- do not delete them, and do not write through them. The array is
2125	* terminated with a null pointer.
2126	*/
2127	U_CAPI const char* const* U_EXPORT2
2128	uloc_getISOCountries()
2129	{
2130	return COUNTRIES;
2131	}
2132
2133	U_CAPI const char* U_EXPORT2
2134	uloc_toUnicodeLocaleKey(const char* keyword)
2135	{
2136	const char* bcpKey = ulocimp_toBcpKey(keyword);
2137	if (bcpKey == nullptr && ultag_isUnicodeLocaleKey(keyword, -`1`)) {
2138	// unknown keyword, but syntax is fine..
2139	return keyword;
2140	}
2141	return bcpKey;
2142	}
2143
2144	U_CAPI const char* U_EXPORT2
2145	uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2146	{
2147	const char* bcpType = ulocimp_toBcpType(keyword, value, nullptr, nullptr);
2148	if (bcpType == nullptr && ultag_isUnicodeLocaleType(value, -`1`)) {
2149	// unknown keyword, but syntax is fine..
2150	return value;
2151	}
2152	return bcpType;
2153	}
2154
2155	static UBool
2156	isWellFormedLegacyKey(const char* legacyKey)
2157	{
2158	const char* p = legacyKey;
2159	while (*p) {
2160	if (!UPRV_ISALPHANUM(*p)) {
2161	return false;
2162	}
2163	p++;
2164	}
2165	return true;
2166	}
2167
2168	static UBool
2169	isWellFormedLegacyType(const char* legacyType)
2170	{
2171	const char* p = legacyType;
2172	int32_t alphaNumLen = `0`;
2173	while (*p) {
2174	if (p == `'_'` \|\| p == `'/'` \|\| *p == `'-'`) {
2175	if (alphaNumLen == `0`) {
2176	return false;
2177	}
2178	alphaNumLen = `0`;
2179	} else if (UPRV_ISALPHANUM(*p)) {
2180	alphaNumLen++;
2181	} else {
2182	return false;
2183	}
2184	p++;
2185	}
2186	return (alphaNumLen != `0`);
2187	}
2188
2189	U_CAPI const char* U_EXPORT2
2190	uloc_toLegacyKey(const char* keyword)
2191	{
2192	const char* legacyKey = ulocimp_toLegacyKey(keyword);
2193	if (legacyKey == nullptr) {
2194	// Checks if the specified locale key is well-formed with the legacy locale syntax.
2195	//
2196	// Note:
2197	// LDML/CLDR provides some definition of keyword syntax in
2198	// http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and*
2199	// http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax*
2200	// Keys can only consist of [0-9a-zA-Z].
2201	if (isWellFormedLegacyKey(keyword)) {
2202	return keyword;
2203	}
2204	}
2205	return legacyKey;
2206	}
2207
2208	U_CAPI const char* U_EXPORT2
2209	uloc_toLegacyType(const char* keyword, const char* value)
2210	{
2211	const char* legacyType = ulocimp_toLegacyType(keyword, value, nullptr, nullptr);
2212	if (legacyType == nullptr) {
2213	// Checks if the specified locale type is well-formed with the legacy locale syntax.
2214	//
2215	// Note:
2216	// LDML/CLDR provides some definition of keyword syntax in
2217	// http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and*
2218	// http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax*
2219	// Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2220	// we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2221	if (isWellFormedLegacyType(value)) {
2222	return value;
2223	}
2224	}
2225	return legacyType;
2226	}
2227
2228	/eof/
2229

Browse the source code of Godot/thirdparty/icu4c/common/uloc.cpp