1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 1997-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9* File ULOC.CPP
10*
11* Modification History:
12*
13* Date Name Description
14* 04/01/97 aliu Creation.
15* 08/21/98 stephen JDK 1.2 sync
16* 12/08/98 rtg New Locale implementation and C API
17* 03/15/99 damiba overhaul.
18* 04/06/99 stephen changed setDefault() to realloc and copy
19* 06/14/99 stephen Changed calls to ures_open for new params
20* 07/21/99 stephen Modified setDefault() to propagate to C++
21* 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22* brought canonicalization code into line with spec
23*****************************************************************************/
24
25/*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31*/
32
33#include "unicode/bytestream.h"
34#include "unicode/errorcode.h"
35#include "unicode/stringpiece.h"
36#include "unicode/utypes.h"
37#include "unicode/ustring.h"
38#include "unicode/uloc.h"
39
40#include "bytesinkutil.h"
41#include "putilimp.h"
42#include "ustr_imp.h"
43#include "ulocimp.h"
44#include "umutex.h"
45#include "cstring.h"
46#include "cmemory.h"
47#include "locmap.h"
48#include "uarrsort.h"
49#include "uenumimp.h"
50#include "uassert.h"
51#include "charstr.h"
52
53U_NAMESPACE_USE
54
55/* ### Declarations **************************************************/
56
57/* Locale stuff from locid.cpp */
58U_CFUNC void locale_set_default(const char *id);
59U_CFUNC const char *locale_get_default();
60
61/* ### Data tables **************************************************/
62
63/**
64 * Table of language codes, both 2- and 3-letter, with preference
65 * given to 2-letter codes where possible. Includes 3-letter codes
66 * that lack a 2-letter equivalent.
67 *
68 * This list must be in sorted order. This list is returned directly
69 * to the user by some API.
70 *
71 * This list must be kept in sync with LANGUAGES_3, with corresponding
72 * entries matched.
73 *
74 * This table should be terminated with a nullptr entry, followed by a
75 * second list, and another nullptr entry. The first list is visible to
76 * user code when this array is returned by API. The second list
77 * contains codes we support, but do not expose through user API.
78 *
79 * Notes
80 *
81 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82 * include the revisions up to 2001/7/27 *CWB*
83 *
84 * The 3 character codes are the terminology codes like RFC 3066. This
85 * is compatible with prior ICU codes
86 *
87 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88 * table but now at the end of the table because 3 character codes are
89 * duplicates. This avoids bad searches going from 3 to 2 character
90 * codes.
91 *
92 * The range qaa-qtz is reserved for local use
93 */
94/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
95/* ISO639 table version is 20150505 */
96/* Subsequent hand addition of selected languages */
97static const char * const LANGUAGES[] = {
98 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
99 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
100 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
101 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
102 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
103 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
104 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
105 "bgc", "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
106 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
107 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
108 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
109 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
110 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
111 "cs", "csb", "cu", "cv", "cy",
112 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
113 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
114 "dyo", "dyu", "dz", "dzg",
115 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
116 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
117 "ext",
118 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
119 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
120 "frs", "fur", "fy",
121 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
122 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
123 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
124 "gur", "guz", "gv", "gwi",
125 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
126 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
127 "hup", "hy", "hz",
128 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
129 "ilo", "inh", "io", "is", "it", "iu", "izh",
130 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
131 "jv",
132 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
133 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
134 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
135 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
136 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
137 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
138 "kv", "kw", "ky",
139 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
140 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
141 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
142 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
143 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
144 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
145 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
146 "ml", "mn", "mnc", "mni",
147 "moh", "mos", "mr", "mrj",
148 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
149 "my", "mye", "myv", "mzn",
150 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
151 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
152 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
153 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
154 "oc", "oj", "om", "or", "os", "osa", "ota",
155 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
156 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
157 "pon", "prg", "pro", "ps", "pt",
158 "qu", "quc", "qug",
159 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
160 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
161 "rw", "rwk",
162 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
163 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
164 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
165 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
166 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
167 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
168 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
169 "sv", "sw", "swb", "syc", "syr", "szl",
170 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
171 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr",
172 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
173 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
174 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
175 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
176 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
177 "vot", "vro", "vun",
178 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
179 "xal", "xh", "xmf", "xog",
180 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
181 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
182 "zun", "zxx", "zza",
183nullptr,
184 "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", /* obsolete language codes */
185nullptr
186};
187
188static const char* const DEPRECATED_LANGUAGES[]={
189 "in", "iw", "ji", "jw", "mo", nullptr, nullptr
190};
191static const char* const REPLACEMENT_LANGUAGES[]={
192 "id", "he", "yi", "jv", "ro", nullptr, nullptr
193};
194
195/**
196 * Table of 3-letter language codes.
197 *
198 * This is a lookup table used to convert 3-letter language codes to
199 * their 2-letter equivalent, where possible. It must be kept in sync
200 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
201 * same language as LANGUAGES_3[i]. The commented-out lines are
202 * copied from LANGUAGES to make eyeballing this baby easier.
203 *
204 * Where a 3-letter language code has no 2-letter equivalent, the
205 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
206 *
207 * This table should be terminated with a nullptr entry, followed by a
208 * second list, and another nullptr entry. The two lists correspond to
209 * the two lists in LANGUAGES.
210 */
211/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
212/* ISO639 table version is 20150505 */
213/* Subsequent hand addition of selected languages */
214static const char * const LANGUAGES_3[] = {
215 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
216 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
217 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
218 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
219 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
220 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
221 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
222 "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
223 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
224 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
225 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
226 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
227 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
228 "ces", "csb", "chu", "chv", "cym",
229 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
230 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
231 "dyo", "dyu", "dzo", "dzg",
232 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
233 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
234 "ext",
235 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
236 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
237 "frs", "fur", "fry",
238 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
239 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
240 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
241 "gur", "guz", "glv", "gwi",
242 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
243 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
244 "hup", "hye", "her",
245 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
246 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
247 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
248 "jav",
249 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
250 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
251 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
252 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
253 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
254 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
255 "kom", "cor", "kir",
256 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
257 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
258 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
259 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
260 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
261 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
262 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
263 "mal", "mon", "mnc", "mni",
264 "moh", "mos", "mar", "mrj",
265 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266 "mya", "mye", "myv", "mzn",
267 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
273 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274 "pon", "prg", "pro", "pus", "por",
275 "que", "quc", "qug",
276 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278 "kin", "rwk",
279 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286 "swe", "swa", "swb", "syc", "syr", "szl",
287 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
289 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294 "vot", "vro", "vun",
295 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296 "xal", "xho", "xmf", "xog",
297 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299 "zun", "zxx", "zza",
300nullptr,
301/* "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", */
302 "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
303nullptr
304};
305
306/**
307 * Table of 2-letter country codes.
308 *
309 * This list must be in sorted order. This list is returned directly
310 * to the user by some API.
311 *
312 * This list must be kept in sync with COUNTRIES_3, with corresponding
313 * entries matched.
314 *
315 * This table should be terminated with a nullptr entry, followed by a
316 * second list, and another nullptr entry. The first list is visible to
317 * user code when this array is returned by API. The second list
318 * contains codes we support, but do not expose through user API.
319 *
320 * Notes:
321 *
322 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324 * new codes keeping the old ones for compatibility updated to include
325 * 1999/12/03 revisions *CWB*
326 *
327 * RO(ROM) is now RO(ROU) according to
328 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329 */
330static const char * const COUNTRIES[] = {
331 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
332 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
333 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
334 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
335 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
336 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR",
337 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
338 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
339 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
340 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
341 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
342 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
343 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
344 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
345 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
346 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
347 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
348 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
349 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
350 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
351 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
352 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
353 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
354 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
355 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
356 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
357 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
358 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
359 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
360 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
361nullptr,
362 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
363nullptr
364};
365
366static const char* const DEPRECATED_COUNTRIES[] = {
367 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
368};
369static const char* const REPLACEMENT_COUNTRIES[] = {
370/* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr /* replacement country codes */
372};
373
374/**
375 * Table of 3-letter country codes.
376 *
377 * This is a lookup table used to convert 3-letter country codes to
378 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
379 * For all valid i, COUNTRIES[i] must refer to the same country as
380 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
381 * to make eyeballing this baby easier.
382 *
383 * This table should be terminated with a nullptr entry, followed by a
384 * second list, and another nullptr entry. The two lists correspond to
385 * the two lists in COUNTRIES.
386 */
387static const char * const COUNTRIES_3[] = {
388/* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
389 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390/* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
391 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392/* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
393 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394/* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
395 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396/* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
397 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398/* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR", */
399 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
400/* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */
401 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
402/* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */
403 "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
404/* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
405 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406/* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
407 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408/* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
409 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410/* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
411 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412/* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
413 "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414/* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
415 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416/* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
417 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418/* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
419 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420/* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
421 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422/* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
423 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424/* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
425 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426/* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
427 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428/* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
429 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430/* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
431 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432/* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
433 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434/* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
435 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436/* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
437 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438/* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
439 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440/* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
441 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442/* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
443 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444/* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
445 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446/* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
447 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448nullptr,
449/* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
450 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451nullptr
452};
453
454typedef struct CanonicalizationMap {
455 const char *id; /* input ID */
456 const char *canonicalID; /* canonicalized output ID */
457} CanonicalizationMap;
458
459/**
460 * A map to canonicalize locale IDs. This handles a variety of
461 * different semantic kinds of transformations.
462 */
463static const CanonicalizationMap CANONICALIZE_MAP[] = {
464 { "art__LOJBAN", "jbo" }, /* registered name */
465 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
466 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
467 { "zh__GUOYU", "zh" }, /* registered name */
468 { "zh__HAKKA", "hak" }, /* registered name */
469 { "zh__XIANG", "hsn" }, /* registered name */
470 // subtags with 3 chars won't be treated as variants.
471 { "zh_GAN", "gan" }, /* registered name */
472 { "zh_MIN_NAN", "nan" }, /* registered name */
473 { "zh_WUU", "wuu" }, /* registered name */
474 { "zh_YUE", "yue" }, /* registered name */
475};
476
477/* ### BCP47 Conversion *******************************************/
478/* Test if the locale id has BCP47 u extension and does not have '@' */
479#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(localeID) == 1)
480/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
481static const char* _ConvertBCP47(
482 const char* id, char* buffer, int32_t length,
483 UErrorCode* err, int32_t* pLocaleIdSize) {
484 const char* finalID;
485 int32_t localeIDSize = uloc_forLanguageTag(id, buffer, length, nullptr, err);
486 if (localeIDSize <= 0 || U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) {
487 finalID=id;
488 if (*err == U_STRING_NOT_TERMINATED_WARNING) {
489 *err = U_BUFFER_OVERFLOW_ERROR;
490 }
491 } else {
492 finalID=buffer;
493 }
494 if (pLocaleIdSize != nullptr) {
495 *pLocaleIdSize = localeIDSize;
496 }
497 return finalID;
498}
499/* Gets the size of the shortest subtag in the given localeID. */
500static int32_t getShortestSubtagLength(const char *localeID) {
501 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
502 int32_t length = localeIDLength;
503 int32_t tmpLength = 0;
504 int32_t i;
505 UBool reset = true;
506
507 for (i = 0; i < localeIDLength; i++) {
508 if (localeID[i] != '_' && localeID[i] != '-') {
509 if (reset) {
510 tmpLength = 0;
511 reset = false;
512 }
513 tmpLength++;
514 } else {
515 if (tmpLength != 0 && tmpLength < length) {
516 length = tmpLength;
517 }
518 reset = true;
519 }
520 }
521
522 return length;
523}
524
525/* ### Keywords **************************************************/
526#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
527#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
528/* Punctuation/symbols allowed in legacy key values */
529#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
530
531#define ULOC_KEYWORD_BUFFER_LEN 25
532#define ULOC_MAX_NO_KEYWORDS 25
533
534U_CAPI const char * U_EXPORT2
535locale_getKeywordsStart(const char *localeID) {
536 const char *result = nullptr;
537 if((result = uprv_strchr(localeID, '@')) != nullptr) {
538 return result;
539 }
540#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
541 else {
542 /* We do this because the @ sign is variant, and the @ sign used on one
543 EBCDIC machine won't be compiled the same way on other EBCDIC based
544 machines. */
545 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
546 const uint8_t *charToFind = ebcdicSigns;
547 while(*charToFind) {
548 if((result = uprv_strchr(localeID, *charToFind)) != nullptr) {
549 return result;
550 }
551 charToFind++;
552 }
553 }
554#endif
555 return nullptr;
556}
557
558/**
559 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
560 * @param keywordName incoming name to be canonicalized
561 * @param status return status (keyword too long)
562 * @return length of the keyword name
563 */
564static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
565{
566 int32_t keywordNameLen = 0;
567
568 for (; *keywordName != 0; keywordName++) {
569 if (!UPRV_ISALPHANUM(*keywordName)) {
570 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
571 return 0;
572 }
573 if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
574 buf[keywordNameLen++] = uprv_tolower(*keywordName);
575 } else {
576 /* keyword name too long for internal buffer */
577 *status = U_INTERNAL_PROGRAM_ERROR;
578 return 0;
579 }
580 }
581 if (keywordNameLen == 0) {
582 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
583 return 0;
584 }
585 buf[keywordNameLen] = 0; /* terminate */
586
587 return keywordNameLen;
588}
589
590typedef struct {
591 char keyword[ULOC_KEYWORD_BUFFER_LEN];
592 int32_t keywordLen;
593 const char *valueStart;
594 int32_t valueLen;
595} KeywordStruct;
596
597static int32_t U_CALLCONV
598compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
599 const char* leftString = ((const KeywordStruct *)left)->keyword;
600 const char* rightString = ((const KeywordStruct *)right)->keyword;
601 return uprv_strcmp(leftString, rightString);
602}
603
604U_CFUNC void
605ulocimp_getKeywords(const char *localeID,
606 char prev,
607 ByteSink& sink,
608 UBool valuesToo,
609 UErrorCode *status)
610{
611 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
612
613 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
614 int32_t numKeywords = 0;
615 const char* pos = localeID;
616 const char* equalSign = nullptr;
617 const char* semicolon = nullptr;
618 int32_t i = 0, j, n;
619
620 if(prev == '@') { /* start of keyword definition */
621 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
622 do {
623 UBool duplicate = false;
624 /* skip leading spaces */
625 while(*pos == ' ') {
626 pos++;
627 }
628 if (!*pos) { /* handle trailing "; " */
629 break;
630 }
631 if(numKeywords == maxKeywords) {
632 *status = U_INTERNAL_PROGRAM_ERROR;
633 return;
634 }
635 equalSign = uprv_strchr(pos, '=');
636 semicolon = uprv_strchr(pos, ';');
637 /* lack of '=' [foo@currency] is illegal */
638 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
639 if(!equalSign || (semicolon && semicolon<equalSign)) {
640 *status = U_INVALID_FORMAT_ERROR;
641 return;
642 }
643 /* need to normalize both keyword and keyword name */
644 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
645 /* keyword name too long for internal buffer */
646 *status = U_INTERNAL_PROGRAM_ERROR;
647 return;
648 }
649 for(i = 0, n = 0; i < equalSign - pos; ++i) {
650 if (pos[i] != ' ') {
651 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
652 }
653 }
654
655 /* zero-length keyword is an error. */
656 if (n == 0) {
657 *status = U_INVALID_FORMAT_ERROR;
658 return;
659 }
660
661 keywordList[numKeywords].keyword[n] = 0;
662 keywordList[numKeywords].keywordLen = n;
663 /* now grab the value part. First we skip the '=' */
664 equalSign++;
665 /* then we leading spaces */
666 while(*equalSign == ' ') {
667 equalSign++;
668 }
669
670 /* Premature end or zero-length value */
671 if (!*equalSign || equalSign == semicolon) {
672 *status = U_INVALID_FORMAT_ERROR;
673 return;
674 }
675
676 keywordList[numKeywords].valueStart = equalSign;
677
678 pos = semicolon;
679 i = 0;
680 if(pos) {
681 while(*(pos - i - 1) == ' ') {
682 i++;
683 }
684 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
685 pos++;
686 } else {
687 i = (int32_t)uprv_strlen(equalSign);
688 while(i && equalSign[i-1] == ' ') {
689 i--;
690 }
691 keywordList[numKeywords].valueLen = i;
692 }
693 /* If this is a duplicate keyword, then ignore it */
694 for (j=0; j<numKeywords; ++j) {
695 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
696 duplicate = true;
697 break;
698 }
699 }
700 if (!duplicate) {
701 ++numKeywords;
702 }
703 } while(pos);
704
705 /* now we have a list of keywords */
706 /* we need to sort it */
707 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, status);
708
709 /* Now construct the keyword part */
710 for(i = 0; i < numKeywords; i++) {
711 sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
712 if(valuesToo) {
713 sink.Append("=", 1);
714 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
715 if(i < numKeywords - 1) {
716 sink.Append(";", 1);
717 }
718 } else {
719 sink.Append("\0", 1);
720 }
721 }
722 }
723}
724
725U_CAPI int32_t U_EXPORT2
726uloc_getKeywordValue(const char* localeID,
727 const char* keywordName,
728 char* buffer, int32_t bufferCapacity,
729 UErrorCode* status)
730{
731 if (U_FAILURE(*status)) {
732 return 0;
733 }
734
735 CheckedArrayByteSink sink(buffer, bufferCapacity);
736 ulocimp_getKeywordValue(localeID, keywordName, sink, status);
737
738 int32_t reslen = sink.NumberOfBytesAppended();
739
740 if (U_FAILURE(*status)) {
741 return reslen;
742 }
743
744 if (sink.Overflowed()) {
745 *status = U_BUFFER_OVERFLOW_ERROR;
746 } else {
747 u_terminateChars(buffer, bufferCapacity, reslen, status);
748 }
749
750 return reslen;
751}
752
753U_CAPI void U_EXPORT2
754ulocimp_getKeywordValue(const char* localeID,
755 const char* keywordName,
756 icu::ByteSink& sink,
757 UErrorCode* status)
758{
759 const char* startSearchHere = nullptr;
760 const char* nextSeparator = nullptr;
761 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
762 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
763
764 if(status && U_SUCCESS(*status) && localeID) {
765 char tempBuffer[ULOC_FULLNAME_CAPACITY];
766 const char* tmpLocaleID;
767
768 if (keywordName == nullptr || keywordName[0] == 0) {
769 *status = U_ILLEGAL_ARGUMENT_ERROR;
770 return;
771 }
772
773 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
774 if(U_FAILURE(*status)) {
775 return;
776 }
777
778 if (_hasBCP47Extension(localeID)) {
779 tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
780 sizeof(tempBuffer), status, nullptr);
781 } else {
782 tmpLocaleID=localeID;
783 }
784
785 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
786 if(startSearchHere == nullptr) {
787 /* no keywords, return at once */
788 return;
789 }
790
791 /* find the first keyword */
792 while(startSearchHere) {
793 const char* keyValueTail;
794 int32_t keyValueLen;
795
796 startSearchHere++; /* skip @ or ; */
797 nextSeparator = uprv_strchr(startSearchHere, '=');
798 if(!nextSeparator) {
799 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
800 return;
801 }
802 /* strip leading & trailing spaces (TC decided to tolerate these) */
803 while(*startSearchHere == ' ') {
804 startSearchHere++;
805 }
806 keyValueTail = nextSeparator;
807 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
808 keyValueTail--;
809 }
810 /* now keyValueTail points to first char after the keyName */
811 /* copy & normalize keyName from locale */
812 if (startSearchHere == keyValueTail) {
813 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
814 return;
815 }
816 keyValueLen = 0;
817 while (startSearchHere < keyValueTail) {
818 if (!UPRV_ISALPHANUM(*startSearchHere)) {
819 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
820 return;
821 }
822 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
823 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
824 } else {
825 /* keyword name too long for internal buffer */
826 *status = U_INTERNAL_PROGRAM_ERROR;
827 return;
828 }
829 }
830 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
831
832 startSearchHere = uprv_strchr(nextSeparator, ';');
833
834 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
835 /* current entry matches the keyword. */
836 nextSeparator++; /* skip '=' */
837 /* First strip leading & trailing spaces (TC decided to tolerate these) */
838 while(*nextSeparator == ' ') {
839 nextSeparator++;
840 }
841 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
842 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
843 keyValueTail--;
844 }
845 /* Now copy the value, but check well-formedness */
846 if (nextSeparator == keyValueTail) {
847 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
848 return;
849 }
850 while (nextSeparator < keyValueTail) {
851 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
852 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
853 return;
854 }
855 /* Should we lowercase value to return here? Tests expect as-is. */
856 sink.Append(nextSeparator++, 1);
857 }
858 return;
859 }
860 }
861 }
862}
863
864U_CAPI int32_t U_EXPORT2
865uloc_setKeywordValue(const char* keywordName,
866 const char* keywordValue,
867 char* buffer, int32_t bufferCapacity,
868 UErrorCode* status)
869{
870 /* TODO: sorting. removal. */
871 int32_t keywordNameLen;
872 int32_t keywordValueLen;
873 int32_t bufLen;
874 int32_t needLen = 0;
875 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
876 char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
877 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
878 int32_t rc;
879 char* nextSeparator = nullptr;
880 char* nextEqualsign = nullptr;
881 char* startSearchHere = nullptr;
882 char* keywordStart = nullptr;
883 CharString updatedKeysAndValues;
884 UBool handledInputKeyAndValue = false;
885 char keyValuePrefix = '@';
886
887 if(U_FAILURE(*status)) {
888 return -1;
889 }
890 if (*status == U_STRING_NOT_TERMINATED_WARNING) {
891 *status = U_ZERO_ERROR;
892 }
893 if (keywordName == nullptr || keywordName[0] == 0 || bufferCapacity <= 1) {
894 *status = U_ILLEGAL_ARGUMENT_ERROR;
895 return 0;
896 }
897 bufLen = (int32_t)uprv_strlen(buffer);
898 if(bufferCapacity<bufLen) {
899 /* The capacity is less than the length?! Is this NUL terminated? */
900 *status = U_ILLEGAL_ARGUMENT_ERROR;
901 return 0;
902 }
903 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
904 if(U_FAILURE(*status)) {
905 return 0;
906 }
907
908 keywordValueLen = 0;
909 if(keywordValue) {
910 while (*keywordValue != 0) {
911 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
912 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
913 return 0;
914 }
915 if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
916 /* Should we force lowercase in value to set? */
917 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
918 } else {
919 /* keywordValue too long for internal buffer */
920 *status = U_INTERNAL_PROGRAM_ERROR;
921 return 0;
922 }
923 }
924 }
925 keywordValueBuffer[keywordValueLen] = 0; /* terminate */
926
927 startSearchHere = (char*)locale_getKeywordsStart(buffer);
928 if(startSearchHere == nullptr || (startSearchHere[1]==0)) {
929 if(keywordValueLen == 0) { /* no keywords = nothing to remove */
930 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
931 return bufLen;
932 }
933
934 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
935 if(startSearchHere) { /* had a single @ */
936 needLen--; /* already had the @ */
937 /* startSearchHere points at the @ */
938 } else {
939 startSearchHere=buffer+bufLen;
940 }
941 if(needLen >= bufferCapacity) {
942 *status = U_BUFFER_OVERFLOW_ERROR;
943 return needLen; /* no change */
944 }
945 *startSearchHere++ = '@';
946 uprv_strcpy(startSearchHere, keywordNameBuffer);
947 startSearchHere += keywordNameLen;
948 *startSearchHere++ = '=';
949 uprv_strcpy(startSearchHere, keywordValueBuffer);
950 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
951 return needLen;
952 } /* end shortcut - no @ */
953
954 keywordStart = startSearchHere;
955 /* search for keyword */
956 while(keywordStart) {
957 const char* keyValueTail;
958 int32_t keyValueLen;
959
960 keywordStart++; /* skip @ or ; */
961 nextEqualsign = uprv_strchr(keywordStart, '=');
962 if (!nextEqualsign) {
963 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
964 return 0;
965 }
966 /* strip leading & trailing spaces (TC decided to tolerate these) */
967 while(*keywordStart == ' ') {
968 keywordStart++;
969 }
970 keyValueTail = nextEqualsign;
971 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
972 keyValueTail--;
973 }
974 /* now keyValueTail points to first char after the keyName */
975 /* copy & normalize keyName from locale */
976 if (keywordStart == keyValueTail) {
977 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
978 return 0;
979 }
980 keyValueLen = 0;
981 while (keywordStart < keyValueTail) {
982 if (!UPRV_ISALPHANUM(*keywordStart)) {
983 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
984 return 0;
985 }
986 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
987 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
988 } else {
989 /* keyword name too long for internal buffer */
990 *status = U_INTERNAL_PROGRAM_ERROR;
991 return 0;
992 }
993 }
994 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
995
996 nextSeparator = uprv_strchr(nextEqualsign, ';');
997
998 /* start processing the value part */
999 nextEqualsign++; /* skip '=' */
1000 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1001 while(*nextEqualsign == ' ') {
1002 nextEqualsign++;
1003 }
1004 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1005 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1006 keyValueTail--;
1007 }
1008 if (nextEqualsign == keyValueTail) {
1009 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1010 return 0;
1011 }
1012
1013 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1014 if(rc == 0) {
1015 /* Current entry matches the input keyword. Update the entry */
1016 if(keywordValueLen > 0) { /* updating a value */
1017 updatedKeysAndValues.append(keyValuePrefix, *status);
1018 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1019 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1020 updatedKeysAndValues.append('=', *status);
1021 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1022 } /* else removing this entry, don't emit anything */
1023 handledInputKeyAndValue = true;
1024 } else {
1025 /* input keyword sorts earlier than current entry, add before current entry */
1026 if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1027 /* insert new entry at this location */
1028 updatedKeysAndValues.append(keyValuePrefix, *status);
1029 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1030 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1031 updatedKeysAndValues.append('=', *status);
1032 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1033 handledInputKeyAndValue = true;
1034 }
1035 /* copy the current entry */
1036 updatedKeysAndValues.append(keyValuePrefix, *status);
1037 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1038 updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1039 updatedKeysAndValues.append('=', *status);
1040 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1041 }
1042 if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1043 /* append new entry at the end, it sorts later than existing entries */
1044 updatedKeysAndValues.append(keyValuePrefix, *status);
1045 /* skip keyValuePrefix update, no subsequent key-value pair */
1046 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1047 updatedKeysAndValues.append('=', *status);
1048 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1049 handledInputKeyAndValue = true;
1050 }
1051 keywordStart = nextSeparator;
1052 } /* end loop searching */
1053
1054 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1055 * problems with the passed-in locale. So if we did encounter problems with the
1056 * passed-in locale above, those errors took precedence and overrode any error
1057 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1058 * are errors here they are from updatedKeysAndValues.append; they do cause an
1059 * error return but the passed-in locale is unmodified and the original bufLen is
1060 * returned.
1061 */
1062 if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1063 /* if input key/value specified removal of a keyword not present in locale, or
1064 * there was an error in CharString.append, leave original locale alone. */
1065 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1066 return bufLen;
1067 }
1068
1069 // needLen = length of the part before '@'
1070 needLen = (int32_t)(startSearchHere - buffer);
1071 // Check to see can we fit the startSearchHere, if not, return
1072 // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1073 // We do this because this API function does not behave like most others:
1074 // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1075 // When the contents fits but without the terminating NUL, in this case we need to not change
1076 // the buffer contents and return with a buffer overflow error.
1077 int32_t appendLength = updatedKeysAndValues.length();
1078 if (appendLength >= bufferCapacity - needLen) {
1079 *status = U_BUFFER_OVERFLOW_ERROR;
1080 return needLen + appendLength;
1081 }
1082 needLen += updatedKeysAndValues.extract(
1083 startSearchHere, bufferCapacity - needLen, *status);
1084 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1085 return needLen;
1086}
1087
1088/* ### ID parsing implementation **************************************************/
1089
1090#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1091
1092/*returns true if one of the special prefixes is here (s=string)
1093 'x-' or 'i-' */
1094#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1095
1096/* Dot terminates it because of POSIX form where dot precedes the codepage
1097 * except for variant
1098 */
1099#define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1100
1101/**
1102 * Lookup 'key' in the array 'list'. The array 'list' should contain
1103 * a nullptr entry, followed by more entries, and a second nullptr entry.
1104 *
1105 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1106 * COUNTRIES_3.
1107 */
1108static int16_t _findIndex(const char* const* list, const char* key)
1109{
1110 const char* const* anchor = list;
1111 int32_t pass = 0;
1112
1113 /* Make two passes through two nullptr-terminated arrays at 'list' */
1114 while (pass++ < 2) {
1115 while (*list) {
1116 if (uprv_strcmp(key, *list) == 0) {
1117 return (int16_t)(list - anchor);
1118 }
1119 list++;
1120 }
1121 ++list; /* skip final nullptr *CWB*/
1122 }
1123 return -1;
1124}
1125
1126U_CFUNC const char*
1127uloc_getCurrentCountryID(const char* oldID){
1128 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1129 if (offset >= 0) {
1130 return REPLACEMENT_COUNTRIES[offset];
1131 }
1132 return oldID;
1133}
1134U_CFUNC const char*
1135uloc_getCurrentLanguageID(const char* oldID){
1136 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1137 if (offset >= 0) {
1138 return REPLACEMENT_LANGUAGES[offset];
1139 }
1140 return oldID;
1141}
1142/*
1143 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1144 * avoid duplicating code to handle the earlier locale ID pieces
1145 * in the functions for the later ones by
1146 * setting the *pEnd pointer to where they stopped parsing
1147 *
1148 * TODO try to use this in Locale
1149 */
1150CharString U_EXPORT2
1151ulocimp_getLanguage(const char *localeID,
1152 const char **pEnd,
1153 UErrorCode &status) {
1154 CharString result;
1155
1156 if (uprv_stricmp(localeID, "root") == 0) {
1157 localeID += 4;
1158 } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1159 (localeID[3] == '\0' ||
1160 localeID[3] == '-' ||
1161 localeID[3] == '_' ||
1162 localeID[3] == '@')) {
1163 localeID += 3;
1164 }
1165
1166 /* if it starts with i- or x- then copy that prefix */
1167 if(_isIDPrefix(localeID)) {
1168 result.append((char)uprv_tolower(*localeID), status);
1169 result.append('-', status);
1170 localeID+=2;
1171 }
1172
1173 /* copy the language as far as possible and count its length */
1174 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1175 result.append((char)uprv_tolower(*localeID), status);
1176 localeID++;
1177 }
1178
1179 if(result.length()==3) {
1180 /* convert 3 character code to 2 character code if possible *CWB*/
1181 int32_t offset = _findIndex(LANGUAGES_3, result.data());
1182 if(offset>=0) {
1183 result.clear();
1184 result.append(LANGUAGES[offset], status);
1185 }
1186 }
1187
1188 if(pEnd!=nullptr) {
1189 *pEnd=localeID;
1190 }
1191
1192 return result;
1193}
1194
1195CharString U_EXPORT2
1196ulocimp_getScript(const char *localeID,
1197 const char **pEnd,
1198 UErrorCode &status) {
1199 CharString result;
1200 int32_t idLen = 0;
1201
1202 if (pEnd != nullptr) {
1203 *pEnd = localeID;
1204 }
1205
1206 /* copy the second item as far as possible and count its length */
1207 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1208 && uprv_isASCIILetter(localeID[idLen])) {
1209 idLen++;
1210 }
1211
1212 /* If it's exactly 4 characters long, then it's a script and not a country. */
1213 if (idLen == 4) {
1214 int32_t i;
1215 if (pEnd != nullptr) {
1216 *pEnd = localeID+idLen;
1217 }
1218 if (idLen >= 1) {
1219 result.append((char)uprv_toupper(*(localeID++)), status);
1220 }
1221 for (i = 1; i < idLen; i++) {
1222 result.append((char)uprv_tolower(*(localeID++)), status);
1223 }
1224 }
1225
1226 return result;
1227}
1228
1229CharString U_EXPORT2
1230ulocimp_getCountry(const char *localeID,
1231 const char **pEnd,
1232 UErrorCode &status) {
1233 CharString result;
1234 int32_t idLen=0;
1235
1236 /* copy the country as far as possible and count its length */
1237 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1238 result.append((char)uprv_toupper(localeID[idLen]), status);
1239 idLen++;
1240 }
1241
1242 /* the country should be either length 2 or 3 */
1243 if (idLen == 2 || idLen == 3) {
1244 /* convert 3 character code to 2 character code if possible *CWB*/
1245 if(idLen==3) {
1246 int32_t offset = _findIndex(COUNTRIES_3, result.data());
1247 if(offset>=0) {
1248 result.clear();
1249 result.append(COUNTRIES[offset], status);
1250 }
1251 }
1252 localeID+=idLen;
1253 } else {
1254 result.clear();
1255 }
1256
1257 if(pEnd!=nullptr) {
1258 *pEnd=localeID;
1259 }
1260
1261 return result;
1262}
1263
1264/**
1265 * @param needSeparator if true, then add leading '_' if any variants
1266 * are added to 'variant'
1267 */
1268static void
1269_getVariant(const char *localeID,
1270 char prev,
1271 ByteSink& sink,
1272 UBool needSeparator) {
1273 UBool hasVariant = false;
1274
1275 /* get one or more variant tags and separate them with '_' */
1276 if(_isIDSeparator(prev)) {
1277 /* get a variant string after a '-' or '_' */
1278 while(!_isTerminator(*localeID)) {
1279 if (needSeparator) {
1280 sink.Append("_", 1);
1281 needSeparator = false;
1282 }
1283 char c = (char)uprv_toupper(*localeID);
1284 if (c == '-') c = '_';
1285 sink.Append(&c, 1);
1286 hasVariant = true;
1287 localeID++;
1288 }
1289 }
1290
1291 /* if there is no variant tag after a '-' or '_' then look for '@' */
1292 if(!hasVariant) {
1293 if(prev=='@') {
1294 /* keep localeID */
1295 } else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
1296 ++localeID; /* point after the '@' */
1297 } else {
1298 return;
1299 }
1300 while(!_isTerminator(*localeID)) {
1301 if (needSeparator) {
1302 sink.Append("_", 1);
1303 needSeparator = false;
1304 }
1305 char c = (char)uprv_toupper(*localeID);
1306 if (c == '-' || c == ',') c = '_';
1307 sink.Append(&c, 1);
1308 localeID++;
1309 }
1310 }
1311}
1312
1313/* Keyword enumeration */
1314
1315typedef struct UKeywordsContext {
1316 char* keywords;
1317 char* current;
1318} UKeywordsContext;
1319
1320U_CDECL_BEGIN
1321
1322static void U_CALLCONV
1323uloc_kw_closeKeywords(UEnumeration *enumerator) {
1324 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1325 uprv_free(enumerator->context);
1326 uprv_free(enumerator);
1327}
1328
1329static int32_t U_CALLCONV
1330uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1331 char *kw = ((UKeywordsContext *)en->context)->keywords;
1332 int32_t result = 0;
1333 while(*kw) {
1334 result++;
1335 kw += uprv_strlen(kw)+1;
1336 }
1337 return result;
1338}
1339
1340static const char * U_CALLCONV
1341uloc_kw_nextKeyword(UEnumeration* en,
1342 int32_t* resultLength,
1343 UErrorCode* /*status*/) {
1344 const char* result = ((UKeywordsContext *)en->context)->current;
1345 int32_t len = 0;
1346 if(*result) {
1347 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1348 ((UKeywordsContext *)en->context)->current += len+1;
1349 } else {
1350 result = nullptr;
1351 }
1352 if (resultLength) {
1353 *resultLength = len;
1354 }
1355 return result;
1356}
1357
1358static void U_CALLCONV
1359uloc_kw_resetKeywords(UEnumeration* en,
1360 UErrorCode* /*status*/) {
1361 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1362}
1363
1364U_CDECL_END
1365
1366
1367static const UEnumeration gKeywordsEnum = {
1368 nullptr,
1369 nullptr,
1370 uloc_kw_closeKeywords,
1371 uloc_kw_countKeywords,
1372 uenum_unextDefault,
1373 uloc_kw_nextKeyword,
1374 uloc_kw_resetKeywords
1375};
1376
1377U_CAPI UEnumeration* U_EXPORT2
1378uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1379{
1380 LocalMemory<UKeywordsContext> myContext;
1381 LocalMemory<UEnumeration> result;
1382
1383 if (U_FAILURE(*status)) {
1384 return nullptr;
1385 }
1386 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1387 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1388 if (myContext.isNull() || result.isNull()) {
1389 *status = U_MEMORY_ALLOCATION_ERROR;
1390 return nullptr;
1391 }
1392 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1393 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1394 if (myContext->keywords == nullptr) {
1395 *status = U_MEMORY_ALLOCATION_ERROR;
1396 return nullptr;
1397 }
1398 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1399 myContext->keywords[keywordListSize] = 0;
1400 myContext->current = myContext->keywords;
1401 result->context = myContext.orphan();
1402 return result.orphan();
1403}
1404
1405U_CAPI UEnumeration* U_EXPORT2
1406uloc_openKeywords(const char* localeID,
1407 UErrorCode* status)
1408{
1409 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1410 const char* tmpLocaleID;
1411
1412 if(status==nullptr || U_FAILURE(*status)) {
1413 return 0;
1414 }
1415
1416 if (_hasBCP47Extension(localeID)) {
1417 tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
1418 sizeof(tempBuffer), status, nullptr);
1419 } else {
1420 if (localeID==nullptr) {
1421 localeID=uloc_getDefault();
1422 }
1423 tmpLocaleID=localeID;
1424 }
1425
1426 /* Skip the language */
1427 ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
1428 if (U_FAILURE(*status)) {
1429 return 0;
1430 }
1431
1432 if(_isIDSeparator(*tmpLocaleID)) {
1433 const char *scriptID;
1434 /* Skip the script if available */
1435 ulocimp_getScript(tmpLocaleID+1, &scriptID, *status);
1436 if (U_FAILURE(*status)) {
1437 return 0;
1438 }
1439 if(scriptID != tmpLocaleID+1) {
1440 /* Found optional script */
1441 tmpLocaleID = scriptID;
1442 }
1443 /* Skip the Country */
1444 if (_isIDSeparator(*tmpLocaleID)) {
1445 ulocimp_getCountry(tmpLocaleID+1, &tmpLocaleID, *status);
1446 if (U_FAILURE(*status)) {
1447 return 0;
1448 }
1449 }
1450 }
1451
1452 /* keywords are located after '@' */
1453 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1454 CharString keywords;
1455 CharStringByteSink sink(&keywords);
1456 ulocimp_getKeywords(tmpLocaleID+1, '@', sink, false, status);
1457 if (U_FAILURE(*status)) {
1458 return nullptr;
1459 }
1460 return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1461 }
1462 return nullptr;
1463}
1464
1465
1466/* bit-flags for 'options' parameter of _canonicalize */
1467#define _ULOC_STRIP_KEYWORDS 0x2
1468#define _ULOC_CANONICALIZE 0x1
1469
1470#define OPTION_SET(options, mask) ((options & mask) != 0)
1471
1472static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1473#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1474
1475/**
1476 * Canonicalize the given localeID, to level 1 or to level 2,
1477 * depending on the options. To specify level 1, pass in options=0.
1478 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1479 *
1480 * This is the code underlying uloc_getName and uloc_canonicalize.
1481 */
1482static void
1483_canonicalize(const char* localeID,
1484 ByteSink& sink,
1485 uint32_t options,
1486 UErrorCode* err) {
1487 if (U_FAILURE(*err)) {
1488 return;
1489 }
1490
1491 int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1492 PreflightingLocaleIDBuffer tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
1493 CharString localeIDWithHyphens; // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1494 const char* origLocaleID;
1495 const char* tmpLocaleID;
1496 const char* keywordAssign = nullptr;
1497 const char* separatorIndicator = nullptr;
1498
1499 if (_hasBCP47Extension(localeID)) {
1500 const char* localeIDPtr = localeID;
1501
1502 // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1503 if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
1504 localeIDWithHyphens.append(localeID, -1, *err);
1505 if (U_SUCCESS(*err)) {
1506 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1507 if (*p == '_') {
1508 *p = '-';
1509 }
1510 }
1511 localeIDPtr = localeIDWithHyphens.data();
1512 }
1513 }
1514
1515 do {
1516 // After this call tmpLocaleID may point to localeIDPtr which may
1517 // point to either localeID or localeIDWithHyphens.data().
1518 tmpLocaleID = _ConvertBCP47(localeIDPtr, tempBuffer.getBuffer(),
1519 tempBuffer.getCapacity(), err,
1520 &(tempBuffer.requestedCapacity));
1521 } while (tempBuffer.needToTryAgain(err));
1522 } else {
1523 if (localeID==nullptr) {
1524 localeID=uloc_getDefault();
1525 }
1526 tmpLocaleID=localeID;
1527 }
1528
1529 origLocaleID=tmpLocaleID;
1530
1531 /* get all pieces, one after another, and separate with '_' */
1532 CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1533
1534 if (tag.length() == I_DEFAULT_LENGTH &&
1535 uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1536 tag.clear();
1537 tag.append(uloc_getDefault(), *err);
1538 } else if(_isIDSeparator(*tmpLocaleID)) {
1539 const char *scriptID;
1540
1541 ++fieldCount;
1542 tag.append('_', *err);
1543
1544 CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1545 tag.append(script, *err);
1546 scriptSize = script.length();
1547 if(scriptSize > 0) {
1548 /* Found optional script */
1549 tmpLocaleID = scriptID;
1550 ++fieldCount;
1551 if (_isIDSeparator(*tmpLocaleID)) {
1552 /* If there is something else, then we add the _ */
1553 tag.append('_', *err);
1554 }
1555 }
1556
1557 if (_isIDSeparator(*tmpLocaleID)) {
1558 const char *cntryID;
1559
1560 CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1561 tag.append(country, *err);
1562 if (!country.isEmpty()) {
1563 /* Found optional country */
1564 tmpLocaleID = cntryID;
1565 }
1566 if(_isIDSeparator(*tmpLocaleID)) {
1567 /* If there is something else, then we add the _ if we found country before. */
1568 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1569 ++fieldCount;
1570 tag.append('_', *err);
1571 }
1572
1573 variantSize = -tag.length();
1574 {
1575 CharStringByteSink s(&tag);
1576 _getVariant(tmpLocaleID+1, *tmpLocaleID, s, false);
1577 }
1578 variantSize += tag.length();
1579 if (variantSize > 0) {
1580 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1581 }
1582 }
1583 }
1584 }
1585
1586 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1587 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1588 UBool done = false;
1589 do {
1590 char c = *tmpLocaleID;
1591 switch (c) {
1592 case 0:
1593 case '@':
1594 done = true;
1595 break;
1596 default:
1597 tag.append(c, *err);
1598 ++tmpLocaleID;
1599 break;
1600 }
1601 } while (!done);
1602 }
1603
1604 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1605 After this, tmpLocaleID either points to '@' or is nullptr */
1606 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=nullptr) {
1607 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1608 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1609 }
1610
1611 /* Copy POSIX-style variant, if any [mr@FOO] */
1612 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1613 tmpLocaleID != nullptr && keywordAssign == nullptr) {
1614 for (;;) {
1615 char c = *tmpLocaleID;
1616 if (c == 0) {
1617 break;
1618 }
1619 tag.append(c, *err);
1620 ++tmpLocaleID;
1621 }
1622 }
1623
1624 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1625 /* Handle @FOO variant if @ is present and not followed by = */
1626 if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
1627 /* Add missing '_' if needed */
1628 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1629 do {
1630 tag.append('_', *err);
1631 ++fieldCount;
1632 } while(fieldCount<2);
1633 }
1634
1635 int32_t posixVariantSize = -tag.length();
1636 {
1637 CharStringByteSink s(&tag);
1638 _getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1639 }
1640 posixVariantSize += tag.length();
1641 if (posixVariantSize > 0) {
1642 variantSize += posixVariantSize;
1643 }
1644 }
1645
1646 /* Look up the ID in the canonicalization map */
1647 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1648 StringPiece id(CANONICALIZE_MAP[j].id);
1649 if (tag == id) {
1650 if (id.empty() && tmpLocaleID != nullptr) {
1651 break; /* Don't remap "" if keywords present */
1652 }
1653 tag.clear();
1654 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1655 break;
1656 }
1657 }
1658 }
1659
1660 sink.Append(tag.data(), tag.length());
1661
1662 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1663 if (tmpLocaleID!=nullptr && keywordAssign!=nullptr &&
1664 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1665 sink.Append("@", 1);
1666 ++fieldCount;
1667 ulocimp_getKeywords(tmpLocaleID+1, '@', sink, true, err);
1668 }
1669 }
1670}
1671
1672/* ### ID parsing API **************************************************/
1673
1674U_CAPI int32_t U_EXPORT2
1675uloc_getParent(const char* localeID,
1676 char* parent,
1677 int32_t parentCapacity,
1678 UErrorCode* err)
1679{
1680 const char *lastUnderscore;
1681 int32_t i;
1682
1683 if (U_FAILURE(*err))
1684 return 0;
1685
1686 if (localeID == nullptr)
1687 localeID = uloc_getDefault();
1688
1689 lastUnderscore=uprv_strrchr(localeID, '_');
1690 if(lastUnderscore!=nullptr) {
1691 i=(int32_t)(lastUnderscore-localeID);
1692 } else {
1693 i=0;
1694 }
1695
1696 if (i > 0) {
1697 if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1698 localeID += 3;
1699 i -= 3;
1700 uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1701 } else if (parent != localeID) {
1702 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1703 }
1704 }
1705
1706 return u_terminateChars(parent, parentCapacity, i, err);
1707}
1708
1709U_CAPI int32_t U_EXPORT2
1710uloc_getLanguage(const char* localeID,
1711 char* language,
1712 int32_t languageCapacity,
1713 UErrorCode* err)
1714{
1715 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1716
1717 if (err==nullptr || U_FAILURE(*err)) {
1718 return 0;
1719 }
1720
1721 if(localeID==nullptr) {
1722 localeID=uloc_getDefault();
1723 }
1724
1725 return ulocimp_getLanguage(localeID, nullptr, *err).extract(language, languageCapacity, *err);
1726}
1727
1728U_CAPI int32_t U_EXPORT2
1729uloc_getScript(const char* localeID,
1730 char* script,
1731 int32_t scriptCapacity,
1732 UErrorCode* err)
1733{
1734 if(err==nullptr || U_FAILURE(*err)) {
1735 return 0;
1736 }
1737
1738 if(localeID==nullptr) {
1739 localeID=uloc_getDefault();
1740 }
1741
1742 /* skip the language */
1743 ulocimp_getLanguage(localeID, &localeID, *err);
1744 if (U_FAILURE(*err)) {
1745 return 0;
1746 }
1747
1748 if(_isIDSeparator(*localeID)) {
1749 return ulocimp_getScript(localeID+1, nullptr, *err).extract(script, scriptCapacity, *err);
1750 }
1751 return u_terminateChars(script, scriptCapacity, 0, err);
1752}
1753
1754U_CAPI int32_t U_EXPORT2
1755uloc_getCountry(const char* localeID,
1756 char* country,
1757 int32_t countryCapacity,
1758 UErrorCode* err)
1759{
1760 if(err==nullptr || U_FAILURE(*err)) {
1761 return 0;
1762 }
1763
1764 if(localeID==nullptr) {
1765 localeID=uloc_getDefault();
1766 }
1767
1768 /* Skip the language */
1769 ulocimp_getLanguage(localeID, &localeID, *err);
1770 if (U_FAILURE(*err)) {
1771 return 0;
1772 }
1773
1774 if(_isIDSeparator(*localeID)) {
1775 const char *scriptID;
1776 /* Skip the script if available */
1777 ulocimp_getScript(localeID+1, &scriptID, *err);
1778 if (U_FAILURE(*err)) {
1779 return 0;
1780 }
1781 if(scriptID != localeID+1) {
1782 /* Found optional script */
1783 localeID = scriptID;
1784 }
1785 if(_isIDSeparator(*localeID)) {
1786 return ulocimp_getCountry(localeID+1, nullptr, *err).extract(country, countryCapacity, *err);
1787 }
1788 }
1789 return u_terminateChars(country, countryCapacity, 0, err);
1790}
1791
1792U_CAPI int32_t U_EXPORT2
1793uloc_getVariant(const char* localeID,
1794 char* variant,
1795 int32_t variantCapacity,
1796 UErrorCode* err)
1797{
1798 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1799 const char* tmpLocaleID;
1800 int32_t i=0;
1801
1802 if(err==nullptr || U_FAILURE(*err)) {
1803 return 0;
1804 }
1805
1806 if (_hasBCP47Extension(localeID)) {
1807 tmpLocaleID =_ConvertBCP47(localeID, tempBuffer, sizeof(tempBuffer), err, nullptr);
1808 } else {
1809 if (localeID==nullptr) {
1810 localeID=uloc_getDefault();
1811 }
1812 tmpLocaleID=localeID;
1813 }
1814
1815 /* Skip the language */
1816 ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1817 if (U_FAILURE(*err)) {
1818 return 0;
1819 }
1820
1821 if(_isIDSeparator(*tmpLocaleID)) {
1822 const char *scriptID;
1823 /* Skip the script if available */
1824 ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1825 if (U_FAILURE(*err)) {
1826 return 0;
1827 }
1828 if(scriptID != tmpLocaleID+1) {
1829 /* Found optional script */
1830 tmpLocaleID = scriptID;
1831 }
1832 /* Skip the Country */
1833 if (_isIDSeparator(*tmpLocaleID)) {
1834 const char *cntryID;
1835 ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1836 if (U_FAILURE(*err)) {
1837 return 0;
1838 }
1839 if (cntryID != tmpLocaleID+1) {
1840 /* Found optional country */
1841 tmpLocaleID = cntryID;
1842 }
1843 if(_isIDSeparator(*tmpLocaleID)) {
1844 /* If there was no country ID, skip a possible extra IDSeparator */
1845 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1846 tmpLocaleID++;
1847 }
1848
1849 CheckedArrayByteSink sink(variant, variantCapacity);
1850 _getVariant(tmpLocaleID+1, *tmpLocaleID, sink, false);
1851
1852 i = sink.NumberOfBytesAppended();
1853
1854 if (U_FAILURE(*err)) {
1855 return i;
1856 }
1857
1858 if (sink.Overflowed()) {
1859 *err = U_BUFFER_OVERFLOW_ERROR;
1860 return i;
1861 }
1862 }
1863 }
1864 }
1865
1866 return u_terminateChars(variant, variantCapacity, i, err);
1867}
1868
1869U_CAPI int32_t U_EXPORT2
1870uloc_getName(const char* localeID,
1871 char* name,
1872 int32_t nameCapacity,
1873 UErrorCode* err)
1874{
1875 if (U_FAILURE(*err)) {
1876 return 0;
1877 }
1878
1879 CheckedArrayByteSink sink(name, nameCapacity);
1880 ulocimp_getName(localeID, sink, err);
1881
1882 int32_t reslen = sink.NumberOfBytesAppended();
1883
1884 if (U_FAILURE(*err)) {
1885 return reslen;
1886 }
1887
1888 if (sink.Overflowed()) {
1889 *err = U_BUFFER_OVERFLOW_ERROR;
1890 } else {
1891 u_terminateChars(name, nameCapacity, reslen, err);
1892 }
1893
1894 return reslen;
1895}
1896
1897U_CAPI void U_EXPORT2
1898ulocimp_getName(const char* localeID,
1899 ByteSink& sink,
1900 UErrorCode* err)
1901{
1902 _canonicalize(localeID, sink, 0, err);
1903}
1904
1905U_CAPI int32_t U_EXPORT2
1906uloc_getBaseName(const char* localeID,
1907 char* name,
1908 int32_t nameCapacity,
1909 UErrorCode* err)
1910{
1911 if (U_FAILURE(*err)) {
1912 return 0;
1913 }
1914
1915 CheckedArrayByteSink sink(name, nameCapacity);
1916 ulocimp_getBaseName(localeID, sink, err);
1917
1918 int32_t reslen = sink.NumberOfBytesAppended();
1919
1920 if (U_FAILURE(*err)) {
1921 return reslen;
1922 }
1923
1924 if (sink.Overflowed()) {
1925 *err = U_BUFFER_OVERFLOW_ERROR;
1926 } else {
1927 u_terminateChars(name, nameCapacity, reslen, err);
1928 }
1929
1930 return reslen;
1931}
1932
1933U_CAPI void U_EXPORT2
1934ulocimp_getBaseName(const char* localeID,
1935 ByteSink& sink,
1936 UErrorCode* err)
1937{
1938 _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1939}
1940
1941U_CAPI int32_t U_EXPORT2
1942uloc_canonicalize(const char* localeID,
1943 char* name,
1944 int32_t nameCapacity,
1945 UErrorCode* err)
1946{
1947 if (U_FAILURE(*err)) {
1948 return 0;
1949 }
1950
1951 CheckedArrayByteSink sink(name, nameCapacity);
1952 ulocimp_canonicalize(localeID, sink, err);
1953
1954 int32_t reslen = sink.NumberOfBytesAppended();
1955
1956 if (U_FAILURE(*err)) {
1957 return reslen;
1958 }
1959
1960 if (sink.Overflowed()) {
1961 *err = U_BUFFER_OVERFLOW_ERROR;
1962 } else {
1963 u_terminateChars(name, nameCapacity, reslen, err);
1964 }
1965
1966 return reslen;
1967}
1968
1969U_CAPI void U_EXPORT2
1970ulocimp_canonicalize(const char* localeID,
1971 ByteSink& sink,
1972 UErrorCode* err)
1973{
1974 _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1975}
1976
1977U_CAPI const char* U_EXPORT2
1978uloc_getISO3Language(const char* localeID)
1979{
1980 int16_t offset;
1981 char lang[ULOC_LANG_CAPACITY];
1982 UErrorCode err = U_ZERO_ERROR;
1983
1984 if (localeID == nullptr)
1985 {
1986 localeID = uloc_getDefault();
1987 }
1988 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1989 if (U_FAILURE(err))
1990 return "";
1991 offset = _findIndex(LANGUAGES, lang);
1992 if (offset < 0)
1993 return "";
1994 return LANGUAGES_3[offset];
1995}
1996
1997U_CAPI const char* U_EXPORT2
1998uloc_getISO3Country(const char* localeID)
1999{
2000 int16_t offset;
2001 char cntry[ULOC_LANG_CAPACITY];
2002 UErrorCode err = U_ZERO_ERROR;
2003
2004 if (localeID == nullptr)
2005 {
2006 localeID = uloc_getDefault();
2007 }
2008 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2009 if (U_FAILURE(err))
2010 return "";
2011 offset = _findIndex(COUNTRIES, cntry);
2012 if (offset < 0)
2013 return "";
2014
2015 return COUNTRIES_3[offset];
2016}
2017
2018U_CAPI uint32_t U_EXPORT2
2019uloc_getLCID(const char* localeID)
2020{
2021 UErrorCode status = U_ZERO_ERROR;
2022 char langID[ULOC_FULLNAME_CAPACITY];
2023 uint32_t lcid = 0;
2024
2025 /* Check for incomplete id. */
2026 if (!localeID || uprv_strlen(localeID) < 2) {
2027 return 0;
2028 }
2029
2030 // First, attempt Windows platform lookup if available, but fall
2031 // through to catch any special cases (ICU vs Windows name differences).
2032 lcid = uprv_convertToLCIDPlatform(localeID, &status);
2033 if (U_FAILURE(status)) {
2034 return 0;
2035 }
2036 if (lcid > 0) {
2037 // Windows found an LCID, return that
2038 return lcid;
2039 }
2040
2041 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2042 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2043 return 0;
2044 }
2045
2046 if (uprv_strchr(localeID, '@')) {
2047 // uprv_convertToLCID does not support keywords other than collation.
2048 // Remove all keywords except collation.
2049 int32_t len;
2050 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2051
2052 CharString collVal;
2053 {
2054 CharStringByteSink sink(&collVal);
2055 ulocimp_getKeywordValue(localeID, "collation", sink, &status);
2056 }
2057
2058 if (U_SUCCESS(status) && !collVal.isEmpty()) {
2059 len = uloc_getBaseName(localeID, tmpLocaleID,
2060 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2061
2062 if (U_SUCCESS(status) && len > 0) {
2063 tmpLocaleID[len] = 0;
2064
2065 len = uloc_setKeywordValue("collation", collVal.data(), tmpLocaleID,
2066 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2067
2068 if (U_SUCCESS(status) && len > 0) {
2069 tmpLocaleID[len] = 0;
2070 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2071 }
2072 }
2073 }
2074
2075 // fall through - all keywords are simply ignored
2076 status = U_ZERO_ERROR;
2077 }
2078
2079 return uprv_convertToLCID(langID, localeID, &status);
2080}
2081
2082U_CAPI int32_t U_EXPORT2
2083uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2084 UErrorCode *status)
2085{
2086 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2087}
2088
2089/* ### Default locale **************************************************/
2090
2091U_CAPI const char* U_EXPORT2
2092uloc_getDefault()
2093{
2094 return locale_get_default();
2095}
2096
2097U_CAPI void U_EXPORT2
2098uloc_setDefault(const char* newDefaultLocale,
2099 UErrorCode* err)
2100{
2101 if (U_FAILURE(*err))
2102 return;
2103 /* the error code isn't currently used for anything by this function*/
2104
2105 /* propagate change to C++ */
2106 locale_set_default(newDefaultLocale);
2107}
2108
2109/**
2110 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2111 * to an array of pointers to arrays of char. All of these pointers are owned
2112 * by ICU-- do not delete them, and do not write through them. The array is
2113 * terminated with a null pointer.
2114 */
2115U_CAPI const char* const* U_EXPORT2
2116uloc_getISOLanguages()
2117{
2118 return LANGUAGES;
2119}
2120
2121/**
2122 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2123 * pointer to an array of pointers to arrays of char. All of these pointers are
2124 * owned by ICU-- do not delete them, and do not write through them. The array is
2125 * terminated with a null pointer.
2126 */
2127U_CAPI const char* const* U_EXPORT2
2128uloc_getISOCountries()
2129{
2130 return COUNTRIES;
2131}
2132
2133U_CAPI const char* U_EXPORT2
2134uloc_toUnicodeLocaleKey(const char* keyword)
2135{
2136 const char* bcpKey = ulocimp_toBcpKey(keyword);
2137 if (bcpKey == nullptr && ultag_isUnicodeLocaleKey(keyword, -1)) {
2138 // unknown keyword, but syntax is fine..
2139 return keyword;
2140 }
2141 return bcpKey;
2142}
2143
2144U_CAPI const char* U_EXPORT2
2145uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2146{
2147 const char* bcpType = ulocimp_toBcpType(keyword, value, nullptr, nullptr);
2148 if (bcpType == nullptr && ultag_isUnicodeLocaleType(value, -1)) {
2149 // unknown keyword, but syntax is fine..
2150 return value;
2151 }
2152 return bcpType;
2153}
2154
2155static UBool
2156isWellFormedLegacyKey(const char* legacyKey)
2157{
2158 const char* p = legacyKey;
2159 while (*p) {
2160 if (!UPRV_ISALPHANUM(*p)) {
2161 return false;
2162 }
2163 p++;
2164 }
2165 return true;
2166}
2167
2168static UBool
2169isWellFormedLegacyType(const char* legacyType)
2170{
2171 const char* p = legacyType;
2172 int32_t alphaNumLen = 0;
2173 while (*p) {
2174 if (*p == '_' || *p == '/' || *p == '-') {
2175 if (alphaNumLen == 0) {
2176 return false;
2177 }
2178 alphaNumLen = 0;
2179 } else if (UPRV_ISALPHANUM(*p)) {
2180 alphaNumLen++;
2181 } else {
2182 return false;
2183 }
2184 p++;
2185 }
2186 return (alphaNumLen != 0);
2187}
2188
2189U_CAPI const char* U_EXPORT2
2190uloc_toLegacyKey(const char* keyword)
2191{
2192 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2193 if (legacyKey == nullptr) {
2194 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2195 //
2196 // Note:
2197 // LDML/CLDR provides some definition of keyword syntax in
2198 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2199 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2200 // Keys can only consist of [0-9a-zA-Z].
2201 if (isWellFormedLegacyKey(keyword)) {
2202 return keyword;
2203 }
2204 }
2205 return legacyKey;
2206}
2207
2208U_CAPI const char* U_EXPORT2
2209uloc_toLegacyType(const char* keyword, const char* value)
2210{
2211 const char* legacyType = ulocimp_toLegacyType(keyword, value, nullptr, nullptr);
2212 if (legacyType == nullptr) {
2213 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2214 //
2215 // Note:
2216 // LDML/CLDR provides some definition of keyword syntax in
2217 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2218 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2219 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2220 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2221 if (isWellFormedLegacyType(value)) {
2222 return value;
2223 }
2224 }
2225 return legacyType;
2226}
2227
2228/*eof*/
2229