1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 1997-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9* File ULOC.CPP
10*
11* Modification History:
12*
13* Date Name Description
14* 04/01/97 aliu Creation.
15* 08/21/98 stephen JDK 1.2 sync
16* 12/08/98 rtg New Locale implementation and C API
17* 03/15/99 damiba overhaul.
18* 04/06/99 stephen changed setDefault() to realloc and copy
19* 06/14/99 stephen Changed calls to ures_open for new params
20* 07/21/99 stephen Modified setDefault() to propagate to C++
21* 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22* brought canonicalization code into line with spec
23*****************************************************************************/
24
25/*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31*/
32
33#include "unicode/bytestream.h"
34#include "unicode/errorcode.h"
35#include "unicode/stringpiece.h"
36#include "unicode/utypes.h"
37#include "unicode/ustring.h"
38#include "unicode/uloc.h"
39
40#include "bytesinkutil.h"
41#include "putilimp.h"
42#include "ustr_imp.h"
43#include "ulocimp.h"
44#include "umutex.h"
45#include "cstring.h"
46#include "cmemory.h"
47#include "locmap.h"
48#include "uarrsort.h"
49#include "uenumimp.h"
50#include "uassert.h"
51#include "charstr.h"
52
53#include <algorithm>
54#include <stdio.h> /* for sprintf */
55
56U_NAMESPACE_USE
57
58/* ### Declarations **************************************************/
59
60/* Locale stuff from locid.cpp */
61U_CFUNC void locale_set_default(const char *id);
62U_CFUNC const char *locale_get_default(void);
63U_CFUNC int32_t
64locale_getKeywords(const char *localeID,
65 char prev,
66 char *keywords, int32_t keywordCapacity,
67 UBool valuesToo,
68 UErrorCode *status);
69
70/* ### Data tables **************************************************/
71
72/**
73 * Table of language codes, both 2- and 3-letter, with preference
74 * given to 2-letter codes where possible. Includes 3-letter codes
75 * that lack a 2-letter equivalent.
76 *
77 * This list must be in sorted order. This list is returned directly
78 * to the user by some API.
79 *
80 * This list must be kept in sync with LANGUAGES_3, with corresponding
81 * entries matched.
82 *
83 * This table should be terminated with a NULL entry, followed by a
84 * second list, and another NULL entry. The first list is visible to
85 * user code when this array is returned by API. The second list
86 * contains codes we support, but do not expose through user API.
87 *
88 * Notes
89 *
90 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
91 * include the revisions up to 2001/7/27 *CWB*
92 *
93 * The 3 character codes are the terminology codes like RFC 3066. This
94 * is compatible with prior ICU codes
95 *
96 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
97 * table but now at the end of the table because 3 character codes are
98 * duplicates. This avoids bad searches going from 3 to 2 character
99 * codes.
100 *
101 * The range qaa-qtz is reserved for local use
102 */
103/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
104/* ISO639 table version is 20150505 */
105/* Subsequent hand addition of selected languages */
106static const char * const LANGUAGES[] = {
107 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
108 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
109 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
110 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
111 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
112 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
113 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
114 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
115 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
116 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
117 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
118 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
119 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
120 "cs", "csb", "cu", "cv", "cy",
121 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
122 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
123 "dyo", "dyu", "dz", "dzg",
124 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
125 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
126 "ext",
127 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
128 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
129 "frs", "fur", "fy",
130 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
131 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
132 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
133 "gur", "guz", "gv", "gwi",
134 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
135 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
136 "hup", "hy", "hz",
137 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
138 "ilo", "inh", "io", "is", "it", "iu", "izh",
139 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
140 "jv",
141 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
142 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
143 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
144 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
145 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
146 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
147 "kv", "kw", "ky",
148 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
149 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
150 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
151 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
152 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
153 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
154 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
155 "ml", "mn", "mnc", "mni", "mo",
156 "moh", "mos", "mr", "mrj",
157 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
158 "my", "mye", "myv", "mzn",
159 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
160 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
161 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
162 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
163 "oc", "oj", "om", "or", "os", "osa", "ota",
164 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
165 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
166 "pon", "prg", "pro", "ps", "pt",
167 "qu", "quc", "qug",
168 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
169 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
170 "rw", "rwk",
171 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
172 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
173 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
174 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
175 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
176 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
177 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
178 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
179 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
180 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
181 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
182 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
183 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
184 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
185 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
186 "vot", "vro", "vun",
187 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
188 "xal", "xh", "xmf", "xog",
189 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
190 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
191 "zun", "zxx", "zza",
192NULL,
193 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
194NULL
195};
196
197static const char* const DEPRECATED_LANGUAGES[]={
198 "in", "iw", "ji", "jw", NULL, NULL
199};
200static const char* const REPLACEMENT_LANGUAGES[]={
201 "id", "he", "yi", "jv", NULL, NULL
202};
203
204/**
205 * Table of 3-letter language codes.
206 *
207 * This is a lookup table used to convert 3-letter language codes to
208 * their 2-letter equivalent, where possible. It must be kept in sync
209 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
210 * same language as LANGUAGES_3[i]. The commented-out lines are
211 * copied from LANGUAGES to make eyeballing this baby easier.
212 *
213 * Where a 3-letter language code has no 2-letter equivalent, the
214 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
215 *
216 * This table should be terminated with a NULL entry, followed by a
217 * second list, and another NULL entry. The two lists correspond to
218 * the two lists in LANGUAGES.
219 */
220/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
221/* ISO639 table version is 20150505 */
222/* Subsequent hand addition of selected languages */
223static const char * const LANGUAGES_3[] = {
224 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
225 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
226 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
227 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
228 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
229 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
230 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
231 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
232 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
233 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
234 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
235 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
236 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
237 "ces", "csb", "chu", "chv", "cym",
238 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
239 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
240 "dyo", "dyu", "dzo", "dzg",
241 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
242 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
243 "ext",
244 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
245 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
246 "frs", "fur", "fry",
247 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
248 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
249 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
250 "gur", "guz", "glv", "gwi",
251 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
252 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
253 "hup", "hye", "her",
254 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
255 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
256 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
257 "jav",
258 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
259 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
260 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
261 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
262 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
263 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
264 "kom", "cor", "kir",
265 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
266 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
267 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
268 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
269 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
270 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
271 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
272 "mal", "mon", "mnc", "mni", "mol",
273 "moh", "mos", "mar", "mrj",
274 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
275 "mya", "mye", "myv", "mzn",
276 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
277 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
278 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
279 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
280 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
281 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
282 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
283 "pon", "prg", "pro", "pus", "por",
284 "que", "quc", "qug",
285 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
286 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
287 "kin", "rwk",
288 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
289 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
290 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
291 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
292 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
293 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
294 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
295 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
296 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
297 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
298 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
299 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
300 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
301 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
302 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
303 "vot", "vro", "vun",
304 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
305 "xal", "xho", "xmf", "xog",
306 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
307 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
308 "zun", "zxx", "zza",
309NULL,
310/* "in", "iw", "ji", "jw", "sh", */
311 "ind", "heb", "yid", "jaw", "srp",
312NULL
313};
314
315/**
316 * Table of 2-letter country codes.
317 *
318 * This list must be in sorted order. This list is returned directly
319 * to the user by some API.
320 *
321 * This list must be kept in sync with COUNTRIES_3, with corresponding
322 * entries matched.
323 *
324 * This table should be terminated with a NULL entry, followed by a
325 * second list, and another NULL entry. The first list is visible to
326 * user code when this array is returned by API. The second list
327 * contains codes we support, but do not expose through user API.
328 *
329 * Notes:
330 *
331 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
332 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
333 * new codes keeping the old ones for compatibility updated to include
334 * 1999/12/03 revisions *CWB*
335 *
336 * RO(ROM) is now RO(ROU) according to
337 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
338 */
339static const char * const COUNTRIES[] = {
340 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
341 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
342 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
343 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
344 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
345 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR",
346 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK",
347 "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER",
348 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
349 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
350 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
351 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
352 "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
353 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
354 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
355 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
356 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
357 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
358 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
359 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
360 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
361 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
362 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
363 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
364 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
365 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
366 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
367 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
368 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
369 "WS", "YE", "YT", "ZA", "ZM", "ZW",
370NULL,
371 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
372NULL
373};
374
375static const char* const DEPRECATED_COUNTRIES[] = {
376 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
377};
378static const char* const REPLACEMENT_COUNTRIES[] = {
379/* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
380 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL /* replacement country codes */
381};
382
383/**
384 * Table of 3-letter country codes.
385 *
386 * This is a lookup table used to convert 3-letter country codes to
387 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
388 * For all valid i, COUNTRIES[i] must refer to the same country as
389 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
390 * to make eyeballing this baby easier.
391 *
392 * This table should be terminated with a NULL entry, followed by a
393 * second list, and another NULL entry. The two lists correspond to
394 * the two lists in COUNTRIES.
395 */
396static const char * const COUNTRIES_3[] = {
397/* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
398 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
399/* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
400 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
401/* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
402 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
403/* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
404 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
405/* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
406 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
407/* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", */
408 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
409/* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK", */
410 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
411/* "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", */
412 "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
413/* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
414 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
415/* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
416 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
417/* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
418 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
419/* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
420 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
421/* "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
422 "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
423/* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
424 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
425/* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
426 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
427/* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
428 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
429/* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
430 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
431/* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
432 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
433/* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
434 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
435/* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
436 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
437/* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
438 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
439/* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
440 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
441/* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
442 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
443/* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
444 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
445/* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
446 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
447/* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
448 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
449/* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
450 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
451/* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
452 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
453/* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
454 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
455/* "WS", "YE", "YT", "ZA", "ZM", "ZW", */
456 "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
457NULL,
458/* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
459 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
460NULL
461};
462
463typedef struct CanonicalizationMap {
464 const char *id; /* input ID */
465 const char *canonicalID; /* canonicalized output ID */
466} CanonicalizationMap;
467
468/**
469 * A map to canonicalize locale IDs. This handles a variety of
470 * different semantic kinds of transformations.
471 */
472static const CanonicalizationMap CANONICALIZE_MAP[] = {
473 { "art__LOJBAN", "jbo" }, /* registered name */
474 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
475 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
476 { "zh__GUOYU", "zh" }, /* registered name */
477 { "zh__HAKKA", "hak" }, /* registered name */
478 { "zh__XIANG", "hsn" }, /* registered name */
479 // subtags with 3 chars won't be treated as variants.
480 { "zh_GAN", "gan" }, /* registered name */
481 { "zh_MIN_NAN", "nan" }, /* registered name */
482 { "zh_WUU", "wuu" }, /* registered name */
483 { "zh_YUE", "yue" }, /* registered name */
484};
485
486/* ### BCP47 Conversion *******************************************/
487/* Test if the locale id has BCP47 u extension and does not have '@' */
488#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
489/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
490#define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
491 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
492 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
493 finalID=id; \
494 if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
495 } else { \
496 finalID=buffer; \
497 } \
498} UPRV_BLOCK_MACRO_END
499/* Gets the size of the shortest subtag in the given localeID. */
500static int32_t getShortestSubtagLength(const char *localeID) {
501 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
502 int32_t length = localeIDLength;
503 int32_t tmpLength = 0;
504 int32_t i;
505 UBool reset = TRUE;
506
507 for (i = 0; i < localeIDLength; i++) {
508 if (localeID[i] != '_' && localeID[i] != '-') {
509 if (reset) {
510 tmpLength = 0;
511 reset = FALSE;
512 }
513 tmpLength++;
514 } else {
515 if (tmpLength != 0 && tmpLength < length) {
516 length = tmpLength;
517 }
518 reset = TRUE;
519 }
520 }
521
522 return length;
523}
524
525/* ### Keywords **************************************************/
526#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
527#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
528/* Punctuation/symbols allowed in legacy key values */
529#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
530
531#define ULOC_KEYWORD_BUFFER_LEN 25
532#define ULOC_MAX_NO_KEYWORDS 25
533
534U_CAPI const char * U_EXPORT2
535locale_getKeywordsStart(const char *localeID) {
536 const char *result = NULL;
537 if((result = uprv_strchr(localeID, '@')) != NULL) {
538 return result;
539 }
540#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
541 else {
542 /* We do this because the @ sign is variant, and the @ sign used on one
543 EBCDIC machine won't be compiled the same way on other EBCDIC based
544 machines. */
545 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
546 const uint8_t *charToFind = ebcdicSigns;
547 while(*charToFind) {
548 if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
549 return result;
550 }
551 charToFind++;
552 }
553 }
554#endif
555 return NULL;
556}
557
558/**
559 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
560 * @param keywordName incoming name to be canonicalized
561 * @param status return status (keyword too long)
562 * @return length of the keyword name
563 */
564static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
565{
566 int32_t keywordNameLen = 0;
567
568 for (; *keywordName != 0; keywordName++) {
569 if (!UPRV_ISALPHANUM(*keywordName)) {
570 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
571 return 0;
572 }
573 if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
574 buf[keywordNameLen++] = uprv_tolower(*keywordName);
575 } else {
576 /* keyword name too long for internal buffer */
577 *status = U_INTERNAL_PROGRAM_ERROR;
578 return 0;
579 }
580 }
581 if (keywordNameLen == 0) {
582 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
583 return 0;
584 }
585 buf[keywordNameLen] = 0; /* terminate */
586
587 return keywordNameLen;
588}
589
590typedef struct {
591 char keyword[ULOC_KEYWORD_BUFFER_LEN];
592 int32_t keywordLen;
593 const char *valueStart;
594 int32_t valueLen;
595} KeywordStruct;
596
597static int32_t U_CALLCONV
598compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
599 const char* leftString = ((const KeywordStruct *)left)->keyword;
600 const char* rightString = ((const KeywordStruct *)right)->keyword;
601 return uprv_strcmp(leftString, rightString);
602}
603
604static void
605_getKeywords(const char *localeID,
606 char prev,
607 ByteSink& sink,
608 UBool valuesToo,
609 UErrorCode *status)
610{
611 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
612
613 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
614 int32_t numKeywords = 0;
615 const char* pos = localeID;
616 const char* equalSign = NULL;
617 const char* semicolon = NULL;
618 int32_t i = 0, j, n;
619
620 if(prev == '@') { /* start of keyword definition */
621 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
622 do {
623 UBool duplicate = FALSE;
624 /* skip leading spaces */
625 while(*pos == ' ') {
626 pos++;
627 }
628 if (!*pos) { /* handle trailing "; " */
629 break;
630 }
631 if(numKeywords == maxKeywords) {
632 *status = U_INTERNAL_PROGRAM_ERROR;
633 return;
634 }
635 equalSign = uprv_strchr(pos, '=');
636 semicolon = uprv_strchr(pos, ';');
637 /* lack of '=' [foo@currency] is illegal */
638 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
639 if(!equalSign || (semicolon && semicolon<equalSign)) {
640 *status = U_INVALID_FORMAT_ERROR;
641 return;
642 }
643 /* need to normalize both keyword and keyword name */
644 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
645 /* keyword name too long for internal buffer */
646 *status = U_INTERNAL_PROGRAM_ERROR;
647 return;
648 }
649 for(i = 0, n = 0; i < equalSign - pos; ++i) {
650 if (pos[i] != ' ') {
651 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
652 }
653 }
654
655 /* zero-length keyword is an error. */
656 if (n == 0) {
657 *status = U_INVALID_FORMAT_ERROR;
658 return;
659 }
660
661 keywordList[numKeywords].keyword[n] = 0;
662 keywordList[numKeywords].keywordLen = n;
663 /* now grab the value part. First we skip the '=' */
664 equalSign++;
665 /* then we leading spaces */
666 while(*equalSign == ' ') {
667 equalSign++;
668 }
669
670 /* Premature end or zero-length value */
671 if (!*equalSign || equalSign == semicolon) {
672 *status = U_INVALID_FORMAT_ERROR;
673 return;
674 }
675
676 keywordList[numKeywords].valueStart = equalSign;
677
678 pos = semicolon;
679 i = 0;
680 if(pos) {
681 while(*(pos - i - 1) == ' ') {
682 i++;
683 }
684 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
685 pos++;
686 } else {
687 i = (int32_t)uprv_strlen(equalSign);
688 while(i && equalSign[i-1] == ' ') {
689 i--;
690 }
691 keywordList[numKeywords].valueLen = i;
692 }
693 /* If this is a duplicate keyword, then ignore it */
694 for (j=0; j<numKeywords; ++j) {
695 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
696 duplicate = TRUE;
697 break;
698 }
699 }
700 if (!duplicate) {
701 ++numKeywords;
702 }
703 } while(pos);
704
705 /* now we have a list of keywords */
706 /* we need to sort it */
707 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
708
709 /* Now construct the keyword part */
710 for(i = 0; i < numKeywords; i++) {
711 sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
712 if(valuesToo) {
713 sink.Append("=", 1);
714 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
715 if(i < numKeywords - 1) {
716 sink.Append(";", 1);
717 }
718 } else {
719 sink.Append("\0", 1);
720 }
721 }
722 }
723}
724
725U_CFUNC int32_t
726locale_getKeywords(const char *localeID,
727 char prev,
728 char *keywords, int32_t keywordCapacity,
729 UBool valuesToo,
730 UErrorCode *status) {
731 if (U_FAILURE(*status)) {
732 return 0;
733 }
734
735 CheckedArrayByteSink sink(keywords, keywordCapacity);
736 _getKeywords(localeID, prev, sink, valuesToo, status);
737
738 int32_t reslen = sink.NumberOfBytesAppended();
739
740 if (U_FAILURE(*status)) {
741 return reslen;
742 }
743
744 if (sink.Overflowed()) {
745 *status = U_BUFFER_OVERFLOW_ERROR;
746 } else {
747 u_terminateChars(keywords, keywordCapacity, reslen, status);
748 }
749
750 return reslen;
751}
752
753U_CAPI int32_t U_EXPORT2
754uloc_getKeywordValue(const char* localeID,
755 const char* keywordName,
756 char* buffer, int32_t bufferCapacity,
757 UErrorCode* status)
758{
759 if (buffer != nullptr) {
760 buffer[0] = '\0';
761 }
762 const char* startSearchHere = NULL;
763 const char* nextSeparator = NULL;
764 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
765 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
766 int32_t result = 0;
767
768 if(status && U_SUCCESS(*status) && localeID) {
769 char tempBuffer[ULOC_FULLNAME_CAPACITY];
770 const char* tmpLocaleID;
771
772 if (keywordName == NULL || keywordName[0] == 0) {
773 *status = U_ILLEGAL_ARGUMENT_ERROR;
774 return 0;
775 }
776
777 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
778 if(U_FAILURE(*status)) {
779 return 0;
780 }
781
782 if (_hasBCP47Extension(localeID)) {
783 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
784 } else {
785 tmpLocaleID=localeID;
786 }
787
788 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
789 if(startSearchHere == NULL) {
790 /* no keywords, return at once */
791 return 0;
792 }
793
794 /* find the first keyword */
795 while(startSearchHere) {
796 const char* keyValueTail;
797 int32_t keyValueLen;
798
799 startSearchHere++; /* skip @ or ; */
800 nextSeparator = uprv_strchr(startSearchHere, '=');
801 if(!nextSeparator) {
802 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
803 return 0;
804 }
805 /* strip leading & trailing spaces (TC decided to tolerate these) */
806 while(*startSearchHere == ' ') {
807 startSearchHere++;
808 }
809 keyValueTail = nextSeparator;
810 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
811 keyValueTail--;
812 }
813 /* now keyValueTail points to first char after the keyName */
814 /* copy & normalize keyName from locale */
815 if (startSearchHere == keyValueTail) {
816 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
817 return 0;
818 }
819 keyValueLen = 0;
820 while (startSearchHere < keyValueTail) {
821 if (!UPRV_ISALPHANUM(*startSearchHere)) {
822 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
823 return 0;
824 }
825 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
826 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
827 } else {
828 /* keyword name too long for internal buffer */
829 *status = U_INTERNAL_PROGRAM_ERROR;
830 return 0;
831 }
832 }
833 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
834
835 startSearchHere = uprv_strchr(nextSeparator, ';');
836
837 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
838 /* current entry matches the keyword. */
839 nextSeparator++; /* skip '=' */
840 /* First strip leading & trailing spaces (TC decided to tolerate these) */
841 while(*nextSeparator == ' ') {
842 nextSeparator++;
843 }
844 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
845 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
846 keyValueTail--;
847 }
848 /* Now copy the value, but check well-formedness */
849 if (nextSeparator == keyValueTail) {
850 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
851 return 0;
852 }
853 keyValueLen = 0;
854 while (nextSeparator < keyValueTail) {
855 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
856 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
857 return 0;
858 }
859 if (keyValueLen < bufferCapacity) {
860 /* Should we lowercase value to return here? Tests expect as-is. */
861 buffer[keyValueLen++] = *nextSeparator++;
862 } else { /* keep advancing so we return correct length in case of overflow */
863 keyValueLen++;
864 nextSeparator++;
865 }
866 }
867 result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
868 return result;
869 }
870 }
871 }
872 return 0;
873}
874
875U_CAPI int32_t U_EXPORT2
876uloc_setKeywordValue(const char* keywordName,
877 const char* keywordValue,
878 char* buffer, int32_t bufferCapacity,
879 UErrorCode* status)
880{
881 /* TODO: sorting. removal. */
882 int32_t keywordNameLen;
883 int32_t keywordValueLen;
884 int32_t bufLen;
885 int32_t needLen = 0;
886 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
887 char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
888 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
889 int32_t rc;
890 char* nextSeparator = NULL;
891 char* nextEqualsign = NULL;
892 char* startSearchHere = NULL;
893 char* keywordStart = NULL;
894 CharString updatedKeysAndValues;
895 int32_t updatedKeysAndValuesLen;
896 UBool handledInputKeyAndValue = FALSE;
897 char keyValuePrefix = '@';
898
899 if(U_FAILURE(*status)) {
900 return -1;
901 }
902 if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
903 *status = U_ILLEGAL_ARGUMENT_ERROR;
904 return 0;
905 }
906 bufLen = (int32_t)uprv_strlen(buffer);
907 if(bufferCapacity<bufLen) {
908 /* The capacity is less than the length?! Is this NULL terminated? */
909 *status = U_ILLEGAL_ARGUMENT_ERROR;
910 return 0;
911 }
912 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
913 if(U_FAILURE(*status)) {
914 return 0;
915 }
916
917 keywordValueLen = 0;
918 if(keywordValue) {
919 while (*keywordValue != 0) {
920 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
921 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
922 return 0;
923 }
924 if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
925 /* Should we force lowercase in value to set? */
926 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
927 } else {
928 /* keywordValue too long for internal buffer */
929 *status = U_INTERNAL_PROGRAM_ERROR;
930 return 0;
931 }
932 }
933 }
934 keywordValueBuffer[keywordValueLen] = 0; /* terminate */
935
936 startSearchHere = (char*)locale_getKeywordsStart(buffer);
937 if(startSearchHere == NULL || (startSearchHere[1]==0)) {
938 if(keywordValueLen == 0) { /* no keywords = nothing to remove */
939 return bufLen;
940 }
941
942 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
943 if(startSearchHere) { /* had a single @ */
944 needLen--; /* already had the @ */
945 /* startSearchHere points at the @ */
946 } else {
947 startSearchHere=buffer+bufLen;
948 }
949 if(needLen >= bufferCapacity) {
950 *status = U_BUFFER_OVERFLOW_ERROR;
951 return needLen; /* no change */
952 }
953 *startSearchHere++ = '@';
954 uprv_strcpy(startSearchHere, keywordNameBuffer);
955 startSearchHere += keywordNameLen;
956 *startSearchHere++ = '=';
957 uprv_strcpy(startSearchHere, keywordValueBuffer);
958 return needLen;
959 } /* end shortcut - no @ */
960
961 keywordStart = startSearchHere;
962 /* search for keyword */
963 while(keywordStart) {
964 const char* keyValueTail;
965 int32_t keyValueLen;
966
967 keywordStart++; /* skip @ or ; */
968 nextEqualsign = uprv_strchr(keywordStart, '=');
969 if (!nextEqualsign) {
970 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
971 return 0;
972 }
973 /* strip leading & trailing spaces (TC decided to tolerate these) */
974 while(*keywordStart == ' ') {
975 keywordStart++;
976 }
977 keyValueTail = nextEqualsign;
978 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
979 keyValueTail--;
980 }
981 /* now keyValueTail points to first char after the keyName */
982 /* copy & normalize keyName from locale */
983 if (keywordStart == keyValueTail) {
984 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
985 return 0;
986 }
987 keyValueLen = 0;
988 while (keywordStart < keyValueTail) {
989 if (!UPRV_ISALPHANUM(*keywordStart)) {
990 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
991 return 0;
992 }
993 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
994 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
995 } else {
996 /* keyword name too long for internal buffer */
997 *status = U_INTERNAL_PROGRAM_ERROR;
998 return 0;
999 }
1000 }
1001 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1002
1003 nextSeparator = uprv_strchr(nextEqualsign, ';');
1004
1005 /* start processing the value part */
1006 nextEqualsign++; /* skip '=' */
1007 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1008 while(*nextEqualsign == ' ') {
1009 nextEqualsign++;
1010 }
1011 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1012 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1013 keyValueTail--;
1014 }
1015 if (nextEqualsign == keyValueTail) {
1016 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1017 return 0;
1018 }
1019
1020 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1021 if(rc == 0) {
1022 /* Current entry matches the input keyword. Update the entry */
1023 if(keywordValueLen > 0) { /* updating a value */
1024 updatedKeysAndValues.append(keyValuePrefix, *status);
1025 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1026 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1027 updatedKeysAndValues.append('=', *status);
1028 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1029 } /* else removing this entry, don't emit anything */
1030 handledInputKeyAndValue = TRUE;
1031 } else {
1032 /* input keyword sorts earlier than current entry, add before current entry */
1033 if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1034 /* insert new entry at this location */
1035 updatedKeysAndValues.append(keyValuePrefix, *status);
1036 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1037 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1038 updatedKeysAndValues.append('=', *status);
1039 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1040 handledInputKeyAndValue = TRUE;
1041 }
1042 /* copy the current entry */
1043 updatedKeysAndValues.append(keyValuePrefix, *status);
1044 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1045 updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1046 updatedKeysAndValues.append('=', *status);
1047 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1048 }
1049 if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1050 /* append new entry at the end, it sorts later than existing entries */
1051 updatedKeysAndValues.append(keyValuePrefix, *status);
1052 /* skip keyValuePrefix update, no subsequent key-value pair */
1053 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1054 updatedKeysAndValues.append('=', *status);
1055 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1056 handledInputKeyAndValue = TRUE;
1057 }
1058 keywordStart = nextSeparator;
1059 } /* end loop searching */
1060
1061 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1062 * problems with the passed-in locale. So if we did encounter problems with the
1063 * passed-in locale above, those errors took precedence and overrode any error
1064 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1065 * are errors here they are from updatedKeysAndValues.append; they do cause an
1066 * error return but the passed-in locale is unmodified and the original bufLen is
1067 * returned.
1068 */
1069 if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1070 /* if input key/value specified removal of a keyword not present in locale, or
1071 * there was an error in CharString.append, leave original locale alone. */
1072 return bufLen;
1073 }
1074
1075 updatedKeysAndValuesLen = updatedKeysAndValues.length();
1076 /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1077 needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1078 if(needLen >= bufferCapacity) {
1079 *status = U_BUFFER_OVERFLOW_ERROR;
1080 return needLen; /* no change */
1081 }
1082 if (updatedKeysAndValuesLen > 0) {
1083 uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1084 }
1085 buffer[needLen]=0;
1086 return needLen;
1087}
1088
1089/* ### ID parsing implementation **************************************************/
1090
1091#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1092
1093/*returns TRUE if one of the special prefixes is here (s=string)
1094 'x-' or 'i-' */
1095#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1096
1097/* Dot terminates it because of POSIX form where dot precedes the codepage
1098 * except for variant
1099 */
1100#define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1101
1102/**
1103 * Lookup 'key' in the array 'list'. The array 'list' should contain
1104 * a NULL entry, followed by more entries, and a second NULL entry.
1105 *
1106 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1107 * COUNTRIES_3.
1108 */
1109static int16_t _findIndex(const char* const* list, const char* key)
1110{
1111 const char* const* anchor = list;
1112 int32_t pass = 0;
1113
1114 /* Make two passes through two NULL-terminated arrays at 'list' */
1115 while (pass++ < 2) {
1116 while (*list) {
1117 if (uprv_strcmp(key, *list) == 0) {
1118 return (int16_t)(list - anchor);
1119 }
1120 list++;
1121 }
1122 ++list; /* skip final NULL *CWB*/
1123 }
1124 return -1;
1125}
1126
1127U_CFUNC const char*
1128uloc_getCurrentCountryID(const char* oldID){
1129 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1130 if (offset >= 0) {
1131 return REPLACEMENT_COUNTRIES[offset];
1132 }
1133 return oldID;
1134}
1135U_CFUNC const char*
1136uloc_getCurrentLanguageID(const char* oldID){
1137 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1138 if (offset >= 0) {
1139 return REPLACEMENT_LANGUAGES[offset];
1140 }
1141 return oldID;
1142}
1143/*
1144 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1145 * avoid duplicating code to handle the earlier locale ID pieces
1146 * in the functions for the later ones by
1147 * setting the *pEnd pointer to where they stopped parsing
1148 *
1149 * TODO try to use this in Locale
1150 */
1151static CharString
1152ulocimp_getLanguage(const char *localeID,
1153 const char **pEnd,
1154 UErrorCode &status) {
1155 CharString result;
1156
1157 if (uprv_stricmp(localeID, "root") == 0) {
1158 localeID += 4;
1159 } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1160 (localeID[3] == '\0' ||
1161 localeID[3] == '-' ||
1162 localeID[3] == '_' ||
1163 localeID[3] == '@')) {
1164 localeID += 3;
1165 }
1166
1167 /* if it starts with i- or x- then copy that prefix */
1168 if(_isIDPrefix(localeID)) {
1169 result.append((char)uprv_tolower(*localeID), status);
1170 result.append('-', status);
1171 localeID+=2;
1172 }
1173
1174 /* copy the language as far as possible and count its length */
1175 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1176 result.append((char)uprv_tolower(*localeID), status);
1177 localeID++;
1178 }
1179
1180 if(result.length()==3) {
1181 /* convert 3 character code to 2 character code if possible *CWB*/
1182 int32_t offset = _findIndex(LANGUAGES_3, result.data());
1183 if(offset>=0) {
1184 result.clear();
1185 result.append(LANGUAGES[offset], status);
1186 }
1187 }
1188
1189 if(pEnd!=NULL) {
1190 *pEnd=localeID;
1191 }
1192
1193 return result;
1194}
1195
1196U_CFUNC int32_t
1197ulocimp_getLanguage(const char *localeID,
1198 char *language, int32_t languageCapacity,
1199 const char **pEnd) {
1200 ErrorCode status;
1201 CharString result = ulocimp_getLanguage(localeID, pEnd, status);
1202 if (status.isFailure()) {
1203 return 0;
1204 }
1205 int32_t reslen = result.length();
1206 uprv_memcpy(language, result.data(), std::min(reslen, languageCapacity));
1207 return reslen;
1208}
1209
1210static CharString
1211ulocimp_getScript(const char *localeID,
1212 const char **pEnd,
1213 UErrorCode &status) {
1214 CharString result;
1215 int32_t idLen = 0;
1216
1217 if (pEnd != NULL) {
1218 *pEnd = localeID;
1219 }
1220
1221 /* copy the second item as far as possible and count its length */
1222 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1223 && uprv_isASCIILetter(localeID[idLen])) {
1224 idLen++;
1225 }
1226
1227 /* If it's exactly 4 characters long, then it's a script and not a country. */
1228 if (idLen == 4) {
1229 int32_t i;
1230 if (pEnd != NULL) {
1231 *pEnd = localeID+idLen;
1232 }
1233 if (idLen >= 1) {
1234 result.append((char)uprv_toupper(*(localeID++)), status);
1235 }
1236 for (i = 1; i < idLen; i++) {
1237 result.append((char)uprv_tolower(*(localeID++)), status);
1238 }
1239 }
1240
1241 return result;
1242}
1243
1244U_CFUNC int32_t
1245ulocimp_getScript(const char *localeID,
1246 char *script, int32_t scriptCapacity,
1247 const char **pEnd) {
1248 ErrorCode status;
1249 CharString result = ulocimp_getScript(localeID, pEnd, status);
1250 if (status.isFailure()) {
1251 return 0;
1252 }
1253 int32_t reslen = result.length();
1254 uprv_memcpy(script, result.data(), std::min(reslen, scriptCapacity));
1255 return reslen;
1256}
1257
1258static CharString
1259ulocimp_getCountry(const char *localeID,
1260 const char **pEnd,
1261 UErrorCode &status) {
1262 CharString result;
1263 int32_t idLen=0;
1264
1265 /* copy the country as far as possible and count its length */
1266 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1267 result.append((char)uprv_toupper(localeID[idLen]), status);
1268 idLen++;
1269 }
1270
1271 /* the country should be either length 2 or 3 */
1272 if (idLen == 2 || idLen == 3) {
1273 /* convert 3 character code to 2 character code if possible *CWB*/
1274 if(idLen==3) {
1275 int32_t offset = _findIndex(COUNTRIES_3, result.data());
1276 if(offset>=0) {
1277 result.clear();
1278 result.append(COUNTRIES[offset], status);
1279 }
1280 }
1281 localeID+=idLen;
1282 } else {
1283 result.clear();
1284 }
1285
1286 if(pEnd!=NULL) {
1287 *pEnd=localeID;
1288 }
1289
1290 return result;
1291}
1292
1293U_CFUNC int32_t
1294ulocimp_getCountry(const char *localeID,
1295 char *country, int32_t countryCapacity,
1296 const char **pEnd) {
1297 ErrorCode status;
1298 CharString result = ulocimp_getCountry(localeID, pEnd, status);
1299 if (status.isFailure()) {
1300 return 0;
1301 }
1302 int32_t reslen = result.length();
1303 uprv_memcpy(country, result.data(), std::min(reslen, countryCapacity));
1304 return reslen;
1305}
1306
1307/**
1308 * @param needSeparator if true, then add leading '_' if any variants
1309 * are added to 'variant'
1310 */
1311static void
1312_getVariantEx(const char *localeID,
1313 char prev,
1314 ByteSink& sink,
1315 UBool needSeparator) {
1316 UBool hasVariant = FALSE;
1317
1318 /* get one or more variant tags and separate them with '_' */
1319 if(_isIDSeparator(prev)) {
1320 /* get a variant string after a '-' or '_' */
1321 while(!_isTerminator(*localeID)) {
1322 if (needSeparator) {
1323 sink.Append("_", 1);
1324 needSeparator = FALSE;
1325 }
1326 char c = (char)uprv_toupper(*localeID);
1327 if (c == '-') c = '_';
1328 sink.Append(&c, 1);
1329 hasVariant = TRUE;
1330 localeID++;
1331 }
1332 }
1333
1334 /* if there is no variant tag after a '-' or '_' then look for '@' */
1335 if(!hasVariant) {
1336 if(prev=='@') {
1337 /* keep localeID */
1338 } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1339 ++localeID; /* point after the '@' */
1340 } else {
1341 return;
1342 }
1343 while(!_isTerminator(*localeID)) {
1344 if (needSeparator) {
1345 sink.Append("_", 1);
1346 needSeparator = FALSE;
1347 }
1348 char c = (char)uprv_toupper(*localeID);
1349 if (c == '-' || c == ',') c = '_';
1350 sink.Append(&c, 1);
1351 localeID++;
1352 }
1353 }
1354}
1355
1356static int32_t
1357_getVariantEx(const char *localeID,
1358 char prev,
1359 char *variant, int32_t variantCapacity,
1360 UBool needSeparator) {
1361 CheckedArrayByteSink sink(variant, variantCapacity);
1362 _getVariantEx(localeID, prev, sink, needSeparator);
1363 return sink.NumberOfBytesAppended();
1364}
1365
1366static int32_t
1367_getVariant(const char *localeID,
1368 char prev,
1369 char *variant, int32_t variantCapacity) {
1370 return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1371}
1372
1373/* Keyword enumeration */
1374
1375typedef struct UKeywordsContext {
1376 char* keywords;
1377 char* current;
1378} UKeywordsContext;
1379
1380U_CDECL_BEGIN
1381
1382static void U_CALLCONV
1383uloc_kw_closeKeywords(UEnumeration *enumerator) {
1384 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1385 uprv_free(enumerator->context);
1386 uprv_free(enumerator);
1387}
1388
1389static int32_t U_CALLCONV
1390uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1391 char *kw = ((UKeywordsContext *)en->context)->keywords;
1392 int32_t result = 0;
1393 while(*kw) {
1394 result++;
1395 kw += uprv_strlen(kw)+1;
1396 }
1397 return result;
1398}
1399
1400static const char * U_CALLCONV
1401uloc_kw_nextKeyword(UEnumeration* en,
1402 int32_t* resultLength,
1403 UErrorCode* /*status*/) {
1404 const char* result = ((UKeywordsContext *)en->context)->current;
1405 int32_t len = 0;
1406 if(*result) {
1407 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1408 ((UKeywordsContext *)en->context)->current += len+1;
1409 } else {
1410 result = NULL;
1411 }
1412 if (resultLength) {
1413 *resultLength = len;
1414 }
1415 return result;
1416}
1417
1418static void U_CALLCONV
1419uloc_kw_resetKeywords(UEnumeration* en,
1420 UErrorCode* /*status*/) {
1421 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1422}
1423
1424U_CDECL_END
1425
1426
1427static const UEnumeration gKeywordsEnum = {
1428 NULL,
1429 NULL,
1430 uloc_kw_closeKeywords,
1431 uloc_kw_countKeywords,
1432 uenum_unextDefault,
1433 uloc_kw_nextKeyword,
1434 uloc_kw_resetKeywords
1435};
1436
1437U_CAPI UEnumeration* U_EXPORT2
1438uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1439{
1440 LocalMemory<UKeywordsContext> myContext;
1441 LocalMemory<UEnumeration> result;
1442
1443 if (U_FAILURE(*status)) {
1444 return nullptr;
1445 }
1446 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1447 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1448 if (myContext.isNull() || result.isNull()) {
1449 *status = U_MEMORY_ALLOCATION_ERROR;
1450 return nullptr;
1451 }
1452 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1453 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1454 if (myContext->keywords == nullptr) {
1455 *status = U_MEMORY_ALLOCATION_ERROR;
1456 return nullptr;
1457 }
1458 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1459 myContext->keywords[keywordListSize] = 0;
1460 myContext->current = myContext->keywords;
1461 result->context = myContext.orphan();
1462 return result.orphan();
1463}
1464
1465U_CAPI UEnumeration* U_EXPORT2
1466uloc_openKeywords(const char* localeID,
1467 UErrorCode* status)
1468{
1469 int32_t i=0;
1470 char keywords[256];
1471 int32_t keywordsCapacity = 256;
1472 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1473 const char* tmpLocaleID;
1474
1475 if(status==NULL || U_FAILURE(*status)) {
1476 return 0;
1477 }
1478
1479 if (_hasBCP47Extension(localeID)) {
1480 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1481 } else {
1482 if (localeID==NULL) {
1483 localeID=uloc_getDefault();
1484 }
1485 tmpLocaleID=localeID;
1486 }
1487
1488 /* Skip the language */
1489 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1490 if(_isIDSeparator(*tmpLocaleID)) {
1491 const char *scriptID;
1492 /* Skip the script if available */
1493 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1494 if(scriptID != tmpLocaleID+1) {
1495 /* Found optional script */
1496 tmpLocaleID = scriptID;
1497 }
1498 /* Skip the Country */
1499 if (_isIDSeparator(*tmpLocaleID)) {
1500 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1501 if(_isIDSeparator(*tmpLocaleID)) {
1502 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1503 }
1504 }
1505 }
1506
1507 /* keywords are located after '@' */
1508 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1509 i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, FALSE, status);
1510 }
1511
1512 if(i) {
1513 return uloc_openKeywordList(keywords, i, status);
1514 } else {
1515 return NULL;
1516 }
1517}
1518
1519
1520/* bit-flags for 'options' parameter of _canonicalize */
1521#define _ULOC_STRIP_KEYWORDS 0x2
1522#define _ULOC_CANONICALIZE 0x1
1523
1524#define OPTION_SET(options, mask) ((options & mask) != 0)
1525
1526static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1527#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1528
1529/**
1530 * Canonicalize the given localeID, to level 1 or to level 2,
1531 * depending on the options. To specify level 1, pass in options=0.
1532 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1533 *
1534 * This is the code underlying uloc_getName and uloc_canonicalize.
1535 */
1536static void
1537_canonicalize(const char* localeID,
1538 ByteSink& sink,
1539 uint32_t options,
1540 UErrorCode* err) {
1541 int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1542 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1543 const char* origLocaleID;
1544 const char* tmpLocaleID;
1545 const char* keywordAssign = NULL;
1546 const char* separatorIndicator = NULL;
1547
1548 if (U_FAILURE(*err)) {
1549 return;
1550 }
1551
1552 if (_hasBCP47Extension(localeID)) {
1553 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1554 } else {
1555 if (localeID==NULL) {
1556 localeID=uloc_getDefault();
1557 }
1558 tmpLocaleID=localeID;
1559 }
1560
1561 origLocaleID=tmpLocaleID;
1562
1563 /* get all pieces, one after another, and separate with '_' */
1564 CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1565
1566 if (tag.length() == I_DEFAULT_LENGTH &&
1567 uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1568 tag.clear();
1569 tag.append(uloc_getDefault(), *err);
1570 } else if(_isIDSeparator(*tmpLocaleID)) {
1571 const char *scriptID;
1572
1573 ++fieldCount;
1574 tag.append('_', *err);
1575
1576 CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1577 tag.append(script, *err);
1578 scriptSize = script.length();
1579 if(scriptSize > 0) {
1580 /* Found optional script */
1581 tmpLocaleID = scriptID;
1582 ++fieldCount;
1583 if (_isIDSeparator(*tmpLocaleID)) {
1584 /* If there is something else, then we add the _ */
1585 tag.append('_', *err);
1586 }
1587 }
1588
1589 if (_isIDSeparator(*tmpLocaleID)) {
1590 const char *cntryID;
1591
1592 CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1593 tag.append(country, *err);
1594 if (!country.isEmpty()) {
1595 /* Found optional country */
1596 tmpLocaleID = cntryID;
1597 }
1598 if(_isIDSeparator(*tmpLocaleID)) {
1599 /* If there is something else, then we add the _ if we found country before. */
1600 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1601 ++fieldCount;
1602 tag.append('_', *err);
1603 }
1604
1605 variantSize = -tag.length();
1606 {
1607 CharStringByteSink s(&tag);
1608 _getVariantEx(tmpLocaleID+1, *tmpLocaleID, s, FALSE);
1609 }
1610 variantSize += tag.length();
1611 if (variantSize > 0) {
1612 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1613 }
1614 }
1615 }
1616 }
1617
1618 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1619 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1620 UBool done = FALSE;
1621 do {
1622 char c = *tmpLocaleID;
1623 switch (c) {
1624 case 0:
1625 case '@':
1626 done = TRUE;
1627 break;
1628 default:
1629 tag.append(c, *err);
1630 ++tmpLocaleID;
1631 break;
1632 }
1633 } while (!done);
1634 }
1635
1636 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1637 After this, tmpLocaleID either points to '@' or is NULL */
1638 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1639 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1640 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1641 }
1642
1643 /* Copy POSIX-style variant, if any [mr@FOO] */
1644 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1645 tmpLocaleID != NULL && keywordAssign == NULL) {
1646 for (;;) {
1647 char c = *tmpLocaleID;
1648 if (c == 0) {
1649 break;
1650 }
1651 tag.append(c, *err);
1652 ++tmpLocaleID;
1653 }
1654 }
1655
1656 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1657 /* Handle @FOO variant if @ is present and not followed by = */
1658 if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1659 /* Add missing '_' if needed */
1660 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1661 do {
1662 tag.append('_', *err);
1663 ++fieldCount;
1664 } while(fieldCount<2);
1665 }
1666
1667 int32_t posixVariantSize = -tag.length();
1668 {
1669 CharStringByteSink s(&tag);
1670 _getVariantEx(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1671 }
1672 posixVariantSize += tag.length();
1673 if (posixVariantSize > 0) {
1674 variantSize += posixVariantSize;
1675 }
1676 }
1677
1678 /* Look up the ID in the canonicalization map */
1679 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1680 StringPiece id(CANONICALIZE_MAP[j].id);
1681 if (tag == id) {
1682 if (id.empty() && tmpLocaleID != NULL) {
1683 break; /* Don't remap "" if keywords present */
1684 }
1685 tag.clear();
1686 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1687 break;
1688 }
1689 }
1690 }
1691
1692 sink.Append(tag.data(), tag.length());
1693
1694 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1695 if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1696 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1697 sink.Append("@", 1);
1698 ++fieldCount;
1699 _getKeywords(tmpLocaleID+1, '@', sink, TRUE, err);
1700 }
1701 }
1702}
1703
1704/* ### ID parsing API **************************************************/
1705
1706U_CAPI int32_t U_EXPORT2
1707uloc_getParent(const char* localeID,
1708 char* parent,
1709 int32_t parentCapacity,
1710 UErrorCode* err)
1711{
1712 const char *lastUnderscore;
1713 int32_t i;
1714
1715 if (U_FAILURE(*err))
1716 return 0;
1717
1718 if (localeID == NULL)
1719 localeID = uloc_getDefault();
1720
1721 lastUnderscore=uprv_strrchr(localeID, '_');
1722 if(lastUnderscore!=NULL) {
1723 i=(int32_t)(lastUnderscore-localeID);
1724 } else {
1725 i=0;
1726 }
1727
1728 if (i > 0) {
1729 if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1730 localeID += 3;
1731 i -= 3;
1732 uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1733 } else if (parent != localeID) {
1734 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1735 }
1736 }
1737
1738 return u_terminateChars(parent, parentCapacity, i, err);
1739}
1740
1741U_CAPI int32_t U_EXPORT2
1742uloc_getLanguage(const char* localeID,
1743 char* language,
1744 int32_t languageCapacity,
1745 UErrorCode* err)
1746{
1747 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1748 int32_t i=0;
1749
1750 if (err==NULL || U_FAILURE(*err)) {
1751 return 0;
1752 }
1753
1754 if(localeID==NULL) {
1755 localeID=uloc_getDefault();
1756 }
1757
1758 i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1759 return u_terminateChars(language, languageCapacity, i, err);
1760}
1761
1762U_CAPI int32_t U_EXPORT2
1763uloc_getScript(const char* localeID,
1764 char* script,
1765 int32_t scriptCapacity,
1766 UErrorCode* err)
1767{
1768 int32_t i=0;
1769
1770 if(err==NULL || U_FAILURE(*err)) {
1771 return 0;
1772 }
1773
1774 if(localeID==NULL) {
1775 localeID=uloc_getDefault();
1776 }
1777
1778 /* skip the language */
1779 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1780 if(_isIDSeparator(*localeID)) {
1781 i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1782 }
1783 return u_terminateChars(script, scriptCapacity, i, err);
1784}
1785
1786U_CAPI int32_t U_EXPORT2
1787uloc_getCountry(const char* localeID,
1788 char* country,
1789 int32_t countryCapacity,
1790 UErrorCode* err)
1791{
1792 int32_t i=0;
1793
1794 if(err==NULL || U_FAILURE(*err)) {
1795 return 0;
1796 }
1797
1798 if(localeID==NULL) {
1799 localeID=uloc_getDefault();
1800 }
1801
1802 /* Skip the language */
1803 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1804 if(_isIDSeparator(*localeID)) {
1805 const char *scriptID;
1806 /* Skip the script if available */
1807 ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1808 if(scriptID != localeID+1) {
1809 /* Found optional script */
1810 localeID = scriptID;
1811 }
1812 if(_isIDSeparator(*localeID)) {
1813 i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1814 }
1815 }
1816 return u_terminateChars(country, countryCapacity, i, err);
1817}
1818
1819U_CAPI int32_t U_EXPORT2
1820uloc_getVariant(const char* localeID,
1821 char* variant,
1822 int32_t variantCapacity,
1823 UErrorCode* err)
1824{
1825 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1826 const char* tmpLocaleID;
1827 int32_t i=0;
1828
1829 if(err==NULL || U_FAILURE(*err)) {
1830 return 0;
1831 }
1832
1833 if (_hasBCP47Extension(localeID)) {
1834 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1835 } else {
1836 if (localeID==NULL) {
1837 localeID=uloc_getDefault();
1838 }
1839 tmpLocaleID=localeID;
1840 }
1841
1842 /* Skip the language */
1843 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1844 if(_isIDSeparator(*tmpLocaleID)) {
1845 const char *scriptID;
1846 /* Skip the script if available */
1847 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1848 if(scriptID != tmpLocaleID+1) {
1849 /* Found optional script */
1850 tmpLocaleID = scriptID;
1851 }
1852 /* Skip the Country */
1853 if (_isIDSeparator(*tmpLocaleID)) {
1854 const char *cntryID;
1855 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
1856 if (cntryID != tmpLocaleID+1) {
1857 /* Found optional country */
1858 tmpLocaleID = cntryID;
1859 }
1860 if(_isIDSeparator(*tmpLocaleID)) {
1861 /* If there was no country ID, skip a possible extra IDSeparator */
1862 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1863 tmpLocaleID++;
1864 }
1865 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
1866 }
1867 }
1868 }
1869
1870 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1871 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1872/*
1873 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1874 i=_getVariant(localeID+1, '@', variant, variantCapacity);
1875 }
1876*/
1877 return u_terminateChars(variant, variantCapacity, i, err);
1878}
1879
1880U_CAPI int32_t U_EXPORT2
1881uloc_getName(const char* localeID,
1882 char* name,
1883 int32_t nameCapacity,
1884 UErrorCode* err)
1885{
1886 if (U_FAILURE(*err)) {
1887 return 0;
1888 }
1889
1890 CheckedArrayByteSink sink(name, nameCapacity);
1891 ulocimp_getName(localeID, sink, err);
1892
1893 int32_t reslen = sink.NumberOfBytesAppended();
1894
1895 if (U_FAILURE(*err)) {
1896 return reslen;
1897 }
1898
1899 if (sink.Overflowed()) {
1900 *err = U_BUFFER_OVERFLOW_ERROR;
1901 } else {
1902 u_terminateChars(name, nameCapacity, reslen, err);
1903 }
1904
1905 return reslen;
1906}
1907
1908U_STABLE void U_EXPORT2
1909ulocimp_getName(const char* localeID,
1910 ByteSink& sink,
1911 UErrorCode* err)
1912{
1913 _canonicalize(localeID, sink, 0, err);
1914}
1915
1916U_CAPI int32_t U_EXPORT2
1917uloc_getBaseName(const char* localeID,
1918 char* name,
1919 int32_t nameCapacity,
1920 UErrorCode* err)
1921{
1922 if (U_FAILURE(*err)) {
1923 return 0;
1924 }
1925
1926 CheckedArrayByteSink sink(name, nameCapacity);
1927 ulocimp_getBaseName(localeID, sink, err);
1928
1929 int32_t reslen = sink.NumberOfBytesAppended();
1930
1931 if (U_FAILURE(*err)) {
1932 return reslen;
1933 }
1934
1935 if (sink.Overflowed()) {
1936 *err = U_BUFFER_OVERFLOW_ERROR;
1937 } else {
1938 u_terminateChars(name, nameCapacity, reslen, err);
1939 }
1940
1941 return reslen;
1942}
1943
1944U_STABLE void U_EXPORT2
1945ulocimp_getBaseName(const char* localeID,
1946 ByteSink& sink,
1947 UErrorCode* err)
1948{
1949 _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1950}
1951
1952U_CAPI int32_t U_EXPORT2
1953uloc_canonicalize(const char* localeID,
1954 char* name,
1955 int32_t nameCapacity,
1956 UErrorCode* err)
1957{
1958 if (U_FAILURE(*err)) {
1959 return 0;
1960 }
1961
1962 CheckedArrayByteSink sink(name, nameCapacity);
1963 ulocimp_canonicalize(localeID, sink, err);
1964
1965 int32_t reslen = sink.NumberOfBytesAppended();
1966
1967 if (U_FAILURE(*err)) {
1968 return reslen;
1969 }
1970
1971 if (sink.Overflowed()) {
1972 *err = U_BUFFER_OVERFLOW_ERROR;
1973 } else {
1974 u_terminateChars(name, nameCapacity, reslen, err);
1975 }
1976
1977 return reslen;
1978}
1979
1980U_STABLE void U_EXPORT2
1981ulocimp_canonicalize(const char* localeID,
1982 ByteSink& sink,
1983 UErrorCode* err)
1984{
1985 _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1986}
1987
1988U_CAPI const char* U_EXPORT2
1989uloc_getISO3Language(const char* localeID)
1990{
1991 int16_t offset;
1992 char lang[ULOC_LANG_CAPACITY];
1993 UErrorCode err = U_ZERO_ERROR;
1994
1995 if (localeID == NULL)
1996 {
1997 localeID = uloc_getDefault();
1998 }
1999 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2000 if (U_FAILURE(err))
2001 return "";
2002 offset = _findIndex(LANGUAGES, lang);
2003 if (offset < 0)
2004 return "";
2005 return LANGUAGES_3[offset];
2006}
2007
2008U_CAPI const char* U_EXPORT2
2009uloc_getISO3Country(const char* localeID)
2010{
2011 int16_t offset;
2012 char cntry[ULOC_LANG_CAPACITY];
2013 UErrorCode err = U_ZERO_ERROR;
2014
2015 if (localeID == NULL)
2016 {
2017 localeID = uloc_getDefault();
2018 }
2019 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2020 if (U_FAILURE(err))
2021 return "";
2022 offset = _findIndex(COUNTRIES, cntry);
2023 if (offset < 0)
2024 return "";
2025
2026 return COUNTRIES_3[offset];
2027}
2028
2029U_CAPI uint32_t U_EXPORT2
2030uloc_getLCID(const char* localeID)
2031{
2032 UErrorCode status = U_ZERO_ERROR;
2033 char langID[ULOC_FULLNAME_CAPACITY];
2034 uint32_t lcid = 0;
2035
2036 /* Check for incomplete id. */
2037 if (!localeID || uprv_strlen(localeID) < 2) {
2038 return 0;
2039 }
2040
2041 // First, attempt Windows platform lookup if available, but fall
2042 // through to catch any special cases (ICU vs Windows name differences).
2043 lcid = uprv_convertToLCIDPlatform(localeID, &status);
2044 if (U_FAILURE(status)) {
2045 return 0;
2046 }
2047 if (lcid > 0) {
2048 // Windows found an LCID, return that
2049 return lcid;
2050 }
2051
2052 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2053 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2054 return 0;
2055 }
2056
2057 if (uprv_strchr(localeID, '@')) {
2058 // uprv_convertToLCID does not support keywords other than collation.
2059 // Remove all keywords except collation.
2060 int32_t len;
2061 char collVal[ULOC_KEYWORDS_CAPACITY];
2062 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2063
2064 len = uloc_getKeywordValue(localeID, "collation", collVal,
2065 UPRV_LENGTHOF(collVal) - 1, &status);
2066
2067 if (U_SUCCESS(status) && len > 0) {
2068 collVal[len] = 0;
2069
2070 len = uloc_getBaseName(localeID, tmpLocaleID,
2071 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2072
2073 if (U_SUCCESS(status) && len > 0) {
2074 tmpLocaleID[len] = 0;
2075
2076 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2077 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2078
2079 if (U_SUCCESS(status) && len > 0) {
2080 tmpLocaleID[len] = 0;
2081 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2082 }
2083 }
2084 }
2085
2086 // fall through - all keywords are simply ignored
2087 status = U_ZERO_ERROR;
2088 }
2089
2090 return uprv_convertToLCID(langID, localeID, &status);
2091}
2092
2093U_CAPI int32_t U_EXPORT2
2094uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2095 UErrorCode *status)
2096{
2097 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2098}
2099
2100/* ### Default locale **************************************************/
2101
2102U_CAPI const char* U_EXPORT2
2103uloc_getDefault()
2104{
2105 return locale_get_default();
2106}
2107
2108U_CAPI void U_EXPORT2
2109uloc_setDefault(const char* newDefaultLocale,
2110 UErrorCode* err)
2111{
2112 if (U_FAILURE(*err))
2113 return;
2114 /* the error code isn't currently used for anything by this function*/
2115
2116 /* propagate change to C++ */
2117 locale_set_default(newDefaultLocale);
2118}
2119
2120/**
2121 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2122 * to an array of pointers to arrays of char. All of these pointers are owned
2123 * by ICU-- do not delete them, and do not write through them. The array is
2124 * terminated with a null pointer.
2125 */
2126U_CAPI const char* const* U_EXPORT2
2127uloc_getISOLanguages()
2128{
2129 return LANGUAGES;
2130}
2131
2132/**
2133 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2134 * pointer to an array of pointers to arrays of char. All of these pointers are
2135 * owned by ICU-- do not delete them, and do not write through them. The array is
2136 * terminated with a null pointer.
2137 */
2138U_CAPI const char* const* U_EXPORT2
2139uloc_getISOCountries()
2140{
2141 return COUNTRIES;
2142}
2143
2144U_CAPI const char* U_EXPORT2
2145uloc_toUnicodeLocaleKey(const char* keyword)
2146{
2147 const char* bcpKey = ulocimp_toBcpKey(keyword);
2148 if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2149 // unknown keyword, but syntax is fine..
2150 return keyword;
2151 }
2152 return bcpKey;
2153}
2154
2155U_CAPI const char* U_EXPORT2
2156uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2157{
2158 const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2159 if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2160 // unknown keyword, but syntax is fine..
2161 return value;
2162 }
2163 return bcpType;
2164}
2165
2166static UBool
2167isWellFormedLegacyKey(const char* legacyKey)
2168{
2169 const char* p = legacyKey;
2170 while (*p) {
2171 if (!UPRV_ISALPHANUM(*p)) {
2172 return FALSE;
2173 }
2174 p++;
2175 }
2176 return TRUE;
2177}
2178
2179static UBool
2180isWellFormedLegacyType(const char* legacyType)
2181{
2182 const char* p = legacyType;
2183 int32_t alphaNumLen = 0;
2184 while (*p) {
2185 if (*p == '_' || *p == '/' || *p == '-') {
2186 if (alphaNumLen == 0) {
2187 return FALSE;
2188 }
2189 alphaNumLen = 0;
2190 } else if (UPRV_ISALPHANUM(*p)) {
2191 alphaNumLen++;
2192 } else {
2193 return FALSE;
2194 }
2195 p++;
2196 }
2197 return (alphaNumLen != 0);
2198}
2199
2200U_CAPI const char* U_EXPORT2
2201uloc_toLegacyKey(const char* keyword)
2202{
2203 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2204 if (legacyKey == NULL) {
2205 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2206 //
2207 // Note:
2208 // LDML/CLDR provides some definition of keyword syntax in
2209 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2210 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2211 // Keys can only consist of [0-9a-zA-Z].
2212 if (isWellFormedLegacyKey(keyword)) {
2213 return keyword;
2214 }
2215 }
2216 return legacyKey;
2217}
2218
2219U_CAPI const char* U_EXPORT2
2220uloc_toLegacyType(const char* keyword, const char* value)
2221{
2222 const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2223 if (legacyType == NULL) {
2224 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2225 //
2226 // Note:
2227 // LDML/CLDR provides some definition of keyword syntax in
2228 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2229 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2230 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2231 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2232 if (isWellFormedLegacyType(value)) {
2233 return value;
2234 }
2235 }
2236 return legacyType;
2237}
2238
2239/*eof*/
2240