1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 1997-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9* File ULOC.CPP
10*
11* Modification History:
12*
13* Date Name Description
14* 04/01/97 aliu Creation.
15* 08/21/98 stephen JDK 1.2 sync
16* 12/08/98 rtg New Locale implementation and C API
17* 03/15/99 damiba overhaul.
18* 04/06/99 stephen changed setDefault() to realloc and copy
19* 06/14/99 stephen Changed calls to ures_open for new params
20* 07/21/99 stephen Modified setDefault() to propagate to C++
21* 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22* brought canonicalization code into line with spec
23*****************************************************************************/
24
25/*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31*/
32
33#include "unicode/utypes.h"
34#include "unicode/ustring.h"
35#include "unicode/uloc.h"
36
37#include "putilimp.h"
38#include "ustr_imp.h"
39#include "ulocimp.h"
40#include "umutex.h"
41#include "cstring.h"
42#include "cmemory.h"
43#include "locmap.h"
44#include "uarrsort.h"
45#include "uenumimp.h"
46#include "uassert.h"
47#include "charstr.h"
48
49#include <stdio.h> /* for sprintf */
50
51U_NAMESPACE_USE
52
53/* ### Declarations **************************************************/
54
55/* Locale stuff from locid.cpp */
56U_CFUNC void locale_set_default(const char *id);
57U_CFUNC const char *locale_get_default(void);
58U_CFUNC int32_t
59locale_getKeywords(const char *localeID,
60 char prev,
61 char *keywords, int32_t keywordCapacity,
62 char *values, int32_t valuesCapacity, int32_t *valLen,
63 UBool valuesToo,
64 UErrorCode *status);
65
66/* ### Data tables **************************************************/
67
68/**
69 * Table of language codes, both 2- and 3-letter, with preference
70 * given to 2-letter codes where possible. Includes 3-letter codes
71 * that lack a 2-letter equivalent.
72 *
73 * This list must be in sorted order. This list is returned directly
74 * to the user by some API.
75 *
76 * This list must be kept in sync with LANGUAGES_3, with corresponding
77 * entries matched.
78 *
79 * This table should be terminated with a NULL entry, followed by a
80 * second list, and another NULL entry. The first list is visible to
81 * user code when this array is returned by API. The second list
82 * contains codes we support, but do not expose through user API.
83 *
84 * Notes
85 *
86 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
87 * include the revisions up to 2001/7/27 *CWB*
88 *
89 * The 3 character codes are the terminology codes like RFC 3066. This
90 * is compatible with prior ICU codes
91 *
92 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
93 * table but now at the end of the table because 3 character codes are
94 * duplicates. This avoids bad searches going from 3 to 2 character
95 * codes.
96 *
97 * The range qaa-qtz is reserved for local use
98 */
99/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
100/* ISO639 table version is 20150505 */
101/* Subsequent hand addition of selected languages */
102static const char * const LANGUAGES[] = {
103 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
104 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
105 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
106 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
107 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
108 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
109 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
110 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
111 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
112 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
113 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
114 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
115 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
116 "cs", "csb", "cu", "cv", "cy",
117 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
118 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
119 "dyo", "dyu", "dz", "dzg",
120 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
121 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
122 "ext",
123 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
124 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
125 "frs", "fur", "fy",
126 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
127 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
128 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
129 "gur", "guz", "gv", "gwi",
130 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
131 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
132 "hup", "hy", "hz",
133 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
134 "ilo", "inh", "io", "is", "it", "iu", "izh",
135 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
136 "jv",
137 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
138 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
139 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
140 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
141 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
142 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
143 "kv", "kw", "ky",
144 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
145 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
146 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
147 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
148 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
149 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
150 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
151 "ml", "mn", "mnc", "mni", "mo",
152 "moh", "mos", "mr", "mrj",
153 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
154 "my", "mye", "myv", "mzn",
155 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
156 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
157 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
158 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
159 "oc", "oj", "om", "or", "os", "osa", "ota",
160 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
161 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
162 "pon", "prg", "pro", "ps", "pt",
163 "qu", "quc", "qug",
164 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
165 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
166 "rw", "rwk",
167 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
168 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
169 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
170 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
171 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
172 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
173 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
174 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
175 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
176 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
177 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
178 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
179 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
180 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
181 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
182 "vot", "vro", "vun",
183 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
184 "xal", "xh", "xmf", "xog",
185 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
186 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
187 "zun", "zxx", "zza",
188NULL,
189 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
190NULL
191};
192
193static const char* const DEPRECATED_LANGUAGES[]={
194 "in", "iw", "ji", "jw", NULL, NULL
195};
196static const char* const REPLACEMENT_LANGUAGES[]={
197 "id", "he", "yi", "jv", NULL, NULL
198};
199
200/**
201 * Table of 3-letter language codes.
202 *
203 * This is a lookup table used to convert 3-letter language codes to
204 * their 2-letter equivalent, where possible. It must be kept in sync
205 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
206 * same language as LANGUAGES_3[i]. The commented-out lines are
207 * copied from LANGUAGES to make eyeballing this baby easier.
208 *
209 * Where a 3-letter language code has no 2-letter equivalent, the
210 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
211 *
212 * This table should be terminated with a NULL entry, followed by a
213 * second list, and another NULL entry. The two lists correspond to
214 * the two lists in LANGUAGES.
215 */
216/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
217/* ISO639 table version is 20150505 */
218/* Subsequent hand addition of selected languages */
219static const char * const LANGUAGES_3[] = {
220 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
221 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
222 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
223 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
224 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
225 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
226 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
227 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
228 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
229 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
230 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
231 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
232 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
233 "ces", "csb", "chu", "chv", "cym",
234 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
235 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
236 "dyo", "dyu", "dzo", "dzg",
237 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
238 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
239 "ext",
240 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
241 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
242 "frs", "fur", "fry",
243 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
244 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
245 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
246 "gur", "guz", "glv", "gwi",
247 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
248 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
249 "hup", "hye", "her",
250 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
251 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
252 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
253 "jav",
254 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
255 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
256 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
257 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
258 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
259 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
260 "kom", "cor", "kir",
261 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
262 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
263 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
264 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
265 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
266 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
267 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
268 "mal", "mon", "mnc", "mni", "mol",
269 "moh", "mos", "mar", "mrj",
270 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
271 "mya", "mye", "myv", "mzn",
272 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
273 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
274 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
275 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
276 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
277 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
278 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
279 "pon", "prg", "pro", "pus", "por",
280 "que", "quc", "qug",
281 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
282 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
283 "kin", "rwk",
284 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
285 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
286 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
287 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
288 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
289 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
290 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
291 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
292 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
293 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
294 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
295 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
296 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
297 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
298 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
299 "vot", "vro", "vun",
300 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
301 "xal", "xho", "xmf", "xog",
302 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
303 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
304 "zun", "zxx", "zza",
305NULL,
306/* "in", "iw", "ji", "jw", "sh", */
307 "ind", "heb", "yid", "jaw", "srp",
308NULL
309};
310
311/**
312 * Table of 2-letter country codes.
313 *
314 * This list must be in sorted order. This list is returned directly
315 * to the user by some API.
316 *
317 * This list must be kept in sync with COUNTRIES_3, with corresponding
318 * entries matched.
319 *
320 * This table should be terminated with a NULL entry, followed by a
321 * second list, and another NULL entry. The first list is visible to
322 * user code when this array is returned by API. The second list
323 * contains codes we support, but do not expose through user API.
324 *
325 * Notes:
326 *
327 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
328 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
329 * new codes keeping the old ones for compatibility updated to include
330 * 1999/12/03 revisions *CWB*
331 *
332 * RO(ROM) is now RO(ROU) according to
333 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
334 */
335static const char * const COUNTRIES[] = {
336 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
337 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
338 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
339 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
340 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
341 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR",
342 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK",
343 "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER",
344 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
345 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
346 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
347 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
348 "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
349 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
350 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
351 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
352 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
353 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
354 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
355 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
356 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
357 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
358 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
359 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
360 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
361 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
362 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
363 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
364 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
365 "WS", "YE", "YT", "ZA", "ZM", "ZW",
366NULL,
367 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
368NULL
369};
370
371static const char* const DEPRECATED_COUNTRIES[] = {
372 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
373};
374static const char* const REPLACEMENT_COUNTRIES[] = {
375/* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
376 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL /* replacement country codes */
377};
378
379/**
380 * Table of 3-letter country codes.
381 *
382 * This is a lookup table used to convert 3-letter country codes to
383 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
384 * For all valid i, COUNTRIES[i] must refer to the same country as
385 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
386 * to make eyeballing this baby easier.
387 *
388 * This table should be terminated with a NULL entry, followed by a
389 * second list, and another NULL entry. The two lists correspond to
390 * the two lists in COUNTRIES.
391 */
392static const char * const COUNTRIES_3[] = {
393/* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
394 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
395/* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
396 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
397/* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
398 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
399/* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
400 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
401/* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
402 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
403/* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", */
404 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
405/* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DJ", "DK", */
406 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
407/* "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", */
408 "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
409/* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
410 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
411/* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
412 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
413/* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
414 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
415/* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
416 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
417/* "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
418 "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
419/* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
420 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
421/* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
422 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
423/* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
424 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
425/* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
426 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
427/* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
428 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
429/* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
430 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
431/* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
432 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
433/* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
434 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
435/* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
436 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
437/* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
438 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
439/* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
440 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
441/* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
442 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
443/* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
444 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
445/* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
446 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
447/* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
448 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
449/* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
450 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
451/* "WS", "YE", "YT", "ZA", "ZM", "ZW", */
452 "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
453NULL,
454/* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
455 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
456NULL
457};
458
459typedef struct CanonicalizationMap {
460 const char *id; /* input ID */
461 const char *canonicalID; /* canonicalized output ID */
462} CanonicalizationMap;
463
464/**
465 * A map to canonicalize locale IDs. This handles a variety of
466 * different semantic kinds of transformations.
467 */
468static const CanonicalizationMap CANONICALIZE_MAP[] = {
469 { "art_LOJBAN", "jbo" }, /* registered name */
470 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
471 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
472 { "zh_GAN", "gan" }, /* registered name */
473 { "zh_GUOYU", "zh" }, /* registered name */
474 { "zh_HAKKA", "hak" }, /* registered name */
475 { "zh_MIN_NAN", "nan" }, /* registered name */
476 { "zh_WUU", "wuu" }, /* registered name */
477 { "zh_XIANG", "hsn" }, /* registered name */
478 { "zh_YUE", "yue" }, /* registered name */
479};
480
481/* ### BCP47 Conversion *******************************************/
482/* Test if the locale id has BCP47 u extension and does not have '@' */
483#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
484/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
485#define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
486 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
487 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
488 finalID=id; \
489 if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
490 } else { \
491 finalID=buffer; \
492 } \
493} UPRV_BLOCK_MACRO_END
494/* Gets the size of the shortest subtag in the given localeID. */
495static int32_t getShortestSubtagLength(const char *localeID) {
496 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
497 int32_t length = localeIDLength;
498 int32_t tmpLength = 0;
499 int32_t i;
500 UBool reset = TRUE;
501
502 for (i = 0; i < localeIDLength; i++) {
503 if (localeID[i] != '_' && localeID[i] != '-') {
504 if (reset) {
505 tmpLength = 0;
506 reset = FALSE;
507 }
508 tmpLength++;
509 } else {
510 if (tmpLength != 0 && tmpLength < length) {
511 length = tmpLength;
512 }
513 reset = TRUE;
514 }
515 }
516
517 return length;
518}
519
520/* ### Keywords **************************************************/
521#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
522#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
523/* Punctuation/symbols allowed in legacy key values */
524#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
525
526#define ULOC_KEYWORD_BUFFER_LEN 25
527#define ULOC_MAX_NO_KEYWORDS 25
528
529U_CAPI const char * U_EXPORT2
530locale_getKeywordsStart(const char *localeID) {
531 const char *result = NULL;
532 if((result = uprv_strchr(localeID, '@')) != NULL) {
533 return result;
534 }
535#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
536 else {
537 /* We do this because the @ sign is variant, and the @ sign used on one
538 EBCDIC machine won't be compiled the same way on other EBCDIC based
539 machines. */
540 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
541 const uint8_t *charToFind = ebcdicSigns;
542 while(*charToFind) {
543 if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
544 return result;
545 }
546 charToFind++;
547 }
548 }
549#endif
550 return NULL;
551}
552
553/**
554 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
555 * @param keywordName incoming name to be canonicalized
556 * @param status return status (keyword too long)
557 * @return length of the keyword name
558 */
559static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
560{
561 int32_t keywordNameLen = 0;
562
563 for (; *keywordName != 0; keywordName++) {
564 if (!UPRV_ISALPHANUM(*keywordName)) {
565 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
566 return 0;
567 }
568 if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
569 buf[keywordNameLen++] = uprv_tolower(*keywordName);
570 } else {
571 /* keyword name too long for internal buffer */
572 *status = U_INTERNAL_PROGRAM_ERROR;
573 return 0;
574 }
575 }
576 if (keywordNameLen == 0) {
577 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
578 return 0;
579 }
580 buf[keywordNameLen] = 0; /* terminate */
581
582 return keywordNameLen;
583}
584
585typedef struct {
586 char keyword[ULOC_KEYWORD_BUFFER_LEN];
587 int32_t keywordLen;
588 const char *valueStart;
589 int32_t valueLen;
590} KeywordStruct;
591
592static int32_t U_CALLCONV
593compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
594 const char* leftString = ((const KeywordStruct *)left)->keyword;
595 const char* rightString = ((const KeywordStruct *)right)->keyword;
596 return uprv_strcmp(leftString, rightString);
597}
598
599static int32_t
600_getKeywords(const char *localeID,
601 char prev,
602 char *keywords, int32_t keywordCapacity,
603 char *values, int32_t valuesCapacity, int32_t *valLen,
604 UBool valuesToo,
605 UErrorCode *status)
606{
607 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
608
609 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
610 int32_t numKeywords = 0;
611 const char* pos = localeID;
612 const char* equalSign = NULL;
613 const char* semicolon = NULL;
614 int32_t i = 0, j, n;
615 int32_t keywordsLen = 0;
616 int32_t valuesLen = 0;
617
618 if(prev == '@') { /* start of keyword definition */
619 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
620 do {
621 UBool duplicate = FALSE;
622 /* skip leading spaces */
623 while(*pos == ' ') {
624 pos++;
625 }
626 if (!*pos) { /* handle trailing "; " */
627 break;
628 }
629 if(numKeywords == maxKeywords) {
630 *status = U_INTERNAL_PROGRAM_ERROR;
631 return 0;
632 }
633 equalSign = uprv_strchr(pos, '=');
634 semicolon = uprv_strchr(pos, ';');
635 /* lack of '=' [foo@currency] is illegal */
636 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
637 if(!equalSign || (semicolon && semicolon<equalSign)) {
638 *status = U_INVALID_FORMAT_ERROR;
639 return 0;
640 }
641 /* need to normalize both keyword and keyword name */
642 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
643 /* keyword name too long for internal buffer */
644 *status = U_INTERNAL_PROGRAM_ERROR;
645 return 0;
646 }
647 for(i = 0, n = 0; i < equalSign - pos; ++i) {
648 if (pos[i] != ' ') {
649 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
650 }
651 }
652
653 /* zero-length keyword is an error. */
654 if (n == 0) {
655 *status = U_INVALID_FORMAT_ERROR;
656 return 0;
657 }
658
659 keywordList[numKeywords].keyword[n] = 0;
660 keywordList[numKeywords].keywordLen = n;
661 /* now grab the value part. First we skip the '=' */
662 equalSign++;
663 /* then we leading spaces */
664 while(*equalSign == ' ') {
665 equalSign++;
666 }
667
668 /* Premature end or zero-length value */
669 if (!*equalSign || equalSign == semicolon) {
670 *status = U_INVALID_FORMAT_ERROR;
671 return 0;
672 }
673
674 keywordList[numKeywords].valueStart = equalSign;
675
676 pos = semicolon;
677 i = 0;
678 if(pos) {
679 while(*(pos - i - 1) == ' ') {
680 i++;
681 }
682 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
683 pos++;
684 } else {
685 i = (int32_t)uprv_strlen(equalSign);
686 while(i && equalSign[i-1] == ' ') {
687 i--;
688 }
689 keywordList[numKeywords].valueLen = i;
690 }
691 /* If this is a duplicate keyword, then ignore it */
692 for (j=0; j<numKeywords; ++j) {
693 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
694 duplicate = TRUE;
695 break;
696 }
697 }
698 if (!duplicate) {
699 ++numKeywords;
700 }
701 } while(pos);
702
703 /* now we have a list of keywords */
704 /* we need to sort it */
705 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
706
707 /* Now construct the keyword part */
708 for(i = 0; i < numKeywords; i++) {
709 if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
710 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
711 if(valuesToo) {
712 keywords[keywordsLen + keywordList[i].keywordLen] = '=';
713 } else {
714 keywords[keywordsLen + keywordList[i].keywordLen] = 0;
715 }
716 }
717 keywordsLen += keywordList[i].keywordLen + 1;
718 if(valuesToo) {
719 if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
720 uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
721 }
722 keywordsLen += keywordList[i].valueLen;
723
724 if(i < numKeywords - 1) {
725 if(keywordsLen < keywordCapacity) {
726 keywords[keywordsLen] = ';';
727 }
728 keywordsLen++;
729 }
730 }
731 if(values) {
732 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
733 uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
734 values[valuesLen + keywordList[i].valueLen] = 0;
735 }
736 valuesLen += keywordList[i].valueLen + 1;
737 }
738 }
739 if(values) {
740 values[valuesLen] = 0;
741 if(valLen) {
742 *valLen = valuesLen;
743 }
744 }
745 return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
746 } else {
747 return 0;
748 }
749}
750
751U_CFUNC int32_t
752locale_getKeywords(const char *localeID,
753 char prev,
754 char *keywords, int32_t keywordCapacity,
755 char *values, int32_t valuesCapacity, int32_t *valLen,
756 UBool valuesToo,
757 UErrorCode *status) {
758 return _getKeywords(localeID, prev, keywords, keywordCapacity,
759 values, valuesCapacity, valLen, valuesToo,
760 status);
761}
762
763U_CAPI int32_t U_EXPORT2
764uloc_getKeywordValue(const char* localeID,
765 const char* keywordName,
766 char* buffer, int32_t bufferCapacity,
767 UErrorCode* status)
768{
769 if (buffer != nullptr) {
770 buffer[0] = '\0';
771 }
772 const char* startSearchHere = NULL;
773 const char* nextSeparator = NULL;
774 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
775 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
776 int32_t result = 0;
777
778 if(status && U_SUCCESS(*status) && localeID) {
779 char tempBuffer[ULOC_FULLNAME_CAPACITY];
780 const char* tmpLocaleID;
781
782 if (keywordName == NULL || keywordName[0] == 0) {
783 *status = U_ILLEGAL_ARGUMENT_ERROR;
784 return 0;
785 }
786
787 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
788 if(U_FAILURE(*status)) {
789 return 0;
790 }
791
792 if (_hasBCP47Extension(localeID)) {
793 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
794 } else {
795 tmpLocaleID=localeID;
796 }
797
798 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
799 if(startSearchHere == NULL) {
800 /* no keywords, return at once */
801 return 0;
802 }
803
804 /* find the first keyword */
805 while(startSearchHere) {
806 const char* keyValueTail;
807 int32_t keyValueLen;
808
809 startSearchHere++; /* skip @ or ; */
810 nextSeparator = uprv_strchr(startSearchHere, '=');
811 if(!nextSeparator) {
812 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
813 return 0;
814 }
815 /* strip leading & trailing spaces (TC decided to tolerate these) */
816 while(*startSearchHere == ' ') {
817 startSearchHere++;
818 }
819 keyValueTail = nextSeparator;
820 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
821 keyValueTail--;
822 }
823 /* now keyValueTail points to first char after the keyName */
824 /* copy & normalize keyName from locale */
825 if (startSearchHere == keyValueTail) {
826 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
827 return 0;
828 }
829 keyValueLen = 0;
830 while (startSearchHere < keyValueTail) {
831 if (!UPRV_ISALPHANUM(*startSearchHere)) {
832 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
833 return 0;
834 }
835 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
836 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
837 } else {
838 /* keyword name too long for internal buffer */
839 *status = U_INTERNAL_PROGRAM_ERROR;
840 return 0;
841 }
842 }
843 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
844
845 startSearchHere = uprv_strchr(nextSeparator, ';');
846
847 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
848 /* current entry matches the keyword. */
849 nextSeparator++; /* skip '=' */
850 /* First strip leading & trailing spaces (TC decided to tolerate these) */
851 while(*nextSeparator == ' ') {
852 nextSeparator++;
853 }
854 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
855 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
856 keyValueTail--;
857 }
858 /* Now copy the value, but check well-formedness */
859 if (nextSeparator == keyValueTail) {
860 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
861 return 0;
862 }
863 keyValueLen = 0;
864 while (nextSeparator < keyValueTail) {
865 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
866 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
867 return 0;
868 }
869 if (keyValueLen < bufferCapacity) {
870 /* Should we lowercase value to return here? Tests expect as-is. */
871 buffer[keyValueLen++] = *nextSeparator++;
872 } else { /* keep advancing so we return correct length in case of overflow */
873 keyValueLen++;
874 nextSeparator++;
875 }
876 }
877 result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
878 return result;
879 }
880 }
881 }
882 return 0;
883}
884
885U_CAPI int32_t U_EXPORT2
886uloc_setKeywordValue(const char* keywordName,
887 const char* keywordValue,
888 char* buffer, int32_t bufferCapacity,
889 UErrorCode* status)
890{
891 /* TODO: sorting. removal. */
892 int32_t keywordNameLen;
893 int32_t keywordValueLen;
894 int32_t bufLen;
895 int32_t needLen = 0;
896 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
897 char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
898 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
899 int32_t rc;
900 char* nextSeparator = NULL;
901 char* nextEqualsign = NULL;
902 char* startSearchHere = NULL;
903 char* keywordStart = NULL;
904 CharString updatedKeysAndValues;
905 int32_t updatedKeysAndValuesLen;
906 UBool handledInputKeyAndValue = FALSE;
907 char keyValuePrefix = '@';
908
909 if(U_FAILURE(*status)) {
910 return -1;
911 }
912 if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
913 *status = U_ILLEGAL_ARGUMENT_ERROR;
914 return 0;
915 }
916 bufLen = (int32_t)uprv_strlen(buffer);
917 if(bufferCapacity<bufLen) {
918 /* The capacity is less than the length?! Is this NULL terminated? */
919 *status = U_ILLEGAL_ARGUMENT_ERROR;
920 return 0;
921 }
922 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
923 if(U_FAILURE(*status)) {
924 return 0;
925 }
926
927 keywordValueLen = 0;
928 if(keywordValue) {
929 while (*keywordValue != 0) {
930 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
931 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
932 return 0;
933 }
934 if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
935 /* Should we force lowercase in value to set? */
936 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
937 } else {
938 /* keywordValue too long for internal buffer */
939 *status = U_INTERNAL_PROGRAM_ERROR;
940 return 0;
941 }
942 }
943 }
944 keywordValueBuffer[keywordValueLen] = 0; /* terminate */
945
946 startSearchHere = (char*)locale_getKeywordsStart(buffer);
947 if(startSearchHere == NULL || (startSearchHere[1]==0)) {
948 if(keywordValueLen == 0) { /* no keywords = nothing to remove */
949 return bufLen;
950 }
951
952 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
953 if(startSearchHere) { /* had a single @ */
954 needLen--; /* already had the @ */
955 /* startSearchHere points at the @ */
956 } else {
957 startSearchHere=buffer+bufLen;
958 }
959 if(needLen >= bufferCapacity) {
960 *status = U_BUFFER_OVERFLOW_ERROR;
961 return needLen; /* no change */
962 }
963 *startSearchHere++ = '@';
964 uprv_strcpy(startSearchHere, keywordNameBuffer);
965 startSearchHere += keywordNameLen;
966 *startSearchHere++ = '=';
967 uprv_strcpy(startSearchHere, keywordValueBuffer);
968 return needLen;
969 } /* end shortcut - no @ */
970
971 keywordStart = startSearchHere;
972 /* search for keyword */
973 while(keywordStart) {
974 const char* keyValueTail;
975 int32_t keyValueLen;
976
977 keywordStart++; /* skip @ or ; */
978 nextEqualsign = uprv_strchr(keywordStart, '=');
979 if (!nextEqualsign) {
980 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
981 return 0;
982 }
983 /* strip leading & trailing spaces (TC decided to tolerate these) */
984 while(*keywordStart == ' ') {
985 keywordStart++;
986 }
987 keyValueTail = nextEqualsign;
988 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
989 keyValueTail--;
990 }
991 /* now keyValueTail points to first char after the keyName */
992 /* copy & normalize keyName from locale */
993 if (keywordStart == keyValueTail) {
994 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
995 return 0;
996 }
997 keyValueLen = 0;
998 while (keywordStart < keyValueTail) {
999 if (!UPRV_ISALPHANUM(*keywordStart)) {
1000 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1001 return 0;
1002 }
1003 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1004 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1005 } else {
1006 /* keyword name too long for internal buffer */
1007 *status = U_INTERNAL_PROGRAM_ERROR;
1008 return 0;
1009 }
1010 }
1011 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1012
1013 nextSeparator = uprv_strchr(nextEqualsign, ';');
1014
1015 /* start processing the value part */
1016 nextEqualsign++; /* skip '=' */
1017 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1018 while(*nextEqualsign == ' ') {
1019 nextEqualsign++;
1020 }
1021 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1022 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1023 keyValueTail--;
1024 }
1025 if (nextEqualsign == keyValueTail) {
1026 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1027 return 0;
1028 }
1029
1030 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1031 if(rc == 0) {
1032 /* Current entry matches the input keyword. Update the entry */
1033 if(keywordValueLen > 0) { /* updating a value */
1034 updatedKeysAndValues.append(keyValuePrefix, *status);
1035 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1036 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1037 updatedKeysAndValues.append('=', *status);
1038 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1039 } /* else removing this entry, don't emit anything */
1040 handledInputKeyAndValue = TRUE;
1041 } else {
1042 /* input keyword sorts earlier than current entry, add before current entry */
1043 if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1044 /* insert new entry at this location */
1045 updatedKeysAndValues.append(keyValuePrefix, *status);
1046 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1047 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1048 updatedKeysAndValues.append('=', *status);
1049 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1050 handledInputKeyAndValue = TRUE;
1051 }
1052 /* copy the current entry */
1053 updatedKeysAndValues.append(keyValuePrefix, *status);
1054 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1055 updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1056 updatedKeysAndValues.append('=', *status);
1057 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1058 }
1059 if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1060 /* append new entry at the end, it sorts later than existing entries */
1061 updatedKeysAndValues.append(keyValuePrefix, *status);
1062 /* skip keyValuePrefix update, no subsequent key-value pair */
1063 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1064 updatedKeysAndValues.append('=', *status);
1065 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1066 handledInputKeyAndValue = TRUE;
1067 }
1068 keywordStart = nextSeparator;
1069 } /* end loop searching */
1070
1071 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1072 * problems with the passed-in locale. So if we did encounter problems with the
1073 * passed-in locale above, those errors took precedence and overrode any error
1074 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1075 * are errors here they are from updatedKeysAndValues.append; they do cause an
1076 * error return but the passed-in locale is unmodified and the original bufLen is
1077 * returned.
1078 */
1079 if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1080 /* if input key/value specified removal of a keyword not present in locale, or
1081 * there was an error in CharString.append, leave original locale alone. */
1082 return bufLen;
1083 }
1084
1085 updatedKeysAndValuesLen = updatedKeysAndValues.length();
1086 /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1087 needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1088 if(needLen >= bufferCapacity) {
1089 *status = U_BUFFER_OVERFLOW_ERROR;
1090 return needLen; /* no change */
1091 }
1092 if (updatedKeysAndValuesLen > 0) {
1093 uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1094 }
1095 buffer[needLen]=0;
1096 return needLen;
1097}
1098
1099/* ### ID parsing implementation **************************************************/
1100
1101#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1102
1103/*returns TRUE if one of the special prefixes is here (s=string)
1104 'x-' or 'i-' */
1105#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1106
1107/* Dot terminates it because of POSIX form where dot precedes the codepage
1108 * except for variant
1109 */
1110#define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1111
1112/**
1113 * Lookup 'key' in the array 'list'. The array 'list' should contain
1114 * a NULL entry, followed by more entries, and a second NULL entry.
1115 *
1116 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1117 * COUNTRIES_3.
1118 */
1119static int16_t _findIndex(const char* const* list, const char* key)
1120{
1121 const char* const* anchor = list;
1122 int32_t pass = 0;
1123
1124 /* Make two passes through two NULL-terminated arrays at 'list' */
1125 while (pass++ < 2) {
1126 while (*list) {
1127 if (uprv_strcmp(key, *list) == 0) {
1128 return (int16_t)(list - anchor);
1129 }
1130 list++;
1131 }
1132 ++list; /* skip final NULL *CWB*/
1133 }
1134 return -1;
1135}
1136
1137/* count the length of src while copying it to dest; return strlen(src) */
1138static inline int32_t
1139_copyCount(char *dest, int32_t destCapacity, const char *src) {
1140 const char *anchor;
1141 char c;
1142
1143 anchor=src;
1144 for(;;) {
1145 if((c=*src)==0) {
1146 return (int32_t)(src-anchor);
1147 }
1148 if(destCapacity<=0) {
1149 return (int32_t)((src-anchor)+uprv_strlen(src));
1150 }
1151 ++src;
1152 *dest++=c;
1153 --destCapacity;
1154 }
1155}
1156
1157U_CFUNC const char*
1158uloc_getCurrentCountryID(const char* oldID){
1159 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1160 if (offset >= 0) {
1161 return REPLACEMENT_COUNTRIES[offset];
1162 }
1163 return oldID;
1164}
1165U_CFUNC const char*
1166uloc_getCurrentLanguageID(const char* oldID){
1167 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1168 if (offset >= 0) {
1169 return REPLACEMENT_LANGUAGES[offset];
1170 }
1171 return oldID;
1172}
1173/*
1174 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1175 * avoid duplicating code to handle the earlier locale ID pieces
1176 * in the functions for the later ones by
1177 * setting the *pEnd pointer to where they stopped parsing
1178 *
1179 * TODO try to use this in Locale
1180 */
1181U_CFUNC int32_t
1182ulocimp_getLanguage(const char *localeID,
1183 char *language, int32_t languageCapacity,
1184 const char **pEnd) {
1185 int32_t i=0;
1186 int32_t offset;
1187 char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1188
1189 if (uprv_stricmp(localeID, "root") == 0) {
1190 localeID += 4;
1191 } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1192 (localeID[3] == '\0' ||
1193 localeID[3] == '-' ||
1194 localeID[3] == '_' ||
1195 localeID[3] == '@')) {
1196 localeID += 3;
1197 }
1198
1199 /* if it starts with i- or x- then copy that prefix */
1200 if(_isIDPrefix(localeID)) {
1201 if(i<languageCapacity) {
1202 language[i]=(char)uprv_tolower(*localeID);
1203 }
1204 if(i<languageCapacity) {
1205 language[i+1]='-';
1206 }
1207 i+=2;
1208 localeID+=2;
1209 }
1210
1211 /* copy the language as far as possible and count its length */
1212 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1213 if(i<languageCapacity) {
1214 language[i]=(char)uprv_tolower(*localeID);
1215 }
1216 if(i<3) {
1217 U_ASSERT(i>=0);
1218 lang[i]=(char)uprv_tolower(*localeID);
1219 }
1220 i++;
1221 localeID++;
1222 }
1223
1224 if(i==3) {
1225 /* convert 3 character code to 2 character code if possible *CWB*/
1226 offset=_findIndex(LANGUAGES_3, lang);
1227 if(offset>=0) {
1228 i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1229 }
1230 }
1231
1232 if(pEnd!=NULL) {
1233 *pEnd=localeID;
1234 }
1235 return i;
1236}
1237
1238U_CFUNC int32_t
1239ulocimp_getScript(const char *localeID,
1240 char *script, int32_t scriptCapacity,
1241 const char **pEnd)
1242{
1243 int32_t idLen = 0;
1244
1245 if (pEnd != NULL) {
1246 *pEnd = localeID;
1247 }
1248
1249 /* copy the second item as far as possible and count its length */
1250 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1251 && uprv_isASCIILetter(localeID[idLen])) {
1252 idLen++;
1253 }
1254
1255 /* If it's exactly 4 characters long, then it's a script and not a country. */
1256 if (idLen == 4) {
1257 int32_t i;
1258 if (pEnd != NULL) {
1259 *pEnd = localeID+idLen;
1260 }
1261 if(idLen > scriptCapacity) {
1262 idLen = scriptCapacity;
1263 }
1264 if (idLen >= 1) {
1265 script[0]=(char)uprv_toupper(*(localeID++));
1266 }
1267 for (i = 1; i < idLen; i++) {
1268 script[i]=(char)uprv_tolower(*(localeID++));
1269 }
1270 }
1271 else {
1272 idLen = 0;
1273 }
1274 return idLen;
1275}
1276
1277U_CFUNC int32_t
1278ulocimp_getCountry(const char *localeID,
1279 char *country, int32_t countryCapacity,
1280 const char **pEnd)
1281{
1282 int32_t idLen=0;
1283 char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1284 int32_t offset;
1285
1286 /* copy the country as far as possible and count its length */
1287 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1288 if(idLen<(ULOC_COUNTRY_CAPACITY-1)) { /*CWB*/
1289 cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1290 }
1291 idLen++;
1292 }
1293
1294 /* the country should be either length 2 or 3 */
1295 if (idLen == 2 || idLen == 3) {
1296 UBool gotCountry = FALSE;
1297 /* convert 3 character code to 2 character code if possible *CWB*/
1298 if(idLen==3) {
1299 offset=_findIndex(COUNTRIES_3, cnty);
1300 if(offset>=0) {
1301 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1302 gotCountry = TRUE;
1303 }
1304 }
1305 if (!gotCountry) {
1306 int32_t i = 0;
1307 for (i = 0; i < idLen; i++) {
1308 if (i < countryCapacity) {
1309 country[i]=(char)uprv_toupper(localeID[i]);
1310 }
1311 }
1312 }
1313 localeID+=idLen;
1314 } else {
1315 idLen = 0;
1316 }
1317
1318 if(pEnd!=NULL) {
1319 *pEnd=localeID;
1320 }
1321
1322 return idLen;
1323}
1324
1325/**
1326 * @param needSeparator if true, then add leading '_' if any variants
1327 * are added to 'variant'
1328 */
1329static int32_t
1330_getVariantEx(const char *localeID,
1331 char prev,
1332 char *variant, int32_t variantCapacity,
1333 UBool needSeparator) {
1334 int32_t i=0;
1335
1336 /* get one or more variant tags and separate them with '_' */
1337 if(_isIDSeparator(prev)) {
1338 /* get a variant string after a '-' or '_' */
1339 while(!_isTerminator(*localeID)) {
1340 if (needSeparator) {
1341 if (i<variantCapacity) {
1342 variant[i] = '_';
1343 }
1344 ++i;
1345 needSeparator = FALSE;
1346 }
1347 if(i<variantCapacity) {
1348 variant[i]=(char)uprv_toupper(*localeID);
1349 if(variant[i]=='-') {
1350 variant[i]='_';
1351 }
1352 }
1353 i++;
1354 localeID++;
1355 }
1356 }
1357
1358 /* if there is no variant tag after a '-' or '_' then look for '@' */
1359 if(i==0) {
1360 if(prev=='@') {
1361 /* keep localeID */
1362 } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1363 ++localeID; /* point after the '@' */
1364 } else {
1365 return 0;
1366 }
1367 while(!_isTerminator(*localeID)) {
1368 if (needSeparator) {
1369 if (i<variantCapacity) {
1370 variant[i] = '_';
1371 }
1372 ++i;
1373 needSeparator = FALSE;
1374 }
1375 if(i<variantCapacity) {
1376 variant[i]=(char)uprv_toupper(*localeID);
1377 if(variant[i]=='-' || variant[i]==',') {
1378 variant[i]='_';
1379 }
1380 }
1381 i++;
1382 localeID++;
1383 }
1384 }
1385
1386 return i;
1387}
1388
1389static int32_t
1390_getVariant(const char *localeID,
1391 char prev,
1392 char *variant, int32_t variantCapacity) {
1393 return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1394}
1395
1396/* Keyword enumeration */
1397
1398typedef struct UKeywordsContext {
1399 char* keywords;
1400 char* current;
1401} UKeywordsContext;
1402
1403U_CDECL_BEGIN
1404
1405static void U_CALLCONV
1406uloc_kw_closeKeywords(UEnumeration *enumerator) {
1407 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1408 uprv_free(enumerator->context);
1409 uprv_free(enumerator);
1410}
1411
1412static int32_t U_CALLCONV
1413uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1414 char *kw = ((UKeywordsContext *)en->context)->keywords;
1415 int32_t result = 0;
1416 while(*kw) {
1417 result++;
1418 kw += uprv_strlen(kw)+1;
1419 }
1420 return result;
1421}
1422
1423static const char * U_CALLCONV
1424uloc_kw_nextKeyword(UEnumeration* en,
1425 int32_t* resultLength,
1426 UErrorCode* /*status*/) {
1427 const char* result = ((UKeywordsContext *)en->context)->current;
1428 int32_t len = 0;
1429 if(*result) {
1430 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1431 ((UKeywordsContext *)en->context)->current += len+1;
1432 } else {
1433 result = NULL;
1434 }
1435 if (resultLength) {
1436 *resultLength = len;
1437 }
1438 return result;
1439}
1440
1441static void U_CALLCONV
1442uloc_kw_resetKeywords(UEnumeration* en,
1443 UErrorCode* /*status*/) {
1444 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1445}
1446
1447U_CDECL_END
1448
1449
1450static const UEnumeration gKeywordsEnum = {
1451 NULL,
1452 NULL,
1453 uloc_kw_closeKeywords,
1454 uloc_kw_countKeywords,
1455 uenum_unextDefault,
1456 uloc_kw_nextKeyword,
1457 uloc_kw_resetKeywords
1458};
1459
1460U_CAPI UEnumeration* U_EXPORT2
1461uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1462{
1463 LocalMemory<UKeywordsContext> myContext;
1464 LocalMemory<UEnumeration> result;
1465
1466 if (U_FAILURE(*status)) {
1467 return nullptr;
1468 }
1469 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1470 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1471 if (myContext.isNull() || result.isNull()) {
1472 *status = U_MEMORY_ALLOCATION_ERROR;
1473 return nullptr;
1474 }
1475 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1476 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1477 if (myContext->keywords == nullptr) {
1478 *status = U_MEMORY_ALLOCATION_ERROR;
1479 return nullptr;
1480 }
1481 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1482 myContext->keywords[keywordListSize] = 0;
1483 myContext->current = myContext->keywords;
1484 result->context = myContext.orphan();
1485 return result.orphan();
1486}
1487
1488U_CAPI UEnumeration* U_EXPORT2
1489uloc_openKeywords(const char* localeID,
1490 UErrorCode* status)
1491{
1492 int32_t i=0;
1493 char keywords[256];
1494 int32_t keywordsCapacity = 256;
1495 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1496 const char* tmpLocaleID;
1497
1498 if(status==NULL || U_FAILURE(*status)) {
1499 return 0;
1500 }
1501
1502 if (_hasBCP47Extension(localeID)) {
1503 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1504 } else {
1505 if (localeID==NULL) {
1506 localeID=uloc_getDefault();
1507 }
1508 tmpLocaleID=localeID;
1509 }
1510
1511 /* Skip the language */
1512 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1513 if(_isIDSeparator(*tmpLocaleID)) {
1514 const char *scriptID;
1515 /* Skip the script if available */
1516 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1517 if(scriptID != tmpLocaleID+1) {
1518 /* Found optional script */
1519 tmpLocaleID = scriptID;
1520 }
1521 /* Skip the Country */
1522 if (_isIDSeparator(*tmpLocaleID)) {
1523 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1524 if(_isIDSeparator(*tmpLocaleID)) {
1525 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1526 }
1527 }
1528 }
1529
1530 /* keywords are located after '@' */
1531 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1532 i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1533 }
1534
1535 if(i) {
1536 return uloc_openKeywordList(keywords, i, status);
1537 } else {
1538 return NULL;
1539 }
1540}
1541
1542
1543/* bit-flags for 'options' parameter of _canonicalize */
1544#define _ULOC_STRIP_KEYWORDS 0x2
1545#define _ULOC_CANONICALIZE 0x1
1546
1547#define OPTION_SET(options, mask) ((options & mask) != 0)
1548
1549static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1550#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1551
1552/**
1553 * Canonicalize the given localeID, to level 1 or to level 2,
1554 * depending on the options. To specify level 1, pass in options=0.
1555 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1556 *
1557 * This is the code underlying uloc_getName and uloc_canonicalize.
1558 */
1559static int32_t
1560_canonicalize(const char* localeID,
1561 char* result,
1562 int32_t resultCapacity,
1563 uint32_t options,
1564 UErrorCode* err) {
1565 int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1566 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1567 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1568 const char* origLocaleID;
1569 const char* tmpLocaleID;
1570 const char* keywordAssign = NULL;
1571 const char* separatorIndicator = NULL;
1572 char* name;
1573 char* variant = NULL; /* pointer into name, or NULL */
1574
1575 if (U_FAILURE(*err)) {
1576 return 0;
1577 }
1578
1579 if (_hasBCP47Extension(localeID)) {
1580 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1581 } else {
1582 if (localeID==NULL) {
1583 localeID=uloc_getDefault();
1584 }
1585 tmpLocaleID=localeID;
1586 }
1587
1588 origLocaleID=tmpLocaleID;
1589
1590 /* if we are doing a full canonicalization, then put results in
1591 localeBuffer, if necessary; otherwise send them to result. */
1592 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1593 (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1594 name = localeBuffer;
1595 nameCapacity = (int32_t)sizeof(localeBuffer);
1596 } else {
1597 name = result;
1598 nameCapacity = resultCapacity;
1599 }
1600
1601 /* get all pieces, one after another, and separate with '_' */
1602 len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1603
1604 if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1605 const char *d = uloc_getDefault();
1606
1607 len = (int32_t)uprv_strlen(d);
1608
1609 if (name != NULL) {
1610 uprv_memcpy(name, d, len);
1611 }
1612 } else if(_isIDSeparator(*tmpLocaleID)) {
1613 const char *scriptID;
1614
1615 ++fieldCount;
1616 if(len<nameCapacity) {
1617 name[len]='_';
1618 }
1619 ++len;
1620
1621 scriptSize=ulocimp_getScript(tmpLocaleID+1,
1622 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1623 if(scriptSize > 0) {
1624 /* Found optional script */
1625 tmpLocaleID = scriptID;
1626 ++fieldCount;
1627 len+=scriptSize;
1628 if (_isIDSeparator(*tmpLocaleID)) {
1629 /* If there is something else, then we add the _ */
1630 if(len<nameCapacity) {
1631 name[len]='_';
1632 }
1633 ++len;
1634 }
1635 }
1636
1637 if (_isIDSeparator(*tmpLocaleID)) {
1638 const char *cntryID;
1639 int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1640 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1641 if (cntrySize > 0) {
1642 /* Found optional country */
1643 tmpLocaleID = cntryID;
1644 len+=cntrySize;
1645 }
1646 if(_isIDSeparator(*tmpLocaleID)) {
1647 /* If there is something else, then we add the _ if we found country before. */
1648 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1649 ++fieldCount;
1650 if(len<nameCapacity) {
1651 name[len]='_';
1652 }
1653 ++len;
1654 }
1655
1656 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1657 (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1658 if (variantSize > 0) {
1659 variant = len<nameCapacity ? name+len : NULL;
1660 len += variantSize;
1661 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1662 }
1663 }
1664 }
1665 }
1666
1667 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1668 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1669 UBool done = FALSE;
1670 do {
1671 char c = *tmpLocaleID;
1672 switch (c) {
1673 case 0:
1674 case '@':
1675 done = TRUE;
1676 break;
1677 default:
1678 if (len<nameCapacity) {
1679 name[len] = c;
1680 }
1681 ++len;
1682 ++tmpLocaleID;
1683 break;
1684 }
1685 } while (!done);
1686 }
1687
1688 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1689 After this, tmpLocaleID either points to '@' or is NULL */
1690 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1691 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1692 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1693 }
1694
1695 /* Copy POSIX-style variant, if any [mr@FOO] */
1696 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1697 tmpLocaleID != NULL && keywordAssign == NULL) {
1698 for (;;) {
1699 char c = *tmpLocaleID;
1700 if (c == 0) {
1701 break;
1702 }
1703 if (len<nameCapacity) {
1704 name[len] = c;
1705 }
1706 ++len;
1707 ++tmpLocaleID;
1708 }
1709 }
1710
1711 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1712 /* Handle @FOO variant if @ is present and not followed by = */
1713 if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1714 int32_t posixVariantSize;
1715 /* Add missing '_' if needed */
1716 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1717 do {
1718 if(len<nameCapacity) {
1719 name[len]='_';
1720 }
1721 ++len;
1722 ++fieldCount;
1723 } while(fieldCount<2);
1724 }
1725 posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1726 (UBool)(variantSize > 0));
1727 if (posixVariantSize > 0) {
1728 if (variant == NULL) {
1729 variant = name+len;
1730 }
1731 len += posixVariantSize;
1732 variantSize += posixVariantSize;
1733 }
1734 }
1735
1736 /* Look up the ID in the canonicalization map */
1737 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1738 const char* id = CANONICALIZE_MAP[j].id;
1739 int32_t n = (int32_t)uprv_strlen(id);
1740 if (len == n && uprv_strncmp(name, id, n) == 0) {
1741 if (n == 0 && tmpLocaleID != NULL) {
1742 break; /* Don't remap "" if keywords present */
1743 }
1744 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1745 break;
1746 }
1747 }
1748 }
1749
1750 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1751 if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1752 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1753 if(len<nameCapacity) {
1754 name[len]='@';
1755 }
1756 ++len;
1757 ++fieldCount;
1758 len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1759 NULL, 0, NULL, TRUE, err);
1760 }
1761 }
1762
1763 if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1764 uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1765 }
1766
1767 return u_terminateChars(result, resultCapacity, len, err);
1768}
1769
1770/* ### ID parsing API **************************************************/
1771
1772U_CAPI int32_t U_EXPORT2
1773uloc_getParent(const char* localeID,
1774 char* parent,
1775 int32_t parentCapacity,
1776 UErrorCode* err)
1777{
1778 const char *lastUnderscore;
1779 int32_t i;
1780
1781 if (U_FAILURE(*err))
1782 return 0;
1783
1784 if (localeID == NULL)
1785 localeID = uloc_getDefault();
1786
1787 lastUnderscore=uprv_strrchr(localeID, '_');
1788 if(lastUnderscore!=NULL) {
1789 i=(int32_t)(lastUnderscore-localeID);
1790 } else {
1791 i=0;
1792 }
1793
1794 if (i > 0) {
1795 if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1796 localeID += 3;
1797 i -= 3;
1798 uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1799 } else if (parent != localeID) {
1800 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1801 }
1802 }
1803
1804 return u_terminateChars(parent, parentCapacity, i, err);
1805}
1806
1807U_CAPI int32_t U_EXPORT2
1808uloc_getLanguage(const char* localeID,
1809 char* language,
1810 int32_t languageCapacity,
1811 UErrorCode* err)
1812{
1813 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1814 int32_t i=0;
1815
1816 if (err==NULL || U_FAILURE(*err)) {
1817 return 0;
1818 }
1819
1820 if(localeID==NULL) {
1821 localeID=uloc_getDefault();
1822 }
1823
1824 i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1825 return u_terminateChars(language, languageCapacity, i, err);
1826}
1827
1828U_CAPI int32_t U_EXPORT2
1829uloc_getScript(const char* localeID,
1830 char* script,
1831 int32_t scriptCapacity,
1832 UErrorCode* err)
1833{
1834 int32_t i=0;
1835
1836 if(err==NULL || U_FAILURE(*err)) {
1837 return 0;
1838 }
1839
1840 if(localeID==NULL) {
1841 localeID=uloc_getDefault();
1842 }
1843
1844 /* skip the language */
1845 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1846 if(_isIDSeparator(*localeID)) {
1847 i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1848 }
1849 return u_terminateChars(script, scriptCapacity, i, err);
1850}
1851
1852U_CAPI int32_t U_EXPORT2
1853uloc_getCountry(const char* localeID,
1854 char* country,
1855 int32_t countryCapacity,
1856 UErrorCode* err)
1857{
1858 int32_t i=0;
1859
1860 if(err==NULL || U_FAILURE(*err)) {
1861 return 0;
1862 }
1863
1864 if(localeID==NULL) {
1865 localeID=uloc_getDefault();
1866 }
1867
1868 /* Skip the language */
1869 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1870 if(_isIDSeparator(*localeID)) {
1871 const char *scriptID;
1872 /* Skip the script if available */
1873 ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1874 if(scriptID != localeID+1) {
1875 /* Found optional script */
1876 localeID = scriptID;
1877 }
1878 if(_isIDSeparator(*localeID)) {
1879 i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1880 }
1881 }
1882 return u_terminateChars(country, countryCapacity, i, err);
1883}
1884
1885U_CAPI int32_t U_EXPORT2
1886uloc_getVariant(const char* localeID,
1887 char* variant,
1888 int32_t variantCapacity,
1889 UErrorCode* err)
1890{
1891 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1892 const char* tmpLocaleID;
1893 int32_t i=0;
1894
1895 if(err==NULL || U_FAILURE(*err)) {
1896 return 0;
1897 }
1898
1899 if (_hasBCP47Extension(localeID)) {
1900 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1901 } else {
1902 if (localeID==NULL) {
1903 localeID=uloc_getDefault();
1904 }
1905 tmpLocaleID=localeID;
1906 }
1907
1908 /* Skip the language */
1909 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1910 if(_isIDSeparator(*tmpLocaleID)) {
1911 const char *scriptID;
1912 /* Skip the script if available */
1913 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1914 if(scriptID != tmpLocaleID+1) {
1915 /* Found optional script */
1916 tmpLocaleID = scriptID;
1917 }
1918 /* Skip the Country */
1919 if (_isIDSeparator(*tmpLocaleID)) {
1920 const char *cntryID;
1921 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
1922 if (cntryID != tmpLocaleID+1) {
1923 /* Found optional country */
1924 tmpLocaleID = cntryID;
1925 }
1926 if(_isIDSeparator(*tmpLocaleID)) {
1927 /* If there was no country ID, skip a possible extra IDSeparator */
1928 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1929 tmpLocaleID++;
1930 }
1931 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
1932 }
1933 }
1934 }
1935
1936 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1937 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1938/*
1939 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1940 i=_getVariant(localeID+1, '@', variant, variantCapacity);
1941 }
1942*/
1943 return u_terminateChars(variant, variantCapacity, i, err);
1944}
1945
1946U_CAPI int32_t U_EXPORT2
1947uloc_getName(const char* localeID,
1948 char* name,
1949 int32_t nameCapacity,
1950 UErrorCode* err)
1951{
1952 return _canonicalize(localeID, name, nameCapacity, 0, err);
1953}
1954
1955U_CAPI int32_t U_EXPORT2
1956uloc_getBaseName(const char* localeID,
1957 char* name,
1958 int32_t nameCapacity,
1959 UErrorCode* err)
1960{
1961 return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
1962}
1963
1964U_CAPI int32_t U_EXPORT2
1965uloc_canonicalize(const char* localeID,
1966 char* name,
1967 int32_t nameCapacity,
1968 UErrorCode* err)
1969{
1970 return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
1971}
1972
1973U_CAPI const char* U_EXPORT2
1974uloc_getISO3Language(const char* localeID)
1975{
1976 int16_t offset;
1977 char lang[ULOC_LANG_CAPACITY];
1978 UErrorCode err = U_ZERO_ERROR;
1979
1980 if (localeID == NULL)
1981 {
1982 localeID = uloc_getDefault();
1983 }
1984 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1985 if (U_FAILURE(err))
1986 return "";
1987 offset = _findIndex(LANGUAGES, lang);
1988 if (offset < 0)
1989 return "";
1990 return LANGUAGES_3[offset];
1991}
1992
1993U_CAPI const char* U_EXPORT2
1994uloc_getISO3Country(const char* localeID)
1995{
1996 int16_t offset;
1997 char cntry[ULOC_LANG_CAPACITY];
1998 UErrorCode err = U_ZERO_ERROR;
1999
2000 if (localeID == NULL)
2001 {
2002 localeID = uloc_getDefault();
2003 }
2004 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2005 if (U_FAILURE(err))
2006 return "";
2007 offset = _findIndex(COUNTRIES, cntry);
2008 if (offset < 0)
2009 return "";
2010
2011 return COUNTRIES_3[offset];
2012}
2013
2014U_CAPI uint32_t U_EXPORT2
2015uloc_getLCID(const char* localeID)
2016{
2017 UErrorCode status = U_ZERO_ERROR;
2018 char langID[ULOC_FULLNAME_CAPACITY];
2019 uint32_t lcid = 0;
2020
2021 /* Check for incomplete id. */
2022 if (!localeID || uprv_strlen(localeID) < 2) {
2023 return 0;
2024 }
2025
2026 // First, attempt Windows platform lookup if available, but fall
2027 // through to catch any special cases (ICU vs Windows name differences).
2028 lcid = uprv_convertToLCIDPlatform(localeID, &status);
2029 if (U_FAILURE(status)) {
2030 return 0;
2031 }
2032 if (lcid > 0) {
2033 // Windows found an LCID, return that
2034 return lcid;
2035 }
2036
2037 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2038 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2039 return 0;
2040 }
2041
2042 if (uprv_strchr(localeID, '@')) {
2043 // uprv_convertToLCID does not support keywords other than collation.
2044 // Remove all keywords except collation.
2045 int32_t len;
2046 char collVal[ULOC_KEYWORDS_CAPACITY];
2047 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2048
2049 len = uloc_getKeywordValue(localeID, "collation", collVal,
2050 UPRV_LENGTHOF(collVal) - 1, &status);
2051
2052 if (U_SUCCESS(status) && len > 0) {
2053 collVal[len] = 0;
2054
2055 len = uloc_getBaseName(localeID, tmpLocaleID,
2056 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2057
2058 if (U_SUCCESS(status) && len > 0) {
2059 tmpLocaleID[len] = 0;
2060
2061 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2062 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2063
2064 if (U_SUCCESS(status) && len > 0) {
2065 tmpLocaleID[len] = 0;
2066 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2067 }
2068 }
2069 }
2070
2071 // fall through - all keywords are simply ignored
2072 status = U_ZERO_ERROR;
2073 }
2074
2075 return uprv_convertToLCID(langID, localeID, &status);
2076}
2077
2078U_CAPI int32_t U_EXPORT2
2079uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2080 UErrorCode *status)
2081{
2082 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2083}
2084
2085/* ### Default locale **************************************************/
2086
2087U_CAPI const char* U_EXPORT2
2088uloc_getDefault()
2089{
2090 return locale_get_default();
2091}
2092
2093U_CAPI void U_EXPORT2
2094uloc_setDefault(const char* newDefaultLocale,
2095 UErrorCode* err)
2096{
2097 if (U_FAILURE(*err))
2098 return;
2099 /* the error code isn't currently used for anything by this function*/
2100
2101 /* propagate change to C++ */
2102 locale_set_default(newDefaultLocale);
2103}
2104
2105/**
2106 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2107 * to an array of pointers to arrays of char. All of these pointers are owned
2108 * by ICU-- do not delete them, and do not write through them. The array is
2109 * terminated with a null pointer.
2110 */
2111U_CAPI const char* const* U_EXPORT2
2112uloc_getISOLanguages()
2113{
2114 return LANGUAGES;
2115}
2116
2117/**
2118 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2119 * pointer to an array of pointers to arrays of char. All of these pointers are
2120 * owned by ICU-- do not delete them, and do not write through them. The array is
2121 * terminated with a null pointer.
2122 */
2123U_CAPI const char* const* U_EXPORT2
2124uloc_getISOCountries()
2125{
2126 return COUNTRIES;
2127}
2128
2129
2130/* this function to be moved into cstring.c later */
2131static char gDecimal = 0;
2132
2133static /* U_CAPI */
2134double
2135/* U_EXPORT2 */
2136_uloc_strtod(const char *start, char **end) {
2137 char *decimal;
2138 char *myEnd;
2139 char buf[30];
2140 double rv;
2141 if (!gDecimal) {
2142 char rep[5];
2143 /* For machines that decide to change the decimal on you,
2144 and try to be too smart with localization.
2145 This normally should be just a '.'. */
2146 sprintf(rep, "%+1.1f", 1.0);
2147 gDecimal = rep[2];
2148 }
2149
2150 if(gDecimal == '.') {
2151 return uprv_strtod(start, end); /* fall through to OS */
2152 } else {
2153 uprv_strncpy(buf, start, 29);
2154 buf[29]=0;
2155 decimal = uprv_strchr(buf, '.');
2156 if(decimal) {
2157 *decimal = gDecimal;
2158 } else {
2159 return uprv_strtod(start, end); /* no decimal point */
2160 }
2161 rv = uprv_strtod(buf, &myEnd);
2162 if(end) {
2163 *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2164 }
2165 return rv;
2166 }
2167}
2168
2169typedef struct {
2170 float q;
2171 int32_t dummy; /* to avoid uninitialized memory copy from qsort */
2172 char locale[ULOC_FULLNAME_CAPACITY+1];
2173} _acceptLangItem;
2174
2175static int32_t U_CALLCONV
2176uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2177{
2178 const _acceptLangItem *aa = (const _acceptLangItem*)a;
2179 const _acceptLangItem *bb = (const _acceptLangItem*)b;
2180
2181 int32_t rc = 0;
2182 if(bb->q < aa->q) {
2183 rc = -1; /* A > B */
2184 } else if(bb->q > aa->q) {
2185 rc = 1; /* A < B */
2186 } else {
2187 rc = 0; /* A = B */
2188 }
2189
2190 if(rc==0) {
2191 rc = uprv_stricmp(aa->locale, bb->locale);
2192 }
2193
2194#if defined(ULOC_DEBUG)
2195 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2196 aa->locale, aa->q,
2197 bb->locale, bb->q,
2198 rc);*/
2199#endif
2200
2201 return rc;
2202}
2203
2204/*
2205mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2206*/
2207
2208U_CAPI int32_t U_EXPORT2
2209uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2210 const char *httpAcceptLanguage,
2211 UEnumeration* availableLocales,
2212 UErrorCode *status)
2213{
2214 MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2215 char tmp[ULOC_FULLNAME_CAPACITY +1];
2216 int32_t n = 0;
2217 const char *itemEnd;
2218 const char *paramEnd;
2219 const char *s;
2220 const char *t;
2221 int32_t res;
2222 int32_t i;
2223 int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2224
2225 if(U_FAILURE(*status)) {
2226 return -1;
2227 }
2228
2229 for(s=httpAcceptLanguage;s&&*s;) {
2230 while(isspace(*s)) /* eat space at the beginning */
2231 s++;
2232 itemEnd=uprv_strchr(s,',');
2233 paramEnd=uprv_strchr(s,';');
2234 if(!itemEnd) {
2235 itemEnd = httpAcceptLanguage+l; /* end of string */
2236 }
2237 if(paramEnd && paramEnd<itemEnd) {
2238 /* semicolon (;) is closer than end (,) */
2239 t = paramEnd+1;
2240 if(*t=='q') {
2241 t++;
2242 }
2243 while(isspace(*t)) {
2244 t++;
2245 }
2246 if(*t=='=') {
2247 t++;
2248 }
2249 while(isspace(*t)) {
2250 t++;
2251 }
2252 items[n].q = (float)_uloc_strtod(t,NULL);
2253 } else {
2254 /* no semicolon - it's 1.0 */
2255 items[n].q = 1.0f;
2256 paramEnd = itemEnd;
2257 }
2258 items[n].dummy=0;
2259 /* eat spaces prior to semi */
2260 for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2261 ;
2262 int32_t slen = static_cast<int32_t>(((t+1)-s));
2263 if(slen > ULOC_FULLNAME_CAPACITY) {
2264 *status = U_BUFFER_OVERFLOW_ERROR;
2265 return -1; // too big
2266 }
2267 uprv_strncpy(items[n].locale, s, slen);
2268 items[n].locale[slen]=0; // terminate
2269 int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2270 if(U_FAILURE(*status)) return -1;
2271 if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2272 // canonicalization had an effect- copy back
2273 uprv_strncpy(items[n].locale, tmp, clen);
2274 items[n].locale[clen] = 0; // terminate
2275 }
2276#if defined(ULOC_DEBUG)
2277 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2278#endif
2279 n++;
2280 s = itemEnd;
2281 while(*s==',') { /* eat duplicate commas */
2282 s++;
2283 }
2284 if(n>=items.getCapacity()) { // If we need more items
2285 if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2286 *status = U_MEMORY_ALLOCATION_ERROR;
2287 return -1;
2288 }
2289#if defined(ULOC_DEBUG)
2290 fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2291#endif
2292 }
2293 }
2294 uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2295 if (U_FAILURE(*status)) {
2296 return -1;
2297 }
2298 LocalMemory<const char*> strs(NULL);
2299 if (strs.allocateInsteadAndReset(n) == NULL) {
2300 *status = U_MEMORY_ALLOCATION_ERROR;
2301 return -1;
2302 }
2303 for(i=0;i<n;i++) {
2304#if defined(ULOC_DEBUG)
2305 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2306#endif
2307 strs[i]=items[i].locale;
2308 }
2309 res = uloc_acceptLanguage(result, resultAvailable, outResult,
2310 strs.getAlias(), n, availableLocales, status);
2311 return res;
2312}
2313
2314
2315U_CAPI int32_t U_EXPORT2
2316uloc_acceptLanguage(char *result, int32_t resultAvailable,
2317 UAcceptResult *outResult, const char **acceptList,
2318 int32_t acceptListCount,
2319 UEnumeration* availableLocales,
2320 UErrorCode *status)
2321{
2322 int32_t i,j;
2323 int32_t len;
2324 int32_t maxLen=0;
2325 char tmp[ULOC_FULLNAME_CAPACITY+1];
2326 const char *l;
2327 char **fallbackList;
2328 if(U_FAILURE(*status)) {
2329 return -1;
2330 }
2331 fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2332 if(fallbackList==NULL) {
2333 *status = U_MEMORY_ALLOCATION_ERROR;
2334 return -1;
2335 }
2336 for(i=0;i<acceptListCount;i++) {
2337#if defined(ULOC_DEBUG)
2338 fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2339#endif
2340 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2341#if defined(ULOC_DEBUG)
2342 fprintf(stderr," %s\n", l);
2343#endif
2344 len = (int32_t)uprv_strlen(l);
2345 if(!uprv_strcmp(acceptList[i], l)) {
2346 if(outResult) {
2347 *outResult = ULOC_ACCEPT_VALID;
2348 }
2349#if defined(ULOC_DEBUG)
2350 fprintf(stderr, "MATCH! %s\n", l);
2351#endif
2352 if(len>0) {
2353 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2354 }
2355 for(j=0;j<i;j++) {
2356 uprv_free(fallbackList[j]);
2357 }
2358 uprv_free(fallbackList);
2359 return u_terminateChars(result, resultAvailable, len, status);
2360 }
2361 if(len>maxLen) {
2362 maxLen = len;
2363 }
2364 }
2365 uenum_reset(availableLocales, status);
2366 /* save off parent info */
2367 if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2368 fallbackList[i] = uprv_strdup(tmp);
2369 } else {
2370 fallbackList[i]=0;
2371 }
2372 }
2373
2374 for(maxLen--;maxLen>0;maxLen--) {
2375 for(i=0;i<acceptListCount;i++) {
2376 if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2377#if defined(ULOC_DEBUG)
2378 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2379#endif
2380 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2381#if defined(ULOC_DEBUG)
2382 fprintf(stderr," %s\n", l);
2383#endif
2384 len = (int32_t)uprv_strlen(l);
2385 if(!uprv_strcmp(fallbackList[i], l)) {
2386 if(outResult) {
2387 *outResult = ULOC_ACCEPT_FALLBACK;
2388 }
2389#if defined(ULOC_DEBUG)
2390 fprintf(stderr, "fallback MATCH! %s\n", l);
2391#endif
2392 if(len>0) {
2393 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2394 }
2395 for(j=0;j<acceptListCount;j++) {
2396 uprv_free(fallbackList[j]);
2397 }
2398 uprv_free(fallbackList);
2399 return u_terminateChars(result, resultAvailable, len, status);
2400 }
2401 }
2402 uenum_reset(availableLocales, status);
2403
2404 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2405 uprv_free(fallbackList[i]);
2406 fallbackList[i] = uprv_strdup(tmp);
2407 } else {
2408 uprv_free(fallbackList[i]);
2409 fallbackList[i]=0;
2410 }
2411 }
2412 }
2413 if(outResult) {
2414 *outResult = ULOC_ACCEPT_FAILED;
2415 }
2416 }
2417 for(i=0;i<acceptListCount;i++) {
2418 uprv_free(fallbackList[i]);
2419 }
2420 uprv_free(fallbackList);
2421 return -1;
2422}
2423
2424U_CAPI const char* U_EXPORT2
2425uloc_toUnicodeLocaleKey(const char* keyword)
2426{
2427 const char* bcpKey = ulocimp_toBcpKey(keyword);
2428 if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2429 // unknown keyword, but syntax is fine..
2430 return keyword;
2431 }
2432 return bcpKey;
2433}
2434
2435U_CAPI const char* U_EXPORT2
2436uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2437{
2438 const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2439 if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2440 // unknown keyword, but syntax is fine..
2441 return value;
2442 }
2443 return bcpType;
2444}
2445
2446static UBool
2447isWellFormedLegacyKey(const char* legacyKey)
2448{
2449 const char* p = legacyKey;
2450 while (*p) {
2451 if (!UPRV_ISALPHANUM(*p)) {
2452 return FALSE;
2453 }
2454 p++;
2455 }
2456 return TRUE;
2457}
2458
2459static UBool
2460isWellFormedLegacyType(const char* legacyType)
2461{
2462 const char* p = legacyType;
2463 int32_t alphaNumLen = 0;
2464 while (*p) {
2465 if (*p == '_' || *p == '/' || *p == '-') {
2466 if (alphaNumLen == 0) {
2467 return FALSE;
2468 }
2469 alphaNumLen = 0;
2470 } else if (UPRV_ISALPHANUM(*p)) {
2471 alphaNumLen++;
2472 } else {
2473 return FALSE;
2474 }
2475 p++;
2476 }
2477 return (alphaNumLen != 0);
2478}
2479
2480U_CAPI const char* U_EXPORT2
2481uloc_toLegacyKey(const char* keyword)
2482{
2483 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2484 if (legacyKey == NULL) {
2485 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2486 //
2487 // Note:
2488 // LDML/CLDR provides some definition of keyword syntax in
2489 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2490 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2491 // Keys can only consist of [0-9a-zA-Z].
2492 if (isWellFormedLegacyKey(keyword)) {
2493 return keyword;
2494 }
2495 }
2496 return legacyKey;
2497}
2498
2499U_CAPI const char* U_EXPORT2
2500uloc_toLegacyType(const char* keyword, const char* value)
2501{
2502 const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2503 if (legacyType == NULL) {
2504 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2505 //
2506 // Note:
2507 // LDML/CLDR provides some definition of keyword syntax in
2508 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2509 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2510 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2511 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2512 if (isWellFormedLegacyType(value)) {
2513 return value;
2514 }
2515 }
2516 return legacyType;
2517}
2518
2519/*eof*/
2520