1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2009-2015, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*/
9
10#include "unicode/bytestream.h"
11#include "unicode/utypes.h"
12#include "unicode/ures.h"
13#include "unicode/localpointer.h"
14#include "unicode/putil.h"
15#include "unicode/uenum.h"
16#include "unicode/uloc.h"
17#include "ustr_imp.h"
18#include "bytesinkutil.h"
19#include "charstr.h"
20#include "cmemory.h"
21#include "cstring.h"
22#include "putilimp.h"
23#include "uinvchar.h"
24#include "ulocimp.h"
25#include "uassert.h"
26
27
28/* struct holding a single variant */
29typedef struct VariantListEntry {
30 const char *variant;
31 struct VariantListEntry *next;
32} VariantListEntry;
33
34/* struct holding a single attribute value */
35struct AttributeListEntry : public icu::UMemory {
36 const char *attribute;
37 struct AttributeListEntry *next;
38};
39
40/* struct holding a single extension */
41struct ExtensionListEntry : public icu::UMemory {
42 const char *key;
43 const char *value;
44 struct ExtensionListEntry *next;
45};
46
47#define MAXEXTLANG 3
48typedef struct ULanguageTag {
49 char *buf; /* holding parsed subtags */
50 const char *language;
51 const char *extlang[MAXEXTLANG];
52 const char *script;
53 const char *region;
54 VariantListEntry *variants;
55 ExtensionListEntry *extensions;
56 const char *privateuse;
57 const char *legacy;
58} ULanguageTag;
59
60#define MINLEN 2
61#define SEP '-'
62#define PRIVATEUSE 'x'
63#define LDMLEXT 'u'
64
65#define LOCALE_SEP '_'
66#define LOCALE_EXT_SEP '@'
67#define LOCALE_KEYWORD_SEP ';'
68#define LOCALE_KEY_TYPE_SEP '='
69
70#define ISALPHA(c) uprv_isASCIILetter(c)
71#define ISNUMERIC(c) ((c)>='0' && (c)<='9')
72
73static const char EMPTY[] = "";
74static const char LANG_UND[] = "und";
75static const char PRIVATEUSE_KEY[] = "x";
76static const char _POSIX[] = "_POSIX";
77static const char POSIX_KEY[] = "va";
78static const char POSIX_VALUE[] = "posix";
79static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
80static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
81static const char LOCALE_TYPE_YES[] = "yes";
82
83#define LANG_UND_LEN 3
84
85/*
86 Updated on 2018-09-12 from
87 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
88
89 This table has 2 parts. The part for
90 legacy language tags (marked as “Type: grandfathered” in BCP 47)
91 is generated by the following scripts from the IANA language tag registry.
92
93 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
94 egrep -A 7 'Type: grandfathered' | \
95 egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
96 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
97 tr 'A-Z' 'a-z'
98
99
100 The 2nd part is made of five ICU-specific entries. They're kept for
101 the backward compatibility for now, even though there are no preferred
102 values. They may have to be removed for the strict BCP 47 compliance.
103
104*/
105static const char* const LEGACY[] = {
106/* legacy preferred */
107 "art-lojban", "jbo",
108 "en-gb-oed", "en-gb-oxendict",
109 "i-ami", "ami",
110 "i-bnn", "bnn",
111 "i-hak", "hak",
112 "i-klingon", "tlh",
113 "i-lux", "lb",
114 "i-navajo", "nv",
115 "i-pwn", "pwn",
116 "i-tao", "tao",
117 "i-tay", "tay",
118 "i-tsu", "tsu",
119 "no-bok", "nb",
120 "no-nyn", "nn",
121 "sgn-be-fr", "sfb",
122 "sgn-be-nl", "vgt",
123 "sgn-ch-de", "sgg",
124 "zh-guoyu", "cmn",
125 "zh-hakka", "hak",
126 "zh-min-nan", "nan",
127 "zh-xiang", "hsn",
128
129 // Legacy tags with no preferred value in the IANA
130 // registry. Kept for now for the backward compatibility
131 // because ICU has mapped them this way.
132 "i-default", "en-x-i-default",
133 "i-enochian", "und-x-i-enochian",
134 "i-mingo", "see-x-i-mingo",
135 "zh-min", "nan-x-zh-min",
136};
137
138/*
139 Updated on 2018-09-12 from
140 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
141
142 The table lists redundant tags with preferred value in the IANA language tag registry.
143 It's generated with the following command:
144
145 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
146 grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
147 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
148 tr 'A-Z' 'a-z'
149
150 In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
151 a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
152*/
153
154static const char* const REDUNDANT[] = {
155// redundant preferred
156 "sgn-br", "bzs",
157 "sgn-co", "csn",
158 "sgn-de", "gsg",
159 "sgn-dk", "dsl",
160 "sgn-es", "ssp",
161 "sgn-fr", "fsl",
162 "sgn-gb", "bfi",
163 "sgn-gr", "gss",
164 "sgn-ie", "isg",
165 "sgn-it", "ise",
166 "sgn-jp", "jsl",
167 "sgn-mx", "mfs",
168 "sgn-ni", "ncs",
169 "sgn-nl", "dse",
170 "sgn-no", "nsl",
171 "sgn-pt", "psr",
172 "sgn-se", "swl",
173 "sgn-us", "ase",
174 "sgn-za", "sfs",
175 "zh-cmn", "cmn",
176 "zh-cmn-hans", "cmn-hans",
177 "zh-cmn-hant", "cmn-hant",
178 "zh-gan", "gan",
179 "zh-wuu", "wuu",
180 "zh-yue", "yue",
181
182 // variant tag with preferred value
183 "ja-latn-hepburn-heploc", "ja-latn-alalc97",
184};
185
186/*
187 Updated on 2018-09-12 from
188 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
189
190 grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
191 grep -B1 'Preferred' | grep -v '^--' | \
192 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
193
194 Make sure that 2-letter language subtags come before 3-letter subtags.
195*/
196static const char DEPRECATEDLANGS[][4] = {
197/* deprecated new */
198 "in", "id",
199 "iw", "he",
200 "ji", "yi",
201 "jw", "jv",
202 "mo", "ro",
203 "aam", "aas",
204 "adp", "dz",
205 "aue", "ktz",
206 "ayx", "nun",
207 "bgm", "bcg",
208 "bjd", "drl",
209 "ccq", "rki",
210 "cjr", "mom",
211 "cka", "cmr",
212 "cmk", "xch",
213 "coy", "pij",
214 "cqu", "quh",
215 "drh", "khk",
216 "drw", "prs",
217 "gav", "dev",
218 "gfx", "vaj",
219 "ggn", "gvr",
220 "gti", "nyc",
221 "guv", "duz",
222 "hrr", "jal",
223 "ibi", "opa",
224 "ilw", "gal",
225 "jeg", "oyb",
226 "kgc", "tdf",
227 "kgh", "kml",
228 "koj", "kwv",
229 "krm", "bmf",
230 "ktr", "dtp",
231 "kvs", "gdj",
232 "kwq", "yam",
233 "kxe", "tvd",
234 "kzj", "dtp",
235 "kzt", "dtp",
236 "lii", "raq",
237 "lmm", "rmx",
238 "meg", "cir",
239 "mst", "mry",
240 "mwj", "vaj",
241 "myt", "mry",
242 "nad", "xny",
243 "ncp", "kdz",
244 "nnx", "ngv",
245 "nts", "pij",
246 "oun", "vaj",
247 "pcr", "adx",
248 "pmc", "huw",
249 "pmu", "phr",
250 "ppa", "bfy",
251 "ppr", "lcq",
252 "pry", "prt",
253 "puz", "pub",
254 "sca", "hle",
255 "skk", "oyb",
256 "tdu", "dtp",
257 "thc", "tpo",
258 "thx", "oyb",
259 "tie", "ras",
260 "tkk", "twm",
261 "tlw", "weo",
262 "tmp", "tyj",
263 "tne", "kak",
264 "tnf", "prs",
265 "tsf", "taj",
266 "uok", "ema",
267 "xba", "cax",
268 "xia", "acn",
269 "xkh", "waw",
270 "xsj", "suj",
271 "ybd", "rki",
272 "yma", "lrr",
273 "ymt", "mtm",
274 "yos", "zom",
275 "yuu", "yug",
276};
277
278/*
279 Updated on 2018-04-24 from
280
281 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
282 grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
283 grep -B1 'Preferred' | \
284 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
285*/
286static const char DEPRECATEDREGIONS[][3] = {
287/* deprecated new */
288 "BU", "MM",
289 "DD", "DE",
290 "FX", "FR",
291 "TP", "TL",
292 "YD", "YE",
293 "ZR", "CD",
294};
295
296/*
297* -------------------------------------------------
298*
299* These ultag_ functions may be exposed as APIs later
300*
301* -------------------------------------------------
302*/
303
304static ULanguageTag*
305ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
306
307static void
308ultag_close(ULanguageTag* langtag);
309
310static const char*
311ultag_getLanguage(const ULanguageTag* langtag);
312
313#if 0
314static const char*
315ultag_getJDKLanguage(const ULanguageTag* langtag);
316#endif
317
318static const char*
319ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
320
321static int32_t
322ultag_getExtlangSize(const ULanguageTag* langtag);
323
324static const char*
325ultag_getScript(const ULanguageTag* langtag);
326
327static const char*
328ultag_getRegion(const ULanguageTag* langtag);
329
330static const char*
331ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
332
333static int32_t
334ultag_getVariantsSize(const ULanguageTag* langtag);
335
336static const char*
337ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
338
339static const char*
340ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
341
342static int32_t
343ultag_getExtensionsSize(const ULanguageTag* langtag);
344
345static const char*
346ultag_getPrivateUse(const ULanguageTag* langtag);
347
348#if 0
349static const char*
350ultag_getLegacy(const ULanguageTag* langtag);
351#endif
352
353U_NAMESPACE_BEGIN
354
355/**
356 * \class LocalULanguageTagPointer
357 * "Smart pointer" class, closes a ULanguageTag via ultag_close().
358 * For most methods see the LocalPointerBase base class.
359 *
360 * @see LocalPointerBase
361 * @see LocalPointer
362 * @internal
363 */
364U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close);
365
366U_NAMESPACE_END
367
368/*
369* -------------------------------------------------
370*
371* Language subtag syntax validation functions
372*
373* -------------------------------------------------
374*/
375
376static UBool
377_isAlphaString(const char* s, int32_t len) {
378 int32_t i;
379 for (i = 0; i < len; i++) {
380 if (!ISALPHA(*(s + i))) {
381 return false;
382 }
383 }
384 return true;
385}
386
387static UBool
388_isNumericString(const char* s, int32_t len) {
389 int32_t i;
390 for (i = 0; i < len; i++) {
391 if (!ISNUMERIC(*(s + i))) {
392 return false;
393 }
394 }
395 return true;
396}
397
398static UBool
399_isAlphaNumericString(const char* s, int32_t len) {
400 int32_t i;
401 for (i = 0; i < len; i++) {
402 if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
403 return false;
404 }
405 }
406 return true;
407}
408
409static UBool
410_isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
411 if (len < 0) {
412 len = (int32_t)uprv_strlen(s);
413 }
414 if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
415 return true;
416 }
417 return false;
418}
419
420U_CFUNC UBool
421ultag_isLanguageSubtag(const char* s, int32_t len) {
422 /*
423 * unicode_language_subtag = alpha{2,3} | alpha{5,8};
424 * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
425 * See ICU-20372
426 */
427 if (len < 0) {
428 len = (int32_t)uprv_strlen(s);
429 }
430 if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
431 return true;
432 }
433 return false;
434}
435
436static UBool
437_isExtlangSubtag(const char* s, int32_t len) {
438 /*
439 * extlang = 3ALPHA ; selected ISO 639 codes
440 * *2("-" 3ALPHA) ; permanently reserved
441 */
442 if (len < 0) {
443 len = (int32_t)uprv_strlen(s);
444 }
445 if (len == 3 && _isAlphaString(s, len)) {
446 return true;
447 }
448 return false;
449}
450
451U_CFUNC UBool
452ultag_isScriptSubtag(const char* s, int32_t len) {
453 /*
454 * script = 4ALPHA ; ISO 15924 code
455 */
456 if (len < 0) {
457 len = (int32_t)uprv_strlen(s);
458 }
459 if (len == 4 && _isAlphaString(s, len)) {
460 return true;
461 }
462 return false;
463}
464
465U_CFUNC UBool
466ultag_isRegionSubtag(const char* s, int32_t len) {
467 /*
468 * region = 2ALPHA ; ISO 3166-1 code
469 * / 3DIGIT ; UN M.49 code
470 */
471 if (len < 0) {
472 len = (int32_t)uprv_strlen(s);
473 }
474 if (len == 2 && _isAlphaString(s, len)) {
475 return true;
476 }
477 if (len == 3 && _isNumericString(s, len)) {
478 return true;
479 }
480 return false;
481}
482
483static UBool
484_isVariantSubtag(const char* s, int32_t len) {
485 /*
486 * variant = 5*8alphanum ; registered variants
487 * / (DIGIT 3alphanum)
488 */
489 if (len < 0) {
490 len = (int32_t)uprv_strlen(s);
491 }
492 if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
493 return true;
494 }
495 if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
496 return true;
497 }
498 return false;
499}
500
501static UBool
502_isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
503 const char *p = s;
504 const char *pSubtag = nullptr;
505
506 if (len < 0) {
507 len = (int32_t)uprv_strlen(s);
508 }
509
510 while ((p - s) < len) {
511 if (*p == SEP) {
512 if (pSubtag == nullptr) {
513 return false;
514 }
515 if (!test(pSubtag, (int32_t)(p - pSubtag))) {
516 return false;
517 }
518 pSubtag = nullptr;
519 } else if (pSubtag == nullptr) {
520 pSubtag = p;
521 }
522 p++;
523 }
524 if (pSubtag == nullptr) {
525 return false;
526 }
527 return test(pSubtag, (int32_t)(p - pSubtag));
528}
529
530U_CFUNC UBool
531ultag_isVariantSubtags(const char* s, int32_t len) {
532 return _isSepListOf(&_isVariantSubtag, s, len);
533}
534
535// This is for the ICU-specific "lvariant" handling.
536static UBool
537_isPrivateuseVariantSubtag(const char* s, int32_t len) {
538 /*
539 * variant = 1*8alphanum ; registered variants
540 * / (DIGIT 3alphanum)
541 */
542 return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
543}
544
545static UBool
546_isExtensionSingleton(const char* s, int32_t len) {
547 /*
548 * extension = singleton 1*("-" (2*8alphanum))
549 *
550 * singleton = DIGIT ; 0 - 9
551 * / %x41-57 ; A - W
552 * / %x59-5A ; Y - Z
553 * / %x61-77 ; a - w
554 * / %x79-7A ; y - z
555 */
556 if (len < 0) {
557 len = (int32_t)uprv_strlen(s);
558 }
559 if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) {
560 return true;
561 }
562 return false;
563}
564
565static UBool
566_isExtensionSubtag(const char* s, int32_t len) {
567 /*
568 * extension = singleton 1*("-" (2*8alphanum))
569 */
570 return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
571}
572
573U_CFUNC UBool
574ultag_isExtensionSubtags(const char* s, int32_t len) {
575 return _isSepListOf(&_isExtensionSubtag, s, len);
576}
577
578static UBool
579_isPrivateuseValueSubtag(const char* s, int32_t len) {
580 /*
581 * privateuse = "x" 1*("-" (1*8alphanum))
582 */
583 return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
584}
585
586U_CFUNC UBool
587ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
588 return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
589}
590
591U_CFUNC UBool
592ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
593 /*
594 * attribute = alphanum{3,8} ;
595 */
596 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
597}
598
599U_CFUNC UBool
600ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
601 return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
602}
603
604U_CFUNC UBool
605ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
606 /*
607 * key = alphanum alpha ;
608 */
609 if (len < 0) {
610 len = (int32_t)uprv_strlen(s);
611 }
612 if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
613 return true;
614 }
615 return false;
616}
617
618U_CFUNC UBool
619_isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
620 /*
621 * alphanum{3,8}
622 */
623 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
624}
625
626U_CFUNC UBool
627ultag_isUnicodeLocaleType(const char*s, int32_t len) {
628 /*
629 * type = alphanum{3,8} (sep alphanum{3,8})* ;
630 */
631 return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
632}
633
634static UBool
635_isTKey(const char* s, int32_t len)
636{
637 /*
638 * tkey = alpha digit ;
639 */
640 if (len < 0) {
641 len = (int32_t)uprv_strlen(s);
642 }
643 if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
644 return true;
645 }
646 return false;
647}
648
649U_CAPI const char * U_EXPORT2
650ultag_getTKeyStart(const char *localeID) {
651 const char *result = localeID;
652 const char *sep;
653 while((sep = uprv_strchr(result, SEP)) != nullptr) {
654 if (_isTKey(result, static_cast<int32_t>(sep - result))) {
655 return result;
656 }
657 result = ++sep;
658 }
659 if (_isTKey(result, -1)) {
660 return result;
661 }
662 return nullptr;
663}
664
665static UBool
666_isTValue(const char* s, int32_t len)
667{
668 /*
669 * tvalue = (sep alphanum{3,8})+ ;
670 */
671 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
672}
673
674static UBool
675_isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
676{
677 const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end
678 const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
679 // unicode_region_subtag, unicode_variant_subtag, tkey or end
680 const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag,
681 // unicode_variant_subtag, tkey, or end
682 const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag,
683 // tkey, or end.
684 const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag
685 // tkey or end.
686 const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here.
687 const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end
688
689
690 if (len < 0) {
691 len = (int32_t)uprv_strlen(s);
692 }
693 switch (state) {
694 case kStart:
695 if (ultag_isLanguageSubtag(s, len) && len != 4) {
696 state = kGotLanguage;
697 return true;
698 }
699 if (_isTKey(s, len)) {
700 state = kGotTKey;
701 return true;
702 }
703 return false;
704 case kGotLanguage:
705 if (ultag_isScriptSubtag(s, len)) {
706 state = kGotScript;
707 return true;
708 }
709 U_FALLTHROUGH;
710 case kGotScript:
711 if (ultag_isRegionSubtag(s, len)) {
712 state = kGotRegion;
713 return true;
714 }
715 U_FALLTHROUGH;
716 case kGotRegion:
717 U_FALLTHROUGH;
718 case kGotVariant:
719 if (_isVariantSubtag(s, len)) {
720 state = kGotVariant;
721 return true;
722 }
723 if (_isTKey(s, len)) {
724 state = kGotTKey;
725 return true;
726 }
727 return false;
728 case kGotTKey:
729 if (_isTValue(s, len)) {
730 state = kGotTValue;
731 return true;
732 }
733 return false;
734 case kGotTValue:
735 if (_isTKey(s, len)) {
736 state = kGotTKey;
737 return true;
738 }
739 if (_isTValue(s, len)) {
740 return true;
741 }
742 return false;
743 }
744 return false;
745}
746
747static UBool
748_isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
749{
750 const int32_t kStart = 0; // Start, wait for a key or attribute or end
751 const int32_t kGotKey = 1; // Got a key, wait for type or key or end
752 const int32_t kGotType = 2; // Got a type, wait for key or end
753
754 switch (state) {
755 case kStart:
756 if (ultag_isUnicodeLocaleKey(s, len)) {
757 state = kGotKey;
758 return true;
759 }
760 if (ultag_isUnicodeLocaleAttribute(s, len)) {
761 return true;
762 }
763 return false;
764 case kGotKey:
765 if (ultag_isUnicodeLocaleKey(s, len)) {
766 return true;
767 }
768 if (_isUnicodeLocaleTypeSubtag(s, len)) {
769 state = kGotType;
770 return true;
771 }
772 return false;
773 case kGotType:
774 if (ultag_isUnicodeLocaleKey(s, len)) {
775 state = kGotKey;
776 return true;
777 }
778 if (_isUnicodeLocaleTypeSubtag(s, len)) {
779 return true;
780 }
781 return false;
782 }
783 return false;
784}
785
786static UBool
787_isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
788{
789 int32_t state = 0;
790 const char* p;
791 const char* start = s;
792 int32_t subtagLen = 0;
793
794 if (len < 0) {
795 len = (int32_t)uprv_strlen(s);
796 }
797
798 for (p = s; len > 0; p++, len--) {
799 if (*p == SEP) {
800 if (!test(state, start, subtagLen)) {
801 return false;
802 }
803 subtagLen = 0;
804 start = p + 1;
805 } else {
806 subtagLen++;
807 }
808 }
809
810 if (test(state, start, subtagLen) && state >= 0) {
811 return true;
812 }
813 return false;
814}
815
816U_CFUNC UBool
817ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
818{
819 return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
820}
821
822U_CFUNC UBool
823ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
824 return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
825}
826
827
828/*
829* -------------------------------------------------
830*
831* Helper functions
832*
833* -------------------------------------------------
834*/
835
836static UBool
837_addVariantToList(VariantListEntry **first, VariantListEntry *var) {
838 UBool bAdded = true;
839
840 if (*first == nullptr) {
841 var->next = nullptr;
842 *first = var;
843 } else {
844 VariantListEntry *prev, *cur;
845 int32_t cmp;
846
847 /* variants order should be preserved */
848 prev = nullptr;
849 cur = *first;
850 while (true) {
851 if (cur == nullptr) {
852 prev->next = var;
853 var->next = nullptr;
854 break;
855 }
856
857 /* Checking for duplicate variant */
858 cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
859 if (cmp == 0) {
860 /* duplicated variant */
861 bAdded = false;
862 break;
863 }
864 prev = cur;
865 cur = cur->next;
866 }
867 }
868
869 return bAdded;
870}
871
872static UBool
873_addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
874 UBool bAdded = true;
875
876 if (*first == nullptr) {
877 attr->next = nullptr;
878 *first = attr;
879 } else {
880 AttributeListEntry *prev, *cur;
881 int32_t cmp;
882
883 /* reorder variants in alphabetical order */
884 prev = nullptr;
885 cur = *first;
886 while (true) {
887 if (cur == nullptr) {
888 prev->next = attr;
889 attr->next = nullptr;
890 break;
891 }
892 cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
893 if (cmp < 0) {
894 if (prev == nullptr) {
895 *first = attr;
896 } else {
897 prev->next = attr;
898 }
899 attr->next = cur;
900 break;
901 }
902 if (cmp == 0) {
903 /* duplicated variant */
904 bAdded = false;
905 break;
906 }
907 prev = cur;
908 cur = cur->next;
909 }
910 }
911
912 return bAdded;
913}
914
915
916static UBool
917_addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
918 UBool bAdded = true;
919
920 if (*first == nullptr) {
921 ext->next = nullptr;
922 *first = ext;
923 } else {
924 ExtensionListEntry *prev, *cur;
925 int32_t cmp;
926
927 /* reorder variants in alphabetical order */
928 prev = nullptr;
929 cur = *first;
930 while (true) {
931 if (cur == nullptr) {
932 prev->next = ext;
933 ext->next = nullptr;
934 break;
935 }
936 if (localeToBCP) {
937 /* special handling for locale to bcp conversion */
938 int32_t len, curlen;
939
940 len = (int32_t)uprv_strlen(ext->key);
941 curlen = (int32_t)uprv_strlen(cur->key);
942
943 if (len == 1 && curlen == 1) {
944 if (*(ext->key) == *(cur->key)) {
945 cmp = 0;
946 } else if (*(ext->key) == PRIVATEUSE) {
947 cmp = 1;
948 } else if (*(cur->key) == PRIVATEUSE) {
949 cmp = -1;
950 } else {
951 cmp = *(ext->key) - *(cur->key);
952 }
953 } else if (len == 1) {
954 cmp = *(ext->key) - LDMLEXT;
955 } else if (curlen == 1) {
956 cmp = LDMLEXT - *(cur->key);
957 } else {
958 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
959 /* Both are u extension keys - we need special handling for 'attribute' */
960 if (cmp != 0) {
961 if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
962 cmp = 1;
963 } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
964 cmp = -1;
965 }
966 }
967 }
968 } else {
969 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
970 }
971 if (cmp < 0) {
972 if (prev == nullptr) {
973 *first = ext;
974 } else {
975 prev->next = ext;
976 }
977 ext->next = cur;
978 break;
979 }
980 if (cmp == 0) {
981 /* duplicated extension key */
982 bAdded = false;
983 break;
984 }
985 prev = cur;
986 cur = cur->next;
987 }
988 }
989
990 return bAdded;
991}
992
993static void
994_initializeULanguageTag(ULanguageTag* langtag) {
995 int32_t i;
996
997 langtag->buf = nullptr;
998
999 langtag->language = EMPTY;
1000 for (i = 0; i < MAXEXTLANG; i++) {
1001 langtag->extlang[i] = nullptr;
1002 }
1003
1004 langtag->script = EMPTY;
1005 langtag->region = EMPTY;
1006
1007 langtag->variants = nullptr;
1008 langtag->extensions = nullptr;
1009
1010 langtag->legacy = EMPTY;
1011 langtag->privateuse = EMPTY;
1012}
1013
1014static void
1015_appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1016 char buf[ULOC_LANG_CAPACITY];
1017 UErrorCode tmpStatus = U_ZERO_ERROR;
1018 int32_t len, i;
1019
1020 if (U_FAILURE(*status)) {
1021 return;
1022 }
1023
1024 len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
1025 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1026 if (strict) {
1027 *status = U_ILLEGAL_ARGUMENT_ERROR;
1028 return;
1029 }
1030 len = 0;
1031 }
1032
1033 /* Note: returned language code is in lower case letters */
1034
1035 if (len == 0) {
1036 sink.Append(LANG_UND, LANG_UND_LEN);
1037 } else if (!ultag_isLanguageSubtag(buf, len)) {
1038 /* invalid language code */
1039 if (strict) {
1040 *status = U_ILLEGAL_ARGUMENT_ERROR;
1041 return;
1042 }
1043 sink.Append(LANG_UND, LANG_UND_LEN);
1044 } else {
1045 /* resolve deprecated */
1046 for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
1047 // 2-letter deprecated subtags are listede before 3-letter
1048 // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1049 // across the 1st 3-letter subtag, if the input is a 2-letter code.
1050 // to avoid continuing to try when there's no match.
1051 if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
1052 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
1053 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
1054 len = (int32_t)uprv_strlen(buf);
1055 break;
1056 }
1057 }
1058 sink.Append(buf, len);
1059 }
1060}
1061
1062static void
1063_appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1064 char buf[ULOC_SCRIPT_CAPACITY];
1065 UErrorCode tmpStatus = U_ZERO_ERROR;
1066 int32_t len;
1067
1068 if (U_FAILURE(*status)) {
1069 return;
1070 }
1071
1072 len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
1073 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1074 if (strict) {
1075 *status = U_ILLEGAL_ARGUMENT_ERROR;
1076 }
1077 return;
1078 }
1079
1080 if (len > 0) {
1081 if (!ultag_isScriptSubtag(buf, len)) {
1082 /* invalid script code */
1083 if (strict) {
1084 *status = U_ILLEGAL_ARGUMENT_ERROR;
1085 }
1086 return;
1087 } else {
1088 sink.Append("-", 1);
1089 sink.Append(buf, len);
1090 }
1091 }
1092}
1093
1094static void
1095_appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1096 char buf[ULOC_COUNTRY_CAPACITY];
1097 UErrorCode tmpStatus = U_ZERO_ERROR;
1098 int32_t len;
1099
1100 if (U_FAILURE(*status)) {
1101 return;
1102 }
1103
1104 len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
1105 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1106 if (strict) {
1107 *status = U_ILLEGAL_ARGUMENT_ERROR;
1108 }
1109 return;
1110 }
1111
1112 if (len > 0) {
1113 if (!ultag_isRegionSubtag(buf, len)) {
1114 /* invalid region code */
1115 if (strict) {
1116 *status = U_ILLEGAL_ARGUMENT_ERROR;
1117 }
1118 return;
1119 } else {
1120 sink.Append("-", 1);
1121 /* resolve deprecated */
1122 for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
1123 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
1124 uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
1125 len = (int32_t)uprv_strlen(buf);
1126 break;
1127 }
1128 }
1129 sink.Append(buf, len);
1130 }
1131 }
1132}
1133
1134static void _sortVariants(VariantListEntry* first) {
1135 for (VariantListEntry* var1 = first; var1 != nullptr; var1 = var1->next) {
1136 for (VariantListEntry* var2 = var1->next; var2 != nullptr; var2 = var2->next) {
1137 // Swap var1->variant and var2->variant.
1138 if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) {
1139 const char* temp = var1->variant;
1140 var1->variant = var2->variant;
1141 var2->variant = temp;
1142 }
1143 }
1144 }
1145}
1146
1147static void
1148_appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
1149 char buf[ULOC_FULLNAME_CAPACITY];
1150 UErrorCode tmpStatus = U_ZERO_ERROR;
1151 int32_t len, i;
1152
1153 if (U_FAILURE(*status)) {
1154 return;
1155 }
1156
1157 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1158 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1159 if (strict) {
1160 *status = U_ILLEGAL_ARGUMENT_ERROR;
1161 }
1162 return;
1163 }
1164
1165 if (len > 0) {
1166 char *p, *pVar;
1167 UBool bNext = true;
1168 VariantListEntry *var;
1169 VariantListEntry *varFirst = nullptr;
1170
1171 pVar = nullptr;
1172 p = buf;
1173 while (bNext) {
1174 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1175 if (*p == 0) {
1176 bNext = false;
1177 } else {
1178 *p = 0; /* terminate */
1179 }
1180 if (pVar == nullptr) {
1181 if (strict) {
1182 *status = U_ILLEGAL_ARGUMENT_ERROR;
1183 break;
1184 }
1185 /* ignore empty variant */
1186 } else {
1187 /* ICU uses upper case letters for variants, but
1188 the canonical format is lowercase in BCP47 */
1189 for (i = 0; *(pVar + i) != 0; i++) {
1190 *(pVar + i) = uprv_tolower(*(pVar + i));
1191 }
1192
1193 /* validate */
1194 if (_isVariantSubtag(pVar, -1)) {
1195 if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
1196 /* emit the variant to the list */
1197 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
1198 if (var == nullptr) {
1199 *status = U_MEMORY_ALLOCATION_ERROR;
1200 break;
1201 }
1202 var->variant = pVar;
1203 if (!_addVariantToList(&varFirst, var)) {
1204 /* duplicated variant */
1205 uprv_free(var);
1206 if (strict) {
1207 *status = U_ILLEGAL_ARGUMENT_ERROR;
1208 break;
1209 }
1210 }
1211 } else {
1212 /* Special handling for POSIX variant, need to remember that we had it and then */
1213 /* treat it like an extension later. */
1214 *hadPosix = true;
1215 }
1216 } else if (strict) {
1217 *status = U_ILLEGAL_ARGUMENT_ERROR;
1218 break;
1219 } else if (_isPrivateuseValueSubtag(pVar, -1)) {
1220 /* Handle private use subtags separately */
1221 break;
1222 }
1223 }
1224 /* reset variant starting position */
1225 pVar = nullptr;
1226 } else if (pVar == nullptr) {
1227 pVar = p;
1228 }
1229 p++;
1230 }
1231
1232 if (U_SUCCESS(*status)) {
1233 if (varFirst != nullptr) {
1234 int32_t varLen;
1235
1236 /* per UTS35, we should sort the variants */
1237 _sortVariants(varFirst);
1238
1239 /* write out validated/normalized variants to the target */
1240 var = varFirst;
1241 while (var != nullptr) {
1242 sink.Append("-", 1);
1243 varLen = (int32_t)uprv_strlen(var->variant);
1244 sink.Append(var->variant, varLen);
1245 var = var->next;
1246 }
1247 }
1248 }
1249
1250 /* clean up */
1251 var = varFirst;
1252 while (var != nullptr) {
1253 VariantListEntry *tmpVar = var->next;
1254 uprv_free(var);
1255 var = tmpVar;
1256 }
1257
1258 if (U_FAILURE(*status)) {
1259 return;
1260 }
1261 }
1262}
1263
1264static void
1265_appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1266 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
1267 int32_t attrBufLength = 0;
1268
1269 icu::MemoryPool<AttributeListEntry> attrPool;
1270 icu::MemoryPool<ExtensionListEntry> extPool;
1271 icu::MemoryPool<icu::CharString> strPool;
1272
1273 icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status));
1274 if (U_FAILURE(*status) && !hadPosix) {
1275 return;
1276 }
1277 if (keywordEnum.isValid() || hadPosix) {
1278 /* reorder extensions */
1279 int32_t len;
1280 const char *key;
1281 ExtensionListEntry *firstExt = nullptr;
1282 ExtensionListEntry *ext;
1283 AttributeListEntry *firstAttr = nullptr;
1284 AttributeListEntry *attr;
1285 icu::MemoryPool<icu::CharString> extBufPool;
1286 const char *bcpKey=nullptr, *bcpValue=nullptr;
1287 UErrorCode tmpStatus = U_ZERO_ERROR;
1288 int32_t keylen;
1289 UBool isBcpUExt;
1290
1291 while (true) {
1292 key = uenum_next(keywordEnum.getAlias(), nullptr, status);
1293 if (key == nullptr) {
1294 break;
1295 }
1296
1297 icu::CharString buf;
1298 {
1299 icu::CharStringByteSink sink(&buf);
1300 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
1301 }
1302 len = buf.length();
1303
1304 if (U_FAILURE(tmpStatus)) {
1305 if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
1306 *status = U_MEMORY_ALLOCATION_ERROR;
1307 break;
1308 }
1309 if (strict) {
1310 *status = U_ILLEGAL_ARGUMENT_ERROR;
1311 break;
1312 }
1313 /* ignore this keyword */
1314 tmpStatus = U_ZERO_ERROR;
1315 continue;
1316 }
1317
1318 keylen = (int32_t)uprv_strlen(key);
1319 isBcpUExt = (keylen > 1);
1320
1321 /* special keyword used for representing Unicode locale attributes */
1322 if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
1323 if (len > 0) {
1324 int32_t i = 0;
1325 while (true) {
1326 attrBufLength = 0;
1327 for (; i < len; i++) {
1328 if (buf[i] != '-') {
1329 attrBuf[attrBufLength++] = buf[i];
1330 } else {
1331 i++;
1332 break;
1333 }
1334 }
1335 if (attrBufLength > 0) {
1336 attrBuf[attrBufLength] = 0;
1337
1338 } else if (i >= len){
1339 break;
1340 }
1341
1342 /* create AttributeListEntry */
1343 attr = attrPool.create();
1344 if (attr == nullptr) {
1345 *status = U_MEMORY_ALLOCATION_ERROR;
1346 break;
1347 }
1348 icu::CharString* attrValue =
1349 strPool.create(attrBuf, attrBufLength, *status);
1350 if (attrValue == nullptr) {
1351 *status = U_MEMORY_ALLOCATION_ERROR;
1352 break;
1353 }
1354 if (U_FAILURE(*status)) {
1355 break;
1356 }
1357 attr->attribute = attrValue->data();
1358
1359 if (!_addAttributeToList(&firstAttr, attr)) {
1360 if (strict) {
1361 *status = U_ILLEGAL_ARGUMENT_ERROR;
1362 break;
1363 }
1364 }
1365 }
1366 /* for a place holder ExtensionListEntry */
1367 bcpKey = LOCALE_ATTRIBUTE_KEY;
1368 bcpValue = nullptr;
1369 }
1370 } else if (isBcpUExt) {
1371 bcpKey = uloc_toUnicodeLocaleKey(key);
1372 if (bcpKey == nullptr) {
1373 if (strict) {
1374 *status = U_ILLEGAL_ARGUMENT_ERROR;
1375 break;
1376 }
1377 continue;
1378 }
1379
1380 /* we've checked buf is null-terminated above */
1381 bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
1382 if (bcpValue == nullptr) {
1383 if (strict) {
1384 *status = U_ILLEGAL_ARGUMENT_ERROR;
1385 break;
1386 }
1387 continue;
1388 }
1389 if (bcpValue == buf.data()) {
1390 /*
1391 When uloc_toUnicodeLocaleType(key, buf) returns the
1392 input value as is, the value is well-formed, but has
1393 no known mapping. This implementation normalizes the
1394 value to lower case
1395 */
1396 icu::CharString* extBuf = extBufPool.create(buf, tmpStatus);
1397
1398 if (extBuf == nullptr) {
1399 *status = U_MEMORY_ALLOCATION_ERROR;
1400 break;
1401 }
1402 if (U_FAILURE(tmpStatus)) {
1403 *status = tmpStatus;
1404 break;
1405 }
1406
1407 T_CString_toLowerCase(extBuf->data());
1408 bcpValue = extBuf->data();
1409 }
1410 } else {
1411 if (*key == PRIVATEUSE) {
1412 if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
1413 if (strict) {
1414 *status = U_ILLEGAL_ARGUMENT_ERROR;
1415 break;
1416 }
1417 continue;
1418 }
1419 } else {
1420 if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
1421 if (strict) {
1422 *status = U_ILLEGAL_ARGUMENT_ERROR;
1423 break;
1424 }
1425 continue;
1426 }
1427 }
1428 bcpKey = key;
1429 icu::CharString* extBuf =
1430 extBufPool.create(buf.data(), len, tmpStatus);
1431 if (extBuf == nullptr) {
1432 *status = U_MEMORY_ALLOCATION_ERROR;
1433 break;
1434 }
1435 if (U_FAILURE(tmpStatus)) {
1436 *status = tmpStatus;
1437 break;
1438 }
1439 bcpValue = extBuf->data();
1440 }
1441
1442 /* create ExtensionListEntry */
1443 ext = extPool.create();
1444 if (ext == nullptr) {
1445 *status = U_MEMORY_ALLOCATION_ERROR;
1446 break;
1447 }
1448 ext->key = bcpKey;
1449 ext->value = bcpValue;
1450
1451 if (!_addExtensionToList(&firstExt, ext, true)) {
1452 if (strict) {
1453 *status = U_ILLEGAL_ARGUMENT_ERROR;
1454 break;
1455 }
1456 }
1457 }
1458
1459 /* Special handling for POSIX variant - add the keywords for POSIX */
1460 if (hadPosix) {
1461 /* create ExtensionListEntry for POSIX */
1462 ext = extPool.create();
1463 if (ext == nullptr) {
1464 *status = U_MEMORY_ALLOCATION_ERROR;
1465 return;
1466 }
1467 ext->key = POSIX_KEY;
1468 ext->value = POSIX_VALUE;
1469
1470 if (!_addExtensionToList(&firstExt, ext, true)) {
1471 // Silently ignore errors.
1472 }
1473 }
1474
1475 if (U_SUCCESS(*status) && (firstExt != nullptr || firstAttr != nullptr)) {
1476 UBool startLDMLExtension = false;
1477 for (ext = firstExt; ext; ext = ext->next) {
1478 if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
1479 /* first LDML u singlton extension */
1480 sink.Append("-u", 2);
1481 startLDMLExtension = true;
1482 }
1483
1484 /* write out the sorted BCP47 attributes, extensions and private use */
1485 if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
1486 /* write the value for the attributes */
1487 for (attr = firstAttr; attr; attr = attr->next) {
1488 sink.Append("-", 1);
1489 sink.Append(
1490 attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute)));
1491 }
1492 } else {
1493 sink.Append("-", 1);
1494 sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key)));
1495 if (uprv_strcmp(ext->value, "true") != 0 &&
1496 uprv_strcmp(ext->value, "yes") != 0) {
1497 sink.Append("-", 1);
1498 sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value)));
1499 }
1500 }
1501 }
1502 }
1503 }
1504}
1505
1506/**
1507 * Append keywords parsed from LDML extension value
1508 * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1509 * Note: char* buf is used for storing keywords
1510 */
1511static void
1512_appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) {
1513 const char *pTag; /* beginning of current subtag */
1514 const char *pKwds; /* beginning of key-type pairs */
1515 UBool variantExists = *posixVariant;
1516
1517 ExtensionListEntry *kwdFirst = nullptr; /* first LDML keyword */
1518 ExtensionListEntry *kwd, *nextKwd;
1519
1520 int32_t len;
1521
1522 /* Reset the posixVariant value */
1523 *posixVariant = false;
1524
1525 pTag = ldmlext;
1526 pKwds = nullptr;
1527
1528 {
1529 AttributeListEntry *attrFirst = nullptr; /* first attribute */
1530 AttributeListEntry *attr, *nextAttr;
1531
1532 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1533 int32_t attrBufIdx = 0;
1534
1535 icu::MemoryPool<AttributeListEntry> attrPool;
1536
1537 /* Iterate through u extension attributes */
1538 while (*pTag) {
1539 /* locate next separator char */
1540 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1541
1542 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1543 pKwds = pTag;
1544 break;
1545 }
1546
1547 /* add this attribute to the list */
1548 attr = attrPool.create();
1549 if (attr == nullptr) {
1550 *status = U_MEMORY_ALLOCATION_ERROR;
1551 return;
1552 }
1553
1554 if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
1555 uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
1556 attrBuf[attrBufIdx + len] = 0;
1557 attr->attribute = &attrBuf[attrBufIdx];
1558 attrBufIdx += (len + 1);
1559 } else {
1560 *status = U_ILLEGAL_ARGUMENT_ERROR;
1561 return;
1562 }
1563
1564 // duplicate attribute is ignored, causes no error.
1565 _addAttributeToList(&attrFirst, attr);
1566
1567 /* next tag */
1568 pTag += len;
1569 if (*pTag) {
1570 /* next to the separator */
1571 pTag++;
1572 }
1573 }
1574
1575 if (attrFirst) {
1576 /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1577
1578 kwd = extPool.create();
1579 if (kwd == nullptr) {
1580 *status = U_MEMORY_ALLOCATION_ERROR;
1581 return;
1582 }
1583
1584 icu::CharString* value = kwdBuf.create();
1585 if (value == nullptr) {
1586 *status = U_MEMORY_ALLOCATION_ERROR;
1587 return;
1588 }
1589
1590 /* attribute subtags sorted in alphabetical order as type */
1591 attr = attrFirst;
1592 while (attr != nullptr) {
1593 nextAttr = attr->next;
1594 if (attr != attrFirst) {
1595 value->append('-', *status);
1596 }
1597 value->append(attr->attribute, *status);
1598 attr = nextAttr;
1599 }
1600 if (U_FAILURE(*status)) {
1601 return;
1602 }
1603
1604 kwd->key = LOCALE_ATTRIBUTE_KEY;
1605 kwd->value = value->data();
1606
1607 if (!_addExtensionToList(&kwdFirst, kwd, false)) {
1608 *status = U_ILLEGAL_ARGUMENT_ERROR;
1609 return;
1610 }
1611 }
1612 }
1613
1614 if (pKwds) {
1615 const char *pBcpKey = nullptr; /* u extension key subtag */
1616 const char *pBcpType = nullptr; /* beginning of u extension type subtag(s) */
1617 int32_t bcpKeyLen = 0;
1618 int32_t bcpTypeLen = 0;
1619 UBool isDone = false;
1620
1621 pTag = pKwds;
1622 /* BCP47 representation of LDML key/type pairs */
1623 while (!isDone) {
1624 const char *pNextBcpKey = nullptr;
1625 int32_t nextBcpKeyLen = 0;
1626 UBool emitKeyword = false;
1627
1628 if (*pTag) {
1629 /* locate next separator char */
1630 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1631
1632 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1633 if (pBcpKey) {
1634 emitKeyword = true;
1635 pNextBcpKey = pTag;
1636 nextBcpKeyLen = len;
1637 } else {
1638 pBcpKey = pTag;
1639 bcpKeyLen = len;
1640 }
1641 } else {
1642 U_ASSERT(pBcpKey != nullptr);
1643 /* within LDML type subtags */
1644 if (pBcpType) {
1645 bcpTypeLen += (len + 1);
1646 } else {
1647 pBcpType = pTag;
1648 bcpTypeLen = len;
1649 }
1650 }
1651
1652 /* next tag */
1653 pTag += len;
1654 if (*pTag) {
1655 /* next to the separator */
1656 pTag++;
1657 }
1658 } else {
1659 /* processing last one */
1660 emitKeyword = true;
1661 isDone = true;
1662 }
1663
1664 if (emitKeyword) {
1665 const char *pKey = nullptr; /* LDML key */
1666 const char *pType = nullptr; /* LDML type */
1667
1668 char bcpKeyBuf[3]; /* BCP key length is always 2 for now */
1669
1670 U_ASSERT(pBcpKey != nullptr);
1671
1672 if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
1673 /* the BCP key is invalid */
1674 *status = U_ILLEGAL_ARGUMENT_ERROR;
1675 return;
1676 }
1677 U_ASSERT(bcpKeyLen <= 2);
1678
1679 uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
1680 bcpKeyBuf[bcpKeyLen] = 0;
1681
1682 /* u extension key to LDML key */
1683 pKey = uloc_toLegacyKey(bcpKeyBuf);
1684 if (pKey == nullptr) {
1685 *status = U_ILLEGAL_ARGUMENT_ERROR;
1686 return;
1687 }
1688 if (pKey == bcpKeyBuf) {
1689 /*
1690 The key returned by toLegacyKey points to the input buffer.
1691 We normalize the result key to lower case.
1692 */
1693 T_CString_toLowerCase(bcpKeyBuf);
1694 icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status);
1695 if (key == nullptr) {
1696 *status = U_MEMORY_ALLOCATION_ERROR;
1697 return;
1698 }
1699 if (U_FAILURE(*status)) {
1700 return;
1701 }
1702 pKey = key->data();
1703 }
1704
1705 if (pBcpType) {
1706 char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */
1707 if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
1708 /* the BCP type is too long */
1709 *status = U_ILLEGAL_ARGUMENT_ERROR;
1710 return;
1711 }
1712
1713 uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
1714 bcpTypeBuf[bcpTypeLen] = 0;
1715
1716 /* BCP type to locale type */
1717 pType = uloc_toLegacyType(pKey, bcpTypeBuf);
1718 if (pType == nullptr) {
1719 *status = U_ILLEGAL_ARGUMENT_ERROR;
1720 return;
1721 }
1722 if (pType == bcpTypeBuf) {
1723 /*
1724 The type returned by toLegacyType points to the input buffer.
1725 We normalize the result type to lower case.
1726 */
1727 /* normalize to lower case */
1728 T_CString_toLowerCase(bcpTypeBuf);
1729 icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status);
1730 if (type == nullptr) {
1731 *status = U_MEMORY_ALLOCATION_ERROR;
1732 return;
1733 }
1734 if (U_FAILURE(*status)) {
1735 return;
1736 }
1737 pType = type->data();
1738 }
1739 } else {
1740 /* typeless - default type value is "yes" */
1741 pType = LOCALE_TYPE_YES;
1742 }
1743
1744 /* Special handling for u-va-posix, since we want to treat this as a variant,
1745 not as a keyword */
1746 if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
1747 *posixVariant = true;
1748 } else {
1749 /* create an ExtensionListEntry for this keyword */
1750 kwd = extPool.create();
1751 if (kwd == nullptr) {
1752 *status = U_MEMORY_ALLOCATION_ERROR;
1753 return;
1754 }
1755
1756 kwd->key = pKey;
1757 kwd->value = pType;
1758
1759 if (!_addExtensionToList(&kwdFirst, kwd, false)) {
1760 // duplicate keyword is allowed, Only the first
1761 // is honored.
1762 }
1763 }
1764
1765 pBcpKey = pNextBcpKey;
1766 bcpKeyLen = pNextBcpKey != nullptr ? nextBcpKeyLen : 0;
1767 pBcpType = nullptr;
1768 bcpTypeLen = 0;
1769 }
1770 }
1771 }
1772
1773 kwd = kwdFirst;
1774 while (kwd != nullptr) {
1775 nextKwd = kwd->next;
1776 _addExtensionToList(appendTo, kwd, false);
1777 kwd = nextKwd;
1778 }
1779}
1780
1781
1782static void
1783_appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) {
1784 int32_t i, n;
1785 int32_t len;
1786 ExtensionListEntry *kwdFirst = nullptr;
1787 ExtensionListEntry *kwd;
1788 const char *key, *type;
1789 icu::MemoryPool<ExtensionListEntry> extPool;
1790 icu::MemoryPool<icu::CharString> kwdBuf;
1791 UBool posixVariant = false;
1792
1793 if (U_FAILURE(*status)) {
1794 return;
1795 }
1796
1797 n = ultag_getExtensionsSize(langtag);
1798
1799 /* resolve locale keywords and reordering keys */
1800 for (i = 0; i < n; i++) {
1801 key = ultag_getExtensionKey(langtag, i);
1802 type = ultag_getExtensionValue(langtag, i);
1803 if (*key == LDMLEXT) {
1804 /* Determine if variants already exists */
1805 if (ultag_getVariantsSize(langtag)) {
1806 posixVariant = true;
1807 }
1808
1809 _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
1810 if (U_FAILURE(*status)) {
1811 break;
1812 }
1813 } else {
1814 kwd = extPool.create();
1815 if (kwd == nullptr) {
1816 *status = U_MEMORY_ALLOCATION_ERROR;
1817 break;
1818 }
1819 kwd->key = key;
1820 kwd->value = type;
1821 if (!_addExtensionToList(&kwdFirst, kwd, false)) {
1822 *status = U_ILLEGAL_ARGUMENT_ERROR;
1823 break;
1824 }
1825 }
1826 }
1827
1828 if (U_SUCCESS(*status)) {
1829 type = ultag_getPrivateUse(langtag);
1830 if ((int32_t)uprv_strlen(type) > 0) {
1831 /* add private use as a keyword */
1832 kwd = extPool.create();
1833 if (kwd == nullptr) {
1834 *status = U_MEMORY_ALLOCATION_ERROR;
1835 } else {
1836 kwd->key = PRIVATEUSE_KEY;
1837 kwd->value = type;
1838 if (!_addExtensionToList(&kwdFirst, kwd, false)) {
1839 *status = U_ILLEGAL_ARGUMENT_ERROR;
1840 }
1841 }
1842 }
1843 }
1844
1845 /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1846
1847 if (U_SUCCESS(*status) && posixVariant) {
1848 len = (int32_t) uprv_strlen(_POSIX);
1849 sink.Append(_POSIX, len);
1850 }
1851
1852 if (U_SUCCESS(*status) && kwdFirst != nullptr) {
1853 /* write out the sorted keywords */
1854 UBool firstValue = true;
1855 kwd = kwdFirst;
1856 do {
1857 if (firstValue) {
1858 sink.Append("@", 1);
1859 firstValue = false;
1860 } else {
1861 sink.Append(";", 1);
1862 }
1863
1864 /* key */
1865 len = (int32_t)uprv_strlen(kwd->key);
1866 sink.Append(kwd->key, len);
1867 sink.Append("=", 1);
1868
1869 /* type */
1870 len = (int32_t)uprv_strlen(kwd->value);
1871 sink.Append(kwd->value, len);
1872
1873 kwd = kwd->next;
1874 } while (kwd);
1875 }
1876}
1877
1878static void
1879_appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1880 (void)hadPosix;
1881 char buf[ULOC_FULLNAME_CAPACITY];
1882 char tmpAppend[ULOC_FULLNAME_CAPACITY];
1883 UErrorCode tmpStatus = U_ZERO_ERROR;
1884 int32_t len, i;
1885 int32_t reslen = 0;
1886 int32_t capacity = sizeof tmpAppend;
1887
1888 if (U_FAILURE(*status)) {
1889 return;
1890 }
1891
1892 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1893 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1894 if (strict) {
1895 *status = U_ILLEGAL_ARGUMENT_ERROR;
1896 }
1897 return;
1898 }
1899
1900 if (len > 0) {
1901 char *p, *pPriv;
1902 UBool bNext = true;
1903 UBool firstValue = true;
1904 UBool writeValue;
1905
1906 pPriv = nullptr;
1907 p = buf;
1908 while (bNext) {
1909 writeValue = false;
1910 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1911 if (*p == 0) {
1912 bNext = false;
1913 } else {
1914 *p = 0; /* terminate */
1915 }
1916 if (pPriv != nullptr) {
1917 /* Private use in the canonical format is lowercase in BCP47 */
1918 for (i = 0; *(pPriv + i) != 0; i++) {
1919 *(pPriv + i) = uprv_tolower(*(pPriv + i));
1920 }
1921
1922 /* validate */
1923 if (_isPrivateuseValueSubtag(pPriv, -1)) {
1924 if (firstValue) {
1925 if (!_isVariantSubtag(pPriv, -1)) {
1926 writeValue = true;
1927 }
1928 } else {
1929 writeValue = true;
1930 }
1931 } else if (strict) {
1932 *status = U_ILLEGAL_ARGUMENT_ERROR;
1933 break;
1934 } else {
1935 break;
1936 }
1937
1938 if (writeValue) {
1939 if (reslen < capacity) {
1940 tmpAppend[reslen++] = SEP;
1941 }
1942
1943 if (firstValue) {
1944 if (reslen < capacity) {
1945 tmpAppend[reslen++] = *PRIVATEUSE_KEY;
1946 }
1947
1948 if (reslen < capacity) {
1949 tmpAppend[reslen++] = SEP;
1950 }
1951
1952 len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
1953 if (reslen < capacity) {
1954 uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
1955 }
1956 reslen += len;
1957
1958 if (reslen < capacity) {
1959 tmpAppend[reslen++] = SEP;
1960 }
1961
1962 firstValue = false;
1963 }
1964
1965 len = (int32_t)uprv_strlen(pPriv);
1966 if (reslen < capacity) {
1967 uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
1968 }
1969 reslen += len;
1970 }
1971 }
1972 /* reset private use starting position */
1973 pPriv = nullptr;
1974 } else if (pPriv == nullptr) {
1975 pPriv = p;
1976 }
1977 p++;
1978 }
1979
1980 if (U_FAILURE(*status)) {
1981 return;
1982 }
1983 }
1984
1985 if (U_SUCCESS(*status)) {
1986 len = reslen;
1987 sink.Append(tmpAppend, len);
1988 }
1989}
1990
1991/*
1992* -------------------------------------------------
1993*
1994* ultag_ functions
1995*
1996* -------------------------------------------------
1997*/
1998
1999/* Bit flags used by the parser */
2000#define LANG 0x0001
2001#define EXTL 0x0002
2002#define SCRT 0x0004
2003#define REGN 0x0008
2004#define VART 0x0010
2005#define EXTS 0x0020
2006#define EXTV 0x0040
2007#define PRIV 0x0080
2008
2009/**
2010 * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function.
2011 * As a work-around, optimization is disabled for this function on VS2015 and VS2017.
2012 * This work-around should be removed once the following versions of Visual Studio are no
2013 * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4.
2014 */
2015#if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2016#pragma optimize( "", off )
2017#endif
2018
2019static ULanguageTag*
2020ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
2021 char *tagBuf;
2022 int16_t next;
2023 char *pSubtag, *pNext, *pLastGoodPosition;
2024 int32_t subtagLen;
2025 int32_t extlangIdx;
2026 ExtensionListEntry *pExtension;
2027 char *pExtValueSubtag, *pExtValueSubtagEnd;
2028 int32_t i;
2029 UBool privateuseVar = false;
2030 int32_t legacyLen = 0;
2031
2032 if (parsedLen != nullptr) {
2033 *parsedLen = 0;
2034 }
2035
2036 if (U_FAILURE(*status)) {
2037 return nullptr;
2038 }
2039
2040 if (tagLen < 0) {
2041 tagLen = (int32_t)uprv_strlen(tag);
2042 }
2043
2044 /* copy the entire string */
2045 tagBuf = (char*)uprv_malloc(tagLen + 1);
2046 if (tagBuf == nullptr) {
2047 *status = U_MEMORY_ALLOCATION_ERROR;
2048 return nullptr;
2049 }
2050
2051 if (tagLen > 0) {
2052 uprv_memcpy(tagBuf, tag, tagLen);
2053 }
2054 *(tagBuf + tagLen) = 0;
2055
2056 /* create a ULanguageTag */
2057 icu::LocalULanguageTagPointer t(
2058 (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)));
2059 if (t.isNull()) {
2060 uprv_free(tagBuf);
2061 *status = U_MEMORY_ALLOCATION_ERROR;
2062 return nullptr;
2063 }
2064 _initializeULanguageTag(t.getAlias());
2065 t->buf = tagBuf;
2066
2067 if (tagLen < MINLEN) {
2068 /* the input tag is too short - return empty ULanguageTag */
2069 return t.orphan();
2070 }
2071
2072 size_t parsedLenDelta = 0;
2073 // Legacy tag will be consider together. Legacy tag with intervening
2074 // script and region such as art-DE-lojban or art-Latn-lojban won't be
2075 // matched.
2076 /* check if the tag is legacy */
2077 for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
2078 int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
2079 if (tagLen < checkLegacyLen) {
2080 continue;
2081 }
2082 if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
2083 // make sure next char is '-'.
2084 continue;
2085 }
2086 if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
2087 int32_t newTagLength;
2088
2089 legacyLen = checkLegacyLen; /* back up for output parsedLen */
2090 int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
2091 newTagLength = replacementLen + tagLen - checkLegacyLen;
2092 int32_t oldTagLength = tagLen;
2093 if (tagLen < newTagLength) {
2094 uprv_free(tagBuf);
2095 tagBuf = (char*)uprv_malloc(newTagLength + 1);
2096 if (tagBuf == nullptr) {
2097 *status = U_MEMORY_ALLOCATION_ERROR;
2098 return nullptr;
2099 }
2100 t->buf = tagBuf;
2101 tagLen = newTagLength;
2102 }
2103 parsedLenDelta = checkLegacyLen - replacementLen;
2104 uprv_strcpy(t->buf, LEGACY[i + 1]);
2105 if (checkLegacyLen != tagLen) {
2106 uprv_memcpy(t->buf + replacementLen, tag + checkLegacyLen,
2107 oldTagLength - checkLegacyLen);
2108 // NUL-terminate after memcpy().
2109 t->buf[replacementLen + oldTagLength - checkLegacyLen] = 0;
2110 }
2111 break;
2112 }
2113 }
2114
2115 if (legacyLen == 0) {
2116 for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
2117 const char* redundantTag = REDUNDANT[i];
2118 size_t redundantTagLen = uprv_strlen(redundantTag);
2119 // The preferred tag for a redundant tag is always shorter than redundant
2120 // tag. A redundant tag may or may not be followed by other subtags.
2121 // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2122 if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
2123 const char* redundantTagEnd = tagBuf + redundantTagLen;
2124 if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
2125 const char* preferredTag = REDUNDANT[i + 1];
2126 size_t preferredTagLen = uprv_strlen(preferredTag);
2127 uprv_memcpy(t->buf, preferredTag, preferredTagLen);
2128 if (*redundantTagEnd == SEP) {
2129 uprv_memmove(tagBuf + preferredTagLen,
2130 redundantTagEnd,
2131 tagLen - redundantTagLen + 1);
2132 } else {
2133 tagBuf[preferredTagLen] = '\0';
2134 }
2135 // parsedLen should be the length of the input
2136 // before redundantTag is replaced by preferredTag.
2137 // Save the delta to add it back later.
2138 parsedLenDelta = redundantTagLen - preferredTagLen;
2139 break;
2140 }
2141 }
2142 }
2143 }
2144
2145 /*
2146 * langtag = language
2147 * ["-" script]
2148 * ["-" region]
2149 * *("-" variant)
2150 * *("-" extension)
2151 * ["-" privateuse]
2152 */
2153
2154 next = LANG | PRIV;
2155 pNext = pLastGoodPosition = tagBuf;
2156 extlangIdx = 0;
2157 pExtension = nullptr;
2158 pExtValueSubtag = nullptr;
2159 pExtValueSubtagEnd = nullptr;
2160
2161 while (pNext) {
2162 char *pSep;
2163
2164 pSubtag = pNext;
2165
2166 /* locate next separator char */
2167 pSep = pSubtag;
2168 while (*pSep) {
2169 if (*pSep == SEP) {
2170 break;
2171 }
2172 pSep++;
2173 }
2174 if (*pSep == 0) {
2175 /* last subtag */
2176 pNext = nullptr;
2177 } else {
2178 pNext = pSep + 1;
2179 }
2180 subtagLen = (int32_t)(pSep - pSubtag);
2181
2182 if (next & LANG) {
2183 if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
2184 *pSep = 0; /* terminate */
2185 // TODO: move deprecated language code handling here.
2186 t->language = T_CString_toLowerCase(pSubtag);
2187
2188 pLastGoodPosition = pSep;
2189 next = SCRT | REGN | VART | EXTS | PRIV;
2190 if (subtagLen <= 3)
2191 next |= EXTL;
2192 continue;
2193 }
2194 }
2195 if (next & EXTL) {
2196 if (_isExtlangSubtag(pSubtag, subtagLen)) {
2197 *pSep = 0;
2198 t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
2199
2200 pLastGoodPosition = pSep;
2201 if (extlangIdx < 3) {
2202 next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
2203 } else {
2204 next = SCRT | REGN | VART | EXTS | PRIV;
2205 }
2206 continue;
2207 }
2208 }
2209 if (next & SCRT) {
2210 if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
2211 char *p = pSubtag;
2212
2213 *pSep = 0;
2214
2215 /* to title case */
2216 *p = uprv_toupper(*p);
2217 p++;
2218 for (; *p; p++) {
2219 *p = uprv_tolower(*p);
2220 }
2221
2222 t->script = pSubtag;
2223
2224 pLastGoodPosition = pSep;
2225 next = REGN | VART | EXTS | PRIV;
2226 continue;
2227 }
2228 }
2229 if (next & REGN) {
2230 if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
2231 *pSep = 0;
2232 // TODO: move deprecated region code handling here.
2233 t->region = T_CString_toUpperCase(pSubtag);
2234
2235 pLastGoodPosition = pSep;
2236 next = VART | EXTS | PRIV;
2237 continue;
2238 }
2239 }
2240 if (next & VART) {
2241 if (_isVariantSubtag(pSubtag, subtagLen) ||
2242 (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
2243 VariantListEntry *var;
2244 UBool isAdded;
2245
2246 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
2247 if (var == nullptr) {
2248 *status = U_MEMORY_ALLOCATION_ERROR;
2249 return nullptr;
2250 }
2251 *pSep = 0;
2252 var->variant = T_CString_toUpperCase(pSubtag);
2253 isAdded = _addVariantToList(&(t->variants), var);
2254 if (!isAdded) {
2255 /* duplicated variant entry */
2256 uprv_free(var);
2257 break;
2258 }
2259 pLastGoodPosition = pSep;
2260 next = VART | EXTS | PRIV;
2261 continue;
2262 }
2263 }
2264 if (next & EXTS) {
2265 if (_isExtensionSingleton(pSubtag, subtagLen)) {
2266 if (pExtension != nullptr) {
2267 if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) {
2268 /* the previous extension is incomplete */
2269 uprv_free(pExtension);
2270 pExtension = nullptr;
2271 break;
2272 }
2273
2274 /* terminate the previous extension value */
2275 *pExtValueSubtagEnd = 0;
2276 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2277
2278 /* insert the extension to the list */
2279 if (_addExtensionToList(&(t->extensions), pExtension, false)) {
2280 pLastGoodPosition = pExtValueSubtagEnd;
2281 } else {
2282 /* stop parsing here */
2283 uprv_free(pExtension);
2284 pExtension = nullptr;
2285 break;
2286 }
2287 }
2288
2289 /* create a new extension */
2290 pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
2291 if (pExtension == nullptr) {
2292 *status = U_MEMORY_ALLOCATION_ERROR;
2293 return nullptr;
2294 }
2295 *pSep = 0;
2296 pExtension->key = T_CString_toLowerCase(pSubtag);
2297 pExtension->value = nullptr; /* will be set later */
2298
2299 /*
2300 * reset the start and the end location of extension value
2301 * subtags for this extension
2302 */
2303 pExtValueSubtag = nullptr;
2304 pExtValueSubtagEnd = nullptr;
2305
2306 next = EXTV;
2307 continue;
2308 }
2309 }
2310 if (next & EXTV) {
2311 if (_isExtensionSubtag(pSubtag, subtagLen)) {
2312 if (pExtValueSubtag == nullptr) {
2313 /* if the start position of this extension's value is not yet,
2314 this one is the first value subtag */
2315 pExtValueSubtag = pSubtag;
2316 }
2317
2318 /* Mark the end of this subtag */
2319 pExtValueSubtagEnd = pSep;
2320 next = EXTS | EXTV | PRIV;
2321
2322 continue;
2323 }
2324 }
2325 if (next & PRIV) {
2326 if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
2327 char *pPrivuseVal;
2328
2329 if (pExtension != nullptr) {
2330 /* Process the last extension */
2331 if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) {
2332 /* the previous extension is incomplete */
2333 uprv_free(pExtension);
2334 pExtension = nullptr;
2335 break;
2336 } else {
2337 /* terminate the previous extension value */
2338 *pExtValueSubtagEnd = 0;
2339 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2340
2341 /* insert the extension to the list */
2342 if (_addExtensionToList(&(t->extensions), pExtension, false)) {
2343 pLastGoodPosition = pExtValueSubtagEnd;
2344 pExtension = nullptr;
2345 } else {
2346 /* stop parsing here */
2347 uprv_free(pExtension);
2348 pExtension = nullptr;
2349 break;
2350 }
2351 }
2352 }
2353
2354 /* The rest of part will be private use value subtags */
2355 if (pNext == nullptr) {
2356 /* empty private use subtag */
2357 break;
2358 }
2359 /* back up the private use value start position */
2360 pPrivuseVal = pNext;
2361
2362 /* validate private use value subtags */
2363 while (pNext) {
2364 pSubtag = pNext;
2365 pSep = pSubtag;
2366 while (*pSep) {
2367 if (*pSep == SEP) {
2368 break;
2369 }
2370 pSep++;
2371 }
2372 if (*pSep == 0) {
2373 /* last subtag */
2374 pNext = nullptr;
2375 } else {
2376 pNext = pSep + 1;
2377 }
2378 subtagLen = (int32_t)(pSep - pSubtag);
2379
2380 if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
2381 *pSep = 0;
2382 next = VART;
2383 privateuseVar = true;
2384 break;
2385 } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
2386 pLastGoodPosition = pSep;
2387 } else {
2388 break;
2389 }
2390 }
2391
2392 if (next == VART) {
2393 continue;
2394 }
2395
2396 if (pLastGoodPosition - pPrivuseVal > 0) {
2397 *pLastGoodPosition = 0;
2398 t->privateuse = T_CString_toLowerCase(pPrivuseVal);
2399 }
2400 /* No more subtags, exiting the parse loop */
2401 break;
2402 }
2403 break;
2404 }
2405
2406 /* If we fell through here, it means this subtag is illegal - quit parsing */
2407 break;
2408 }
2409
2410 if (pExtension != nullptr) {
2411 /* Process the last extension */
2412 if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) {
2413 /* the previous extension is incomplete */
2414 uprv_free(pExtension);
2415 } else {
2416 /* terminate the previous extension value */
2417 *pExtValueSubtagEnd = 0;
2418 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2419 /* insert the extension to the list */
2420 if (_addExtensionToList(&(t->extensions), pExtension, false)) {
2421 pLastGoodPosition = pExtValueSubtagEnd;
2422 } else {
2423 uprv_free(pExtension);
2424 }
2425 }
2426 }
2427
2428 if (parsedLen != nullptr) {
2429 *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
2430 }
2431
2432 return t.orphan();
2433}
2434
2435// Ticket #12705 - Turn optimization back on.
2436#if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2437#pragma optimize( "", on )
2438#endif
2439
2440static void
2441ultag_close(ULanguageTag* langtag) {
2442
2443 if (langtag == nullptr) {
2444 return;
2445 }
2446
2447 uprv_free(langtag->buf);
2448
2449 if (langtag->variants) {
2450 VariantListEntry *curVar = langtag->variants;
2451 while (curVar) {
2452 VariantListEntry *nextVar = curVar->next;
2453 uprv_free(curVar);
2454 curVar = nextVar;
2455 }
2456 }
2457
2458 if (langtag->extensions) {
2459 ExtensionListEntry *curExt = langtag->extensions;
2460 while (curExt) {
2461 ExtensionListEntry *nextExt = curExt->next;
2462 uprv_free(curExt);
2463 curExt = nextExt;
2464 }
2465 }
2466
2467 uprv_free(langtag);
2468}
2469
2470static const char*
2471ultag_getLanguage(const ULanguageTag* langtag) {
2472 return langtag->language;
2473}
2474
2475#if 0
2476static const char*
2477ultag_getJDKLanguage(const ULanguageTag* langtag) {
2478 int32_t i;
2479 for (i = 0; DEPRECATEDLANGS[i] != nullptr; i += 2) {
2480 if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
2481 return DEPRECATEDLANGS[i + 1];
2482 }
2483 }
2484 return langtag->language;
2485}
2486#endif
2487
2488static const char*
2489ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
2490 if (idx >= 0 && idx < MAXEXTLANG) {
2491 return langtag->extlang[idx];
2492 }
2493 return nullptr;
2494}
2495
2496static int32_t
2497ultag_getExtlangSize(const ULanguageTag* langtag) {
2498 int32_t size = 0;
2499 int32_t i;
2500 for (i = 0; i < MAXEXTLANG; i++) {
2501 if (langtag->extlang[i]) {
2502 size++;
2503 }
2504 }
2505 return size;
2506}
2507
2508static const char*
2509ultag_getScript(const ULanguageTag* langtag) {
2510 return langtag->script;
2511}
2512
2513static const char*
2514ultag_getRegion(const ULanguageTag* langtag) {
2515 return langtag->region;
2516}
2517
2518static const char*
2519ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
2520 const char *var = nullptr;
2521 VariantListEntry *cur = langtag->variants;
2522 int32_t i = 0;
2523 while (cur) {
2524 if (i == idx) {
2525 var = cur->variant;
2526 break;
2527 }
2528 cur = cur->next;
2529 i++;
2530 }
2531 return var;
2532}
2533
2534static int32_t
2535ultag_getVariantsSize(const ULanguageTag* langtag) {
2536 int32_t size = 0;
2537 VariantListEntry *cur = langtag->variants;
2538 while (true) {
2539 if (cur == nullptr) {
2540 break;
2541 }
2542 size++;
2543 cur = cur->next;
2544 }
2545 return size;
2546}
2547
2548static const char*
2549ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
2550 const char *key = nullptr;
2551 ExtensionListEntry *cur = langtag->extensions;
2552 int32_t i = 0;
2553 while (cur) {
2554 if (i == idx) {
2555 key = cur->key;
2556 break;
2557 }
2558 cur = cur->next;
2559 i++;
2560 }
2561 return key;
2562}
2563
2564static const char*
2565ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
2566 const char *val = nullptr;
2567 ExtensionListEntry *cur = langtag->extensions;
2568 int32_t i = 0;
2569 while (cur) {
2570 if (i == idx) {
2571 val = cur->value;
2572 break;
2573 }
2574 cur = cur->next;
2575 i++;
2576 }
2577 return val;
2578}
2579
2580static int32_t
2581ultag_getExtensionsSize(const ULanguageTag* langtag) {
2582 int32_t size = 0;
2583 ExtensionListEntry *cur = langtag->extensions;
2584 while (true) {
2585 if (cur == nullptr) {
2586 break;
2587 }
2588 size++;
2589 cur = cur->next;
2590 }
2591 return size;
2592}
2593
2594static const char*
2595ultag_getPrivateUse(const ULanguageTag* langtag) {
2596 return langtag->privateuse;
2597}
2598
2599#if 0
2600static const char*
2601ultag_getLegacy(const ULanguageTag* langtag) {
2602 return langtag->legacy;
2603}
2604#endif
2605
2606
2607/*
2608* -------------------------------------------------
2609*
2610* Locale/BCP47 conversion APIs, exposed as uloc_*
2611*
2612* -------------------------------------------------
2613*/
2614U_CAPI int32_t U_EXPORT2
2615uloc_toLanguageTag(const char* localeID,
2616 char* langtag,
2617 int32_t langtagCapacity,
2618 UBool strict,
2619 UErrorCode* status) {
2620 if (U_FAILURE(*status)) {
2621 return 0;
2622 }
2623
2624 icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
2625 ulocimp_toLanguageTag(localeID, sink, strict, status);
2626
2627 int32_t reslen = sink.NumberOfBytesAppended();
2628
2629 if (U_FAILURE(*status)) {
2630 return reslen;
2631 }
2632
2633 if (sink.Overflowed()) {
2634 *status = U_BUFFER_OVERFLOW_ERROR;
2635 } else {
2636 u_terminateChars(langtag, langtagCapacity, reslen, status);
2637 }
2638
2639 return reslen;
2640}
2641
2642
2643U_CAPI void U_EXPORT2
2644ulocimp_toLanguageTag(const char* localeID,
2645 icu::ByteSink& sink,
2646 UBool strict,
2647 UErrorCode* status) {
2648 icu::CharString canonical;
2649 int32_t reslen;
2650 UErrorCode tmpStatus = U_ZERO_ERROR;
2651 UBool hadPosix = false;
2652 const char* pKeywordStart;
2653
2654 /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
2655 int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
2656 if (resultCapacity > 0) {
2657 char* buffer;
2658
2659 for (;;) {
2660 buffer = canonical.getAppendBuffer(
2661 /*minCapacity=*/resultCapacity,
2662 /*desiredCapacityHint=*/resultCapacity,
2663 resultCapacity,
2664 tmpStatus);
2665
2666 if (U_FAILURE(tmpStatus)) {
2667 *status = tmpStatus;
2668 return;
2669 }
2670
2671 reslen =
2672 uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
2673
2674 if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
2675 break;
2676 }
2677
2678 resultCapacity = reslen;
2679 tmpStatus = U_ZERO_ERROR;
2680 }
2681
2682 if (U_FAILURE(tmpStatus)) {
2683 *status = U_ILLEGAL_ARGUMENT_ERROR;
2684 return;
2685 }
2686
2687 canonical.append(buffer, reslen, tmpStatus);
2688 if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
2689 tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
2690 }
2691
2692 if (U_FAILURE(tmpStatus)) {
2693 *status = tmpStatus;
2694 return;
2695 }
2696 }
2697
2698 /* For handling special case - private use only tag */
2699 pKeywordStart = locale_getKeywordsStart(canonical.data());
2700 if (pKeywordStart == canonical.data()) {
2701 int kwdCnt = 0;
2702 UBool done = false;
2703
2704 icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus));
2705 if (U_SUCCESS(tmpStatus)) {
2706 kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus);
2707 if (kwdCnt == 1) {
2708 const char *key;
2709 int32_t len = 0;
2710
2711 key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus);
2712 if (len == 1 && *key == PRIVATEUSE) {
2713 icu::CharString buf;
2714 {
2715 icu::CharStringByteSink sink(&buf);
2716 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
2717 }
2718 if (U_SUCCESS(tmpStatus)) {
2719 if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
2720 /* return private use only tag */
2721 sink.Append("und-x-", 6);
2722 sink.Append(buf.data(), buf.length());
2723 done = true;
2724 } else if (strict) {
2725 *status = U_ILLEGAL_ARGUMENT_ERROR;
2726 done = true;
2727 }
2728 /* if not strict mode, then "und" will be returned */
2729 } else {
2730 *status = U_ILLEGAL_ARGUMENT_ERROR;
2731 done = true;
2732 }
2733 }
2734 }
2735 if (done) {
2736 return;
2737 }
2738 }
2739 }
2740
2741 _appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
2742 _appendScriptToLanguageTag(canonical.data(), sink, strict, status);
2743 _appendRegionToLanguageTag(canonical.data(), sink, strict, status);
2744 _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
2745 _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2746 _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2747}
2748
2749
2750U_CAPI int32_t U_EXPORT2
2751uloc_forLanguageTag(const char* langtag,
2752 char* localeID,
2753 int32_t localeIDCapacity,
2754 int32_t* parsedLength,
2755 UErrorCode* status) {
2756 if (U_FAILURE(*status)) {
2757 return 0;
2758 }
2759
2760 icu::CheckedArrayByteSink sink(localeID, localeIDCapacity);
2761 ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status);
2762
2763 int32_t reslen = sink.NumberOfBytesAppended();
2764
2765 if (U_FAILURE(*status)) {
2766 return reslen;
2767 }
2768
2769 if (sink.Overflowed()) {
2770 *status = U_BUFFER_OVERFLOW_ERROR;
2771 } else {
2772 u_terminateChars(localeID, localeIDCapacity, reslen, status);
2773 }
2774
2775 return reslen;
2776}
2777
2778
2779U_CAPI void U_EXPORT2
2780ulocimp_forLanguageTag(const char* langtag,
2781 int32_t tagLen,
2782 icu::ByteSink& sink,
2783 int32_t* parsedLength,
2784 UErrorCode* status) {
2785 UBool isEmpty = true;
2786 const char *subtag, *p;
2787 int32_t len;
2788 int32_t i, n;
2789 UBool noRegion = true;
2790
2791 icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status));
2792 if (U_FAILURE(*status)) {
2793 return;
2794 }
2795
2796 /* language */
2797 subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias());
2798 if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) {
2799 len = (int32_t)uprv_strlen(subtag);
2800 if (len > 0) {
2801 sink.Append(subtag, len);
2802 isEmpty = false;
2803 }
2804 }
2805
2806 /* script */
2807 subtag = ultag_getScript(lt.getAlias());
2808 len = (int32_t)uprv_strlen(subtag);
2809 if (len > 0) {
2810 sink.Append("_", 1);
2811 isEmpty = false;
2812
2813 /* write out the script in title case */
2814 char c = uprv_toupper(*subtag);
2815 sink.Append(&c, 1);
2816 sink.Append(subtag + 1, len - 1);
2817 }
2818
2819 /* region */
2820 subtag = ultag_getRegion(lt.getAlias());
2821 len = (int32_t)uprv_strlen(subtag);
2822 if (len > 0) {
2823 sink.Append("_", 1);
2824 isEmpty = false;
2825
2826 /* write out the region in upper case */
2827 p = subtag;
2828 while (*p) {
2829 char c = uprv_toupper(*p);
2830 sink.Append(&c, 1);
2831 p++;
2832 }
2833 noRegion = false;
2834 }
2835
2836 /* variants */
2837 _sortVariants(lt.getAlias()->variants);
2838 n = ultag_getVariantsSize(lt.getAlias());
2839 if (n > 0) {
2840 if (noRegion) {
2841 sink.Append("_", 1);
2842 isEmpty = false;
2843 }
2844
2845 for (i = 0; i < n; i++) {
2846 subtag = ultag_getVariant(lt.getAlias(), i);
2847 sink.Append("_", 1);
2848
2849 /* write out the variant in upper case */
2850 p = subtag;
2851 while (*p) {
2852 char c = uprv_toupper(*p);
2853 sink.Append(&c, 1);
2854 p++;
2855 }
2856 }
2857 }
2858
2859 /* keywords */
2860 n = ultag_getExtensionsSize(lt.getAlias());
2861 subtag = ultag_getPrivateUse(lt.getAlias());
2862 if (n > 0 || uprv_strlen(subtag) > 0) {
2863 if (isEmpty && n > 0) {
2864 /* need a language */
2865 sink.Append(LANG_UND, LANG_UND_LEN);
2866 }
2867 _appendKeywords(lt.getAlias(), sink, status);
2868 }
2869}
2870