1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1997-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: loclikely.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2010feb25
16* created by: Markus W. Scherer
17*
18* Code for likely and minimized locale subtags, separated out from other .cpp files
19* that then do not depend on resource bundle code and likely-subtags data.
20*/
21
22#include "unicode/bytestream.h"
23#include "unicode/utypes.h"
24#include "unicode/locid.h"
25#include "unicode/putil.h"
26#include "unicode/uchar.h"
27#include "unicode/uloc.h"
28#include "unicode/ures.h"
29#include "unicode/uscript.h"
30#include "bytesinkutil.h"
31#include "charstr.h"
32#include "cmemory.h"
33#include "cstring.h"
34#include "ulocimp.h"
35#include "ustr_imp.h"
36
37/**
38 * These are the canonical strings for unknown languages, scripts and regions.
39 **/
40static const char* const unknownLanguage = "und";
41static const char* const unknownScript = "Zzzz";
42static const char* const unknownRegion = "ZZ";
43
44/**
45 * This function looks for the localeID in the likelySubtags resource.
46 *
47 * @param localeID The tag to find.
48 * @param buffer A buffer to hold the matching entry
49 * @param bufferLength The length of the output buffer
50 * @return A pointer to "buffer" if found, or a null pointer if not.
51 */
52static const char* U_CALLCONV
53findLikelySubtags(const char* localeID,
54 char* buffer,
55 int32_t bufferLength,
56 UErrorCode* err) {
57 const char* result = nullptr;
58
59 if (!U_FAILURE(*err)) {
60 int32_t resLen = 0;
61 const char16_t* s = nullptr;
62 UErrorCode tmpErr = U_ZERO_ERROR;
63 icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr));
64 if (U_SUCCESS(tmpErr)) {
65 icu::CharString und;
66 if (localeID != nullptr) {
67 if (*localeID == '\0') {
68 localeID = unknownLanguage;
69 } else if (*localeID == '_') {
70 und.append(unknownLanguage, *err);
71 und.append(localeID, *err);
72 if (U_FAILURE(*err)) {
73 return nullptr;
74 }
75 localeID = und.data();
76 }
77 }
78 s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
79
80 if (U_FAILURE(tmpErr)) {
81 /*
82 * If a resource is missing, it's not really an error, it's
83 * just that we don't have any data for that particular locale ID.
84 */
85 if (tmpErr != U_MISSING_RESOURCE_ERROR) {
86 *err = tmpErr;
87 }
88 }
89 else if (resLen >= bufferLength) {
90 /* The buffer should never overflow. */
91 *err = U_INTERNAL_PROGRAM_ERROR;
92 }
93 else {
94 u_UCharsToChars(s, buffer, resLen + 1);
95 if (resLen >= 3 &&
96 uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
97 (resLen == 3 || buffer[3] == '_')) {
98 uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
99 }
100 result = buffer;
101 }
102 } else {
103 *err = tmpErr;
104 }
105 }
106
107 return result;
108}
109
110/**
111 * Append a tag to a buffer, adding the separator if necessary. The buffer
112 * must be large enough to contain the resulting tag plus any separator
113 * necessary. The tag must not be a zero-length string.
114 *
115 * @param tag The tag to add.
116 * @param tagLength The length of the tag.
117 * @param buffer The output buffer.
118 * @param bufferLength The length of the output buffer. This is an input/output parameter.
119 **/
120static void U_CALLCONV
121appendTag(
122 const char* tag,
123 int32_t tagLength,
124 char* buffer,
125 int32_t* bufferLength,
126 UBool withSeparator) {
127
128 if (withSeparator) {
129 buffer[*bufferLength] = '_';
130 ++(*bufferLength);
131 }
132
133 uprv_memmove(
134 &buffer[*bufferLength],
135 tag,
136 tagLength);
137
138 *bufferLength += tagLength;
139}
140
141/**
142 * Create a tag string from the supplied parameters. The lang, script and region
143 * parameters may be nullptr pointers. If they are, their corresponding length parameters
144 * must be less than or equal to 0.
145 *
146 * If any of the language, script or region parameters are empty, and the alternateTags
147 * parameter is not nullptr, it will be parsed for potential language, script and region tags
148 * to be used when constructing the new tag. If the alternateTags parameter is nullptr, or
149 * it contains no language tag, the default tag for the unknown language is used.
150 *
151 * If the length of the new string exceeds the capacity of the output buffer,
152 * the function copies as many bytes to the output buffer as it can, and returns
153 * the error U_BUFFER_OVERFLOW_ERROR.
154 *
155 * If an illegal argument is provided, the function returns the error
156 * U_ILLEGAL_ARGUMENT_ERROR.
157 *
158 * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
159 * the tag string fits in the output buffer, but the null terminator doesn't.
160 *
161 * @param lang The language tag to use.
162 * @param langLength The length of the language tag.
163 * @param script The script tag to use.
164 * @param scriptLength The length of the script tag.
165 * @param region The region tag to use.
166 * @param regionLength The length of the region tag.
167 * @param trailing Any trailing data to append to the new tag.
168 * @param trailingLength The length of the trailing data.
169 * @param alternateTags A string containing any alternate tags.
170 * @param sink The output sink receiving the tag string.
171 * @param err A pointer to a UErrorCode for error reporting.
172 **/
173static void U_CALLCONV
174createTagStringWithAlternates(
175 const char* lang,
176 int32_t langLength,
177 const char* script,
178 int32_t scriptLength,
179 const char* region,
180 int32_t regionLength,
181 const char* trailing,
182 int32_t trailingLength,
183 const char* alternateTags,
184 icu::ByteSink& sink,
185 UErrorCode* err) {
186
187 if (U_FAILURE(*err)) {
188 goto error;
189 }
190 else if (langLength >= ULOC_LANG_CAPACITY ||
191 scriptLength >= ULOC_SCRIPT_CAPACITY ||
192 regionLength >= ULOC_COUNTRY_CAPACITY) {
193 goto error;
194 }
195 else {
196 /**
197 * ULOC_FULLNAME_CAPACITY will provide enough capacity
198 * that we can build a string that contains the language,
199 * script and region code without worrying about overrunning
200 * the user-supplied buffer.
201 **/
202 char tagBuffer[ULOC_FULLNAME_CAPACITY];
203 int32_t tagLength = 0;
204 UBool regionAppended = false;
205
206 if (langLength > 0) {
207 appendTag(
208 lang,
209 langLength,
210 tagBuffer,
211 &tagLength,
212 /*withSeparator=*/false);
213 }
214 else if (alternateTags == nullptr) {
215 /*
216 * Use the empty string for an unknown language, if
217 * we found no language.
218 */
219 }
220 else {
221 /*
222 * Parse the alternateTags string for the language.
223 */
224 char alternateLang[ULOC_LANG_CAPACITY];
225 int32_t alternateLangLength = sizeof(alternateLang);
226
227 alternateLangLength =
228 uloc_getLanguage(
229 alternateTags,
230 alternateLang,
231 alternateLangLength,
232 err);
233 if(U_FAILURE(*err) ||
234 alternateLangLength >= ULOC_LANG_CAPACITY) {
235 goto error;
236 }
237 else if (alternateLangLength == 0) {
238 /*
239 * Use the empty string for an unknown language, if
240 * we found no language.
241 */
242 }
243 else {
244 appendTag(
245 alternateLang,
246 alternateLangLength,
247 tagBuffer,
248 &tagLength,
249 /*withSeparator=*/false);
250 }
251 }
252
253 if (scriptLength > 0) {
254 appendTag(
255 script,
256 scriptLength,
257 tagBuffer,
258 &tagLength,
259 /*withSeparator=*/true);
260 }
261 else if (alternateTags != nullptr) {
262 /*
263 * Parse the alternateTags string for the script.
264 */
265 char alternateScript[ULOC_SCRIPT_CAPACITY];
266
267 const int32_t alternateScriptLength =
268 uloc_getScript(
269 alternateTags,
270 alternateScript,
271 sizeof(alternateScript),
272 err);
273
274 if (U_FAILURE(*err) ||
275 alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
276 goto error;
277 }
278 else if (alternateScriptLength > 0) {
279 appendTag(
280 alternateScript,
281 alternateScriptLength,
282 tagBuffer,
283 &tagLength,
284 /*withSeparator=*/true);
285 }
286 }
287
288 if (regionLength > 0) {
289 appendTag(
290 region,
291 regionLength,
292 tagBuffer,
293 &tagLength,
294 /*withSeparator=*/true);
295
296 regionAppended = true;
297 }
298 else if (alternateTags != nullptr) {
299 /*
300 * Parse the alternateTags string for the region.
301 */
302 char alternateRegion[ULOC_COUNTRY_CAPACITY];
303
304 const int32_t alternateRegionLength =
305 uloc_getCountry(
306 alternateTags,
307 alternateRegion,
308 sizeof(alternateRegion),
309 err);
310 if (U_FAILURE(*err) ||
311 alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
312 goto error;
313 }
314 else if (alternateRegionLength > 0) {
315 appendTag(
316 alternateRegion,
317 alternateRegionLength,
318 tagBuffer,
319 &tagLength,
320 /*withSeparator=*/true);
321
322 regionAppended = true;
323 }
324 }
325
326 /**
327 * Copy the partial tag from our internal buffer to the supplied
328 * target.
329 **/
330 sink.Append(tagBuffer, tagLength);
331
332 if (trailingLength > 0) {
333 if (*trailing != '@') {
334 sink.Append("_", 1);
335 if (!regionAppended) {
336 /* extra separator is required */
337 sink.Append("_", 1);
338 }
339 }
340
341 /*
342 * Copy the trailing data into the supplied buffer.
343 */
344 sink.Append(trailing, trailingLength);
345 }
346
347 return;
348 }
349
350error:
351
352 /**
353 * An overflow indicates the locale ID passed in
354 * is ill-formed. If we got here, and there was
355 * no previous error, it's an implicit overflow.
356 **/
357 if (*err == U_BUFFER_OVERFLOW_ERROR ||
358 U_SUCCESS(*err)) {
359 *err = U_ILLEGAL_ARGUMENT_ERROR;
360 }
361}
362
363/**
364 * Create a tag string from the supplied parameters. The lang, script and region
365 * parameters may be nullptr pointers. If they are, their corresponding length parameters
366 * must be less than or equal to 0. If the lang parameter is an empty string, the
367 * default value for an unknown language is written to the output buffer.
368 *
369 * If the length of the new string exceeds the capacity of the output buffer,
370 * the function copies as many bytes to the output buffer as it can, and returns
371 * the error U_BUFFER_OVERFLOW_ERROR.
372 *
373 * If an illegal argument is provided, the function returns the error
374 * U_ILLEGAL_ARGUMENT_ERROR.
375 *
376 * @param lang The language tag to use.
377 * @param langLength The length of the language tag.
378 * @param script The script tag to use.
379 * @param scriptLength The length of the script tag.
380 * @param region The region tag to use.
381 * @param regionLength The length of the region tag.
382 * @param trailing Any trailing data to append to the new tag.
383 * @param trailingLength The length of the trailing data.
384 * @param sink The output sink receiving the tag string.
385 * @param err A pointer to a UErrorCode for error reporting.
386 **/
387static void U_CALLCONV
388createTagString(
389 const char* lang,
390 int32_t langLength,
391 const char* script,
392 int32_t scriptLength,
393 const char* region,
394 int32_t regionLength,
395 const char* trailing,
396 int32_t trailingLength,
397 icu::ByteSink& sink,
398 UErrorCode* err)
399{
400 createTagStringWithAlternates(
401 lang,
402 langLength,
403 script,
404 scriptLength,
405 region,
406 regionLength,
407 trailing,
408 trailingLength,
409 nullptr,
410 sink,
411 err);
412}
413
414/**
415 * Parse the language, script, and region subtags from a tag string, and copy the
416 * results into the corresponding output parameters. The buffers are null-terminated,
417 * unless overflow occurs.
418 *
419 * The langLength, scriptLength, and regionLength parameters are input/output
420 * parameters, and must contain the capacity of their corresponding buffers on
421 * input. On output, they will contain the actual length of the buffers, not
422 * including the null terminator.
423 *
424 * If the length of any of the output subtags exceeds the capacity of the corresponding
425 * buffer, the function copies as many bytes to the output buffer as it can, and returns
426 * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow
427 * occurs.
428 *
429 * If an illegal argument is provided, the function returns the error
430 * U_ILLEGAL_ARGUMENT_ERROR.
431 *
432 * @param localeID The locale ID to parse.
433 * @param lang The language tag buffer.
434 * @param langLength The length of the language tag.
435 * @param script The script tag buffer.
436 * @param scriptLength The length of the script tag.
437 * @param region The region tag buffer.
438 * @param regionLength The length of the region tag.
439 * @param err A pointer to a UErrorCode for error reporting.
440 * @return The number of chars of the localeID parameter consumed.
441 **/
442static int32_t U_CALLCONV
443parseTagString(
444 const char* localeID,
445 char* lang,
446 int32_t* langLength,
447 char* script,
448 int32_t* scriptLength,
449 char* region,
450 int32_t* regionLength,
451 UErrorCode* err)
452{
453 const char* position = localeID;
454 int32_t subtagLength = 0;
455
456 if(U_FAILURE(*err) ||
457 localeID == nullptr ||
458 lang == nullptr ||
459 langLength == nullptr ||
460 script == nullptr ||
461 scriptLength == nullptr ||
462 region == nullptr ||
463 regionLength == nullptr) {
464 goto error;
465 }
466
467 subtagLength = ulocimp_getLanguage(position, &position, *err).extract(lang, *langLength, *err);
468
469 /*
470 * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
471 * to be an error, because it indicates the user-supplied tag is
472 * not well-formed.
473 */
474 if(U_FAILURE(*err)) {
475 goto error;
476 }
477
478 *langLength = subtagLength;
479
480 /*
481 * If no language was present, use the empty string instead.
482 * Otherwise, move past any separator.
483 */
484 if (_isIDSeparator(*position)) {
485 ++position;
486 }
487
488 subtagLength = ulocimp_getScript(position, &position, *err).extract(script, *scriptLength, *err);
489
490 if(U_FAILURE(*err)) {
491 goto error;
492 }
493
494 *scriptLength = subtagLength;
495
496 if (*scriptLength > 0) {
497 if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
498 /**
499 * If the script part is the "unknown" script, then don't return it.
500 **/
501 *scriptLength = 0;
502 }
503
504 /*
505 * Move past any separator.
506 */
507 if (_isIDSeparator(*position)) {
508 ++position;
509 }
510 }
511
512 subtagLength = ulocimp_getCountry(position, &position, *err).extract(region, *regionLength, *err);
513
514 if(U_FAILURE(*err)) {
515 goto error;
516 }
517
518 *regionLength = subtagLength;
519
520 if (*regionLength > 0) {
521 if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
522 /**
523 * If the region part is the "unknown" region, then don't return it.
524 **/
525 *regionLength = 0;
526 }
527 } else if (*position != 0 && *position != '@') {
528 /* back up over consumed trailing separator */
529 --position;
530 }
531
532exit:
533
534 return (int32_t)(position - localeID);
535
536error:
537
538 /**
539 * If we get here, we have no explicit error, it's the result of an
540 * illegal argument.
541 **/
542 if (!U_FAILURE(*err)) {
543 *err = U_ILLEGAL_ARGUMENT_ERROR;
544 }
545
546 goto exit;
547}
548
549static UBool U_CALLCONV
550createLikelySubtagsString(
551 const char* lang,
552 int32_t langLength,
553 const char* script,
554 int32_t scriptLength,
555 const char* region,
556 int32_t regionLength,
557 const char* variants,
558 int32_t variantsLength,
559 icu::ByteSink& sink,
560 UErrorCode* err) {
561 /**
562 * ULOC_FULLNAME_CAPACITY will provide enough capacity
563 * that we can build a string that contains the language,
564 * script and region code without worrying about overrunning
565 * the user-supplied buffer.
566 **/
567 char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
568
569 if(U_FAILURE(*err)) {
570 goto error;
571 }
572
573 /**
574 * Try the language with the script and region first.
575 **/
576 if (scriptLength > 0 && regionLength > 0) {
577
578 const char* likelySubtags = nullptr;
579
580 icu::CharString tagBuffer;
581 {
582 icu::CharStringByteSink sink(&tagBuffer);
583 createTagString(
584 lang,
585 langLength,
586 script,
587 scriptLength,
588 region,
589 regionLength,
590 nullptr,
591 0,
592 sink,
593 err);
594 }
595 if(U_FAILURE(*err)) {
596 goto error;
597 }
598
599 likelySubtags =
600 findLikelySubtags(
601 tagBuffer.data(),
602 likelySubtagsBuffer,
603 sizeof(likelySubtagsBuffer),
604 err);
605 if(U_FAILURE(*err)) {
606 goto error;
607 }
608
609 if (likelySubtags != nullptr) {
610 /* Always use the language tag from the
611 maximal string, since it may be more
612 specific than the one provided. */
613 createTagStringWithAlternates(
614 nullptr,
615 0,
616 nullptr,
617 0,
618 nullptr,
619 0,
620 variants,
621 variantsLength,
622 likelySubtags,
623 sink,
624 err);
625 return true;
626 }
627 }
628
629 /**
630 * Try the language with just the script.
631 **/
632 if (scriptLength > 0) {
633
634 const char* likelySubtags = nullptr;
635
636 icu::CharString tagBuffer;
637 {
638 icu::CharStringByteSink sink(&tagBuffer);
639 createTagString(
640 lang,
641 langLength,
642 script,
643 scriptLength,
644 nullptr,
645 0,
646 nullptr,
647 0,
648 sink,
649 err);
650 }
651 if(U_FAILURE(*err)) {
652 goto error;
653 }
654
655 likelySubtags =
656 findLikelySubtags(
657 tagBuffer.data(),
658 likelySubtagsBuffer,
659 sizeof(likelySubtagsBuffer),
660 err);
661 if(U_FAILURE(*err)) {
662 goto error;
663 }
664
665 if (likelySubtags != nullptr) {
666 /* Always use the language tag from the
667 maximal string, since it may be more
668 specific than the one provided. */
669 createTagStringWithAlternates(
670 nullptr,
671 0,
672 nullptr,
673 0,
674 region,
675 regionLength,
676 variants,
677 variantsLength,
678 likelySubtags,
679 sink,
680 err);
681 return true;
682 }
683 }
684
685 /**
686 * Try the language with just the region.
687 **/
688 if (regionLength > 0) {
689
690 const char* likelySubtags = nullptr;
691
692 icu::CharString tagBuffer;
693 {
694 icu::CharStringByteSink sink(&tagBuffer);
695 createTagString(
696 lang,
697 langLength,
698 nullptr,
699 0,
700 region,
701 regionLength,
702 nullptr,
703 0,
704 sink,
705 err);
706 }
707 if(U_FAILURE(*err)) {
708 goto error;
709 }
710
711 likelySubtags =
712 findLikelySubtags(
713 tagBuffer.data(),
714 likelySubtagsBuffer,
715 sizeof(likelySubtagsBuffer),
716 err);
717 if(U_FAILURE(*err)) {
718 goto error;
719 }
720
721 if (likelySubtags != nullptr) {
722 /* Always use the language tag from the
723 maximal string, since it may be more
724 specific than the one provided. */
725 createTagStringWithAlternates(
726 nullptr,
727 0,
728 script,
729 scriptLength,
730 nullptr,
731 0,
732 variants,
733 variantsLength,
734 likelySubtags,
735 sink,
736 err);
737 return true;
738 }
739 }
740
741 /**
742 * Finally, try just the language.
743 **/
744 {
745 const char* likelySubtags = nullptr;
746
747 icu::CharString tagBuffer;
748 {
749 icu::CharStringByteSink sink(&tagBuffer);
750 createTagString(
751 lang,
752 langLength,
753 nullptr,
754 0,
755 nullptr,
756 0,
757 nullptr,
758 0,
759 sink,
760 err);
761 }
762 if(U_FAILURE(*err)) {
763 goto error;
764 }
765
766 likelySubtags =
767 findLikelySubtags(
768 tagBuffer.data(),
769 likelySubtagsBuffer,
770 sizeof(likelySubtagsBuffer),
771 err);
772 if(U_FAILURE(*err)) {
773 goto error;
774 }
775
776 if (likelySubtags != nullptr) {
777 /* Always use the language tag from the
778 maximal string, since it may be more
779 specific than the one provided. */
780 createTagStringWithAlternates(
781 nullptr,
782 0,
783 script,
784 scriptLength,
785 region,
786 regionLength,
787 variants,
788 variantsLength,
789 likelySubtags,
790 sink,
791 err);
792 return true;
793 }
794 }
795
796 return false;
797
798error:
799
800 if (!U_FAILURE(*err)) {
801 *err = U_ILLEGAL_ARGUMENT_ERROR;
802 }
803
804 return false;
805}
806
807#define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
808 int32_t count = 0; \
809 int32_t i; \
810 for (i = 0; i < trailingLength; i++) { \
811 if (trailing[i] == '-' || trailing[i] == '_') { \
812 count = 0; \
813 if (count > 8) { \
814 goto error; \
815 } \
816 } else if (trailing[i] == '@') { \
817 break; \
818 } else if (count > 8) { \
819 goto error; \
820 } else { \
821 count++; \
822 } \
823 } \
824} UPRV_BLOCK_MACRO_END
825
826static UBool
827_uloc_addLikelySubtags(const char* localeID,
828 icu::ByteSink& sink,
829 UErrorCode* err) {
830 char lang[ULOC_LANG_CAPACITY];
831 int32_t langLength = sizeof(lang);
832 char script[ULOC_SCRIPT_CAPACITY];
833 int32_t scriptLength = sizeof(script);
834 char region[ULOC_COUNTRY_CAPACITY];
835 int32_t regionLength = sizeof(region);
836 const char* trailing = "";
837 int32_t trailingLength = 0;
838 int32_t trailingIndex = 0;
839 UBool success = false;
840
841 if(U_FAILURE(*err)) {
842 goto error;
843 }
844 if (localeID == nullptr) {
845 goto error;
846 }
847
848 trailingIndex = parseTagString(
849 localeID,
850 lang,
851 &langLength,
852 script,
853 &scriptLength,
854 region,
855 &regionLength,
856 err);
857 if(U_FAILURE(*err)) {
858 /* Overflow indicates an illegal argument error */
859 if (*err == U_BUFFER_OVERFLOW_ERROR) {
860 *err = U_ILLEGAL_ARGUMENT_ERROR;
861 }
862
863 goto error;
864 }
865
866 /* Find the length of the trailing portion. */
867 while (_isIDSeparator(localeID[trailingIndex])) {
868 trailingIndex++;
869 }
870 trailing = &localeID[trailingIndex];
871 trailingLength = (int32_t)uprv_strlen(trailing);
872
873 CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
874
875 success =
876 createLikelySubtagsString(
877 lang,
878 langLength,
879 script,
880 scriptLength,
881 region,
882 regionLength,
883 trailing,
884 trailingLength,
885 sink,
886 err);
887
888 if (!success) {
889 const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
890
891 /*
892 * If we get here, we need to return localeID.
893 */
894 sink.Append(localeID, localIDLength);
895 }
896
897 return success;
898
899error:
900
901 if (!U_FAILURE(*err)) {
902 *err = U_ILLEGAL_ARGUMENT_ERROR;
903 }
904 return false;
905}
906
907// Add likely subtags to the sink
908// return true if the value in the sink is produced by a match during the lookup
909// return false if the value in the sink is the same as input because there are
910// no match after the lookup.
911static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*);
912
913static void
914_uloc_minimizeSubtags(const char* localeID,
915 icu::ByteSink& sink,
916 UErrorCode* err) {
917 icu::CharString maximizedTagBuffer;
918
919 char lang[ULOC_LANG_CAPACITY];
920 int32_t langLength = sizeof(lang);
921 char script[ULOC_SCRIPT_CAPACITY];
922 int32_t scriptLength = sizeof(script);
923 char region[ULOC_COUNTRY_CAPACITY];
924 int32_t regionLength = sizeof(region);
925 const char* trailing = "";
926 int32_t trailingLength = 0;
927 int32_t trailingIndex = 0;
928 UBool successGetMax = false;
929
930 if(U_FAILURE(*err)) {
931 goto error;
932 }
933 else if (localeID == nullptr) {
934 goto error;
935 }
936
937 trailingIndex =
938 parseTagString(
939 localeID,
940 lang,
941 &langLength,
942 script,
943 &scriptLength,
944 region,
945 &regionLength,
946 err);
947 if(U_FAILURE(*err)) {
948
949 /* Overflow indicates an illegal argument error */
950 if (*err == U_BUFFER_OVERFLOW_ERROR) {
951 *err = U_ILLEGAL_ARGUMENT_ERROR;
952 }
953
954 goto error;
955 }
956
957 /* Find the spot where the variants or the keywords begin, if any. */
958 while (_isIDSeparator(localeID[trailingIndex])) {
959 trailingIndex++;
960 }
961 trailing = &localeID[trailingIndex];
962 trailingLength = (int32_t)uprv_strlen(trailing);
963
964 CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
965
966 {
967 icu::CharString base;
968 {
969 icu::CharStringByteSink baseSink(&base);
970 createTagString(
971 lang,
972 langLength,
973 script,
974 scriptLength,
975 region,
976 regionLength,
977 nullptr,
978 0,
979 baseSink,
980 err);
981 }
982
983 /**
984 * First, we need to first get the maximization
985 * from AddLikelySubtags.
986 **/
987 {
988 icu::CharStringByteSink maxSink(&maximizedTagBuffer);
989 successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
990 }
991 }
992
993 if(U_FAILURE(*err)) {
994 goto error;
995 }
996
997 if (!successGetMax) {
998 /**
999 * If we got here, return the locale ID parameter unchanged.
1000 **/
1001 const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
1002 sink.Append(localeID, localeIDLength);
1003 return;
1004 }
1005
1006 // In the following, the lang, script, region are referring to those in
1007 // the maximizedTagBuffer, not the one in the localeID.
1008 langLength = sizeof(lang);
1009 scriptLength = sizeof(script);
1010 regionLength = sizeof(region);
1011 parseTagString(
1012 maximizedTagBuffer.data(),
1013 lang,
1014 &langLength,
1015 script,
1016 &scriptLength,
1017 region,
1018 &regionLength,
1019 err);
1020 if(U_FAILURE(*err)) {
1021 goto error;
1022 }
1023
1024 /**
1025 * Start first with just the language.
1026 **/
1027 {
1028 icu::CharString tagBuffer;
1029 {
1030 icu::CharStringByteSink tagSink(&tagBuffer);
1031 createLikelySubtagsString(
1032 lang,
1033 langLength,
1034 nullptr,
1035 0,
1036 nullptr,
1037 0,
1038 nullptr,
1039 0,
1040 tagSink,
1041 err);
1042 }
1043
1044 if(U_FAILURE(*err)) {
1045 goto error;
1046 }
1047 else if (!tagBuffer.isEmpty() &&
1048 uprv_strnicmp(
1049 maximizedTagBuffer.data(),
1050 tagBuffer.data(),
1051 tagBuffer.length()) == 0) {
1052
1053 createTagString(
1054 lang,
1055 langLength,
1056 nullptr,
1057 0,
1058 nullptr,
1059 0,
1060 trailing,
1061 trailingLength,
1062 sink,
1063 err);
1064 return;
1065 }
1066 }
1067
1068 /**
1069 * Next, try the language and region.
1070 **/
1071 if (regionLength > 0) {
1072
1073 icu::CharString tagBuffer;
1074 {
1075 icu::CharStringByteSink tagSink(&tagBuffer);
1076 createLikelySubtagsString(
1077 lang,
1078 langLength,
1079 nullptr,
1080 0,
1081 region,
1082 regionLength,
1083 nullptr,
1084 0,
1085 tagSink,
1086 err);
1087 }
1088
1089 if(U_FAILURE(*err)) {
1090 goto error;
1091 }
1092 else if (!tagBuffer.isEmpty() &&
1093 uprv_strnicmp(
1094 maximizedTagBuffer.data(),
1095 tagBuffer.data(),
1096 tagBuffer.length()) == 0) {
1097
1098 createTagString(
1099 lang,
1100 langLength,
1101 nullptr,
1102 0,
1103 region,
1104 regionLength,
1105 trailing,
1106 trailingLength,
1107 sink,
1108 err);
1109 return;
1110 }
1111 }
1112
1113 /**
1114 * Finally, try the language and script. This is our last chance,
1115 * since trying with all three subtags would only yield the
1116 * maximal version that we already have.
1117 **/
1118 if (scriptLength > 0) {
1119 icu::CharString tagBuffer;
1120 {
1121 icu::CharStringByteSink tagSink(&tagBuffer);
1122 createLikelySubtagsString(
1123 lang,
1124 langLength,
1125 script,
1126 scriptLength,
1127 nullptr,
1128 0,
1129 nullptr,
1130 0,
1131 tagSink,
1132 err);
1133 }
1134
1135 if(U_FAILURE(*err)) {
1136 goto error;
1137 }
1138 else if (!tagBuffer.isEmpty() &&
1139 uprv_strnicmp(
1140 maximizedTagBuffer.data(),
1141 tagBuffer.data(),
1142 tagBuffer.length()) == 0) {
1143
1144 createTagString(
1145 lang,
1146 langLength,
1147 script,
1148 scriptLength,
1149 nullptr,
1150 0,
1151 trailing,
1152 trailingLength,
1153 sink,
1154 err);
1155 return;
1156 }
1157 }
1158
1159 {
1160 /**
1161 * If we got here, return the max + trail.
1162 **/
1163 createTagString(
1164 lang,
1165 langLength,
1166 script,
1167 scriptLength,
1168 region,
1169 regionLength,
1170 trailing,
1171 trailingLength,
1172 sink,
1173 err);
1174 return;
1175 }
1176
1177error:
1178
1179 if (!U_FAILURE(*err)) {
1180 *err = U_ILLEGAL_ARGUMENT_ERROR;
1181 }
1182}
1183
1184static int32_t
1185do_canonicalize(const char* localeID,
1186 char* buffer,
1187 int32_t bufferCapacity,
1188 UErrorCode* err)
1189{
1190 int32_t canonicalizedSize = uloc_canonicalize(
1191 localeID,
1192 buffer,
1193 bufferCapacity,
1194 err);
1195
1196 if (*err == U_STRING_NOT_TERMINATED_WARNING ||
1197 *err == U_BUFFER_OVERFLOW_ERROR) {
1198 return canonicalizedSize;
1199 }
1200 else if (U_FAILURE(*err)) {
1201
1202 return -1;
1203 }
1204 else {
1205 return canonicalizedSize;
1206 }
1207}
1208
1209U_CAPI int32_t U_EXPORT2
1210uloc_addLikelySubtags(const char* localeID,
1211 char* maximizedLocaleID,
1212 int32_t maximizedLocaleIDCapacity,
1213 UErrorCode* status) {
1214 if (U_FAILURE(*status)) {
1215 return 0;
1216 }
1217
1218 icu::CheckedArrayByteSink sink(
1219 maximizedLocaleID, maximizedLocaleIDCapacity);
1220
1221 ulocimp_addLikelySubtags(localeID, sink, status);
1222 int32_t reslen = sink.NumberOfBytesAppended();
1223
1224 if (U_FAILURE(*status)) {
1225 return sink.Overflowed() ? reslen : -1;
1226 }
1227
1228 if (sink.Overflowed()) {
1229 *status = U_BUFFER_OVERFLOW_ERROR;
1230 } else {
1231 u_terminateChars(
1232 maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status);
1233 }
1234
1235 return reslen;
1236}
1237
1238static UBool
1239_ulocimp_addLikelySubtags(const char* localeID,
1240 icu::ByteSink& sink,
1241 UErrorCode* status) {
1242 PreflightingLocaleIDBuffer localeBuffer;
1243 do {
1244 localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
1245 localeBuffer.getCapacity(), status);
1246 } while (localeBuffer.needToTryAgain(status));
1247
1248 if (U_SUCCESS(*status)) {
1249 return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status);
1250 } else {
1251 return false;
1252 }
1253}
1254
1255U_CAPI void U_EXPORT2
1256ulocimp_addLikelySubtags(const char* localeID,
1257 icu::ByteSink& sink,
1258 UErrorCode* status) {
1259 _ulocimp_addLikelySubtags(localeID, sink, status);
1260}
1261
1262U_CAPI int32_t U_EXPORT2
1263uloc_minimizeSubtags(const char* localeID,
1264 char* minimizedLocaleID,
1265 int32_t minimizedLocaleIDCapacity,
1266 UErrorCode* status) {
1267 if (U_FAILURE(*status)) {
1268 return 0;
1269 }
1270
1271 icu::CheckedArrayByteSink sink(
1272 minimizedLocaleID, minimizedLocaleIDCapacity);
1273
1274 ulocimp_minimizeSubtags(localeID, sink, status);
1275 int32_t reslen = sink.NumberOfBytesAppended();
1276
1277 if (U_FAILURE(*status)) {
1278 return sink.Overflowed() ? reslen : -1;
1279 }
1280
1281 if (sink.Overflowed()) {
1282 *status = U_BUFFER_OVERFLOW_ERROR;
1283 } else {
1284 u_terminateChars(
1285 minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status);
1286 }
1287
1288 return reslen;
1289}
1290
1291U_CAPI void U_EXPORT2
1292ulocimp_minimizeSubtags(const char* localeID,
1293 icu::ByteSink& sink,
1294 UErrorCode* status) {
1295 PreflightingLocaleIDBuffer localeBuffer;
1296 do {
1297 localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
1298 localeBuffer.getCapacity(), status);
1299 } while (localeBuffer.needToTryAgain(status));
1300
1301 _uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
1302}
1303
1304// Pairs of (language subtag, + or -) for finding out fast if common languages
1305// are LTR (minus) or RTL (plus).
1306static const char LANG_DIR_STRING[] =
1307 "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
1308
1309// Implemented here because this calls ulocimp_addLikelySubtags().
1310U_CAPI UBool U_EXPORT2
1311uloc_isRightToLeft(const char *locale) {
1312 UErrorCode errorCode = U_ZERO_ERROR;
1313 char script[8];
1314 int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
1315 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1316 scriptLength == 0) {
1317 // Fastpath: We know the likely scripts and their writing direction
1318 // for some common languages.
1319 errorCode = U_ZERO_ERROR;
1320 char lang[8];
1321 int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
1322 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1323 return false;
1324 }
1325 if (langLength > 0) {
1326 const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
1327 if (langPtr != nullptr) {
1328 switch (langPtr[langLength]) {
1329 case '-': return false;
1330 case '+': return true;
1331 default: break; // partial match of a longer code
1332 }
1333 }
1334 }
1335 // Otherwise, find the likely script.
1336 errorCode = U_ZERO_ERROR;
1337 icu::CharString likely;
1338 {
1339 icu::CharStringByteSink sink(&likely);
1340 ulocimp_addLikelySubtags(locale, sink, &errorCode);
1341 }
1342 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1343 return false;
1344 }
1345 scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode);
1346 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1347 scriptLength == 0) {
1348 return false;
1349 }
1350 }
1351 UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
1352 return uscript_isRightToLeft(scriptCode);
1353}
1354
1355U_NAMESPACE_BEGIN
1356
1357UBool
1358Locale::isRightToLeft() const {
1359 return uloc_isRightToLeft(getBaseName());
1360}
1361
1362U_NAMESPACE_END
1363
1364// The following must at least allow for rg key value (6) plus terminator (1).
1365#define ULOC_RG_BUFLEN 8
1366
1367U_CAPI int32_t U_EXPORT2
1368ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
1369 char *region, int32_t regionCapacity, UErrorCode* status) {
1370 if (U_FAILURE(*status)) {
1371 return 0;
1372 }
1373 char rgBuf[ULOC_RG_BUFLEN];
1374 UErrorCode rgStatus = U_ZERO_ERROR;
1375
1376 // First check for rg keyword value
1377 int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
1378 if (U_FAILURE(rgStatus) || rgLen != 6) {
1379 rgLen = 0;
1380 } else {
1381 // rgBuf guaranteed to be zero terminated here, with text len 6
1382 char *rgPtr = rgBuf;
1383 for (; *rgPtr!= 0; rgPtr++) {
1384 *rgPtr = uprv_toupper(*rgPtr);
1385 }
1386 rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
1387 }
1388
1389 if (rgLen == 0) {
1390 // No valid rg keyword value, try for unicode_region_subtag
1391 rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
1392 if (U_FAILURE(*status)) {
1393 rgLen = 0;
1394 } else if (rgLen == 0 && inferRegion) {
1395 // no unicode_region_subtag but inferRegion true, try likely subtags
1396 rgStatus = U_ZERO_ERROR;
1397 icu::CharString locBuf;
1398 {
1399 icu::CharStringByteSink sink(&locBuf);
1400 ulocimp_addLikelySubtags(localeID, sink, &rgStatus);
1401 }
1402 if (U_SUCCESS(rgStatus)) {
1403 rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status);
1404 if (U_FAILURE(*status)) {
1405 rgLen = 0;
1406 }
1407 }
1408 }
1409 }
1410
1411 rgBuf[rgLen] = 0;
1412 uprv_strncpy(region, rgBuf, regionCapacity);
1413 return u_terminateChars(region, regionCapacity, rgLen, status);
1414}
1415
1416