1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1997-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: loclikely.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2010feb25
16* created by: Markus W. Scherer
17*
18* Code for likely and minimized locale subtags, separated out from other .cpp files
19* that then do not depend on resource bundle code and likely-subtags data.
20*/
21
22#include "unicode/bytestream.h"
23#include "unicode/utypes.h"
24#include "unicode/locid.h"
25#include "unicode/putil.h"
26#include "unicode/uchar.h"
27#include "unicode/uloc.h"
28#include "unicode/ures.h"
29#include "unicode/uscript.h"
30#include "bytesinkutil.h"
31#include "charstr.h"
32#include "cmemory.h"
33#include "cstring.h"
34#include "ulocimp.h"
35#include "ustr_imp.h"
36
37/**
38 * These are the canonical strings for unknown languages, scripts and regions.
39 **/
40static const char* const unknownLanguage = "und";
41static const char* const unknownScript = "Zzzz";
42static const char* const unknownRegion = "ZZ";
43
44/**
45 * This function looks for the localeID in the likelySubtags resource.
46 *
47 * @param localeID The tag to find.
48 * @param buffer A buffer to hold the matching entry
49 * @param bufferLength The length of the output buffer
50 * @return A pointer to "buffer" if found, or a null pointer if not.
51 */
52static const char* U_CALLCONV
53findLikelySubtags(const char* localeID,
54 char* buffer,
55 int32_t bufferLength,
56 UErrorCode* err) {
57 const char* result = NULL;
58
59 if (!U_FAILURE(*err)) {
60 int32_t resLen = 0;
61 const UChar* s = NULL;
62 UErrorCode tmpErr = U_ZERO_ERROR;
63 icu::LocalUResourceBundlePointer subtags(ures_openDirect(NULL, "likelySubtags", &tmpErr));
64 if (U_SUCCESS(tmpErr)) {
65 icu::CharString und;
66 if (localeID != NULL) {
67 if (*localeID == '\0') {
68 localeID = unknownLanguage;
69 } else if (*localeID == '_') {
70 und.append(unknownLanguage, *err);
71 und.append(localeID, *err);
72 if (U_FAILURE(*err)) {
73 return NULL;
74 }
75 localeID = und.data();
76 }
77 }
78 s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
79
80 if (U_FAILURE(tmpErr)) {
81 /*
82 * If a resource is missing, it's not really an error, it's
83 * just that we don't have any data for that particular locale ID.
84 */
85 if (tmpErr != U_MISSING_RESOURCE_ERROR) {
86 *err = tmpErr;
87 }
88 }
89 else if (resLen >= bufferLength) {
90 /* The buffer should never overflow. */
91 *err = U_INTERNAL_PROGRAM_ERROR;
92 }
93 else {
94 u_UCharsToChars(s, buffer, resLen + 1);
95 if (resLen >= 3 &&
96 uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
97 (resLen == 3 || buffer[3] == '_')) {
98 uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
99 }
100 result = buffer;
101 }
102 } else {
103 *err = tmpErr;
104 }
105 }
106
107 return result;
108}
109
110/**
111 * Append a tag to a buffer, adding the separator if necessary. The buffer
112 * must be large enough to contain the resulting tag plus any separator
113 * necessary. The tag must not be a zero-length string.
114 *
115 * @param tag The tag to add.
116 * @param tagLength The length of the tag.
117 * @param buffer The output buffer.
118 * @param bufferLength The length of the output buffer. This is an input/ouput parameter.
119 **/
120static void U_CALLCONV
121appendTag(
122 const char* tag,
123 int32_t tagLength,
124 char* buffer,
125 int32_t* bufferLength,
126 UBool withSeparator) {
127
128 if (withSeparator) {
129 buffer[*bufferLength] = '_';
130 ++(*bufferLength);
131 }
132
133 uprv_memmove(
134 &buffer[*bufferLength],
135 tag,
136 tagLength);
137
138 *bufferLength += tagLength;
139}
140
141/**
142 * Create a tag string from the supplied parameters. The lang, script and region
143 * parameters may be NULL pointers. If they are, their corresponding length parameters
144 * must be less than or equal to 0.
145 *
146 * If any of the language, script or region parameters are empty, and the alternateTags
147 * parameter is not NULL, it will be parsed for potential language, script and region tags
148 * to be used when constructing the new tag. If the alternateTags parameter is NULL, or
149 * it contains no language tag, the default tag for the unknown language is used.
150 *
151 * If the length of the new string exceeds the capacity of the output buffer,
152 * the function copies as many bytes to the output buffer as it can, and returns
153 * the error U_BUFFER_OVERFLOW_ERROR.
154 *
155 * If an illegal argument is provided, the function returns the error
156 * U_ILLEGAL_ARGUMENT_ERROR.
157 *
158 * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
159 * the tag string fits in the output buffer, but the null terminator doesn't.
160 *
161 * @param lang The language tag to use.
162 * @param langLength The length of the language tag.
163 * @param script The script tag to use.
164 * @param scriptLength The length of the script tag.
165 * @param region The region tag to use.
166 * @param regionLength The length of the region tag.
167 * @param trailing Any trailing data to append to the new tag.
168 * @param trailingLength The length of the trailing data.
169 * @param alternateTags A string containing any alternate tags.
170 * @param sink The output sink receiving the tag string.
171 * @param err A pointer to a UErrorCode for error reporting.
172 **/
173static void U_CALLCONV
174createTagStringWithAlternates(
175 const char* lang,
176 int32_t langLength,
177 const char* script,
178 int32_t scriptLength,
179 const char* region,
180 int32_t regionLength,
181 const char* trailing,
182 int32_t trailingLength,
183 const char* alternateTags,
184 icu::ByteSink& sink,
185 UErrorCode* err) {
186
187 if (U_FAILURE(*err)) {
188 goto error;
189 }
190 else if (langLength >= ULOC_LANG_CAPACITY ||
191 scriptLength >= ULOC_SCRIPT_CAPACITY ||
192 regionLength >= ULOC_COUNTRY_CAPACITY) {
193 goto error;
194 }
195 else {
196 /**
197 * ULOC_FULLNAME_CAPACITY will provide enough capacity
198 * that we can build a string that contains the language,
199 * script and region code without worrying about overrunning
200 * the user-supplied buffer.
201 **/
202 char tagBuffer[ULOC_FULLNAME_CAPACITY];
203 int32_t tagLength = 0;
204 UBool regionAppended = FALSE;
205
206 if (langLength > 0) {
207 appendTag(
208 lang,
209 langLength,
210 tagBuffer,
211 &tagLength,
212 /*withSeparator=*/FALSE);
213 }
214 else if (alternateTags == NULL) {
215 /*
216 * Use the empty string for an unknown language, if
217 * we found no language.
218 */
219 }
220 else {
221 /*
222 * Parse the alternateTags string for the language.
223 */
224 char alternateLang[ULOC_LANG_CAPACITY];
225 int32_t alternateLangLength = sizeof(alternateLang);
226
227 alternateLangLength =
228 uloc_getLanguage(
229 alternateTags,
230 alternateLang,
231 alternateLangLength,
232 err);
233 if(U_FAILURE(*err) ||
234 alternateLangLength >= ULOC_LANG_CAPACITY) {
235 goto error;
236 }
237 else if (alternateLangLength == 0) {
238 /*
239 * Use the empty string for an unknown language, if
240 * we found no language.
241 */
242 }
243 else {
244 appendTag(
245 alternateLang,
246 alternateLangLength,
247 tagBuffer,
248 &tagLength,
249 /*withSeparator=*/FALSE);
250 }
251 }
252
253 if (scriptLength > 0) {
254 appendTag(
255 script,
256 scriptLength,
257 tagBuffer,
258 &tagLength,
259 /*withSeparator=*/TRUE);
260 }
261 else if (alternateTags != NULL) {
262 /*
263 * Parse the alternateTags string for the script.
264 */
265 char alternateScript[ULOC_SCRIPT_CAPACITY];
266
267 const int32_t alternateScriptLength =
268 uloc_getScript(
269 alternateTags,
270 alternateScript,
271 sizeof(alternateScript),
272 err);
273
274 if (U_FAILURE(*err) ||
275 alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
276 goto error;
277 }
278 else if (alternateScriptLength > 0) {
279 appendTag(
280 alternateScript,
281 alternateScriptLength,
282 tagBuffer,
283 &tagLength,
284 /*withSeparator=*/TRUE);
285 }
286 }
287
288 if (regionLength > 0) {
289 appendTag(
290 region,
291 regionLength,
292 tagBuffer,
293 &tagLength,
294 /*withSeparator=*/TRUE);
295
296 regionAppended = TRUE;
297 }
298 else if (alternateTags != NULL) {
299 /*
300 * Parse the alternateTags string for the region.
301 */
302 char alternateRegion[ULOC_COUNTRY_CAPACITY];
303
304 const int32_t alternateRegionLength =
305 uloc_getCountry(
306 alternateTags,
307 alternateRegion,
308 sizeof(alternateRegion),
309 err);
310 if (U_FAILURE(*err) ||
311 alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
312 goto error;
313 }
314 else if (alternateRegionLength > 0) {
315 appendTag(
316 alternateRegion,
317 alternateRegionLength,
318 tagBuffer,
319 &tagLength,
320 /*withSeparator=*/TRUE);
321
322 regionAppended = TRUE;
323 }
324 }
325
326 /**
327 * Copy the partial tag from our internal buffer to the supplied
328 * target.
329 **/
330 sink.Append(tagBuffer, tagLength);
331
332 if (trailingLength > 0) {
333 if (*trailing != '@') {
334 sink.Append("_", 1);
335 if (!regionAppended) {
336 /* extra separator is required */
337 sink.Append("_", 1);
338 }
339 }
340
341 /*
342 * Copy the trailing data into the supplied buffer.
343 */
344 sink.Append(trailing, trailingLength);
345 }
346
347 return;
348 }
349
350error:
351
352 /**
353 * An overflow indicates the locale ID passed in
354 * is ill-formed. If we got here, and there was
355 * no previous error, it's an implicit overflow.
356 **/
357 if (*err == U_BUFFER_OVERFLOW_ERROR ||
358 U_SUCCESS(*err)) {
359 *err = U_ILLEGAL_ARGUMENT_ERROR;
360 }
361}
362
363/**
364 * Create a tag string from the supplied parameters. The lang, script and region
365 * parameters may be NULL pointers. If they are, their corresponding length parameters
366 * must be less than or equal to 0. If the lang parameter is an empty string, the
367 * default value for an unknown language is written to the output buffer.
368 *
369 * If the length of the new string exceeds the capacity of the output buffer,
370 * the function copies as many bytes to the output buffer as it can, and returns
371 * the error U_BUFFER_OVERFLOW_ERROR.
372 *
373 * If an illegal argument is provided, the function returns the error
374 * U_ILLEGAL_ARGUMENT_ERROR.
375 *
376 * @param lang The language tag to use.
377 * @param langLength The length of the language tag.
378 * @param script The script tag to use.
379 * @param scriptLength The length of the script tag.
380 * @param region The region tag to use.
381 * @param regionLength The length of the region tag.
382 * @param trailing Any trailing data to append to the new tag.
383 * @param trailingLength The length of the trailing data.
384 * @param sink The output sink receiving the tag string.
385 * @param err A pointer to a UErrorCode for error reporting.
386 **/
387static void U_CALLCONV
388createTagString(
389 const char* lang,
390 int32_t langLength,
391 const char* script,
392 int32_t scriptLength,
393 const char* region,
394 int32_t regionLength,
395 const char* trailing,
396 int32_t trailingLength,
397 icu::ByteSink& sink,
398 UErrorCode* err)
399{
400 createTagStringWithAlternates(
401 lang,
402 langLength,
403 script,
404 scriptLength,
405 region,
406 regionLength,
407 trailing,
408 trailingLength,
409 NULL,
410 sink,
411 err);
412}
413
414/**
415 * Parse the language, script, and region subtags from a tag string, and copy the
416 * results into the corresponding output parameters. The buffers are null-terminated,
417 * unless overflow occurs.
418 *
419 * The langLength, scriptLength, and regionLength parameters are input/output
420 * parameters, and must contain the capacity of their corresponding buffers on
421 * input. On output, they will contain the actual length of the buffers, not
422 * including the null terminator.
423 *
424 * If the length of any of the output subtags exceeds the capacity of the corresponding
425 * buffer, the function copies as many bytes to the output buffer as it can, and returns
426 * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow
427 * occurs.
428 *
429 * If an illegal argument is provided, the function returns the error
430 * U_ILLEGAL_ARGUMENT_ERROR.
431 *
432 * @param localeID The locale ID to parse.
433 * @param lang The language tag buffer.
434 * @param langLength The length of the language tag.
435 * @param script The script tag buffer.
436 * @param scriptLength The length of the script tag.
437 * @param region The region tag buffer.
438 * @param regionLength The length of the region tag.
439 * @param err A pointer to a UErrorCode for error reporting.
440 * @return The number of chars of the localeID parameter consumed.
441 **/
442static int32_t U_CALLCONV
443parseTagString(
444 const char* localeID,
445 char* lang,
446 int32_t* langLength,
447 char* script,
448 int32_t* scriptLength,
449 char* region,
450 int32_t* regionLength,
451 UErrorCode* err)
452{
453 const char* position = localeID;
454 int32_t subtagLength = 0;
455
456 if(U_FAILURE(*err) ||
457 localeID == NULL ||
458 lang == NULL ||
459 langLength == NULL ||
460 script == NULL ||
461 scriptLength == NULL ||
462 region == NULL ||
463 regionLength == NULL) {
464 goto error;
465 }
466
467 subtagLength = ulocimp_getLanguage(position, lang, *langLength, &position);
468 u_terminateChars(lang, *langLength, subtagLength, err);
469
470 /*
471 * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
472 * to be an error, because it indicates the user-supplied tag is
473 * not well-formed.
474 */
475 if(U_FAILURE(*err)) {
476 goto error;
477 }
478
479 *langLength = subtagLength;
480
481 /*
482 * If no language was present, use the empty string instead.
483 * Otherwise, move past any separator.
484 */
485 if (_isIDSeparator(*position)) {
486 ++position;
487 }
488
489 subtagLength = ulocimp_getScript(position, script, *scriptLength, &position);
490 u_terminateChars(script, *scriptLength, subtagLength, err);
491
492 if(U_FAILURE(*err)) {
493 goto error;
494 }
495
496 *scriptLength = subtagLength;
497
498 if (*scriptLength > 0) {
499 if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
500 /**
501 * If the script part is the "unknown" script, then don't return it.
502 **/
503 *scriptLength = 0;
504 }
505
506 /*
507 * Move past any separator.
508 */
509 if (_isIDSeparator(*position)) {
510 ++position;
511 }
512 }
513
514 subtagLength = ulocimp_getCountry(position, region, *regionLength, &position);
515 u_terminateChars(region, *regionLength, subtagLength, err);
516
517 if(U_FAILURE(*err)) {
518 goto error;
519 }
520
521 *regionLength = subtagLength;
522
523 if (*regionLength > 0) {
524 if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
525 /**
526 * If the region part is the "unknown" region, then don't return it.
527 **/
528 *regionLength = 0;
529 }
530 } else if (*position != 0 && *position != '@') {
531 /* back up over consumed trailing separator */
532 --position;
533 }
534
535exit:
536
537 return (int32_t)(position - localeID);
538
539error:
540
541 /**
542 * If we get here, we have no explicit error, it's the result of an
543 * illegal argument.
544 **/
545 if (!U_FAILURE(*err)) {
546 *err = U_ILLEGAL_ARGUMENT_ERROR;
547 }
548
549 goto exit;
550}
551
552static UBool U_CALLCONV
553createLikelySubtagsString(
554 const char* lang,
555 int32_t langLength,
556 const char* script,
557 int32_t scriptLength,
558 const char* region,
559 int32_t regionLength,
560 const char* variants,
561 int32_t variantsLength,
562 icu::ByteSink& sink,
563 UErrorCode* err) {
564 /**
565 * ULOC_FULLNAME_CAPACITY will provide enough capacity
566 * that we can build a string that contains the language,
567 * script and region code without worrying about overrunning
568 * the user-supplied buffer.
569 **/
570 char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
571
572 if(U_FAILURE(*err)) {
573 goto error;
574 }
575
576 /**
577 * Try the language with the script and region first.
578 **/
579 if (scriptLength > 0 && regionLength > 0) {
580
581 const char* likelySubtags = NULL;
582
583 icu::CharString tagBuffer;
584 {
585 icu::CharStringByteSink sink(&tagBuffer);
586 createTagString(
587 lang,
588 langLength,
589 script,
590 scriptLength,
591 region,
592 regionLength,
593 NULL,
594 0,
595 sink,
596 err);
597 }
598 if(U_FAILURE(*err)) {
599 goto error;
600 }
601
602 likelySubtags =
603 findLikelySubtags(
604 tagBuffer.data(),
605 likelySubtagsBuffer,
606 sizeof(likelySubtagsBuffer),
607 err);
608 if(U_FAILURE(*err)) {
609 goto error;
610 }
611
612 if (likelySubtags != NULL) {
613 /* Always use the language tag from the
614 maximal string, since it may be more
615 specific than the one provided. */
616 createTagStringWithAlternates(
617 NULL,
618 0,
619 NULL,
620 0,
621 NULL,
622 0,
623 variants,
624 variantsLength,
625 likelySubtags,
626 sink,
627 err);
628 return TRUE;
629 }
630 }
631
632 /**
633 * Try the language with just the script.
634 **/
635 if (scriptLength > 0) {
636
637 const char* likelySubtags = NULL;
638
639 icu::CharString tagBuffer;
640 {
641 icu::CharStringByteSink sink(&tagBuffer);
642 createTagString(
643 lang,
644 langLength,
645 script,
646 scriptLength,
647 NULL,
648 0,
649 NULL,
650 0,
651 sink,
652 err);
653 }
654 if(U_FAILURE(*err)) {
655 goto error;
656 }
657
658 likelySubtags =
659 findLikelySubtags(
660 tagBuffer.data(),
661 likelySubtagsBuffer,
662 sizeof(likelySubtagsBuffer),
663 err);
664 if(U_FAILURE(*err)) {
665 goto error;
666 }
667
668 if (likelySubtags != NULL) {
669 /* Always use the language tag from the
670 maximal string, since it may be more
671 specific than the one provided. */
672 createTagStringWithAlternates(
673 NULL,
674 0,
675 NULL,
676 0,
677 region,
678 regionLength,
679 variants,
680 variantsLength,
681 likelySubtags,
682 sink,
683 err);
684 return TRUE;
685 }
686 }
687
688 /**
689 * Try the language with just the region.
690 **/
691 if (regionLength > 0) {
692
693 const char* likelySubtags = NULL;
694
695 icu::CharString tagBuffer;
696 {
697 icu::CharStringByteSink sink(&tagBuffer);
698 createTagString(
699 lang,
700 langLength,
701 NULL,
702 0,
703 region,
704 regionLength,
705 NULL,
706 0,
707 sink,
708 err);
709 }
710 if(U_FAILURE(*err)) {
711 goto error;
712 }
713
714 likelySubtags =
715 findLikelySubtags(
716 tagBuffer.data(),
717 likelySubtagsBuffer,
718 sizeof(likelySubtagsBuffer),
719 err);
720 if(U_FAILURE(*err)) {
721 goto error;
722 }
723
724 if (likelySubtags != NULL) {
725 /* Always use the language tag from the
726 maximal string, since it may be more
727 specific than the one provided. */
728 createTagStringWithAlternates(
729 NULL,
730 0,
731 script,
732 scriptLength,
733 NULL,
734 0,
735 variants,
736 variantsLength,
737 likelySubtags,
738 sink,
739 err);
740 return TRUE;
741 }
742 }
743
744 /**
745 * Finally, try just the language.
746 **/
747 {
748 const char* likelySubtags = NULL;
749
750 icu::CharString tagBuffer;
751 {
752 icu::CharStringByteSink sink(&tagBuffer);
753 createTagString(
754 lang,
755 langLength,
756 NULL,
757 0,
758 NULL,
759 0,
760 NULL,
761 0,
762 sink,
763 err);
764 }
765 if(U_FAILURE(*err)) {
766 goto error;
767 }
768
769 likelySubtags =
770 findLikelySubtags(
771 tagBuffer.data(),
772 likelySubtagsBuffer,
773 sizeof(likelySubtagsBuffer),
774 err);
775 if(U_FAILURE(*err)) {
776 goto error;
777 }
778
779 if (likelySubtags != NULL) {
780 /* Always use the language tag from the
781 maximal string, since it may be more
782 specific than the one provided. */
783 createTagStringWithAlternates(
784 NULL,
785 0,
786 script,
787 scriptLength,
788 region,
789 regionLength,
790 variants,
791 variantsLength,
792 likelySubtags,
793 sink,
794 err);
795 return TRUE;
796 }
797 }
798
799 return FALSE;
800
801error:
802
803 if (!U_FAILURE(*err)) {
804 *err = U_ILLEGAL_ARGUMENT_ERROR;
805 }
806
807 return FALSE;
808}
809
810#define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
811 int32_t count = 0; \
812 int32_t i; \
813 for (i = 0; i < trailingLength; i++) { \
814 if (trailing[i] == '-' || trailing[i] == '_') { \
815 count = 0; \
816 if (count > 8) { \
817 goto error; \
818 } \
819 } else if (trailing[i] == '@') { \
820 break; \
821 } else if (count > 8) { \
822 goto error; \
823 } else { \
824 count++; \
825 } \
826 } \
827} UPRV_BLOCK_MACRO_END
828
829static UBool
830_uloc_addLikelySubtags(const char* localeID,
831 icu::ByteSink& sink,
832 UErrorCode* err) {
833 char lang[ULOC_LANG_CAPACITY];
834 int32_t langLength = sizeof(lang);
835 char script[ULOC_SCRIPT_CAPACITY];
836 int32_t scriptLength = sizeof(script);
837 char region[ULOC_COUNTRY_CAPACITY];
838 int32_t regionLength = sizeof(region);
839 const char* trailing = "";
840 int32_t trailingLength = 0;
841 int32_t trailingIndex = 0;
842 UBool success = FALSE;
843
844 if(U_FAILURE(*err)) {
845 goto error;
846 }
847 if (localeID == NULL) {
848 goto error;
849 }
850
851 trailingIndex = parseTagString(
852 localeID,
853 lang,
854 &langLength,
855 script,
856 &scriptLength,
857 region,
858 &regionLength,
859 err);
860 if(U_FAILURE(*err)) {
861 /* Overflow indicates an illegal argument error */
862 if (*err == U_BUFFER_OVERFLOW_ERROR) {
863 *err = U_ILLEGAL_ARGUMENT_ERROR;
864 }
865
866 goto error;
867 }
868
869 /* Find the length of the trailing portion. */
870 while (_isIDSeparator(localeID[trailingIndex])) {
871 trailingIndex++;
872 }
873 trailing = &localeID[trailingIndex];
874 trailingLength = (int32_t)uprv_strlen(trailing);
875
876 CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
877
878 success =
879 createLikelySubtagsString(
880 lang,
881 langLength,
882 script,
883 scriptLength,
884 region,
885 regionLength,
886 trailing,
887 trailingLength,
888 sink,
889 err);
890
891 if (!success) {
892 const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
893
894 /*
895 * If we get here, we need to return localeID.
896 */
897 sink.Append(localeID, localIDLength);
898 }
899
900 return success;
901
902error:
903
904 if (!U_FAILURE(*err)) {
905 *err = U_ILLEGAL_ARGUMENT_ERROR;
906 }
907 return FALSE;
908}
909
910// Add likely subtags to the sink
911// return true if the value in the sink is produced by a match during the lookup
912// return false if the value in the sink is the same as input because there are
913// no match after the lookup.
914static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*);
915
916static void
917_uloc_minimizeSubtags(const char* localeID,
918 icu::ByteSink& sink,
919 UErrorCode* err) {
920 icu::CharString maximizedTagBuffer;
921
922 char lang[ULOC_LANG_CAPACITY];
923 int32_t langLength = sizeof(lang);
924 char script[ULOC_SCRIPT_CAPACITY];
925 int32_t scriptLength = sizeof(script);
926 char region[ULOC_COUNTRY_CAPACITY];
927 int32_t regionLength = sizeof(region);
928 const char* trailing = "";
929 int32_t trailingLength = 0;
930 int32_t trailingIndex = 0;
931 UBool successGetMax = FALSE;
932
933 if(U_FAILURE(*err)) {
934 goto error;
935 }
936 else if (localeID == NULL) {
937 goto error;
938 }
939
940 trailingIndex =
941 parseTagString(
942 localeID,
943 lang,
944 &langLength,
945 script,
946 &scriptLength,
947 region,
948 &regionLength,
949 err);
950 if(U_FAILURE(*err)) {
951
952 /* Overflow indicates an illegal argument error */
953 if (*err == U_BUFFER_OVERFLOW_ERROR) {
954 *err = U_ILLEGAL_ARGUMENT_ERROR;
955 }
956
957 goto error;
958 }
959
960 /* Find the spot where the variants or the keywords begin, if any. */
961 while (_isIDSeparator(localeID[trailingIndex])) {
962 trailingIndex++;
963 }
964 trailing = &localeID[trailingIndex];
965 trailingLength = (int32_t)uprv_strlen(trailing);
966
967 CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
968
969 {
970 icu::CharString base;
971 {
972 icu::CharStringByteSink baseSink(&base);
973 createTagString(
974 lang,
975 langLength,
976 script,
977 scriptLength,
978 region,
979 regionLength,
980 NULL,
981 0,
982 baseSink,
983 err);
984 }
985
986 /**
987 * First, we need to first get the maximization
988 * from AddLikelySubtags.
989 **/
990 {
991 icu::CharStringByteSink maxSink(&maximizedTagBuffer);
992 successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
993 }
994 }
995
996 if(U_FAILURE(*err)) {
997 goto error;
998 }
999
1000 if (!successGetMax) {
1001 /**
1002 * If we got here, return the locale ID parameter unchanged.
1003 **/
1004 const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
1005 sink.Append(localeID, localeIDLength);
1006 return;
1007 }
1008
1009 // In the following, the lang, script, region are referring to those in
1010 // the maximizedTagBuffer, not the one in the localeID.
1011 langLength = sizeof(lang);
1012 scriptLength = sizeof(script);
1013 regionLength = sizeof(region);
1014 parseTagString(
1015 maximizedTagBuffer.data(),
1016 lang,
1017 &langLength,
1018 script,
1019 &scriptLength,
1020 region,
1021 &regionLength,
1022 err);
1023 if(U_FAILURE(*err)) {
1024 goto error;
1025 }
1026
1027 /**
1028 * Start first with just the language.
1029 **/
1030 {
1031 icu::CharString tagBuffer;
1032 {
1033 icu::CharStringByteSink tagSink(&tagBuffer);
1034 createLikelySubtagsString(
1035 lang,
1036 langLength,
1037 NULL,
1038 0,
1039 NULL,
1040 0,
1041 NULL,
1042 0,
1043 tagSink,
1044 err);
1045 }
1046
1047 if(U_FAILURE(*err)) {
1048 goto error;
1049 }
1050 else if (!tagBuffer.isEmpty() &&
1051 uprv_strnicmp(
1052 maximizedTagBuffer.data(),
1053 tagBuffer.data(),
1054 tagBuffer.length()) == 0) {
1055
1056 createTagString(
1057 lang,
1058 langLength,
1059 NULL,
1060 0,
1061 NULL,
1062 0,
1063 trailing,
1064 trailingLength,
1065 sink,
1066 err);
1067 return;
1068 }
1069 }
1070
1071 /**
1072 * Next, try the language and region.
1073 **/
1074 if (regionLength > 0) {
1075
1076 icu::CharString tagBuffer;
1077 {
1078 icu::CharStringByteSink tagSink(&tagBuffer);
1079 createLikelySubtagsString(
1080 lang,
1081 langLength,
1082 NULL,
1083 0,
1084 region,
1085 regionLength,
1086 NULL,
1087 0,
1088 tagSink,
1089 err);
1090 }
1091
1092 if(U_FAILURE(*err)) {
1093 goto error;
1094 }
1095 else if (!tagBuffer.isEmpty() &&
1096 uprv_strnicmp(
1097 maximizedTagBuffer.data(),
1098 tagBuffer.data(),
1099 tagBuffer.length()) == 0) {
1100
1101 createTagString(
1102 lang,
1103 langLength,
1104 NULL,
1105 0,
1106 region,
1107 regionLength,
1108 trailing,
1109 trailingLength,
1110 sink,
1111 err);
1112 return;
1113 }
1114 }
1115
1116 /**
1117 * Finally, try the language and script. This is our last chance,
1118 * since trying with all three subtags would only yield the
1119 * maximal version that we already have.
1120 **/
1121 if (scriptLength > 0) {
1122 icu::CharString tagBuffer;
1123 {
1124 icu::CharStringByteSink tagSink(&tagBuffer);
1125 createLikelySubtagsString(
1126 lang,
1127 langLength,
1128 script,
1129 scriptLength,
1130 NULL,
1131 0,
1132 NULL,
1133 0,
1134 tagSink,
1135 err);
1136 }
1137
1138 if(U_FAILURE(*err)) {
1139 goto error;
1140 }
1141 else if (!tagBuffer.isEmpty() &&
1142 uprv_strnicmp(
1143 maximizedTagBuffer.data(),
1144 tagBuffer.data(),
1145 tagBuffer.length()) == 0) {
1146
1147 createTagString(
1148 lang,
1149 langLength,
1150 script,
1151 scriptLength,
1152 NULL,
1153 0,
1154 trailing,
1155 trailingLength,
1156 sink,
1157 err);
1158 return;
1159 }
1160 }
1161
1162 {
1163 /**
1164 * If we got here, return the max + trail.
1165 **/
1166 createTagString(
1167 lang,
1168 langLength,
1169 script,
1170 scriptLength,
1171 region,
1172 regionLength,
1173 trailing,
1174 trailingLength,
1175 sink,
1176 err);
1177 return;
1178 }
1179
1180error:
1181
1182 if (!U_FAILURE(*err)) {
1183 *err = U_ILLEGAL_ARGUMENT_ERROR;
1184 }
1185}
1186
1187static UBool
1188do_canonicalize(const char* localeID,
1189 char* buffer,
1190 int32_t bufferCapacity,
1191 UErrorCode* err)
1192{
1193 uloc_canonicalize(
1194 localeID,
1195 buffer,
1196 bufferCapacity,
1197 err);
1198
1199 if (*err == U_STRING_NOT_TERMINATED_WARNING ||
1200 *err == U_BUFFER_OVERFLOW_ERROR) {
1201 *err = U_ILLEGAL_ARGUMENT_ERROR;
1202
1203 return FALSE;
1204 }
1205 else if (U_FAILURE(*err)) {
1206
1207 return FALSE;
1208 }
1209 else {
1210 return TRUE;
1211 }
1212}
1213
1214U_CAPI int32_t U_EXPORT2
1215uloc_addLikelySubtags(const char* localeID,
1216 char* maximizedLocaleID,
1217 int32_t maximizedLocaleIDCapacity,
1218 UErrorCode* status) {
1219 if (U_FAILURE(*status)) {
1220 return 0;
1221 }
1222
1223 icu::CheckedArrayByteSink sink(
1224 maximizedLocaleID, maximizedLocaleIDCapacity);
1225
1226 ulocimp_addLikelySubtags(localeID, sink, status);
1227 int32_t reslen = sink.NumberOfBytesAppended();
1228
1229 if (U_FAILURE(*status)) {
1230 return sink.Overflowed() ? reslen : -1;
1231 }
1232
1233 if (sink.Overflowed()) {
1234 *status = U_BUFFER_OVERFLOW_ERROR;
1235 } else {
1236 u_terminateChars(
1237 maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status);
1238 }
1239
1240 return reslen;
1241}
1242
1243static UBool
1244_ulocimp_addLikelySubtags(const char* localeID,
1245 icu::ByteSink& sink,
1246 UErrorCode* status) {
1247 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1248
1249 if (do_canonicalize(localeID, localeBuffer, sizeof localeBuffer, status)) {
1250 return _uloc_addLikelySubtags(localeBuffer, sink, status);
1251 }
1252 return FALSE;
1253}
1254
1255U_CAPI void U_EXPORT2
1256ulocimp_addLikelySubtags(const char* localeID,
1257 icu::ByteSink& sink,
1258 UErrorCode* status) {
1259 _ulocimp_addLikelySubtags(localeID, sink, status);
1260}
1261
1262U_CAPI int32_t U_EXPORT2
1263uloc_minimizeSubtags(const char* localeID,
1264 char* minimizedLocaleID,
1265 int32_t minimizedLocaleIDCapacity,
1266 UErrorCode* status) {
1267 if (U_FAILURE(*status)) {
1268 return 0;
1269 }
1270
1271 icu::CheckedArrayByteSink sink(
1272 minimizedLocaleID, minimizedLocaleIDCapacity);
1273
1274 ulocimp_minimizeSubtags(localeID, sink, status);
1275 int32_t reslen = sink.NumberOfBytesAppended();
1276
1277 if (U_FAILURE(*status)) {
1278 return sink.Overflowed() ? reslen : -1;
1279 }
1280
1281 if (sink.Overflowed()) {
1282 *status = U_BUFFER_OVERFLOW_ERROR;
1283 } else {
1284 u_terminateChars(
1285 minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status);
1286 }
1287
1288 return reslen;
1289}
1290
1291U_CAPI void U_EXPORT2
1292ulocimp_minimizeSubtags(const char* localeID,
1293 icu::ByteSink& sink,
1294 UErrorCode* status) {
1295 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1296
1297 if (do_canonicalize(localeID, localeBuffer, sizeof localeBuffer, status)) {
1298 _uloc_minimizeSubtags(localeBuffer, sink, status);
1299 }
1300}
1301
1302// Pairs of (language subtag, + or -) for finding out fast if common languages
1303// are LTR (minus) or RTL (plus).
1304static const char LANG_DIR_STRING[] =
1305 "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
1306
1307// Implemented here because this calls ulocimp_addLikelySubtags().
1308U_CAPI UBool U_EXPORT2
1309uloc_isRightToLeft(const char *locale) {
1310 UErrorCode errorCode = U_ZERO_ERROR;
1311 char script[8];
1312 int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
1313 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1314 scriptLength == 0) {
1315 // Fastpath: We know the likely scripts and their writing direction
1316 // for some common languages.
1317 errorCode = U_ZERO_ERROR;
1318 char lang[8];
1319 int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
1320 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1321 return FALSE;
1322 }
1323 if (langLength > 0) {
1324 const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
1325 if (langPtr != NULL) {
1326 switch (langPtr[langLength]) {
1327 case '-': return FALSE;
1328 case '+': return TRUE;
1329 default: break; // partial match of a longer code
1330 }
1331 }
1332 }
1333 // Otherwise, find the likely script.
1334 errorCode = U_ZERO_ERROR;
1335 icu::CharString likely;
1336 {
1337 icu::CharStringByteSink sink(&likely);
1338 ulocimp_addLikelySubtags(locale, sink, &errorCode);
1339 }
1340 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1341 return FALSE;
1342 }
1343 scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode);
1344 if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1345 scriptLength == 0) {
1346 return FALSE;
1347 }
1348 }
1349 UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
1350 return uscript_isRightToLeft(scriptCode);
1351}
1352
1353U_NAMESPACE_BEGIN
1354
1355UBool
1356Locale::isRightToLeft() const {
1357 return uloc_isRightToLeft(getBaseName());
1358}
1359
1360U_NAMESPACE_END
1361
1362// The following must at least allow for rg key value (6) plus terminator (1).
1363#define ULOC_RG_BUFLEN 8
1364
1365U_CAPI int32_t U_EXPORT2
1366ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
1367 char *region, int32_t regionCapacity, UErrorCode* status) {
1368 if (U_FAILURE(*status)) {
1369 return 0;
1370 }
1371 char rgBuf[ULOC_RG_BUFLEN];
1372 UErrorCode rgStatus = U_ZERO_ERROR;
1373
1374 // First check for rg keyword value
1375 int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
1376 if (U_FAILURE(rgStatus) || rgLen != 6) {
1377 rgLen = 0;
1378 } else {
1379 // rgBuf guaranteed to be zero terminated here, with text len 6
1380 char *rgPtr = rgBuf;
1381 for (; *rgPtr!= 0; rgPtr++) {
1382 *rgPtr = uprv_toupper(*rgPtr);
1383 }
1384 rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
1385 }
1386
1387 if (rgLen == 0) {
1388 // No valid rg keyword value, try for unicode_region_subtag
1389 rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
1390 if (U_FAILURE(*status)) {
1391 rgLen = 0;
1392 } else if (rgLen == 0 && inferRegion) {
1393 // no unicode_region_subtag but inferRegion TRUE, try likely subtags
1394 rgStatus = U_ZERO_ERROR;
1395 icu::CharString locBuf;
1396 {
1397 icu::CharStringByteSink sink(&locBuf);
1398 ulocimp_addLikelySubtags(localeID, sink, &rgStatus);
1399 }
1400 if (U_SUCCESS(rgStatus)) {
1401 rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status);
1402 if (U_FAILURE(*status)) {
1403 rgLen = 0;
1404 }
1405 }
1406 }
1407 }
1408
1409 rgBuf[rgLen] = 0;
1410 uprv_strncpy(region, rgBuf, regionCapacity);
1411 return u_terminateChars(region, regionCapacity, rgLen, status);
1412}
1413
1414