1// © 2019 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3
4// locdistance.cpp
5// created: 2019may08 Markus W. Scherer
6
7#include "unicode/utypes.h"
8#include "unicode/bytestrie.h"
9#include "unicode/localematcher.h"
10#include "unicode/locid.h"
11#include "unicode/uobject.h"
12#include "unicode/ures.h"
13#include "cstring.h"
14#include "locdistance.h"
15#include "loclikelysubtags.h"
16#include "uassert.h"
17#include "ucln_cmn.h"
18#include "uinvchar.h"
19#include "umutex.h"
20
21U_NAMESPACE_BEGIN
22
23namespace {
24
25/**
26 * Bit flag used on the last character of a subtag in the trie.
27 * Must be set consistently by the builder and the lookup code.
28 */
29constexpr int32_t END_OF_SUBTAG = 0x80;
30/** Distance value bit flag, set by the builder. */
31constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
32/** Distance value bit flag, set by trieNext(). */
33constexpr int32_t DISTANCE_IS_FINAL = 0x100;
34constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
35
36constexpr int32_t ABOVE_THRESHOLD = 100;
37
38// Indexes into array of distances.
39enum {
40 IX_DEF_LANG_DISTANCE,
41 IX_DEF_SCRIPT_DISTANCE,
42 IX_DEF_REGION_DISTANCE,
43 IX_MIN_REGION_DISTANCE,
44 IX_LIMIT
45};
46
47LocaleDistance *gLocaleDistance = nullptr;
48UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
49
50UBool U_CALLCONV cleanup() {
51 delete gLocaleDistance;
52 gLocaleDistance = nullptr;
53 gInitOnce.reset();
54 return TRUE;
55}
56
57} // namespace
58
59void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
60 // This function is invoked only via umtx_initOnce().
61 U_ASSERT(gLocaleDistance == nullptr);
62 const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode);
63 if (U_FAILURE(errorCode)) { return; }
64 const LocaleDistanceData &data = likely.getDistanceData();
65 if (data.distanceTrieBytes == nullptr ||
66 data.regionToPartitions == nullptr || data.partitions == nullptr ||
67 // ok if no paradigms
68 data.distances == nullptr) {
69 errorCode = U_MISSING_RESOURCE_ERROR;
70 return;
71 }
72 gLocaleDistance = new LocaleDistance(data);
73 if (gLocaleDistance == nullptr) {
74 errorCode = U_MEMORY_ALLOCATION_ERROR;
75 return;
76 }
77 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
78}
79
80const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
81 if (U_FAILURE(errorCode)) { return nullptr; }
82 umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
83 return gLocaleDistance;
84}
85
86LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
87 trie(data.distanceTrieBytes),
88 regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
89 paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
90 defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
91 defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
92 defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
93 minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
94 // For the default demotion value, use the
95 // default region distance between unrelated Englishes.
96 // Thus, unless demotion is turned off,
97 // a mere region difference for one desired locale
98 // is as good as a perfect match for the next following desired locale.
99 // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
100 LSR en("en", "Latn", "US");
101 LSR enGB("en", "Latn", "GB");
102 const LSR *p_enGB = &enGB;
103 defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1,
104 50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff;
105}
106
107int32_t LocaleDistance::getBestIndexAndDistance(
108 const LSR &desired,
109 const LSR **supportedLSRs, int32_t supportedLSRsLength,
110 int32_t threshold, ULocMatchFavorSubtag favorSubtag) const {
111 BytesTrie iter(trie);
112 // Look up the desired language only once for all supported LSRs.
113 // Its "distance" is either a match point value of 0, or a non-match negative value.
114 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
115 int32_t desLangDistance = trieNext(iter, desired.language, false);
116 uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
117 // Index of the supported LSR with the lowest distance.
118 int32_t bestIndex = -1;
119 for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
120 const LSR &supported = *supportedLSRs[slIndex];
121 bool star = false;
122 int32_t distance = desLangDistance;
123 if (distance >= 0) {
124 U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
125 if (slIndex != 0) {
126 iter.resetToState64(desLangState);
127 }
128 distance = trieNext(iter, supported.language, true);
129 }
130 // Note: The data builder verifies that there are no rules with "any" (*) language and
131 // real (non *) script or region subtags.
132 // This means that if the lookup for either language fails we can use
133 // the default distances without further lookups.
134 int32_t flags;
135 if (distance >= 0) {
136 flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
137 distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
138 } else { // <*, *>
139 if (uprv_strcmp(desired.language, supported.language) == 0) {
140 distance = 0;
141 } else {
142 distance = defaultLanguageDistance;
143 }
144 flags = 0;
145 star = true;
146 }
147 U_ASSERT(0 <= distance && distance <= 100);
148 // We implement "favor subtag" by reducing the language subtag distance
149 // (unscientifically reducing it to a quarter of the normal value),
150 // so that the script distance is relatively more important.
151 // For example, given a default language distance of 80, we reduce it to 20,
152 // which is below the default threshold of 50, which is the default script distance.
153 if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
154 distance >>= 2;
155 }
156 if (distance >= threshold) {
157 continue;
158 }
159
160 int32_t scriptDistance;
161 if (star || flags != 0) {
162 if (uprv_strcmp(desired.script, supported.script) == 0) {
163 scriptDistance = 0;
164 } else {
165 scriptDistance = defaultScriptDistance;
166 }
167 } else {
168 scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
169 desired.script, supported.script);
170 flags = scriptDistance & DISTANCE_IS_FINAL;
171 scriptDistance &= ~DISTANCE_IS_FINAL;
172 }
173 distance += scriptDistance;
174 if (distance >= threshold) {
175 continue;
176 }
177
178 if (uprv_strcmp(desired.region, supported.region) == 0) {
179 // regionDistance = 0
180 } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
181 distance += defaultRegionDistance;
182 } else {
183 int32_t remainingThreshold = threshold - distance;
184 if (minRegionDistance >= remainingThreshold) {
185 continue;
186 }
187
188 // From here on we know the regions are not equal.
189 // Map each region to zero or more partitions. (zero = one non-matching string)
190 // (Each array of single-character partition strings is encoded as one string.)
191 // If either side has more than one, then we find the maximum distance.
192 // This could be optimized by adding some more structure, but probably not worth it.
193 distance += getRegionPartitionsDistance(
194 iter, iter.getState64(),
195 partitionsForRegion(desired),
196 partitionsForRegion(supported),
197 remainingThreshold);
198 }
199 if (distance < threshold) {
200 if (distance == 0) {
201 return slIndex << 8;
202 }
203 bestIndex = slIndex;
204 threshold = distance;
205 }
206 }
207 return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD;
208}
209
210int32_t LocaleDistance::getDesSuppScriptDistance(
211 BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
212 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
213 int32_t distance = trieNext(iter, desired, false);
214 if (distance >= 0) {
215 distance = trieNext(iter, supported, true);
216 }
217 if (distance < 0) {
218 UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
219 U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
220 if (uprv_strcmp(desired, supported) == 0) {
221 distance = 0; // same script
222 } else {
223 distance = iter.getValue();
224 U_ASSERT(distance >= 0);
225 }
226 if (result == USTRINGTRIE_FINAL_VALUE) {
227 distance |= DISTANCE_IS_FINAL;
228 }
229 }
230 return distance;
231}
232
233int32_t LocaleDistance::getRegionPartitionsDistance(
234 BytesTrie &iter, uint64_t startState,
235 const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
236 char desired = *desiredPartitions++;
237 char supported = *supportedPartitions++;
238 U_ASSERT(desired != 0 && supported != 0);
239 // See if we have single desired/supported partitions, from NUL-terminated
240 // partition strings without explicit length.
241 bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
242 // equivalent to: if (desLength == 1 && suppLength == 1)
243 if (*desiredPartitions == 0 && !suppLengthGt1) {
244 // Fastpath for single desired/supported partitions.
245 UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
246 if (USTRINGTRIE_HAS_NEXT(result)) {
247 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
248 if (USTRINGTRIE_HAS_VALUE(result)) {
249 return iter.getValue();
250 }
251 }
252 return getFallbackRegionDistance(iter, startState);
253 }
254
255 const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
256 int32_t regionDistance = 0;
257 // Fall back to * only once, not for each pair of partition strings.
258 bool star = false;
259 for (;;) {
260 // Look up each desired-partition string only once,
261 // not for each (desired, supported) pair.
262 UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
263 if (USTRINGTRIE_HAS_NEXT(result)) {
264 uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
265 for (;;) {
266 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
267 int32_t d;
268 if (USTRINGTRIE_HAS_VALUE(result)) {
269 d = iter.getValue();
270 } else if (star) {
271 d = 0;
272 } else {
273 d = getFallbackRegionDistance(iter, startState);
274 star = true;
275 }
276 if (d >= threshold) {
277 return d;
278 } else if (regionDistance < d) {
279 regionDistance = d;
280 }
281 if ((supported = *supportedPartitions++) != 0) {
282 iter.resetToState64(desState);
283 } else {
284 break;
285 }
286 }
287 } else if (!star) {
288 int32_t d = getFallbackRegionDistance(iter, startState);
289 if (d >= threshold) {
290 return d;
291 } else if (regionDistance < d) {
292 regionDistance = d;
293 }
294 star = true;
295 }
296 if ((desired = *desiredPartitions++) != 0) {
297 iter.resetToState64(startState);
298 supportedPartitions = supportedStart;
299 supported = *supportedPartitions++;
300 } else {
301 break;
302 }
303 }
304 return regionDistance;
305}
306
307int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
308#if U_DEBUG
309 UStringTrieResult result =
310#endif
311 iter.resetToState64(startState).next(u'*'); // <*, *>
312 U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
313 int32_t distance = iter.getValue();
314 U_ASSERT(distance >= 0);
315 return distance;
316}
317
318int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
319 uint8_t c;
320 if ((c = *s) == 0) {
321 return -1; // no empty subtags in the distance data
322 }
323 for (;;) {
324 c = uprv_invCharToAscii(c);
325 // EBCDIC: If *s is not an invariant character,
326 // then c is now 0 and will simply not match anything, which is harmless.
327 uint8_t next = *++s;
328 if (next != 0) {
329 if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
330 return -1;
331 }
332 } else {
333 // last character of this subtag
334 UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
335 if (wantValue) {
336 if (USTRINGTRIE_HAS_VALUE(result)) {
337 int32_t value = iter.getValue();
338 if (result == USTRINGTRIE_FINAL_VALUE) {
339 value |= DISTANCE_IS_FINAL;
340 }
341 return value;
342 }
343 } else {
344 if (USTRINGTRIE_HAS_NEXT(result)) {
345 return 0;
346 }
347 }
348 return -1;
349 }
350 c = next;
351 }
352}
353
354UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
355 // Linear search for a very short list (length 6 as of 2019).
356 // If there are many paradigm LSRs we should use a hash set.
357 U_ASSERT(paradigmLSRsLength <= 15);
358 for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
359 if (lsr == paradigmLSRs[i]) { return true; }
360 }
361 return false;
362}
363
364U_NAMESPACE_END
365