1 | // © 2019 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | // locdistance.cpp |
5 | // created: 2019may08 Markus W. Scherer |
6 | |
7 | #include "unicode/utypes.h" |
8 | #include "unicode/bytestrie.h" |
9 | #include "unicode/localematcher.h" |
10 | #include "unicode/locid.h" |
11 | #include "unicode/uobject.h" |
12 | #include "unicode/ures.h" |
13 | #include "cstring.h" |
14 | #include "locdistance.h" |
15 | #include "loclikelysubtags.h" |
16 | #include "uassert.h" |
17 | #include "ucln_cmn.h" |
18 | #include "uinvchar.h" |
19 | #include "umutex.h" |
20 | |
21 | U_NAMESPACE_BEGIN |
22 | |
23 | namespace { |
24 | |
25 | /** |
26 | * Bit flag used on the last character of a subtag in the trie. |
27 | * Must be set consistently by the builder and the lookup code. |
28 | */ |
29 | constexpr int32_t END_OF_SUBTAG = 0x80; |
30 | /** Distance value bit flag, set by the builder. */ |
31 | constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; |
32 | /** Distance value bit flag, set by trieNext(). */ |
33 | constexpr int32_t DISTANCE_IS_FINAL = 0x100; |
34 | constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; |
35 | |
36 | constexpr int32_t ABOVE_THRESHOLD = 100; |
37 | |
38 | // Indexes into array of distances. |
39 | enum { |
40 | IX_DEF_LANG_DISTANCE, |
41 | IX_DEF_SCRIPT_DISTANCE, |
42 | IX_DEF_REGION_DISTANCE, |
43 | IX_MIN_REGION_DISTANCE, |
44 | IX_LIMIT |
45 | }; |
46 | |
47 | LocaleDistance *gLocaleDistance = nullptr; |
48 | UInitOnce gInitOnce {}; |
49 | |
50 | UBool U_CALLCONV cleanup() { |
51 | delete gLocaleDistance; |
52 | gLocaleDistance = nullptr; |
53 | gInitOnce.reset(); |
54 | return true; |
55 | } |
56 | |
57 | } // namespace |
58 | |
59 | void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { |
60 | // This function is invoked only via umtx_initOnce(). |
61 | U_ASSERT(gLocaleDistance == nullptr); |
62 | const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); |
63 | if (U_FAILURE(errorCode)) { return; } |
64 | const LocaleDistanceData &data = likely.getDistanceData(); |
65 | if (data.distanceTrieBytes == nullptr || |
66 | data.regionToPartitions == nullptr || data.partitions == nullptr || |
67 | // ok if no paradigms |
68 | data.distances == nullptr) { |
69 | errorCode = U_MISSING_RESOURCE_ERROR; |
70 | return; |
71 | } |
72 | gLocaleDistance = new LocaleDistance(data, likely); |
73 | if (gLocaleDistance == nullptr) { |
74 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
75 | return; |
76 | } |
77 | ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); |
78 | } |
79 | |
80 | const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { |
81 | if (U_FAILURE(errorCode)) { return nullptr; } |
82 | umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); |
83 | return gLocaleDistance; |
84 | } |
85 | |
86 | LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) : |
87 | likelySubtags(likely), |
88 | trie(data.distanceTrieBytes), |
89 | regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), |
90 | paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), |
91 | defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), |
92 | defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), |
93 | defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), |
94 | minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { |
95 | // For the default demotion value, use the |
96 | // default region distance between unrelated Englishes. |
97 | // Thus, unless demotion is turned off, |
98 | // a mere region difference for one desired locale |
99 | // is as good as a perfect match for the next following desired locale. |
100 | // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. |
101 | LSR en("en" , "Latn" , "US" , LSR::EXPLICIT_LSR); |
102 | LSR enGB("en" , "Latn" , "GB" , LSR::EXPLICIT_LSR); |
103 | const LSR *p_enGB = &enGB; |
104 | int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, |
105 | shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY); |
106 | defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); |
107 | } |
108 | |
109 | int32_t LocaleDistance::getBestIndexAndDistance( |
110 | const LSR &desired, |
111 | const LSR **supportedLSRs, int32_t supportedLSRsLength, |
112 | int32_t shiftedThreshold, |
113 | ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const { |
114 | BytesTrie iter(trie); |
115 | // Look up the desired language only once for all supported LSRs. |
116 | // Its "distance" is either a match point value of 0, or a non-match negative value. |
117 | // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
118 | int32_t desLangDistance = trieNext(iter, desired.language, false); |
119 | uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; |
120 | // Index of the supported LSR with the lowest distance. |
121 | int32_t bestIndex = -1; |
122 | // Cached lookup info from XLikelySubtags.compareLikely(). |
123 | int32_t bestLikelyInfo = -1; |
124 | for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { |
125 | const LSR &supported = *supportedLSRs[slIndex]; |
126 | bool star = false; |
127 | int32_t distance = desLangDistance; |
128 | if (distance >= 0) { |
129 | U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); |
130 | if (slIndex != 0) { |
131 | iter.resetToState64(desLangState); |
132 | } |
133 | distance = trieNext(iter, supported.language, true); |
134 | } |
135 | // Note: The data builder verifies that there are no rules with "any" (*) language and |
136 | // real (non *) script or region subtags. |
137 | // This means that if the lookup for either language fails we can use |
138 | // the default distances without further lookups. |
139 | int32_t flags; |
140 | if (distance >= 0) { |
141 | flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
142 | distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
143 | } else { // <*, *> |
144 | if (uprv_strcmp(desired.language, supported.language) == 0) { |
145 | distance = 0; |
146 | } else { |
147 | distance = defaultLanguageDistance; |
148 | } |
149 | flags = 0; |
150 | star = true; |
151 | } |
152 | U_ASSERT(0 <= distance && distance <= 100); |
153 | // Round up the shifted threshold (if fraction bits are not 0) |
154 | // for comparison with un-shifted distances until we need fraction bits. |
155 | // (If we simply shifted non-zero fraction bits away, then we might ignore a language |
156 | // when it's really still a micro distance below the threshold.) |
157 | int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; |
158 | // We implement "favor subtag" by reducing the language subtag distance |
159 | // (unscientifically reducing it to a quarter of the normal value), |
160 | // so that the script distance is relatively more important. |
161 | // For example, given a default language distance of 80, we reduce it to 20, |
162 | // which is below the default threshold of 50, which is the default script distance. |
163 | if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { |
164 | distance >>= 2; |
165 | } |
166 | // Let distance == roundedThreshold pass until the tie-breaker logic |
167 | // at the end of the loop. |
168 | if (distance > roundedThreshold) { |
169 | continue; |
170 | } |
171 | |
172 | int32_t scriptDistance; |
173 | if (star || flags != 0) { |
174 | if (uprv_strcmp(desired.script, supported.script) == 0) { |
175 | scriptDistance = 0; |
176 | } else { |
177 | scriptDistance = defaultScriptDistance; |
178 | } |
179 | } else { |
180 | scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), |
181 | desired.script, supported.script); |
182 | flags = scriptDistance & DISTANCE_IS_FINAL; |
183 | scriptDistance &= ~DISTANCE_IS_FINAL; |
184 | } |
185 | distance += scriptDistance; |
186 | if (distance > roundedThreshold) { |
187 | continue; |
188 | } |
189 | |
190 | if (uprv_strcmp(desired.region, supported.region) == 0) { |
191 | // regionDistance = 0 |
192 | } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { |
193 | distance += defaultRegionDistance; |
194 | } else { |
195 | int32_t remainingThreshold = roundedThreshold - distance; |
196 | if (minRegionDistance > remainingThreshold) { |
197 | continue; |
198 | } |
199 | |
200 | // From here on we know the regions are not equal. |
201 | // Map each region to zero or more partitions. (zero = one non-matching string) |
202 | // (Each array of single-character partition strings is encoded as one string.) |
203 | // If either side has more than one, then we find the maximum distance. |
204 | // This could be optimized by adding some more structure, but probably not worth it. |
205 | distance += getRegionPartitionsDistance( |
206 | iter, iter.getState64(), |
207 | partitionsForRegion(desired), |
208 | partitionsForRegion(supported), |
209 | remainingThreshold); |
210 | } |
211 | int32_t shiftedDistance = shiftDistance(distance); |
212 | if (shiftedDistance == 0) { |
213 | // Distinguish between equivalent but originally unequal locales via an |
214 | // additional micro distance. |
215 | shiftedDistance |= (desired.flags ^ supported.flags); |
216 | if (shiftedDistance < shiftedThreshold) { |
217 | if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || |
218 | // Is there also a match when we swap desired/supported? |
219 | isMatch(supported, desired, shiftedThreshold, favorSubtag)) { |
220 | if (shiftedDistance == 0) { |
221 | return slIndex << INDEX_SHIFT; |
222 | } |
223 | bestIndex = slIndex; |
224 | shiftedThreshold = shiftedDistance; |
225 | bestLikelyInfo = -1; |
226 | } |
227 | } |
228 | } else { |
229 | if (shiftedDistance < shiftedThreshold) { |
230 | if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || |
231 | // Is there also a match when we swap desired/supported? |
232 | isMatch(supported, desired, shiftedThreshold, favorSubtag)) { |
233 | bestIndex = slIndex; |
234 | shiftedThreshold = shiftedDistance; |
235 | bestLikelyInfo = -1; |
236 | } |
237 | } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { |
238 | if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || |
239 | // Is there also a match when we swap desired/supported? |
240 | isMatch(supported, desired, shiftedThreshold, favorSubtag)) { |
241 | bestLikelyInfo = likelySubtags.compareLikely( |
242 | supported, *supportedLSRs[bestIndex], bestLikelyInfo); |
243 | if ((bestLikelyInfo & 1) != 0) { |
244 | // This supported locale matches as well as the previous best match, |
245 | // and neither matches perfectly, |
246 | // but this one is "more likely" (has more-default subtags). |
247 | bestIndex = slIndex; |
248 | } |
249 | } |
250 | } |
251 | } |
252 | } |
253 | return bestIndex >= 0 ? |
254 | (bestIndex << INDEX_SHIFT) | shiftedThreshold : |
255 | INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); |
256 | } |
257 | |
258 | int32_t LocaleDistance::getDesSuppScriptDistance( |
259 | BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { |
260 | // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
261 | int32_t distance = trieNext(iter, desired, false); |
262 | if (distance >= 0) { |
263 | distance = trieNext(iter, supported, true); |
264 | } |
265 | if (distance < 0) { |
266 | UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> |
267 | U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
268 | if (uprv_strcmp(desired, supported) == 0) { |
269 | distance = 0; // same script |
270 | } else { |
271 | distance = iter.getValue(); |
272 | U_ASSERT(distance >= 0); |
273 | } |
274 | if (result == USTRINGTRIE_FINAL_VALUE) { |
275 | distance |= DISTANCE_IS_FINAL; |
276 | } |
277 | } |
278 | return distance; |
279 | } |
280 | |
281 | int32_t LocaleDistance::getRegionPartitionsDistance( |
282 | BytesTrie &iter, uint64_t startState, |
283 | const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { |
284 | char desired = *desiredPartitions++; |
285 | char supported = *supportedPartitions++; |
286 | U_ASSERT(desired != 0 && supported != 0); |
287 | // See if we have single desired/supported partitions, from NUL-terminated |
288 | // partition strings without explicit length. |
289 | bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character |
290 | // equivalent to: if (desLength == 1 && suppLength == 1) |
291 | if (*desiredPartitions == 0 && !suppLengthGt1) { |
292 | // Fastpath for single desired/supported partitions. |
293 | UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
294 | if (USTRINGTRIE_HAS_NEXT(result)) { |
295 | result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
296 | if (USTRINGTRIE_HAS_VALUE(result)) { |
297 | return iter.getValue(); |
298 | } |
299 | } |
300 | return getFallbackRegionDistance(iter, startState); |
301 | } |
302 | |
303 | const char *supportedStart = supportedPartitions - 1; // for restart of inner loop |
304 | int32_t regionDistance = 0; |
305 | // Fall back to * only once, not for each pair of partition strings. |
306 | bool star = false; |
307 | for (;;) { |
308 | // Look up each desired-partition string only once, |
309 | // not for each (desired, supported) pair. |
310 | UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
311 | if (USTRINGTRIE_HAS_NEXT(result)) { |
312 | uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; |
313 | for (;;) { |
314 | result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
315 | int32_t d; |
316 | if (USTRINGTRIE_HAS_VALUE(result)) { |
317 | d = iter.getValue(); |
318 | } else if (star) { |
319 | d = 0; |
320 | } else { |
321 | d = getFallbackRegionDistance(iter, startState); |
322 | star = true; |
323 | } |
324 | if (d > threshold) { |
325 | return d; |
326 | } else if (regionDistance < d) { |
327 | regionDistance = d; |
328 | } |
329 | if ((supported = *supportedPartitions++) != 0) { |
330 | iter.resetToState64(desState); |
331 | } else { |
332 | break; |
333 | } |
334 | } |
335 | } else if (!star) { |
336 | int32_t d = getFallbackRegionDistance(iter, startState); |
337 | if (d > threshold) { |
338 | return d; |
339 | } else if (regionDistance < d) { |
340 | regionDistance = d; |
341 | } |
342 | star = true; |
343 | } |
344 | if ((desired = *desiredPartitions++) != 0) { |
345 | iter.resetToState64(startState); |
346 | supportedPartitions = supportedStart; |
347 | supported = *supportedPartitions++; |
348 | } else { |
349 | break; |
350 | } |
351 | } |
352 | return regionDistance; |
353 | } |
354 | |
355 | int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { |
356 | #if U_DEBUG |
357 | UStringTrieResult result = |
358 | #endif |
359 | iter.resetToState64(startState).next(u'*'); // <*, *> |
360 | U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
361 | int32_t distance = iter.getValue(); |
362 | U_ASSERT(distance >= 0); |
363 | return distance; |
364 | } |
365 | |
366 | int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { |
367 | uint8_t c; |
368 | if ((c = *s) == 0) { |
369 | return -1; // no empty subtags in the distance data |
370 | } |
371 | for (;;) { |
372 | c = uprv_invCharToAscii(c); |
373 | // EBCDIC: If *s is not an invariant character, |
374 | // then c is now 0 and will simply not match anything, which is harmless. |
375 | uint8_t next = *++s; |
376 | if (next != 0) { |
377 | if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { |
378 | return -1; |
379 | } |
380 | } else { |
381 | // last character of this subtag |
382 | UStringTrieResult result = iter.next(c | END_OF_SUBTAG); |
383 | if (wantValue) { |
384 | if (USTRINGTRIE_HAS_VALUE(result)) { |
385 | int32_t value = iter.getValue(); |
386 | if (result == USTRINGTRIE_FINAL_VALUE) { |
387 | value |= DISTANCE_IS_FINAL; |
388 | } |
389 | return value; |
390 | } |
391 | } else { |
392 | if (USTRINGTRIE_HAS_NEXT(result)) { |
393 | return 0; |
394 | } |
395 | } |
396 | return -1; |
397 | } |
398 | c = next; |
399 | } |
400 | } |
401 | |
402 | UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { |
403 | // Linear search for a very short list (length 6 as of 2019), |
404 | // because we look for equivalence not equality, and |
405 | // because it's easy. |
406 | // If there are many paradigm LSRs we should use a hash set |
407 | // with custom comparator and hasher. |
408 | U_ASSERT(paradigmLSRsLength <= 15); |
409 | for (int32_t i = 0; i < paradigmLSRsLength; ++i) { |
410 | if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } |
411 | } |
412 | return false; |
413 | } |
414 | |
415 | U_NAMESPACE_END |
416 | |