1 | // © 2019 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html#License |
3 | |
4 | // locdistance.cpp |
5 | // created: 2019may08 Markus W. Scherer |
6 | |
7 | #include "unicode/utypes.h" |
8 | #include "unicode/bytestrie.h" |
9 | #include "unicode/localematcher.h" |
10 | #include "unicode/locid.h" |
11 | #include "unicode/uobject.h" |
12 | #include "unicode/ures.h" |
13 | #include "cstring.h" |
14 | #include "locdistance.h" |
15 | #include "loclikelysubtags.h" |
16 | #include "uassert.h" |
17 | #include "ucln_cmn.h" |
18 | #include "uinvchar.h" |
19 | #include "umutex.h" |
20 | |
21 | U_NAMESPACE_BEGIN |
22 | |
23 | namespace { |
24 | |
25 | /** |
26 | * Bit flag used on the last character of a subtag in the trie. |
27 | * Must be set consistently by the builder and the lookup code. |
28 | */ |
29 | constexpr int32_t END_OF_SUBTAG = 0x80; |
30 | /** Distance value bit flag, set by the builder. */ |
31 | constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; |
32 | /** Distance value bit flag, set by trieNext(). */ |
33 | constexpr int32_t DISTANCE_IS_FINAL = 0x100; |
34 | constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; |
35 | |
36 | constexpr int32_t ABOVE_THRESHOLD = 100; |
37 | |
38 | // Indexes into array of distances. |
39 | enum { |
40 | IX_DEF_LANG_DISTANCE, |
41 | IX_DEF_SCRIPT_DISTANCE, |
42 | IX_DEF_REGION_DISTANCE, |
43 | IX_MIN_REGION_DISTANCE, |
44 | IX_LIMIT |
45 | }; |
46 | |
47 | LocaleDistance *gLocaleDistance = nullptr; |
48 | UInitOnce gInitOnce = U_INITONCE_INITIALIZER; |
49 | |
50 | UBool U_CALLCONV cleanup() { |
51 | delete gLocaleDistance; |
52 | gLocaleDistance = nullptr; |
53 | gInitOnce.reset(); |
54 | return TRUE; |
55 | } |
56 | |
57 | } // namespace |
58 | |
59 | void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { |
60 | // This function is invoked only via umtx_initOnce(). |
61 | U_ASSERT(gLocaleDistance == nullptr); |
62 | const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); |
63 | if (U_FAILURE(errorCode)) { return; } |
64 | const LocaleDistanceData &data = likely.getDistanceData(); |
65 | if (data.distanceTrieBytes == nullptr || |
66 | data.regionToPartitions == nullptr || data.partitions == nullptr || |
67 | // ok if no paradigms |
68 | data.distances == nullptr) { |
69 | errorCode = U_MISSING_RESOURCE_ERROR; |
70 | return; |
71 | } |
72 | gLocaleDistance = new LocaleDistance(data); |
73 | if (gLocaleDistance == nullptr) { |
74 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
75 | return; |
76 | } |
77 | ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); |
78 | } |
79 | |
80 | const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { |
81 | if (U_FAILURE(errorCode)) { return nullptr; } |
82 | umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); |
83 | return gLocaleDistance; |
84 | } |
85 | |
86 | LocaleDistance::LocaleDistance(const LocaleDistanceData &data) : |
87 | trie(data.distanceTrieBytes), |
88 | regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), |
89 | paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), |
90 | defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), |
91 | defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), |
92 | defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), |
93 | minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { |
94 | // For the default demotion value, use the |
95 | // default region distance between unrelated Englishes. |
96 | // Thus, unless demotion is turned off, |
97 | // a mere region difference for one desired locale |
98 | // is as good as a perfect match for the next following desired locale. |
99 | // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. |
100 | LSR en("en" , "Latn" , "US" ); |
101 | LSR enGB("en" , "Latn" , "GB" ); |
102 | const LSR *p_enGB = &enGB; |
103 | defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1, |
104 | 50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff; |
105 | } |
106 | |
107 | int32_t LocaleDistance::getBestIndexAndDistance( |
108 | const LSR &desired, |
109 | const LSR **supportedLSRs, int32_t supportedLSRsLength, |
110 | int32_t threshold, ULocMatchFavorSubtag favorSubtag) const { |
111 | BytesTrie iter(trie); |
112 | // Look up the desired language only once for all supported LSRs. |
113 | // Its "distance" is either a match point value of 0, or a non-match negative value. |
114 | // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
115 | int32_t desLangDistance = trieNext(iter, desired.language, false); |
116 | uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; |
117 | // Index of the supported LSR with the lowest distance. |
118 | int32_t bestIndex = -1; |
119 | for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { |
120 | const LSR &supported = *supportedLSRs[slIndex]; |
121 | bool star = false; |
122 | int32_t distance = desLangDistance; |
123 | if (distance >= 0) { |
124 | U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); |
125 | if (slIndex != 0) { |
126 | iter.resetToState64(desLangState); |
127 | } |
128 | distance = trieNext(iter, supported.language, true); |
129 | } |
130 | // Note: The data builder verifies that there are no rules with "any" (*) language and |
131 | // real (non *) script or region subtags. |
132 | // This means that if the lookup for either language fails we can use |
133 | // the default distances without further lookups. |
134 | int32_t flags; |
135 | if (distance >= 0) { |
136 | flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
137 | distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
138 | } else { // <*, *> |
139 | if (uprv_strcmp(desired.language, supported.language) == 0) { |
140 | distance = 0; |
141 | } else { |
142 | distance = defaultLanguageDistance; |
143 | } |
144 | flags = 0; |
145 | star = true; |
146 | } |
147 | U_ASSERT(0 <= distance && distance <= 100); |
148 | // We implement "favor subtag" by reducing the language subtag distance |
149 | // (unscientifically reducing it to a quarter of the normal value), |
150 | // so that the script distance is relatively more important. |
151 | // For example, given a default language distance of 80, we reduce it to 20, |
152 | // which is below the default threshold of 50, which is the default script distance. |
153 | if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { |
154 | distance >>= 2; |
155 | } |
156 | if (distance >= threshold) { |
157 | continue; |
158 | } |
159 | |
160 | int32_t scriptDistance; |
161 | if (star || flags != 0) { |
162 | if (uprv_strcmp(desired.script, supported.script) == 0) { |
163 | scriptDistance = 0; |
164 | } else { |
165 | scriptDistance = defaultScriptDistance; |
166 | } |
167 | } else { |
168 | scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), |
169 | desired.script, supported.script); |
170 | flags = scriptDistance & DISTANCE_IS_FINAL; |
171 | scriptDistance &= ~DISTANCE_IS_FINAL; |
172 | } |
173 | distance += scriptDistance; |
174 | if (distance >= threshold) { |
175 | continue; |
176 | } |
177 | |
178 | if (uprv_strcmp(desired.region, supported.region) == 0) { |
179 | // regionDistance = 0 |
180 | } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { |
181 | distance += defaultRegionDistance; |
182 | } else { |
183 | int32_t remainingThreshold = threshold - distance; |
184 | if (minRegionDistance >= remainingThreshold) { |
185 | continue; |
186 | } |
187 | |
188 | // From here on we know the regions are not equal. |
189 | // Map each region to zero or more partitions. (zero = one non-matching string) |
190 | // (Each array of single-character partition strings is encoded as one string.) |
191 | // If either side has more than one, then we find the maximum distance. |
192 | // This could be optimized by adding some more structure, but probably not worth it. |
193 | distance += getRegionPartitionsDistance( |
194 | iter, iter.getState64(), |
195 | partitionsForRegion(desired), |
196 | partitionsForRegion(supported), |
197 | remainingThreshold); |
198 | } |
199 | if (distance < threshold) { |
200 | if (distance == 0) { |
201 | return slIndex << 8; |
202 | } |
203 | bestIndex = slIndex; |
204 | threshold = distance; |
205 | } |
206 | } |
207 | return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD; |
208 | } |
209 | |
210 | int32_t LocaleDistance::getDesSuppScriptDistance( |
211 | BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { |
212 | // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
213 | int32_t distance = trieNext(iter, desired, false); |
214 | if (distance >= 0) { |
215 | distance = trieNext(iter, supported, true); |
216 | } |
217 | if (distance < 0) { |
218 | UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> |
219 | U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
220 | if (uprv_strcmp(desired, supported) == 0) { |
221 | distance = 0; // same script |
222 | } else { |
223 | distance = iter.getValue(); |
224 | U_ASSERT(distance >= 0); |
225 | } |
226 | if (result == USTRINGTRIE_FINAL_VALUE) { |
227 | distance |= DISTANCE_IS_FINAL; |
228 | } |
229 | } |
230 | return distance; |
231 | } |
232 | |
233 | int32_t LocaleDistance::getRegionPartitionsDistance( |
234 | BytesTrie &iter, uint64_t startState, |
235 | const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { |
236 | char desired = *desiredPartitions++; |
237 | char supported = *supportedPartitions++; |
238 | U_ASSERT(desired != 0 && supported != 0); |
239 | // See if we have single desired/supported partitions, from NUL-terminated |
240 | // partition strings without explicit length. |
241 | bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character |
242 | // equivalent to: if (desLength == 1 && suppLength == 1) |
243 | if (*desiredPartitions == 0 && !suppLengthGt1) { |
244 | // Fastpath for single desired/supported partitions. |
245 | UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
246 | if (USTRINGTRIE_HAS_NEXT(result)) { |
247 | result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
248 | if (USTRINGTRIE_HAS_VALUE(result)) { |
249 | return iter.getValue(); |
250 | } |
251 | } |
252 | return getFallbackRegionDistance(iter, startState); |
253 | } |
254 | |
255 | const char *supportedStart = supportedPartitions - 1; // for restart of inner loop |
256 | int32_t regionDistance = 0; |
257 | // Fall back to * only once, not for each pair of partition strings. |
258 | bool star = false; |
259 | for (;;) { |
260 | // Look up each desired-partition string only once, |
261 | // not for each (desired, supported) pair. |
262 | UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
263 | if (USTRINGTRIE_HAS_NEXT(result)) { |
264 | uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; |
265 | for (;;) { |
266 | result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
267 | int32_t d; |
268 | if (USTRINGTRIE_HAS_VALUE(result)) { |
269 | d = iter.getValue(); |
270 | } else if (star) { |
271 | d = 0; |
272 | } else { |
273 | d = getFallbackRegionDistance(iter, startState); |
274 | star = true; |
275 | } |
276 | if (d >= threshold) { |
277 | return d; |
278 | } else if (regionDistance < d) { |
279 | regionDistance = d; |
280 | } |
281 | if ((supported = *supportedPartitions++) != 0) { |
282 | iter.resetToState64(desState); |
283 | } else { |
284 | break; |
285 | } |
286 | } |
287 | } else if (!star) { |
288 | int32_t d = getFallbackRegionDistance(iter, startState); |
289 | if (d >= threshold) { |
290 | return d; |
291 | } else if (regionDistance < d) { |
292 | regionDistance = d; |
293 | } |
294 | star = true; |
295 | } |
296 | if ((desired = *desiredPartitions++) != 0) { |
297 | iter.resetToState64(startState); |
298 | supportedPartitions = supportedStart; |
299 | supported = *supportedPartitions++; |
300 | } else { |
301 | break; |
302 | } |
303 | } |
304 | return regionDistance; |
305 | } |
306 | |
307 | int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { |
308 | #if U_DEBUG |
309 | UStringTrieResult result = |
310 | #endif |
311 | iter.resetToState64(startState).next(u'*'); // <*, *> |
312 | U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
313 | int32_t distance = iter.getValue(); |
314 | U_ASSERT(distance >= 0); |
315 | return distance; |
316 | } |
317 | |
318 | int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { |
319 | uint8_t c; |
320 | if ((c = *s) == 0) { |
321 | return -1; // no empty subtags in the distance data |
322 | } |
323 | for (;;) { |
324 | c = uprv_invCharToAscii(c); |
325 | // EBCDIC: If *s is not an invariant character, |
326 | // then c is now 0 and will simply not match anything, which is harmless. |
327 | uint8_t next = *++s; |
328 | if (next != 0) { |
329 | if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { |
330 | return -1; |
331 | } |
332 | } else { |
333 | // last character of this subtag |
334 | UStringTrieResult result = iter.next(c | END_OF_SUBTAG); |
335 | if (wantValue) { |
336 | if (USTRINGTRIE_HAS_VALUE(result)) { |
337 | int32_t value = iter.getValue(); |
338 | if (result == USTRINGTRIE_FINAL_VALUE) { |
339 | value |= DISTANCE_IS_FINAL; |
340 | } |
341 | return value; |
342 | } |
343 | } else { |
344 | if (USTRINGTRIE_HAS_NEXT(result)) { |
345 | return 0; |
346 | } |
347 | } |
348 | return -1; |
349 | } |
350 | c = next; |
351 | } |
352 | } |
353 | |
354 | UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { |
355 | // Linear search for a very short list (length 6 as of 2019). |
356 | // If there are many paradigm LSRs we should use a hash set. |
357 | U_ASSERT(paradigmLSRsLength <= 15); |
358 | for (int32_t i = 0; i < paradigmLSRsLength; ++i) { |
359 | if (lsr == paradigmLSRs[i]) { return true; } |
360 | } |
361 | return false; |
362 | } |
363 | |
364 | U_NAMESPACE_END |
365 | |