1 | // © 2019 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html#License |
3 | |
4 | // locdistance.h |
5 | // created: 2019may08 Markus W. Scherer |
6 | |
7 | #ifndef __LOCDISTANCE_H__ |
8 | #define __LOCDISTANCE_H__ |
9 | |
10 | #include "unicode/utypes.h" |
11 | #include "unicode/bytestrie.h" |
12 | #include "unicode/localematcher.h" |
13 | #include "unicode/locid.h" |
14 | #include "unicode/uobject.h" |
15 | #include "lsr.h" |
16 | |
17 | U_NAMESPACE_BEGIN |
18 | |
19 | struct LocaleDistanceData; |
20 | |
21 | /** |
22 | * Offline-built data for LocaleMatcher. |
23 | * Mostly but not only the data for mapping locales to their maximized forms. |
24 | */ |
25 | class LocaleDistance final : public UMemory { |
26 | public: |
27 | static const LocaleDistance *getSingleton(UErrorCode &errorCode); |
28 | |
29 | static int32_t shiftDistance(int32_t distance) { |
30 | return distance << DISTANCE_SHIFT; |
31 | } |
32 | |
33 | static int32_t getShiftedDistance(int32_t indexAndDistance) { |
34 | return indexAndDistance & DISTANCE_MASK; |
35 | } |
36 | |
37 | static double getDistanceDouble(int32_t indexAndDistance) { |
38 | double shiftedDistance = getShiftedDistance(indexAndDistance); |
39 | return shiftedDistance / (1 << DISTANCE_SHIFT); |
40 | } |
41 | |
42 | static int32_t getIndex(int32_t indexAndDistance) { |
43 | // assert indexAndDistance >= 0; |
44 | return indexAndDistance >> INDEX_SHIFT; |
45 | } |
46 | |
47 | /** |
48 | * Finds the supported LSR with the smallest distance from the desired one. |
49 | * Equivalent LSR subtags must be normalized into a canonical form. |
50 | * |
51 | * <p>Returns the index of the lowest-distance supported LSR in the high bits |
52 | * (negative if none has a distance below the threshold), |
53 | * and its distance (0..ABOVE_THRESHOLD) in the low bits. |
54 | */ |
55 | int32_t getBestIndexAndDistance(const LSR &desired, |
56 | const LSR **supportedLSRs, int32_t supportedLSRsLength, |
57 | int32_t shiftedThreshold, |
58 | ULocMatchFavorSubtag favorSubtag, |
59 | ULocMatchDirection direction) const; |
60 | |
61 | UBool isParadigmLSR(const LSR &lsr) const; |
62 | |
63 | int32_t getDefaultScriptDistance() const { |
64 | return defaultScriptDistance; |
65 | } |
66 | |
67 | int32_t getDefaultDemotionPerDesiredLocale() const { |
68 | return defaultDemotionPerDesiredLocale; |
69 | } |
70 | |
71 | private: |
72 | // The distance is shifted left to gain some fraction bits. |
73 | static constexpr int32_t DISTANCE_SHIFT = 3; |
74 | static constexpr int32_t DISTANCE_FRACTION_MASK = 7; |
75 | // 7 bits for 0..100 |
76 | static constexpr int32_t DISTANCE_INT_SHIFT = 7; |
77 | static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT; |
78 | static constexpr int32_t DISTANCE_MASK = 0x3ff; |
79 | // tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit |
80 | static constexpr int32_t INDEX_NEG_1 = 0xfffffc00; |
81 | |
82 | static int32_t getDistanceFloor(int32_t indexAndDistance) { |
83 | return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT; |
84 | } |
85 | |
86 | LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely); |
87 | LocaleDistance(const LocaleDistance &other) = delete; |
88 | LocaleDistance &operator=(const LocaleDistance &other) = delete; |
89 | |
90 | static void initLocaleDistance(UErrorCode &errorCode); |
91 | |
92 | UBool isMatch(const LSR &desired, const LSR &supported, |
93 | int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const { |
94 | const LSR *pSupp = &supported; |
95 | return getBestIndexAndDistance( |
96 | desired, &pSupp, 1, |
97 | shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0; |
98 | } |
99 | |
100 | static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState, |
101 | const char *desired, const char *supported); |
102 | |
103 | static int32_t getRegionPartitionsDistance( |
104 | BytesTrie &iter, uint64_t startState, |
105 | const char *desiredPartitions, const char *supportedPartitions, |
106 | int32_t threshold); |
107 | |
108 | static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState); |
109 | |
110 | static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue); |
111 | |
112 | const char *partitionsForRegion(const LSR &lsr) const { |
113 | // ill-formed region -> one non-matching string |
114 | int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex]; |
115 | return partitionArrays[pIndex]; |
116 | } |
117 | |
118 | int32_t getDefaultRegionDistance() const { |
119 | return defaultRegionDistance; |
120 | } |
121 | |
122 | const XLikelySubtags &likelySubtags; |
123 | |
124 | // The trie maps each dlang+slang+dscript+sscript+dregion+sregion |
125 | // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance. |
126 | // There is also a trie value for each subsequence of whole subtags. |
127 | // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"". |
128 | BytesTrie trie; |
129 | |
130 | /** |
131 | * Maps each region to zero or more single-character partitions. |
132 | */ |
133 | const uint8_t *regionToPartitionsIndex; |
134 | const char **partitionArrays; |
135 | |
136 | /** |
137 | * Used to get the paradigm region for a cluster, if there is one. |
138 | */ |
139 | const LSR *paradigmLSRs; |
140 | int32_t paradigmLSRsLength; |
141 | |
142 | int32_t defaultLanguageDistance; |
143 | int32_t defaultScriptDistance; |
144 | int32_t defaultRegionDistance; |
145 | int32_t minRegionDistance; |
146 | int32_t defaultDemotionPerDesiredLocale; |
147 | }; |
148 | |
149 | U_NAMESPACE_END |
150 | |
151 | #endif // __LOCDISTANCE_H__ |
152 | |