1// © 2019 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3
4// locdistance.h
5// created: 2019may08 Markus W. Scherer
6
7#ifndef __LOCDISTANCE_H__
8#define __LOCDISTANCE_H__
9
10#include "unicode/utypes.h"
11#include "unicode/bytestrie.h"
12#include "unicode/localematcher.h"
13#include "unicode/locid.h"
14#include "unicode/uobject.h"
15#include "lsr.h"
16
17U_NAMESPACE_BEGIN
18
19struct LocaleDistanceData;
20
21/**
22 * Offline-built data for LocaleMatcher.
23 * Mostly but not only the data for mapping locales to their maximized forms.
24 */
25class LocaleDistance final : public UMemory {
26public:
27 static const LocaleDistance *getSingleton(UErrorCode &errorCode);
28
29 static int32_t shiftDistance(int32_t distance) {
30 return distance << DISTANCE_SHIFT;
31 }
32
33 static int32_t getShiftedDistance(int32_t indexAndDistance) {
34 return indexAndDistance & DISTANCE_MASK;
35 }
36
37 static double getDistanceDouble(int32_t indexAndDistance) {
38 double shiftedDistance = getShiftedDistance(indexAndDistance);
39 return shiftedDistance / (1 << DISTANCE_SHIFT);
40 }
41
42 static int32_t getIndex(int32_t indexAndDistance) {
43 // assert indexAndDistance >= 0;
44 return indexAndDistance >> INDEX_SHIFT;
45 }
46
47 /**
48 * Finds the supported LSR with the smallest distance from the desired one.
49 * Equivalent LSR subtags must be normalized into a canonical form.
50 *
51 * <p>Returns the index of the lowest-distance supported LSR in the high bits
52 * (negative if none has a distance below the threshold),
53 * and its distance (0..ABOVE_THRESHOLD) in the low bits.
54 */
55 int32_t getBestIndexAndDistance(const LSR &desired,
56 const LSR **supportedLSRs, int32_t supportedLSRsLength,
57 int32_t shiftedThreshold,
58 ULocMatchFavorSubtag favorSubtag,
59 ULocMatchDirection direction) const;
60
61 UBool isParadigmLSR(const LSR &lsr) const;
62
63 int32_t getDefaultScriptDistance() const {
64 return defaultScriptDistance;
65 }
66
67 int32_t getDefaultDemotionPerDesiredLocale() const {
68 return defaultDemotionPerDesiredLocale;
69 }
70
71private:
72 // The distance is shifted left to gain some fraction bits.
73 static constexpr int32_t DISTANCE_SHIFT = 3;
74 static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
75 // 7 bits for 0..100
76 static constexpr int32_t DISTANCE_INT_SHIFT = 7;
77 static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
78 static constexpr int32_t DISTANCE_MASK = 0x3ff;
79 // tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit
80 static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
81
82 static int32_t getDistanceFloor(int32_t indexAndDistance) {
83 return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
84 }
85
86 LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
87 LocaleDistance(const LocaleDistance &other) = delete;
88 LocaleDistance &operator=(const LocaleDistance &other) = delete;
89
90 static void initLocaleDistance(UErrorCode &errorCode);
91
92 UBool isMatch(const LSR &desired, const LSR &supported,
93 int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
94 const LSR *pSupp = &supported;
95 return getBestIndexAndDistance(
96 desired, &pSupp, 1,
97 shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
98 }
99
100 static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
101 const char *desired, const char *supported);
102
103 static int32_t getRegionPartitionsDistance(
104 BytesTrie &iter, uint64_t startState,
105 const char *desiredPartitions, const char *supportedPartitions,
106 int32_t threshold);
107
108 static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState);
109
110 static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue);
111
112 const char *partitionsForRegion(const LSR &lsr) const {
113 // ill-formed region -> one non-matching string
114 int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex];
115 return partitionArrays[pIndex];
116 }
117
118 int32_t getDefaultRegionDistance() const {
119 return defaultRegionDistance;
120 }
121
122 const XLikelySubtags &likelySubtags;
123
124 // The trie maps each dlang+slang+dscript+sscript+dregion+sregion
125 // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
126 // There is also a trie value for each subsequence of whole subtags.
127 // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"".
128 BytesTrie trie;
129
130 /**
131 * Maps each region to zero or more single-character partitions.
132 */
133 const uint8_t *regionToPartitionsIndex;
134 const char **partitionArrays;
135
136 /**
137 * Used to get the paradigm region for a cluster, if there is one.
138 */
139 const LSR *paradigmLSRs;
140 int32_t paradigmLSRsLength;
141
142 int32_t defaultLanguageDistance;
143 int32_t defaultScriptDistance;
144 int32_t defaultRegionDistance;
145 int32_t minRegionDistance;
146 int32_t defaultDemotionPerDesiredLocale;
147};
148
149U_NAMESPACE_END
150
151#endif // __LOCDISTANCE_H__
152