| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ******************************************************************************* | 
|---|
| 5 | * Copyright (C) 2013-2015, International Business Machines | 
|---|
| 6 | * Corporation and others.  All Rights Reserved. | 
|---|
| 7 | ******************************************************************************* | 
|---|
| 8 | * collationsettings.h | 
|---|
| 9 | * | 
|---|
| 10 | * created on: 2013feb07 | 
|---|
| 11 | * created by: Markus W. Scherer | 
|---|
| 12 | */ | 
|---|
| 13 |  | 
|---|
| 14 | #ifndef __COLLATIONSETTINGS_H__ | 
|---|
| 15 | #define __COLLATIONSETTINGS_H__ | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/utypes.h" | 
|---|
| 18 |  | 
|---|
| 19 | #if !UCONFIG_NO_COLLATION | 
|---|
| 20 |  | 
|---|
| 21 | #include "unicode/ucol.h" | 
|---|
| 22 | #include "collation.h" | 
|---|
| 23 | #include "sharedobject.h" | 
|---|
| 24 | #include "umutex.h" | 
|---|
| 25 |  | 
|---|
| 26 | U_NAMESPACE_BEGIN | 
|---|
| 27 |  | 
|---|
| 28 | struct CollationData; | 
|---|
| 29 |  | 
|---|
| 30 | /** | 
|---|
| 31 | * Collation settings/options/attributes. | 
|---|
| 32 | * These are the values that can be changed via API. | 
|---|
| 33 | */ | 
|---|
| 34 | struct U_I18N_API CollationSettings : public SharedObject { | 
|---|
| 35 | /** | 
|---|
| 36 | * Options bit 0: Perform the FCD check on the input text and deliver normalized text. | 
|---|
| 37 | */ | 
|---|
| 38 | static const int32_t CHECK_FCD = 1; | 
|---|
| 39 | /** | 
|---|
| 40 | * Options bit 1: Numeric collation. | 
|---|
| 41 | * Also known as CODAN = COllate Digits As Numbers. | 
|---|
| 42 | * | 
|---|
| 43 | * Treat digit sequences as numbers with CE sequences in numeric order, | 
|---|
| 44 | * rather than returning a normal CE for each digit. | 
|---|
| 45 | */ | 
|---|
| 46 | static const int32_t NUMERIC = 2; | 
|---|
| 47 | /** | 
|---|
| 48 | * "Shifted" alternate handling, see ALTERNATE_MASK. | 
|---|
| 49 | */ | 
|---|
| 50 | static const int32_t SHIFTED = 4; | 
|---|
| 51 | /** | 
|---|
| 52 | * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. | 
|---|
| 53 | * Reserve values 8 and 0xc for shift-trimmed and blanked. | 
|---|
| 54 | */ | 
|---|
| 55 | static const int32_t ALTERNATE_MASK = 0xc; | 
|---|
| 56 | /** | 
|---|
| 57 | * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. | 
|---|
| 58 | */ | 
|---|
| 59 | static const int32_t MAX_VARIABLE_SHIFT = 4; | 
|---|
| 60 | /** maxVariable options bit mask before shifting. */ | 
|---|
| 61 | static const int32_t MAX_VARIABLE_MASK = 0x70; | 
|---|
| 62 | /** Options bit 7: Reserved/unused/0. */ | 
|---|
| 63 | /** | 
|---|
| 64 | * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. | 
|---|
| 65 | */ | 
|---|
| 66 | static const int32_t UPPER_FIRST = 0x100; | 
|---|
| 67 | /** | 
|---|
| 68 | * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) | 
|---|
| 69 | * unless case level is on (when they are *moved* into the separate case level). | 
|---|
| 70 | * By default, the case bits are removed from the tertiary weight (ignored). | 
|---|
| 71 | * | 
|---|
| 72 | * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to | 
|---|
| 73 | * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. | 
|---|
| 74 | */ | 
|---|
| 75 | static const int32_t CASE_FIRST = 0x200; | 
|---|
| 76 | /** | 
|---|
| 77 | * Options bit mask for caseFirst and upperFirst, before shifting. | 
|---|
| 78 | * Same value as caseFirst==upperFirst. | 
|---|
| 79 | */ | 
|---|
| 80 | static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; | 
|---|
| 81 | /** | 
|---|
| 82 | * Options bit 10: Insert the case level between the secondary and tertiary levels. | 
|---|
| 83 | */ | 
|---|
| 84 | static const int32_t CASE_LEVEL = 0x400; | 
|---|
| 85 | /** | 
|---|
| 86 | * Options bit 11: Compare secondary weights backwards. ("French secondary") | 
|---|
| 87 | */ | 
|---|
| 88 | static const int32_t BACKWARD_SECONDARY = 0x800; | 
|---|
| 89 | /** | 
|---|
| 90 | * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. | 
|---|
| 91 | * It is the top used bit field in the options. (No need to mask after shifting.) | 
|---|
| 92 | */ | 
|---|
| 93 | static const int32_t STRENGTH_SHIFT = 12; | 
|---|
| 94 | /** Strength options bit mask before shifting. */ | 
|---|
| 95 | static const int32_t STRENGTH_MASK = 0xf000; | 
|---|
| 96 |  | 
|---|
| 97 | /** maxVariable values */ | 
|---|
| 98 | enum MaxVariable { | 
|---|
| 99 | MAX_VAR_SPACE, | 
|---|
| 100 | MAX_VAR_PUNCT, | 
|---|
| 101 | MAX_VAR_SYMBOL, | 
|---|
| 102 | MAX_VAR_CURRENCY | 
|---|
| 103 | }; | 
|---|
| 104 |  | 
|---|
| 105 | CollationSettings() | 
|---|
| 106 | : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) | | 
|---|
| 107 | (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)), | 
|---|
| 108 | variableTop(0), | 
|---|
| 109 | reorderTable(NULL), | 
|---|
| 110 | minHighNoReorder(0), | 
|---|
| 111 | reorderRanges(NULL), reorderRangesLength(0), | 
|---|
| 112 | reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0), | 
|---|
| 113 | fastLatinOptions(-1) {} | 
|---|
| 114 |  | 
|---|
| 115 | CollationSettings(const CollationSettings &other); | 
|---|
| 116 | virtual ~CollationSettings(); | 
|---|
| 117 |  | 
|---|
| 118 | UBool operator==(const CollationSettings &other) const; | 
|---|
| 119 |  | 
|---|
| 120 | inline UBool operator!=(const CollationSettings &other) const { | 
|---|
| 121 | return !operator==(other); | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | int32_t hashCode() const; | 
|---|
| 125 |  | 
|---|
| 126 | void resetReordering(); | 
|---|
| 127 | void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, | 
|---|
| 128 | const uint32_t *ranges, int32_t rangesLength, | 
|---|
| 129 | const uint8_t *table, UErrorCode &errorCode); | 
|---|
| 130 | void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength, | 
|---|
| 131 | UErrorCode &errorCode); | 
|---|
| 132 | void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode); | 
|---|
| 133 |  | 
|---|
| 134 | inline UBool hasReordering() const { return reorderTable != NULL; } | 
|---|
| 135 | static UBool reorderTableHasSplitBytes(const uint8_t table[256]); | 
|---|
| 136 | inline uint32_t reorder(uint32_t p) const { | 
|---|
| 137 | uint8_t b = reorderTable[p >> 24]; | 
|---|
| 138 | if(b != 0 || p <= Collation::NO_CE_PRIMARY) { | 
|---|
| 139 | return ((uint32_t)b << 24) | (p & 0xffffff); | 
|---|
| 140 | } else { | 
|---|
| 141 | return reorderEx(p); | 
|---|
| 142 | } | 
|---|
| 143 | } | 
|---|
| 144 |  | 
|---|
| 145 | void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); | 
|---|
| 146 |  | 
|---|
| 147 | static int32_t getStrength(int32_t options) { | 
|---|
| 148 | return options >> STRENGTH_SHIFT; | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|
| 151 | int32_t getStrength() const { | 
|---|
| 152 | return getStrength(options); | 
|---|
| 153 | } | 
|---|
| 154 |  | 
|---|
| 155 | /** Sets the options bit for an on/off attribute. */ | 
|---|
| 156 | void setFlag(int32_t bit, UColAttributeValue value, | 
|---|
| 157 | int32_t defaultOptions, UErrorCode &errorCode); | 
|---|
| 158 |  | 
|---|
| 159 | UColAttributeValue getFlag(int32_t bit) const { | 
|---|
| 160 | return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF; | 
|---|
| 161 | } | 
|---|
| 162 |  | 
|---|
| 163 | void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); | 
|---|
| 164 |  | 
|---|
| 165 | UColAttributeValue getCaseFirst() const { | 
|---|
| 166 | int32_t option = options & CASE_FIRST_AND_UPPER_MASK; | 
|---|
| 167 | return (option == 0) ? UCOL_OFF : | 
|---|
| 168 | (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST; | 
|---|
| 169 | } | 
|---|
| 170 |  | 
|---|
| 171 | void setAlternateHandling(UColAttributeValue value, | 
|---|
| 172 | int32_t defaultOptions, UErrorCode &errorCode); | 
|---|
| 173 |  | 
|---|
| 174 | UColAttributeValue getAlternateHandling() const { | 
|---|
| 175 | return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED; | 
|---|
| 176 | } | 
|---|
| 177 |  | 
|---|
| 178 | void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); | 
|---|
| 179 |  | 
|---|
| 180 | MaxVariable getMaxVariable() const { | 
|---|
| 181 | return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT); | 
|---|
| 182 | } | 
|---|
| 183 |  | 
|---|
| 184 | /** | 
|---|
| 185 | * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. | 
|---|
| 186 | */ | 
|---|
| 187 | static inline UBool isTertiaryWithCaseBits(int32_t options) { | 
|---|
| 188 | return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; | 
|---|
| 189 | } | 
|---|
| 190 | static uint32_t getTertiaryMask(int32_t options) { | 
|---|
| 191 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | 
|---|
| 192 | return isTertiaryWithCaseBits(options) ? | 
|---|
| 193 | Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK; | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | static UBool sortsTertiaryUpperCaseFirst(int32_t options) { | 
|---|
| 197 | // On tertiary level, consider case bits and sort uppercase first | 
|---|
| 198 | // if caseLevel is off and caseFirst==upperFirst. | 
|---|
| 199 | return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; | 
|---|
| 200 | } | 
|---|
| 201 |  | 
|---|
| 202 | inline UBool dontCheckFCD() const { | 
|---|
| 203 | return (options & CHECK_FCD) == 0; | 
|---|
| 204 | } | 
|---|
| 205 |  | 
|---|
| 206 | inline UBool hasBackwardSecondary() const { | 
|---|
| 207 | return (options & BACKWARD_SECONDARY) != 0; | 
|---|
| 208 | } | 
|---|
| 209 |  | 
|---|
| 210 | inline UBool isNumeric() const { | 
|---|
| 211 | return (options & NUMERIC) != 0; | 
|---|
| 212 | } | 
|---|
| 213 |  | 
|---|
| 214 | /** CHECK_FCD etc. */ | 
|---|
| 215 | int32_t options; | 
|---|
| 216 | /** Variable-top primary weight. */ | 
|---|
| 217 | uint32_t variableTop; | 
|---|
| 218 | /** | 
|---|
| 219 | * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. | 
|---|
| 220 | * A 0 entry at a non-zero index means that the primary lead byte is "split" | 
|---|
| 221 | * (there are different offsets for primaries that share that lead byte) | 
|---|
| 222 | * and the reordering offset must be determined via the reorderRanges. | 
|---|
| 223 | */ | 
|---|
| 224 | const uint8_t *reorderTable; | 
|---|
| 225 | /** Limit of last reordered range. 0 if no reordering or no split bytes. */ | 
|---|
| 226 | uint32_t minHighNoReorder; | 
|---|
| 227 | /** | 
|---|
| 228 | * Primary-weight ranges for script reordering, | 
|---|
| 229 | * to be used by reorder(p) for split-reordered primary lead bytes. | 
|---|
| 230 | * | 
|---|
| 231 | * Each entry is a (limit, offset) pair. | 
|---|
| 232 | * The upper 16 bits of the entry are the upper 16 bits of the | 
|---|
| 233 | * exclusive primary limit of a range. | 
|---|
| 234 | * Primaries between the previous limit and this one have their lead bytes | 
|---|
| 235 | * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. | 
|---|
| 236 | * | 
|---|
| 237 | * CollationData::makeReorderRanges() writes a full list where the first range | 
|---|
| 238 | * (at least for terminators and separators) has a 0 offset. | 
|---|
| 239 | * The last range has a non-zero offset. | 
|---|
| 240 | * minHighNoReorder is set to the limit of that last range. | 
|---|
| 241 | * | 
|---|
| 242 | * In the settings object, the initial ranges before the first split lead byte | 
|---|
| 243 | * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. | 
|---|
| 244 | * If there are no split-reordered lead bytes, then no ranges are needed. | 
|---|
| 245 | */ | 
|---|
| 246 | const uint32_t *reorderRanges; | 
|---|
| 247 | int32_t reorderRangesLength; | 
|---|
| 248 | /** Array of reorder codes; ignored if reorderCodesLength == 0. */ | 
|---|
| 249 | const int32_t *reorderCodes; | 
|---|
| 250 | /** Number of reorder codes; 0 if no reordering. */ | 
|---|
| 251 | int32_t reorderCodesLength; | 
|---|
| 252 | /** | 
|---|
| 253 | * Capacity of reorderCodes. | 
|---|
| 254 | * If 0, then the codes, the ranges, and the table are aliases. | 
|---|
| 255 | * Otherwise, this object owns the memory via the reorderCodes pointer; | 
|---|
| 256 | * the codes, the ranges, and the table are in the same memory block, in that order. | 
|---|
| 257 | */ | 
|---|
| 258 | int32_t reorderCodesCapacity; | 
|---|
| 259 |  | 
|---|
| 260 | /** Options for CollationFastLatin. Negative if disabled. */ | 
|---|
| 261 | int32_t fastLatinOptions; | 
|---|
| 262 | uint16_t fastLatinPrimaries[0x180]; | 
|---|
| 263 |  | 
|---|
| 264 | private: | 
|---|
| 265 | void setReorderArrays(const int32_t *codes, int32_t codesLength, | 
|---|
| 266 | const uint32_t *ranges, int32_t rangesLength, | 
|---|
| 267 | const uint8_t *table, UErrorCode &errorCode); | 
|---|
| 268 | uint32_t reorderEx(uint32_t p) const; | 
|---|
| 269 | }; | 
|---|
| 270 |  | 
|---|
| 271 | U_NAMESPACE_END | 
|---|
| 272 |  | 
|---|
| 273 | #endif  // !UCONFIG_NO_COLLATION | 
|---|
| 274 | #endif  // __COLLATIONSETTINGS_H__ | 
|---|
| 275 |  | 
|---|