| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ******************************************************************************* | 
|---|
| 5 | * Copyright (C) 2012-2014, International Business Machines | 
|---|
| 6 | * Corporation and others.  All Rights Reserved. | 
|---|
| 7 | ******************************************************************************* | 
|---|
| 8 | * collationdatabuilder.h | 
|---|
| 9 | * | 
|---|
| 10 | * created on: 2012apr01 | 
|---|
| 11 | * created by: Markus W. Scherer | 
|---|
| 12 | */ | 
|---|
| 13 |  | 
|---|
| 14 | #ifndef __COLLATIONDATABUILDER_H__ | 
|---|
| 15 | #define __COLLATIONDATABUILDER_H__ | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/utypes.h" | 
|---|
| 18 |  | 
|---|
| 19 | #if !UCONFIG_NO_COLLATION | 
|---|
| 20 |  | 
|---|
| 21 | #include "unicode/uniset.h" | 
|---|
| 22 | #include "unicode/unistr.h" | 
|---|
| 23 | #include "unicode/uversion.h" | 
|---|
| 24 | #include "collation.h" | 
|---|
| 25 | #include "collationdata.h" | 
|---|
| 26 | #include "collationsettings.h" | 
|---|
| 27 | #include "normalizer2impl.h" | 
|---|
| 28 | #include "utrie2.h" | 
|---|
| 29 | #include "uvectr32.h" | 
|---|
| 30 | #include "uvectr64.h" | 
|---|
| 31 | #include "uvector.h" | 
|---|
| 32 |  | 
|---|
| 33 | U_NAMESPACE_BEGIN | 
|---|
| 34 |  | 
|---|
| 35 | struct ConditionalCE32; | 
|---|
| 36 |  | 
|---|
| 37 | class CollationFastLatinBuilder; | 
|---|
| 38 | class CopyHelper; | 
|---|
| 39 | class DataBuilderCollationIterator; | 
|---|
| 40 | class UCharsTrieBuilder; | 
|---|
| 41 |  | 
|---|
| 42 | /** | 
|---|
| 43 | * Low-level CollationData builder. | 
|---|
| 44 | * Takes (character, CE) pairs and builds them into runtime data structures. | 
|---|
| 45 | * Supports characters with context prefixes and contraction suffixes. | 
|---|
| 46 | */ | 
|---|
| 47 | class U_I18N_API CollationDataBuilder : public UObject { | 
|---|
| 48 | public: | 
|---|
| 49 | /** | 
|---|
| 50 | * Collation element modifier. Interface class for a modifier | 
|---|
| 51 | * that changes a tailoring builder's temporary CEs to final CEs. | 
|---|
| 52 | * Called for every non-special CE32 and every expansion CE. | 
|---|
| 53 | */ | 
|---|
| 54 | class CEModifier : public UObject { | 
|---|
| 55 | public: | 
|---|
| 56 | virtual ~CEModifier(); | 
|---|
| 57 | /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ | 
|---|
| 58 | virtual int64_t modifyCE32(uint32_t ce32) const = 0; | 
|---|
| 59 | /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ | 
|---|
| 60 | virtual int64_t modifyCE(int64_t ce) const = 0; | 
|---|
| 61 | }; | 
|---|
| 62 |  | 
|---|
| 63 | CollationDataBuilder(UErrorCode &errorCode); | 
|---|
| 64 |  | 
|---|
| 65 | virtual ~CollationDataBuilder(); | 
|---|
| 66 |  | 
|---|
| 67 | void initForTailoring(const CollationData *b, UErrorCode &errorCode); | 
|---|
| 68 |  | 
|---|
| 69 | virtual UBool isCompressibleLeadByte(uint32_t b) const; | 
|---|
| 70 |  | 
|---|
| 71 | inline UBool isCompressiblePrimary(uint32_t p) const { | 
|---|
| 72 | return isCompressibleLeadByte(p >> 24); | 
|---|
| 73 | } | 
|---|
| 74 |  | 
|---|
| 75 | /** | 
|---|
| 76 | * @return TRUE if this builder has mappings (e.g., add() has been called) | 
|---|
| 77 | */ | 
|---|
| 78 | UBool hasMappings() const { return modified; } | 
|---|
| 79 |  | 
|---|
| 80 | /** | 
|---|
| 81 | * @return TRUE if c has CEs in this builder | 
|---|
| 82 | */ | 
|---|
| 83 | UBool isAssigned(UChar32 c) const; | 
|---|
| 84 |  | 
|---|
| 85 | /** | 
|---|
| 86 | * @return the three-byte primary if c maps to a single such CE and has no context data, | 
|---|
| 87 | * otherwise returns 0. | 
|---|
| 88 | */ | 
|---|
| 89 | uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; | 
|---|
| 90 |  | 
|---|
| 91 | /** | 
|---|
| 92 | * @return the single CE for c. | 
|---|
| 93 | * Sets an error code if c does not have a single CE. | 
|---|
| 94 | */ | 
|---|
| 95 | int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; | 
|---|
| 96 |  | 
|---|
| 97 | void add(const UnicodeString &prefix, const UnicodeString &s, | 
|---|
| 98 | const int64_t ces[], int32_t cesLength, | 
|---|
| 99 | UErrorCode &errorCode); | 
|---|
| 100 |  | 
|---|
| 101 | /** | 
|---|
| 102 | * Encodes the ces as either the returned ce32 by itself, | 
|---|
| 103 | * or by storing an expansion, with the returned ce32 referring to that. | 
|---|
| 104 | * | 
|---|
| 105 | * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) | 
|---|
| 106 | */ | 
|---|
| 107 | virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); | 
|---|
| 108 | void addCE32(const UnicodeString &prefix, const UnicodeString &s, | 
|---|
| 109 | uint32_t ce32, UErrorCode &errorCode); | 
|---|
| 110 |  | 
|---|
| 111 | /** | 
|---|
| 112 | * Sets three-byte-primary CEs for a range of code points in code point order, | 
|---|
| 113 | * if it is worth doing; otherwise no change is made. | 
|---|
| 114 | * None of the code points in the range should have complex mappings so far | 
|---|
| 115 | * (expansions/contractions/prefixes). | 
|---|
| 116 | * @param start first code point | 
|---|
| 117 | * @param end last code point (inclusive) | 
|---|
| 118 | * @param primary primary weight for 'start' | 
|---|
| 119 | * @param step per-code point primary-weight increment | 
|---|
| 120 | * @param errorCode ICU in/out error code | 
|---|
| 121 | * @return TRUE if an OFFSET_TAG range was used for start..end | 
|---|
| 122 | */ | 
|---|
| 123 | UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, | 
|---|
| 124 | uint32_t primary, int32_t step, | 
|---|
| 125 | UErrorCode &errorCode); | 
|---|
| 126 |  | 
|---|
| 127 | /** | 
|---|
| 128 | * Sets three-byte-primary CEs for a range of code points in code point order. | 
|---|
| 129 | * Sets range values if that is worth doing, or else individual values. | 
|---|
| 130 | * None of the code points in the range should have complex mappings so far | 
|---|
| 131 | * (expansions/contractions/prefixes). | 
|---|
| 132 | * @param start first code point | 
|---|
| 133 | * @param end last code point (inclusive) | 
|---|
| 134 | * @param primary primary weight for 'start' | 
|---|
| 135 | * @param step per-code point primary-weight increment | 
|---|
| 136 | * @param errorCode ICU in/out error code | 
|---|
| 137 | * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step | 
|---|
| 138 | */ | 
|---|
| 139 | uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, | 
|---|
| 140 | uint32_t primary, int32_t step, | 
|---|
| 141 | UErrorCode &errorCode); | 
|---|
| 142 |  | 
|---|
| 143 | /** | 
|---|
| 144 | * Copies all mappings from the src builder, with modifications. | 
|---|
| 145 | * This builder here must not be built yet, and should be empty. | 
|---|
| 146 | */ | 
|---|
| 147 | void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, | 
|---|
| 148 | UErrorCode &errorCode); | 
|---|
| 149 |  | 
|---|
| 150 | void optimize(const UnicodeSet &set, UErrorCode &errorCode); | 
|---|
| 151 | void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); | 
|---|
| 152 |  | 
|---|
| 153 | void enableFastLatin() { fastLatinEnabled = TRUE; } | 
|---|
| 154 | virtual void build(CollationData &data, UErrorCode &errorCode); | 
|---|
| 155 |  | 
|---|
| 156 | /** | 
|---|
| 157 | * Looks up CEs for s and appends them to the ces array. | 
|---|
| 158 | * Does not handle normalization: s should be in FCD form. | 
|---|
| 159 | * | 
|---|
| 160 | * Does not write completely ignorable CEs. | 
|---|
| 161 | * Does not write beyond Collation::MAX_EXPANSION_LENGTH. | 
|---|
| 162 | * | 
|---|
| 163 | * @return incremented cesLength | 
|---|
| 164 | */ | 
|---|
| 165 | int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); | 
|---|
| 166 | int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, | 
|---|
| 167 | int64_t ces[], int32_t cesLength); | 
|---|
| 168 |  | 
|---|
| 169 | protected: | 
|---|
| 170 | friend class CopyHelper; | 
|---|
| 171 | friend class DataBuilderCollationIterator; | 
|---|
| 172 |  | 
|---|
| 173 | uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; | 
|---|
| 174 |  | 
|---|
| 175 | int32_t addCE(int64_t ce, UErrorCode &errorCode); | 
|---|
| 176 | int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); | 
|---|
| 177 | int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); | 
|---|
| 178 |  | 
|---|
| 179 | inline ConditionalCE32 *getConditionalCE32(int32_t index) const { | 
|---|
| 180 | return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); | 
|---|
| 181 | } | 
|---|
| 182 | inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { | 
|---|
| 183 | return getConditionalCE32(Collation::indexFromCE32(ce32)); | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | static uint32_t makeBuilderContextCE32(int32_t index) { | 
|---|
| 187 | return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); | 
|---|
| 188 | } | 
|---|
| 189 | static inline UBool isBuilderContextCE32(uint32_t ce32) { | 
|---|
| 190 | return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | static uint32_t encodeOneCEAsCE32(int64_t ce); | 
|---|
| 194 | uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); | 
|---|
| 195 | uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); | 
|---|
| 196 | uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); | 
|---|
| 197 |  | 
|---|
| 198 | uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); | 
|---|
| 199 | /** | 
|---|
| 200 | * Copies base contractions to a list of ConditionalCE32. | 
|---|
| 201 | * Sets cond->next to the index of the first new item | 
|---|
| 202 | * and returns the index of the last new item. | 
|---|
| 203 | */ | 
|---|
| 204 | int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, | 
|---|
| 205 | ConditionalCE32 *cond, UErrorCode &errorCode); | 
|---|
| 206 |  | 
|---|
| 207 | UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); | 
|---|
| 208 | void setDigitTags(UErrorCode &errorCode); | 
|---|
| 209 | void setLeadSurrogates(UErrorCode &errorCode); | 
|---|
| 210 |  | 
|---|
| 211 | void buildMappings(CollationData &data, UErrorCode &errorCode); | 
|---|
| 212 |  | 
|---|
| 213 | void clearContexts(); | 
|---|
| 214 | void buildContexts(UErrorCode &errorCode); | 
|---|
| 215 | uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); | 
|---|
| 216 | int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, | 
|---|
| 217 | UErrorCode &errorCode); | 
|---|
| 218 |  | 
|---|
| 219 | void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); | 
|---|
| 220 |  | 
|---|
| 221 | int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); | 
|---|
| 222 |  | 
|---|
| 223 | static UChar32 jamoCpFromIndex(int32_t i) { | 
|---|
| 224 | // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 | 
|---|
| 225 | if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } | 
|---|
| 226 | i -= Hangul::JAMO_L_COUNT; | 
|---|
| 227 | if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } | 
|---|
| 228 | i -= Hangul::JAMO_V_COUNT; | 
|---|
| 229 | // i < 27 | 
|---|
| 230 | return Hangul::JAMO_T_BASE + 1 + i; | 
|---|
| 231 | } | 
|---|
| 232 |  | 
|---|
| 233 | /** @see Collation::BUILDER_DATA_TAG */ | 
|---|
| 234 | static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; | 
|---|
| 235 |  | 
|---|
| 236 | const Normalizer2Impl &nfcImpl; | 
|---|
| 237 | const CollationData *base; | 
|---|
| 238 | const CollationSettings *baseSettings; | 
|---|
| 239 | UTrie2 *trie; | 
|---|
| 240 | UVector32 ce32s; | 
|---|
| 241 | UVector64 ce64s; | 
|---|
| 242 | UVector conditionalCE32s;  // vector of ConditionalCE32 | 
|---|
| 243 | // Characters that have context (prefixes or contraction suffixes). | 
|---|
| 244 | UnicodeSet contextChars; | 
|---|
| 245 | // Serialized UCharsTrie structures for finalized contexts. | 
|---|
| 246 | UnicodeString contexts; | 
|---|
| 247 | UnicodeSet unsafeBackwardSet; | 
|---|
| 248 | UBool modified; | 
|---|
| 249 |  | 
|---|
| 250 | UBool fastLatinEnabled; | 
|---|
| 251 | CollationFastLatinBuilder *fastLatinBuilder; | 
|---|
| 252 |  | 
|---|
| 253 | DataBuilderCollationIterator *collIter; | 
|---|
| 254 | }; | 
|---|
| 255 |  | 
|---|
| 256 | U_NAMESPACE_END | 
|---|
| 257 |  | 
|---|
| 258 | #endif  // !UCONFIG_NO_COLLATION | 
|---|
| 259 | #endif  // __COLLATIONDATABUILDER_H__ | 
|---|
| 260 |  | 
|---|