1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationdatabuilder.h
9*
10* created on: 2012apr01
11* created by: Markus W. Scherer
12*/
13
14#ifndef __COLLATIONDATABUILDER_H__
15#define __COLLATIONDATABUILDER_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "unicode/uniset.h"
22#include "unicode/unistr.h"
23#include "unicode/uversion.h"
24#include "collation.h"
25#include "collationdata.h"
26#include "collationsettings.h"
27#include "normalizer2impl.h"
28#include "utrie2.h"
29#include "uvectr32.h"
30#include "uvectr64.h"
31#include "uvector.h"
32
33U_NAMESPACE_BEGIN
34
35struct ConditionalCE32;
36
37class CollationFastLatinBuilder;
38class CopyHelper;
39class DataBuilderCollationIterator;
40class UCharsTrieBuilder;
41
42/**
43 * Low-level CollationData builder.
44 * Takes (character, CE) pairs and builds them into runtime data structures.
45 * Supports characters with context prefixes and contraction suffixes.
46 */
47class U_I18N_API CollationDataBuilder : public UObject {
48public:
49 /**
50 * Collation element modifier. Interface class for a modifier
51 * that changes a tailoring builder's temporary CEs to final CEs.
52 * Called for every non-special CE32 and every expansion CE.
53 */
54 class CEModifier : public UObject {
55 public:
56 virtual ~CEModifier();
57 /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
58 virtual int64_t modifyCE32(uint32_t ce32) const = 0;
59 /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
60 virtual int64_t modifyCE(int64_t ce) const = 0;
61 };
62
63 CollationDataBuilder(UErrorCode &errorCode);
64
65 virtual ~CollationDataBuilder();
66
67 void initForTailoring(const CollationData *b, UErrorCode &errorCode);
68
69 virtual UBool isCompressibleLeadByte(uint32_t b) const;
70
71 inline UBool isCompressiblePrimary(uint32_t p) const {
72 return isCompressibleLeadByte(p >> 24);
73 }
74
75 /**
76 * @return TRUE if this builder has mappings (e.g., add() has been called)
77 */
78 UBool hasMappings() const { return modified; }
79
80 /**
81 * @return TRUE if c has CEs in this builder
82 */
83 UBool isAssigned(UChar32 c) const;
84
85 /**
86 * @return the three-byte primary if c maps to a single such CE and has no context data,
87 * otherwise returns 0.
88 */
89 uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
90
91 /**
92 * @return the single CE for c.
93 * Sets an error code if c does not have a single CE.
94 */
95 int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
96
97 void add(const UnicodeString &prefix, const UnicodeString &s,
98 const int64_t ces[], int32_t cesLength,
99 UErrorCode &errorCode);
100
101 /**
102 * Encodes the ces as either the returned ce32 by itself,
103 * or by storing an expansion, with the returned ce32 referring to that.
104 *
105 * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
106 */
107 virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
108 void addCE32(const UnicodeString &prefix, const UnicodeString &s,
109 uint32_t ce32, UErrorCode &errorCode);
110
111 /**
112 * Sets three-byte-primary CEs for a range of code points in code point order,
113 * if it is worth doing; otherwise no change is made.
114 * None of the code points in the range should have complex mappings so far
115 * (expansions/contractions/prefixes).
116 * @param start first code point
117 * @param end last code point (inclusive)
118 * @param primary primary weight for 'start'
119 * @param step per-code point primary-weight increment
120 * @param errorCode ICU in/out error code
121 * @return TRUE if an OFFSET_TAG range was used for start..end
122 */
123 UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
124 uint32_t primary, int32_t step,
125 UErrorCode &errorCode);
126
127 /**
128 * Sets three-byte-primary CEs for a range of code points in code point order.
129 * Sets range values if that is worth doing, or else individual values.
130 * None of the code points in the range should have complex mappings so far
131 * (expansions/contractions/prefixes).
132 * @param start first code point
133 * @param end last code point (inclusive)
134 * @param primary primary weight for 'start'
135 * @param step per-code point primary-weight increment
136 * @param errorCode ICU in/out error code
137 * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
138 */
139 uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
140 uint32_t primary, int32_t step,
141 UErrorCode &errorCode);
142
143 /**
144 * Copies all mappings from the src builder, with modifications.
145 * This builder here must not be built yet, and should be empty.
146 */
147 void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
148 UErrorCode &errorCode);
149
150 void optimize(const UnicodeSet &set, UErrorCode &errorCode);
151 void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
152
153 void enableFastLatin() { fastLatinEnabled = TRUE; }
154 virtual void build(CollationData &data, UErrorCode &errorCode);
155
156 /**
157 * Looks up CEs for s and appends them to the ces array.
158 * Does not handle normalization: s should be in FCD form.
159 *
160 * Does not write completely ignorable CEs.
161 * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
162 *
163 * @return incremented cesLength
164 */
165 int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
166 int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
167 int64_t ces[], int32_t cesLength);
168
169protected:
170 friend class CopyHelper;
171 friend class DataBuilderCollationIterator;
172
173 uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
174
175 int32_t addCE(int64_t ce, UErrorCode &errorCode);
176 int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
177 int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
178
179 inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
180 return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
181 }
182 inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
183 return getConditionalCE32(Collation::indexFromCE32(ce32));
184 }
185
186 static uint32_t makeBuilderContextCE32(int32_t index) {
187 return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
188 }
189 static inline UBool isBuilderContextCE32(uint32_t ce32) {
190 return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
191 }
192
193 static uint32_t encodeOneCEAsCE32(int64_t ce);
194 uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
195 uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
196 uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
197
198 uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
199 /**
200 * Copies base contractions to a list of ConditionalCE32.
201 * Sets cond->next to the index of the first new item
202 * and returns the index of the last new item.
203 */
204 int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
205 ConditionalCE32 *cond, UErrorCode &errorCode);
206
207 UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
208 void setDigitTags(UErrorCode &errorCode);
209 void setLeadSurrogates(UErrorCode &errorCode);
210
211 void buildMappings(CollationData &data, UErrorCode &errorCode);
212
213 void clearContexts();
214 void buildContexts(UErrorCode &errorCode);
215 uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
216 int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
217 UErrorCode &errorCode);
218
219 void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
220
221 int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
222
223 static UChar32 jamoCpFromIndex(int32_t i) {
224 // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
225 if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
226 i -= Hangul::JAMO_L_COUNT;
227 if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
228 i -= Hangul::JAMO_V_COUNT;
229 // i < 27
230 return Hangul::JAMO_T_BASE + 1 + i;
231 }
232
233 /** @see Collation::BUILDER_DATA_TAG */
234 static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
235
236 const Normalizer2Impl &nfcImpl;
237 const CollationData *base;
238 const CollationSettings *baseSettings;
239 UTrie2 *trie;
240 UVector32 ce32s;
241 UVector64 ce64s;
242 UVector conditionalCE32s; // vector of ConditionalCE32
243 // Characters that have context (prefixes or contraction suffixes).
244 UnicodeSet contextChars;
245 // Serialized UCharsTrie structures for finalized contexts.
246 UnicodeString contexts;
247 UnicodeSet unsafeBackwardSet;
248 UBool modified;
249
250 UBool fastLatinEnabled;
251 CollationFastLatinBuilder *fastLatinBuilder;
252
253 DataBuilderCollationIterator *collIter;
254};
255
256U_NAMESPACE_END
257
258#endif // !UCONFIG_NO_COLLATION
259#endif // __COLLATIONDATABUILDER_H__
260