1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2014-2016, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | * dictionarydata.h |
9 | * |
10 | * created on: 2012may31 |
11 | * created by: Markus W. Scherer & Maxime Serrano |
12 | */ |
13 | |
14 | #include "dictionarydata.h" |
15 | #include "unicode/ucharstrie.h" |
16 | #include "unicode/bytestrie.h" |
17 | #include "unicode/udata.h" |
18 | #include "cmemory.h" |
19 | |
20 | #if !UCONFIG_NO_BREAK_ITERATION |
21 | |
22 | U_NAMESPACE_BEGIN |
23 | |
24 | const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; |
25 | const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; |
26 | const int32_t DictionaryData::TRIE_TYPE_MASK = 7; |
27 | const int32_t DictionaryData::TRIE_HAS_VALUES = 8; |
28 | |
29 | const int32_t DictionaryData::TRANSFORM_NONE = 0; |
30 | const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; |
31 | const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; |
32 | const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; |
33 | |
34 | DictionaryMatcher::~DictionaryMatcher() { |
35 | } |
36 | |
37 | UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { |
38 | udata_close(file); |
39 | } |
40 | |
41 | int32_t UCharsDictionaryMatcher::getType() const { |
42 | return DictionaryData::TRIE_TYPE_UCHARS; |
43 | } |
44 | |
45 | int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
46 | int32_t *lengths, int32_t *cpLengths, int32_t *values, |
47 | int32_t *prefix) const { |
48 | |
49 | UCharsTrie uct(characters); |
50 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); |
51 | int32_t wordCount = 0; |
52 | int32_t codePointsMatched = 0; |
53 | |
54 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { |
55 | UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); |
56 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; |
57 | codePointsMatched += 1; |
58 | if (USTRINGTRIE_HAS_VALUE(result)) { |
59 | if (wordCount < limit) { |
60 | if (values != nullptr) { |
61 | values[wordCount] = uct.getValue(); |
62 | } |
63 | if (lengths != nullptr) { |
64 | lengths[wordCount] = lengthMatched; |
65 | } |
66 | if (cpLengths != nullptr) { |
67 | cpLengths[wordCount] = codePointsMatched; |
68 | } |
69 | ++wordCount; |
70 | } |
71 | if (result == USTRINGTRIE_FINAL_VALUE) { |
72 | break; |
73 | } |
74 | } |
75 | else if (result == USTRINGTRIE_NO_MATCH) { |
76 | break; |
77 | } |
78 | if (lengthMatched >= maxLength) { |
79 | break; |
80 | } |
81 | } |
82 | |
83 | if (prefix != nullptr) { |
84 | *prefix = codePointsMatched; |
85 | } |
86 | return wordCount; |
87 | } |
88 | |
89 | BytesDictionaryMatcher::~BytesDictionaryMatcher() { |
90 | udata_close(file); |
91 | } |
92 | |
93 | UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { |
94 | if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { |
95 | if (c == 0x200D) { |
96 | return 0xFF; |
97 | } else if (c == 0x200C) { |
98 | return 0xFE; |
99 | } |
100 | int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); |
101 | if (delta < 0 || 0xFD < delta) { |
102 | return U_SENTINEL; |
103 | } |
104 | return (UChar32)delta; |
105 | } |
106 | return c; |
107 | } |
108 | |
109 | int32_t BytesDictionaryMatcher::getType() const { |
110 | return DictionaryData::TRIE_TYPE_BYTES; |
111 | } |
112 | |
113 | int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
114 | int32_t *lengths, int32_t *cpLengths, int32_t *values, |
115 | int32_t *prefix) const { |
116 | BytesTrie bt(characters); |
117 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); |
118 | int32_t wordCount = 0; |
119 | int32_t codePointsMatched = 0; |
120 | |
121 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { |
122 | UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); |
123 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; |
124 | codePointsMatched += 1; |
125 | if (USTRINGTRIE_HAS_VALUE(result)) { |
126 | if (wordCount < limit) { |
127 | if (values != nullptr) { |
128 | values[wordCount] = bt.getValue(); |
129 | } |
130 | if (lengths != nullptr) { |
131 | lengths[wordCount] = lengthMatched; |
132 | } |
133 | if (cpLengths != nullptr) { |
134 | cpLengths[wordCount] = codePointsMatched; |
135 | } |
136 | ++wordCount; |
137 | } |
138 | if (result == USTRINGTRIE_FINAL_VALUE) { |
139 | break; |
140 | } |
141 | } |
142 | else if (result == USTRINGTRIE_NO_MATCH) { |
143 | break; |
144 | } |
145 | if (lengthMatched >= maxLength) { |
146 | break; |
147 | } |
148 | } |
149 | |
150 | if (prefix != nullptr) { |
151 | *prefix = codePointsMatched; |
152 | } |
153 | return wordCount; |
154 | } |
155 | |
156 | |
157 | U_NAMESPACE_END |
158 | |
159 | U_NAMESPACE_USE |
160 | |
161 | U_CAPI int32_t U_EXPORT2 |
162 | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, |
163 | void *outData, UErrorCode *pErrorCode) { |
164 | const UDataInfo *pInfo; |
165 | int32_t ; |
166 | const uint8_t *inBytes; |
167 | uint8_t *outBytes; |
168 | const int32_t *inIndexes; |
169 | int32_t indexes[DictionaryData::IX_COUNT]; |
170 | int32_t i, offset, size; |
171 | |
172 | headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
173 | if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0; |
174 | pInfo = (const UDataInfo *)((const char *)inData + 4); |
175 | if (!(pInfo->dataFormat[0] == 0x44 && |
176 | pInfo->dataFormat[1] == 0x69 && |
177 | pInfo->dataFormat[2] == 0x63 && |
178 | pInfo->dataFormat[3] == 0x74 && |
179 | pInfo->formatVersion[0] == 1)) { |
180 | udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n" , |
181 | pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); |
182 | *pErrorCode = U_UNSUPPORTED_ERROR; |
183 | return 0; |
184 | } |
185 | |
186 | inBytes = (const uint8_t *)inData + headerSize; |
187 | outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize; |
188 | |
189 | inIndexes = (const int32_t *)inBytes; |
190 | if (length >= 0) { |
191 | length -= headerSize; |
192 | if (length < (int32_t)(sizeof(indexes))) { |
193 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n" , length); |
194 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
195 | return 0; |
196 | } |
197 | } |
198 | |
199 | for (i = 0; i < DictionaryData::IX_COUNT; i++) { |
200 | indexes[i] = udata_readInt32(ds, inIndexes[i]); |
201 | } |
202 | |
203 | size = indexes[DictionaryData::IX_TOTAL_SIZE]; |
204 | |
205 | if (length >= 0) { |
206 | if (length < size) { |
207 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n" , length); |
208 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
209 | return 0; |
210 | } |
211 | |
212 | if (inBytes != outBytes) { |
213 | uprv_memcpy(outBytes, inBytes, size); |
214 | } |
215 | |
216 | offset = 0; |
217 | ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); |
218 | offset = (int32_t)sizeof(indexes); |
219 | int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
220 | int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; |
221 | |
222 | if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
223 | ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); |
224 | } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
225 | // nothing to do |
226 | } else { |
227 | udata_printError(ds, "udict_swap(): unknown trie type!\n" ); |
228 | *pErrorCode = U_UNSUPPORTED_ERROR; |
229 | return 0; |
230 | } |
231 | |
232 | // these next two sections are empty in the current format, |
233 | // but may be used later. |
234 | offset = nextOffset; |
235 | nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; |
236 | offset = nextOffset; |
237 | nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; |
238 | offset = nextOffset; |
239 | } |
240 | return headerSize + size; |
241 | } |
242 | #endif |
243 | |