dictionarydata.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/dictionarydata.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2014-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* dictionarydata.h
9	*
10	* created on: 2012may31
11	* created by: Markus W. Scherer & Maxime Serrano
12	*/
13
14	#include "dictionarydata.h"
15	#include "unicode/ucharstrie.h"
16	#include "unicode/bytestrie.h"
17	#include "unicode/udata.h"
18	#include "cmemory.h"
19
20	#if !UCONFIG_NO_BREAK_ITERATION
21
22	U_NAMESPACE_BEGIN
23
24	const int32_t DictionaryData::TRIE_TYPE_BYTES = `0`;
25	const int32_t DictionaryData::TRIE_TYPE_UCHARS = `1`;
26	const int32_t DictionaryData::TRIE_TYPE_MASK = `7`;
27	const int32_t DictionaryData::TRIE_HAS_VALUES = `8`;
28
29	const int32_t DictionaryData::TRANSFORM_NONE = `0`;
30	const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = `0x1000000`;
31	const int32_t DictionaryData::TRANSFORM_TYPE_MASK = `0x7f000000`;
32	const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = `0x1fffff`;
33
34	DictionaryMatcher::~DictionaryMatcher() {
35	}
36
37	UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38	udata_close(file);
39	}
40
41	int32_t UCharsDictionaryMatcher::getType() const {
42	return DictionaryData::TRIE_TYPE_UCHARS;
43	}
44
45	int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46	int32_t lengths, int32_t cpLengths, int32_t *values,
47	int32_t prefix) const* {
48
49	UCharsTrie uct(characters);
50	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
51	int32_t wordCount = `0`;
52	int32_t codePointsMatched = `0`;
53
54	for (UChar32 c = utext_next32(text); c >= `0`; c=utext_next32(text)) {
55	UStringTrieResult result = (codePointsMatched == `0`) ? uct.first(c) : uct.next(c);
56	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
57	codePointsMatched += `1`;
58	if (USTRINGTRIE_HAS_VALUE(result)) {
59	if (wordCount < limit) {
60	if (values != NULL) {
61	values[wordCount] = uct.getValue();
62	}
63	if (lengths != NULL) {
64	lengths[wordCount] = lengthMatched;
65	}
66	if (cpLengths != NULL) {
67	cpLengths[wordCount] = codePointsMatched;
68	}
69	++wordCount;
70	}
71	if (result == USTRINGTRIE_FINAL_VALUE) {
72	break;
73	}
74	}
75	else if (result == USTRINGTRIE_NO_MATCH) {
76	break;
77	}
78	if (lengthMatched >= maxLength) {
79	break;
80	}
81	}
82
83	if (prefix != NULL) {
84	*prefix = codePointsMatched;
85	}
86	return wordCount;
87	}
88
89	BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90	udata_close(file);
91	}
92
93	UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94	if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95	if (c == `0x200D`) {
96	return `0xFF`;
97	} else if (c == `0x200C`) {
98	return `0xFE`;
99	}
100	int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101	if (delta < `0` \|\| `0xFD` < delta) {
102	return U_SENTINEL;
103	}
104	return (UChar32)delta;
105	}
106	return c;
107	}
108
109	int32_t BytesDictionaryMatcher::getType() const {
110	return DictionaryData::TRIE_TYPE_BYTES;
111	}
112
113	int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114	int32_t lengths, int32_t cpLengths, int32_t *values,
115	int32_t prefix) const* {
116	BytesTrie bt(characters);
117	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
118	int32_t wordCount = `0`;
119	int32_t codePointsMatched = `0`;
120
121	for (UChar32 c = utext_next32(text); c >= `0`; c=utext_next32(text)) {
122	UStringTrieResult result = (codePointsMatched == `0`) ? bt.first(transform(c)) : bt.next(transform(c));
123	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
124	codePointsMatched += `1`;
125	if (USTRINGTRIE_HAS_VALUE(result)) {
126	if (wordCount < limit) {
127	if (values != NULL) {
128	values[wordCount] = bt.getValue();
129	}
130	if (lengths != NULL) {
131	lengths[wordCount] = lengthMatched;
132	}
133	if (cpLengths != NULL) {
134	cpLengths[wordCount] = codePointsMatched;
135	}
136	++wordCount;
137	}
138	if (result == USTRINGTRIE_FINAL_VALUE) {
139	break;
140	}
141	}
142	else if (result == USTRINGTRIE_NO_MATCH) {
143	break;
144	}
145	if (lengthMatched >= maxLength) {
146	break;
147	}
148	}
149
150	if (prefix != NULL) {
151	*prefix = codePointsMatched;
152	}
153	return wordCount;
154	}
155
156
157	U_NAMESPACE_END
158
159	U_NAMESPACE_USE
160
161	U_CAPI int32_t U_EXPORT2
162	udict_swap(const UDataSwapper ds, const* void *inData, int32_t length,
163	void outData, UErrorCode pErrorCode) {
164	const UDataInfo *pInfo;
165	int32_t headerSize;
166	const uint8_t *inBytes;
167	uint8_t *outBytes;
168	const int32_t *inIndexes;
169	int32_t indexes[DictionaryData::IX_COUNT];
170	int32_t i, offset, size;
171
172	headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173	if (pErrorCode == NULL \|\| U_FAILURE(pErrorCode)) return* `0`;
174	pInfo = (const UDataInfo )((const* char *)inData + `4`);
175	if (!(pInfo->dataFormat[`0`] == `0x44` &&
176	pInfo->dataFormat[`1`] == `0x69` &&
177	pInfo->dataFormat[`2`] == `0x63` &&
178	pInfo->dataFormat[`3`] == `0x74` &&
179	pInfo->formatVersion[`0`] == `1`)) {
180	udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181	pInfo->dataFormat[`0`], pInfo->dataFormat[`1`], pInfo->dataFormat[`2`], pInfo->dataFormat[`3`], pInfo->formatVersion[`0`]);
182	*pErrorCode = U_UNSUPPORTED_ERROR;
183	return `0`;
184	}
185
186	inBytes = (const uint8_t *)inData + headerSize;
187	outBytes = (uint8_t *)outData + headerSize;
188
189	inIndexes = (const int32_t *)inBytes;
190	if (length >= `0`) {
191	length -= headerSize;
192	if (length < (int32_t)(sizeof(indexes))) {
193	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195	return `0`;
196	}
197	}
198
199	for (i = `0`; i < DictionaryData::IX_COUNT; i++) {
200	indexes[i] = udata_readInt32(ds, inIndexes[i]);
201	}
202
203	size = indexes[DictionaryData::IX_TOTAL_SIZE];
204
205	if (length >= `0`) {
206	if (length < size) {
207	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209	return `0`;
210	}
211
212	if (inBytes != outBytes) {
213	uprv_memcpy(outBytes, inBytes, size);
214	}
215
216	offset = `0`;
217	ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218	offset = (int32_t)sizeof(indexes);
219	int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220	int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222	if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223	ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224	} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225	// nothing to do
226	} else {
227	udata_printError(ds, "udict_swap(): unknown trie type!\n");
228	*pErrorCode = U_UNSUPPORTED_ERROR;
229	return `0`;
230	}
231
232	// these next two sections are empty in the current format,
233	// but may be used later.
234	offset = nextOffset;
235	nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236	offset = nextOffset;
237	nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238	offset = nextOffset;
239	}
240	return headerSize + size;
241	}
242	#endif
243

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/dictionarydata.cpp