brkeng.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/brkeng.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	************************************************************************************
5	* Copyright (C) 2006-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	************************************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_BREAK_ITERATION
13
14	#include "unicode/uchar.h"
15	#include "unicode/uniset.h"
16	#include "unicode/chariter.h"
17	#include "unicode/ures.h"
18	#include "unicode/udata.h"
19	#include "unicode/putil.h"
20	#include "unicode/ustring.h"
21	#include "unicode/uscript.h"
22	#include "unicode/ucharstrie.h"
23	#include "unicode/bytestrie.h"
24
25	#include "brkeng.h"
26	#include "cmemory.h"
27	#include "dictbe.h"
28	#include "charstr.h"
29	#include "dictionarydata.h"
30	#include "mutex.h"
31	#include "uvector.h"
32	#include "umutex.h"
33	#include "uresimp.h"
34	#include "ubrkimpl.h"
35
36	U_NAMESPACE_BEGIN
37
38	/*
39	******************************************************************
40	*/
41
42	LanguageBreakEngine::LanguageBreakEngine() {
43	}
44
45	LanguageBreakEngine::~LanguageBreakEngine() {
46	}
47
48	/*
49	******************************************************************
50	*/
51
52	LanguageBreakFactory::LanguageBreakFactory() {
53	}
54
55	LanguageBreakFactory::~LanguageBreakFactory() {
56	}
57
58	/*
59	******************************************************************
60	*/
61
62	UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
63	(void)status;
64	}
65
66	UnhandledEngine::~UnhandledEngine() {
67	delete fHandled;
68	fHandled = nullptr;
69	}
70
71	UBool
72	UnhandledEngine::handles(UChar32 c) const {
73	return fHandled && fHandled->contains(c);
74	}
75
76	int32_t
77	UnhandledEngine::findBreaks( UText *text,
78	int32_t / startPos /,
79	int32_t endPos,
80	UVector32 &/foundBreaks/ ) const {
81	UChar32 c = utext_current32(text);
82	while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
83	utext_next32(text); // TODO: recast loop to work with post-increment operations.
84	c = utext_current32(text);
85	}
86	return `0`;
87	}
88
89	void
90	UnhandledEngine::handleCharacter(UChar32 c) {
91	if (fHandled == nullptr) {
92	fHandled = new UnicodeSet ();
93	if (fHandled == nullptr) {
94	return;
95	}
96	}
97	if (!fHandled->contains(c)) {
98	UErrorCode status = U_ZERO_ERROR;
99	// Apply the entire script of the character.
100	int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
101	fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
102	}
103	}
104
105	/*
106	******************************************************************
107	*/
108
109	ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/status/) {
110	fEngines = `0`;
111	}
112
113	ICULanguageBreakFactory::~ICULanguageBreakFactory() {
114	if (fEngines != `0`) {
115	delete fEngines;
116	}
117	}
118
119	U_NAMESPACE_END
120	U_CDECL_BEGIN
121	static void U_CALLCONV _deleteEngine(void *obj) {
122	delete (const icu::LanguageBreakEngine *) obj;
123	}
124	U_CDECL_END
125	U_NAMESPACE_BEGIN
126
127	const LanguageBreakEngine *
128	ICULanguageBreakFactory::getEngineFor(UChar32 c) {
129	const LanguageBreakEngine *lbe = NULL;
130	UErrorCode status = U_ZERO_ERROR;
131
132	static UMutex gBreakEngineMutex;
133	Mutex m(&gBreakEngineMutex);
134
135	if (fEngines == NULL) {
136	UStack engines = new* UStack (_deleteEngine, NULL, status);
137	if (U_FAILURE(status) \|\| engines == NULL) {
138	// Note: no way to return error code to caller.
139	delete engines;
140	return NULL;
141	}
142	fEngines = engines;
143	} else {
144	int32_t i = fEngines->size();
145	while (--i >= `0`) {
146	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
147	if (lbe != NULL && lbe->handles(c)) {
148	return lbe;
149	}
150	}
151	}
152
153	// We didn't find an engine. Create one.
154	lbe = loadEngineFor(c);
155	if (lbe != NULL) {
156	fEngines->push((void *)lbe, status);
157	}
158	return lbe;
159	}
160
161	const LanguageBreakEngine *
162	ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
163	UErrorCode status = U_ZERO_ERROR;
164	UScriptCode code = uscript_getScript(c, &status);
165	if (U_SUCCESS(status)) {
166	DictionaryMatcher *m = loadDictionaryMatcherFor(code);
167	if (m != NULL) {
168	const LanguageBreakEngine *engine = NULL;
169	switch(code) {
170	case USCRIPT_THAI:
171	engine = new ThaiBreakEngine (m, status);
172	break;
173	case USCRIPT_LAO:
174	engine = new LaoBreakEngine (m, status);
175	break;
176	case USCRIPT_MYANMAR:
177	engine = new BurmeseBreakEngine (m, status);
178	break;
179	case USCRIPT_KHMER:
180	engine = new KhmerBreakEngine (m, status);
181	break;
182
183	#if !UCONFIG_NO_NORMALIZATION
184	// CJK not available w/o normalization
185	case USCRIPT_HANGUL:
186	engine = new CjkBreakEngine (m, kKorean, status);
187	break;
188
189	// use same BreakEngine and dictionary for both Chinese and Japanese
190	case USCRIPT_HIRAGANA:
191	case USCRIPT_KATAKANA:
192	case USCRIPT_HAN:
193	engine = new CjkBreakEngine (m, kChineseJapanese, status);
194	break;
195	#if 0
196	// TODO: Have to get some characters with script=common handled
197	// by CjkBreakEngine (e.g. U+309B). Simply subjecting
198	// them to CjkBreakEngine does not work. The engine has to
199	// special-case them.
200	case USCRIPT_COMMON:
201	{
202	UBlockCode block = ublock_getCode(code);
203	if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)
204	engine = new CjkBreakEngine(dict, kChineseJapanese, status);
205	break;
206	}
207	#endif
208	#endif
209
210	default:
211	break;
212	}
213	if (engine == NULL) {
214	delete m;
215	}
216	else if (U_FAILURE(status)) {
217	delete engine;
218	engine = NULL;
219	}
220	return engine;
221	}
222	}
223	return NULL;
224	}
225
226	DictionaryMatcher *
227	ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
228	UErrorCode status = U_ZERO_ERROR;
229	// open root from brkitr tree.
230	UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
231	b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
232	int32_t dictnlength = `0`;
233	const UChar *dictfname =
234	ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
235	if (U_FAILURE(status)) {
236	ures_close(b);
237	return NULL;
238	}
239	CharString dictnbuf;
240	CharString ext;
241	const UChar extStart = u_memrchr(dictfname, `0x002e`, dictnlength); // last dot*
242	if (extStart != NULL) {
243	int32_t len = (int32_t)(extStart - dictfname);
244	ext.appendInvariantChars(UnicodeString (FALSE, extStart + `1`, dictnlength - len - `1`), status);
245	dictnlength = len;
246	}
247	dictnbuf.appendInvariantChars(UnicodeString (FALSE, dictfname, dictnlength), status);
248	ures_close(b);
249
250	UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
251	if (U_SUCCESS(status)) {
252	// build trie
253	const uint8_t data = (const* uint8_t *)udata_getMemory(file);
254	const int32_t indexes = (const* int32_t *)data;
255	const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
256	const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
257	DictionaryMatcher *m = NULL;
258	if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
259	const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
260	const char characters = (const* char *)(data + offset);
261	m = new BytesDictionaryMatcher (characters, transform, file);
262	}
263	else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
264	const UChar characters = (const* UChar *)(data + offset);
265	m = new UCharsDictionaryMatcher (characters, file);
266	}
267	if (m == NULL) {
268	// no matcher exists to take ownership - either we are an invalid
269	// type or memory allocation failed
270	udata_close(file);
271	}
272	return m;
273	} else if (dictfname != NULL) {
274	// we don't have a dictionary matcher.
275	// returning NULL here will cause us to fail to find a dictionary break engine, as expected
276	status = U_ZERO_ERROR;
277	return NULL;
278	}
279	return NULL;
280	}
281
282	U_NAMESPACE_END
283
284	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
285

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/brkeng.cpp