brkeng.cpp source code [Godot/thirdparty/icu4c/common/brkeng.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	************************************************************************************
5	* Copyright (C) 2006-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	************************************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_BREAK_ITERATION
13
14	#include "unicode/uchar.h"
15	#include "unicode/uniset.h"
16	#include "unicode/chariter.h"
17	#include "unicode/ures.h"
18	#include "unicode/udata.h"
19	#include "unicode/putil.h"
20	#include "unicode/ustring.h"
21	#include "unicode/uscript.h"
22	#include "unicode/ucharstrie.h"
23	#include "unicode/bytestrie.h"
24
25	#include "brkeng.h"
26	#include "cmemory.h"
27	#include "dictbe.h"
28	#include "lstmbe.h"
29	#include "charstr.h"
30	#include "dictionarydata.h"
31	#include "mutex.h"
32	#include "uvector.h"
33	#include "umutex.h"
34	#include "uresimp.h"
35	#include "ubrkimpl.h"
36
37	U_NAMESPACE_BEGIN
38
39	/*
40	******************************************************************
41	*/
42
43	LanguageBreakEngine::LanguageBreakEngine() {
44	}
45
46	LanguageBreakEngine::~LanguageBreakEngine() {
47	}
48
49	/*
50	******************************************************************
51	*/
52
53	LanguageBreakFactory::LanguageBreakFactory() {
54	}
55
56	LanguageBreakFactory::~LanguageBreakFactory() {
57	}
58
59	/*
60	******************************************************************
61	*/
62
63	UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
64	(void)status;
65	}
66
67	UnhandledEngine::~UnhandledEngine() {
68	delete fHandled;
69	fHandled = nullptr;
70	}
71
72	UBool
73	UnhandledEngine::handles(UChar32 c) const {
74	return fHandled && fHandled->contains(c);
75	}
76
77	int32_t
78	UnhandledEngine::findBreaks( UText *text,
79	int32_t / startPos /,
80	int32_t endPos,
81	UVector32 &/foundBreaks/,
82	UBool / isPhraseBreaking /,
83	UErrorCode &status) const {
84	if (U_FAILURE(status)) return `0`;
85	UChar32 c = utext_current32(text);
86	while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
87	utext_next32(text); // TODO: recast loop to work with post-increment operations.
88	c = utext_current32(text);
89	}
90	return `0`;
91	}
92
93	void
94	UnhandledEngine::handleCharacter(UChar32 c) {
95	if (fHandled == nullptr) {
96	fHandled = new UnicodeSet ();
97	if (fHandled == nullptr) {
98	return;
99	}
100	}
101	if (!fHandled->contains(c)) {
102	UErrorCode status = U_ZERO_ERROR;
103	// Apply the entire script of the character.
104	int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
105	fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
106	}
107	}
108
109	/*
110	******************************************************************
111	*/
112
113	ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/status/) {
114	fEngines = `0`;
115	}
116
117	ICULanguageBreakFactory::~ICULanguageBreakFactory() {
118	if (fEngines != `0`) {
119	delete fEngines;
120	}
121	}
122
123	U_NAMESPACE_END
124	U_CDECL_BEGIN
125	static void U_CALLCONV _deleteEngine(void *obj) {
126	delete (const icu::LanguageBreakEngine *) obj;
127	}
128	U_CDECL_END
129	U_NAMESPACE_BEGIN
130
131	const LanguageBreakEngine *
132	ICULanguageBreakFactory::getEngineFor(UChar32 c) {
133	const LanguageBreakEngine lbe = nullptr*;
134	UErrorCode status = U_ZERO_ERROR;
135
136	static UMutex gBreakEngineMutex;
137	Mutex m(&gBreakEngineMutex);
138
139	if (fEngines == nullptr) {
140	LocalPointer<UStack> engines(new UStack (_deleteEngine, nullptr, status), status);
141	if (U_FAILURE(status) ) {
142	// Note: no way to return error code to caller.
143	return nullptr;
144	}
145	fEngines = engines.orphan();
146	} else {
147	int32_t i = fEngines->size();
148	while (--i >= `0`) {
149	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
150	if (lbe != nullptr && lbe->handles(c)) {
151	return lbe;
152	}
153	}
154	}
155
156	// We didn't find an engine. Create one.
157	lbe = loadEngineFor(c);
158	if (lbe != nullptr) {
159	fEngines->push((void *)lbe, status);
160	}
161	return U_SUCCESS(status) ? lbe : nullptr;
162	}
163
164	const LanguageBreakEngine *
165	ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
166	UErrorCode status = U_ZERO_ERROR;
167	UScriptCode code = uscript_getScript(c, &status);
168	if (U_SUCCESS(status)) {
169	const LanguageBreakEngine engine = nullptr*;
170	// Try to use LSTM first
171	const LSTMData *data = CreateLSTMDataForScript(code, status);
172	if (U_SUCCESS(status)) {
173	if (data != nullptr) {
174	engine = CreateLSTMBreakEngine(code, data, status);
175	if (U_SUCCESS(status) && engine != nullptr) {
176	return engine;
177	}
178	if (engine != nullptr) {
179	delete engine;
180	engine = nullptr;
181	} else {
182	DeleteLSTMData(data);
183	}
184	}
185	}
186	status = U_ZERO_ERROR; // fallback to dictionary based
187	DictionaryMatcher *m = loadDictionaryMatcherFor(code);
188	if (m != nullptr) {
189	switch(code) {
190	case USCRIPT_THAI:
191	engine = new ThaiBreakEngine (m, status);
192	break;
193	case USCRIPT_LAO:
194	engine = new LaoBreakEngine (m, status);
195	break;
196	case USCRIPT_MYANMAR:
197	engine = new BurmeseBreakEngine (m, status);
198	break;
199	case USCRIPT_KHMER:
200	engine = new KhmerBreakEngine (m, status);
201	break;
202
203	#if !UCONFIG_NO_NORMALIZATION
204	// CJK not available w/o normalization
205	case USCRIPT_HANGUL:
206	engine = new CjkBreakEngine (m, kKorean, status);
207	break;
208
209	// use same BreakEngine and dictionary for both Chinese and Japanese
210	case USCRIPT_HIRAGANA:
211	case USCRIPT_KATAKANA:
212	case USCRIPT_HAN:
213	engine = new CjkBreakEngine (m, kChineseJapanese, status);
214	break;
215	#if 0
216	// TODO: Have to get some characters with script=common handled
217	// by CjkBreakEngine (e.g. U+309B). Simply subjecting
218	// them to CjkBreakEngine does not work. The engine has to
219	// special-case them.
220	case USCRIPT_COMMON:
221	{
222	UBlockCode block = ublock_getCode(code);
223	if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)
224	engine = new CjkBreakEngine(dict, kChineseJapanese, status);
225	break;
226	}
227	#endif
228	#endif
229
230	default:
231	break;
232	}
233	if (engine == nullptr) {
234	delete m;
235	}
236	else if (U_FAILURE(status)) {
237	delete engine;
238	engine = nullptr;
239	}
240	return engine;
241	}
242	}
243	return nullptr;
244	}
245
246	DictionaryMatcher *
247	ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
248	UErrorCode status = U_ZERO_ERROR;
249	// open root from brkitr tree.
250	UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
251	b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
252	int32_t dictnlength = `0`;
253	const char16_t *dictfname =
254	ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
255	if (U_FAILURE(status)) {
256	ures_close(b);
257	return nullptr;
258	}
259	CharString dictnbuf;
260	CharString ext;
261	const char16_t extStart = u_memrchr(dictfname, `0x002e`, dictnlength); // last dot*
262	if (extStart != nullptr) {
263	int32_t len = (int32_t)(extStart - dictfname);
264	ext.appendInvariantChars(UnicodeString (false, extStart + `1`, dictnlength - len - `1`), status);
265	dictnlength = len;
266	}
267	dictnbuf.appendInvariantChars(UnicodeString (false, dictfname, dictnlength), status);
268	ures_close(b);
269
270	UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
271	if (U_SUCCESS(status)) {
272	// build trie
273	const uint8_t data = (const* uint8_t *)udata_getMemory(file);
274	const int32_t indexes = (const* int32_t *)data;
275	const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
276	const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
277	DictionaryMatcher m = nullptr*;
278	if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
279	const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
280	const char characters = (const* char *)(data + offset);
281	m = new BytesDictionaryMatcher (characters, transform, file);
282	}
283	else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
284	const char16_t characters = (const* char16_t *)(data + offset);
285	m = new UCharsDictionaryMatcher (characters, file);
286	}
287	if (m == nullptr) {
288	// no matcher exists to take ownership - either we are an invalid
289	// type or memory allocation failed
290	udata_close(file);
291	}
292	return m;
293	} else if (dictfname != nullptr) {
294	// we don't have a dictionary matcher.
295	// returning nullptr here will cause us to fail to find a dictionary break engine, as expected
296	status = U_ZERO_ERROR;
297	return nullptr;
298	}
299	return nullptr;
300	}
301
302	U_NAMESPACE_END
303
304	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
305

Browse the source code of Godot/thirdparty/icu4c/common/brkeng.cpp