rbbidata.cpp source code [Godot/thirdparty/icu4c/common/rbbidata.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	***************************************************************************
5	* Copyright (C) 1999-2014 International Business Machines Corporation *
6	* and others. All rights reserved. *
7	***************************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_BREAK_ITERATION
13
14	#include "unicode/ucptrie.h"
15	#include "unicode/utypes.h"
16	#include "rbbidata.h"
17	#include "rbbirb.h"
18	#include "udatamem.h"
19	#include "cmemory.h"
20	#include "cstring.h"
21	#include "umutex.h"
22
23	#include "uassert.h"
24
25
26	U_NAMESPACE_BEGIN
27
28	//-----------------------------------------------------------------------------
29	//
30	// Constructors.
31	//
32	//-----------------------------------------------------------------------------
33	RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
34	init0();
35	init(data, status);
36	}
37
38	RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader data, enum* EDontAdopt, UErrorCode &status) {
39	init0();
40	init(data, status);
41	fDontFreeData = true;
42	}
43
44	RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
45	init0();
46	if (U_FAILURE(status)) {
47	return;
48	}
49	const DataHeader *dh = udm->pHeader;
50	int32_t headerSize = dh->dataHeader.headerSize;
51	if ( !(headerSize >= `20` &&
52	dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
53	dh->info.charsetFamily == U_CHARSET_FAMILY &&
54	dh->info.dataFormat[`0`] == `0x42` && // dataFormat="Brk "
55	dh->info.dataFormat[`1`] == `0x72` &&
56	dh->info.dataFormat[`2`] == `0x6b` &&
57	dh->info.dataFormat[`3`] == `0x20` &&
58	isDataVersionAcceptable(dh->info.formatVersion))
59	) {
60	status = U_INVALID_FORMAT_ERROR;
61	return;
62	}
63	const char dataAsBytes = reinterpret_cast<const* char *>(dh);
64	const RBBIDataHeader rbbidh = reinterpret_cast<const* RBBIDataHeader *>(dataAsBytes + headerSize);
65	init(rbbidh, status);
66	fUDataMem = udm;
67	}
68
69	UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
70	return RBBI_DATA_FORMAT_VERSION[`0`] == version[`0`];
71	}
72
73
74	//-----------------------------------------------------------------------------
75	//
76	// init(). Does most of the work of construction, shared between the
77	// constructors.
78	//
79	//-----------------------------------------------------------------------------
80	void RBBIDataWrapper::init0() {
81	fHeader = nullptr;
82	fForwardTable = nullptr;
83	fReverseTable = nullptr;
84	fRuleSource = nullptr;
85	fRuleStatusTable = nullptr;
86	fTrie = nullptr;
87	fUDataMem = nullptr;
88	fRefCount = `0`;
89	fDontFreeData = true;
90	}
91
92	void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
93	if (U_FAILURE(status)) {
94	return;
95	}
96	fHeader = data;
97	if (fHeader->fMagic != `0xb1a0` \|\| !isDataVersionAcceptable(fHeader->fFormatVersion)) {
98	status = U_INVALID_FORMAT_ERROR;
99	return;
100	}
101	// Note: in ICU version 3.2 and earlier, there was a formatVersion 1
102	// that is no longer supported. At that time fFormatVersion was
103	// an int32_t field, rather than an array of 4 bytes.
104
105	fDontFreeData = false;
106	if (data->fFTableLen != `0`) {
107	fForwardTable = (RBBIStateTable )((char* *)data + fHeader->fFTable);
108	}
109	if (data->fRTableLen != `0`) {
110	fReverseTable = (RBBIStateTable )((char* *)data + fHeader->fRTable);
111	}
112
113	fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST,
114	UCPTRIE_VALUE_BITS_ANY,
115	(uint8_t *)data + fHeader->fTrie,
116	fHeader->fTrieLen,
117	nullptr, // actual length*
118	&status);
119	if (U_FAILURE(status)) {
120	return;
121	}
122
123	UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie);
124	if (!(width == UCPTRIE_VALUE_BITS_8 \|\| width == UCPTRIE_VALUE_BITS_16)) {
125	status = U_INVALID_FORMAT_ERROR;
126	return;
127	}
128
129	fRuleSource = ((char *)data + fHeader->fRuleSource);
130	fRuleString = UnicodeString::fromUTF8(StringPiece (fRuleSource, fHeader->fRuleSourceLen));
131	U_ASSERT(data->fRuleSourceLen > `0`);
132
133	fRuleStatusTable = (int32_t )((char* *)data + fHeader->fStatusTable);
134	fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
135
136	fRefCount = `1`;
137
138	#ifdef RBBI_DEBUG
139	char *debugEnv = getenv("U_RBBIDEBUG");
140	if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
141	#endif
142	}
143
144
145	//-----------------------------------------------------------------------------
146	//
147	// Destructor. Don't call this - use removeReference() instead.
148	//
149	//-----------------------------------------------------------------------------
150	RBBIDataWrapper::~RBBIDataWrapper() {
151	U_ASSERT(fRefCount == `0`);
152	ucptrie_close(fTrie);
153	fTrie = nullptr;
154	if (fUDataMem) {
155	udata_close(fUDataMem);
156	} else if (!fDontFreeData) {
157	uprv_free((void *)fHeader);
158	}
159	}
160
161
162
163	//-----------------------------------------------------------------------------
164	//
165	// Operator == Consider two RBBIDataWrappers to be equal if they
166	// refer to the same underlying data. Although
167	// the data wrappers are normally shared between
168	// iterator instances, it's possible to independently
169	// open the same data twice, and get two instances, which
170	// should still be ==.
171	//
172	//-----------------------------------------------------------------------------
173	bool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
174	if (fHeader == other.fHeader) {
175	return true;
176	}
177	if (fHeader->fLength != other.fHeader->fLength) {
178	return false;
179	}
180	if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == `0`) {
181	return true;
182	}
183	return false;
184	}
185
186	int32_t RBBIDataWrapper::hashCode() {
187	return fHeader->fFTableLen;
188	}
189
190
191
192	//-----------------------------------------------------------------------------
193	//
194	// Reference Counting. A single RBBIDataWrapper object is shared among
195	// however many RulesBasedBreakIterator instances are
196	// referencing the same data.
197	//
198	//-----------------------------------------------------------------------------
199	void RBBIDataWrapper::removeReference() {
200	if (umtx_atomic_dec(&fRefCount) == `0`) {
201	delete this;
202	}
203	}
204
205
206	RBBIDataWrapper *RBBIDataWrapper::addReference() {
207	umtx_atomic_inc(&fRefCount);
208	return this;
209	}
210
211
212
213	//-----------------------------------------------------------------------------
214	//
215	// getRuleSourceString
216	//
217	//-----------------------------------------------------------------------------
218	const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
219	return fRuleString;
220	}
221
222
223	//-----------------------------------------------------------------------------
224	//
225	// print - debugging function to dump the runtime data tables.
226	//
227	//-----------------------------------------------------------------------------
228	#ifdef RBBI_DEBUG
229	void RBBIDataWrapper::printTable(const char heading, const* RBBIStateTable *table) {
230	uint32_t c;
231	uint32_t s;
232
233	RBBIDebugPrintf("%s\n", heading);
234
235	RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart);
236	RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize);
237	RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
238	table->fFlags,
239	table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
240	table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
241	table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
242	RBBIDebugPrintf("\nState \| Acc LA TagIx");
243	for (c=`0`; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
244	RBBIDebugPrintf("\n------\|---------------"); for (c=`0`;c<fHeader->fCatCount; c++) {
245	RBBIDebugPrintf("----");
246	}
247	RBBIDebugPrintf("\n");
248
249	if (table == nullptr) {
250	RBBIDebugPrintf(" N U L L T A B L E\n\n");
251	return;
252	}
253	UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS;
254	for (s=`0`; s<table->fNumStates; s++) {
255	RBBIStateTableRow row = (RBBIStateTableRow )
256	(table->fTableData + (table->fRowLen * s));
257	if (use8Bits) {
258	RBBIDebugPrintf("%4d \| %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx);
259	for (c=`0`; c<fHeader->fCatCount; c++) {
260	RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
261	}
262	} else {
263	RBBIDebugPrintf("%4d \| %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx);
264	for (c=`0`; c<fHeader->fCatCount; c++) {
265	RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
266	}
267	}
268	RBBIDebugPrintf("\n");
269	}
270	RBBIDebugPrintf("\n");
271	}
272	#endif
273
274
275	void RBBIDataWrapper::printData() {
276	#ifdef RBBI_DEBUG
277	RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
278	RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[`0`], fHeader->fFormatVersion[`1`],
279	fHeader->fFormatVersion[`2`], fHeader->fFormatVersion[`3`]);
280	RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
281	RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
282
283	printTable("Forward State Transition Table", fForwardTable);
284	printTable("Reverse State Transition Table", fReverseTable);
285
286	RBBIDebugPrintf("\nOriginal Rules source:\n");
287	for (int32_t c=`0`; fRuleSource[c] != `0`; c++) {
288	RBBIDebugPrintf("%c", fRuleSource[c]);
289	}
290	RBBIDebugPrintf("\n\n");
291	#endif
292	}
293
294
295	U_NAMESPACE_END
296	U_NAMESPACE_USE
297
298	//-----------------------------------------------------------------------------
299	//
300	// ubrk_swap - byte swap and char encoding swap of RBBI data
301	//
302	//-----------------------------------------------------------------------------
303
304	U_CAPI int32_t U_EXPORT2
305	ubrk_swap(const UDataSwapper ds, const* void inData, int32_t length, void* *outData,
306	UErrorCode *status) {
307
308	if (status == nullptr \|\| U_FAILURE(*status)) {
309	return `0`;
310	}
311	if(ds==nullptr \|\| inData==nullptr \|\| length<-`1` \|\| (length>`0` && outData==nullptr)) {
312	*status=U_ILLEGAL_ARGUMENT_ERROR;
313	return `0`;
314	}
315
316	//
317	// Check that the data header is for for break data.
318	// (Header contents are defined in genbrk.cpp)
319	//
320	const UDataInfo pInfo = (const* UDataInfo )((const* char *)inData+`4`);
321	if(!( pInfo->dataFormat[`0`]==`0x42` && / dataFormat="Brk " /
322	pInfo->dataFormat[`1`]==`0x72` &&
323	pInfo->dataFormat[`2`]==`0x6b` &&
324	pInfo->dataFormat[`3`]==`0x20` &&
325	RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) {
326	udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
327	pInfo->dataFormat[`0`], pInfo->dataFormat[`1`],
328	pInfo->dataFormat[`2`], pInfo->dataFormat[`3`],
329	pInfo->formatVersion[`0`]);
330	*status=U_UNSUPPORTED_ERROR;
331	return `0`;
332	}
333
334	//
335	// Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific
336	// RBBIDataHeader). This swap also conveniently gets us
337	// the size of the ICU d.h., which lets us locate the start
338	// of the RBBI specific data.
339	//
340	int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
341
342
343	//
344	// Get the RRBI Data Header, and check that it appears to be OK.
345	//
346	const uint8_t inBytes =(const* uint8_t *)inData+headerSize;
347	RBBIDataHeader rbbiDH = (RBBIDataHeader )inBytes;
348	if (ds->readUInt32(rbbiDH->fMagic) != `0xb1a0` \|\|
349	!RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) \|\|
350	ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) {
351	udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
352	*status=U_UNSUPPORTED_ERROR;
353	return `0`;
354	}
355
356	//
357	// Prefight operation? Just return the size
358	//
359	int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
360	int32_t totalSize = headerSize + breakDataLength;
361	if (length < `0`) {
362	return totalSize;
363	}
364
365	//
366	// Check that length passed in is consistent with length from RBBI data header.
367	//
368	if (length < totalSize) {
369	udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
370	breakDataLength);
371	*status=U_INDEX_OUTOFBOUNDS_ERROR;
372	return `0`;
373	}
374
375
376	//
377	// Swap the Data. Do the data itself first, then the RBBI Data Header, because
378	// we need to reference the header to locate the data, and an
379	// inplace swap of the header leaves it unusable.
380	//
381	uint8_t outBytes = (uint8_t )outData + headerSize;
382	RBBIDataHeader outputDH = (RBBIDataHeader )outBytes;
383
384	int32_t tableStartOffset;
385	int32_t tableLength;
386
387	//
388	// If not swapping in place, zero out the output buffer before starting.
389	// Individual tables and other data items within are aligned to 8 byte boundaries
390	// when originally created. Any unused space between items needs to be zero.
391	//
392	if (inBytes != outBytes) {
393	uprv_memset(outBytes, `0`, breakDataLength);
394	}
395
396	//
397	// Each state table begins with several 32 bit fields. Calculate the size
398	// in bytes of these.
399	//
400	int32_t topSize = offsetof(RBBIStateTable, fTableData);
401
402	// Forward state table.
403	tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
404	tableLength = ds->readUInt32(rbbiDH->fFTableLen);
405
406	if (tableLength > `0`) {
407	RBBIStateTable rbbiST = (RBBIStateTable )(inBytes+tableStartOffset);
408	UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
409
410	ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
411	outBytes+tableStartOffset, status);
412
413	// Swap the state table if the table is in 16 bits.
414	if (use8Bits) {
415	if (outBytes != inBytes) {
416	uprv_memmove(outBytes+tableStartOffset+topSize,
417	inBytes+tableStartOffset+topSize,
418	tableLength-topSize);
419	}
420	} else {
421	ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
422	outBytes+tableStartOffset+topSize, status);
423	}
424	}
425
426	// Reverse state table. Same layout as forward table, above.
427	tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
428	tableLength = ds->readUInt32(rbbiDH->fRTableLen);
429
430	if (tableLength > `0`) {
431	RBBIStateTable rbbiST = (RBBIStateTable )(inBytes+tableStartOffset);
432	UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
433
434	ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
435	outBytes+tableStartOffset, status);
436
437	// Swap the state table if the table is in 16 bits.
438	if (use8Bits) {
439	if (outBytes != inBytes) {
440	uprv_memmove(outBytes+tableStartOffset+topSize,
441	inBytes+tableStartOffset+topSize,
442	tableLength-topSize);
443	}
444	} else {
445	ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
446	outBytes+tableStartOffset+topSize, status);
447	}
448	}
449
450	// Trie table for character categories
451	ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
452	outBytes+ds->readUInt32(rbbiDH->fTrie), status);
453
454	// Source Rules Text. It's UTF8 data
455	if (outBytes != inBytes) {
456	uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource),
457	inBytes+ds->readUInt32(rbbiDH->fRuleSource),
458	ds->readUInt32(rbbiDH->fRuleSourceLen));
459	}
460
461	// Table of rule status values. It's all int_32 values
462	ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
463	outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
464
465	// And, last, the header.
466	// It is all int32_t values except for fFormataVersion, which is an array of four bytes.
467	// Swap the whole thing as int32_t, then re-swap the one field.
468	//
469	ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
470	ds->swapArray32(ds, outputDH->fFormatVersion, `4`, outputDH->fFormatVersion, status);
471
472	return totalSize;
473	}
474
475
476	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
477

Browse the source code of Godot/thirdparty/icu4c/common/rbbidata.cpp