1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | *************************************************************************** |
5 | * Copyright (C) 1999-2014 International Business Machines Corporation * |
6 | * and others. All rights reserved. * |
7 | *************************************************************************** |
8 | */ |
9 | |
10 | #include "unicode/utypes.h" |
11 | |
12 | #if !UCONFIG_NO_BREAK_ITERATION |
13 | |
14 | #include "unicode/ucptrie.h" |
15 | #include "unicode/utypes.h" |
16 | #include "rbbidata.h" |
17 | #include "rbbirb.h" |
18 | #include "udatamem.h" |
19 | #include "cmemory.h" |
20 | #include "cstring.h" |
21 | #include "umutex.h" |
22 | |
23 | #include "uassert.h" |
24 | |
25 | |
26 | U_NAMESPACE_BEGIN |
27 | |
28 | //----------------------------------------------------------------------------- |
29 | // |
30 | // Constructors. |
31 | // |
32 | //----------------------------------------------------------------------------- |
33 | RBBIDataWrapper::(const RBBIDataHeader *data, UErrorCode &status) { |
34 | init0(); |
35 | init(data, status); |
36 | } |
37 | |
38 | RBBIDataWrapper::(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { |
39 | init0(); |
40 | init(data, status); |
41 | fDontFreeData = true; |
42 | } |
43 | |
44 | RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { |
45 | init0(); |
46 | if (U_FAILURE(status)) { |
47 | return; |
48 | } |
49 | const DataHeader *dh = udm->pHeader; |
50 | int32_t = dh->dataHeader.headerSize; |
51 | if ( !(headerSize >= 20 && |
52 | dh->info.isBigEndian == U_IS_BIG_ENDIAN && |
53 | dh->info.charsetFamily == U_CHARSET_FAMILY && |
54 | dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk " |
55 | dh->info.dataFormat[1] == 0x72 && |
56 | dh->info.dataFormat[2] == 0x6b && |
57 | dh->info.dataFormat[3] == 0x20 && |
58 | isDataVersionAcceptable(dh->info.formatVersion)) |
59 | ) { |
60 | status = U_INVALID_FORMAT_ERROR; |
61 | return; |
62 | } |
63 | const char *dataAsBytes = reinterpret_cast<const char *>(dh); |
64 | const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize); |
65 | init(rbbidh, status); |
66 | fUDataMem = udm; |
67 | } |
68 | |
69 | UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) { |
70 | return RBBI_DATA_FORMAT_VERSION[0] == version[0]; |
71 | } |
72 | |
73 | |
74 | //----------------------------------------------------------------------------- |
75 | // |
76 | // init(). Does most of the work of construction, shared between the |
77 | // constructors. |
78 | // |
79 | //----------------------------------------------------------------------------- |
80 | void RBBIDataWrapper::init0() { |
81 | fHeader = nullptr; |
82 | fForwardTable = nullptr; |
83 | fReverseTable = nullptr; |
84 | fRuleSource = nullptr; |
85 | fRuleStatusTable = nullptr; |
86 | fTrie = nullptr; |
87 | fUDataMem = nullptr; |
88 | fRefCount = 0; |
89 | fDontFreeData = true; |
90 | } |
91 | |
92 | void RBBIDataWrapper::(const RBBIDataHeader *data, UErrorCode &status) { |
93 | if (U_FAILURE(status)) { |
94 | return; |
95 | } |
96 | fHeader = data; |
97 | if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) { |
98 | status = U_INVALID_FORMAT_ERROR; |
99 | return; |
100 | } |
101 | // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 |
102 | // that is no longer supported. At that time fFormatVersion was |
103 | // an int32_t field, rather than an array of 4 bytes. |
104 | |
105 | fDontFreeData = false; |
106 | if (data->fFTableLen != 0) { |
107 | fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); |
108 | } |
109 | if (data->fRTableLen != 0) { |
110 | fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); |
111 | } |
112 | |
113 | fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, |
114 | UCPTRIE_VALUE_BITS_ANY, |
115 | (uint8_t *)data + fHeader->fTrie, |
116 | fHeader->fTrieLen, |
117 | nullptr, // *actual length |
118 | &status); |
119 | if (U_FAILURE(status)) { |
120 | return; |
121 | } |
122 | |
123 | UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie); |
124 | if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) { |
125 | status = U_INVALID_FORMAT_ERROR; |
126 | return; |
127 | } |
128 | |
129 | fRuleSource = ((char *)data + fHeader->fRuleSource); |
130 | fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen)); |
131 | U_ASSERT(data->fRuleSourceLen > 0); |
132 | |
133 | fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); |
134 | fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); |
135 | |
136 | fRefCount = 1; |
137 | |
138 | #ifdef RBBI_DEBUG |
139 | char *debugEnv = getenv("U_RBBIDEBUG" ); |
140 | if (debugEnv && uprv_strstr(debugEnv, "data" )) {this->printData();} |
141 | #endif |
142 | } |
143 | |
144 | |
145 | //----------------------------------------------------------------------------- |
146 | // |
147 | // Destructor. Don't call this - use removeReference() instead. |
148 | // |
149 | //----------------------------------------------------------------------------- |
150 | RBBIDataWrapper::~RBBIDataWrapper() { |
151 | U_ASSERT(fRefCount == 0); |
152 | ucptrie_close(fTrie); |
153 | fTrie = nullptr; |
154 | if (fUDataMem) { |
155 | udata_close(fUDataMem); |
156 | } else if (!fDontFreeData) { |
157 | uprv_free((void *)fHeader); |
158 | } |
159 | } |
160 | |
161 | |
162 | |
163 | //----------------------------------------------------------------------------- |
164 | // |
165 | // Operator == Consider two RBBIDataWrappers to be equal if they |
166 | // refer to the same underlying data. Although |
167 | // the data wrappers are normally shared between |
168 | // iterator instances, it's possible to independently |
169 | // open the same data twice, and get two instances, which |
170 | // should still be ==. |
171 | // |
172 | //----------------------------------------------------------------------------- |
173 | bool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { |
174 | if (fHeader == other.fHeader) { |
175 | return true; |
176 | } |
177 | if (fHeader->fLength != other.fHeader->fLength) { |
178 | return false; |
179 | } |
180 | if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { |
181 | return true; |
182 | } |
183 | return false; |
184 | } |
185 | |
186 | int32_t RBBIDataWrapper::hashCode() { |
187 | return fHeader->fFTableLen; |
188 | } |
189 | |
190 | |
191 | |
192 | //----------------------------------------------------------------------------- |
193 | // |
194 | // Reference Counting. A single RBBIDataWrapper object is shared among |
195 | // however many RulesBasedBreakIterator instances are |
196 | // referencing the same data. |
197 | // |
198 | //----------------------------------------------------------------------------- |
199 | void RBBIDataWrapper::removeReference() { |
200 | if (umtx_atomic_dec(&fRefCount) == 0) { |
201 | delete this; |
202 | } |
203 | } |
204 | |
205 | |
206 | RBBIDataWrapper *RBBIDataWrapper::addReference() { |
207 | umtx_atomic_inc(&fRefCount); |
208 | return this; |
209 | } |
210 | |
211 | |
212 | |
213 | //----------------------------------------------------------------------------- |
214 | // |
215 | // getRuleSourceString |
216 | // |
217 | //----------------------------------------------------------------------------- |
218 | const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { |
219 | return fRuleString; |
220 | } |
221 | |
222 | |
223 | //----------------------------------------------------------------------------- |
224 | // |
225 | // print - debugging function to dump the runtime data tables. |
226 | // |
227 | //----------------------------------------------------------------------------- |
228 | #ifdef RBBI_DEBUG |
229 | void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { |
230 | uint32_t c; |
231 | uint32_t s; |
232 | |
233 | RBBIDebugPrintf("%s\n" , heading); |
234 | |
235 | RBBIDebugPrintf(" fDictCategoriesStart: %d\n" , table->fDictCategoriesStart); |
236 | RBBIDebugPrintf(" fLookAheadResultsSize: %d\n" , table->fLookAheadResultsSize); |
237 | RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n" , |
238 | table->fFlags, |
239 | table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F" , |
240 | table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F" , |
241 | table->fFlags & RBBI_8BITS_ROWS ? "T" : "F" ); |
242 | RBBIDebugPrintf("\nState | Acc LA TagIx" ); |
243 | for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d " , c);} |
244 | RBBIDebugPrintf("\n------|---------------" ); for (c=0;c<fHeader->fCatCount; c++) { |
245 | RBBIDebugPrintf("----" ); |
246 | } |
247 | RBBIDebugPrintf("\n" ); |
248 | |
249 | if (table == nullptr) { |
250 | RBBIDebugPrintf(" N U L L T A B L E\n\n" ); |
251 | return; |
252 | } |
253 | UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS; |
254 | for (s=0; s<table->fNumStates; s++) { |
255 | RBBIStateTableRow *row = (RBBIStateTableRow *) |
256 | (table->fTableData + (table->fRowLen * s)); |
257 | if (use8Bits) { |
258 | RBBIDebugPrintf("%4d | %3d %3d %3d " , s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx); |
259 | for (c=0; c<fHeader->fCatCount; c++) { |
260 | RBBIDebugPrintf("%3d " , row->r8.fNextState[c]); |
261 | } |
262 | } else { |
263 | RBBIDebugPrintf("%4d | %3d %3d %3d " , s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx); |
264 | for (c=0; c<fHeader->fCatCount; c++) { |
265 | RBBIDebugPrintf("%3d " , row->r16.fNextState[c]); |
266 | } |
267 | } |
268 | RBBIDebugPrintf("\n" ); |
269 | } |
270 | RBBIDebugPrintf("\n" ); |
271 | } |
272 | #endif |
273 | |
274 | |
275 | void RBBIDataWrapper::printData() { |
276 | #ifdef RBBI_DEBUG |
277 | RBBIDebugPrintf("RBBI Data at %p\n" , (void *)fHeader); |
278 | RBBIDebugPrintf(" Version = {%d %d %d %d}\n" , fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], |
279 | fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); |
280 | RBBIDebugPrintf(" total length of data = %d\n" , fHeader->fLength); |
281 | RBBIDebugPrintf(" number of character categories = %d\n\n" , fHeader->fCatCount); |
282 | |
283 | printTable("Forward State Transition Table" , fForwardTable); |
284 | printTable("Reverse State Transition Table" , fReverseTable); |
285 | |
286 | RBBIDebugPrintf("\nOriginal Rules source:\n" ); |
287 | for (int32_t c=0; fRuleSource[c] != 0; c++) { |
288 | RBBIDebugPrintf("%c" , fRuleSource[c]); |
289 | } |
290 | RBBIDebugPrintf("\n\n" ); |
291 | #endif |
292 | } |
293 | |
294 | |
295 | U_NAMESPACE_END |
296 | U_NAMESPACE_USE |
297 | |
298 | //----------------------------------------------------------------------------- |
299 | // |
300 | // ubrk_swap - byte swap and char encoding swap of RBBI data |
301 | // |
302 | //----------------------------------------------------------------------------- |
303 | |
304 | U_CAPI int32_t U_EXPORT2 |
305 | ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, |
306 | UErrorCode *status) { |
307 | |
308 | if (status == nullptr || U_FAILURE(*status)) { |
309 | return 0; |
310 | } |
311 | if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) { |
312 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
313 | return 0; |
314 | } |
315 | |
316 | // |
317 | // Check that the data header is for for break data. |
318 | // (Header contents are defined in genbrk.cpp) |
319 | // |
320 | const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); |
321 | if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ |
322 | pInfo->dataFormat[1]==0x72 && |
323 | pInfo->dataFormat[2]==0x6b && |
324 | pInfo->dataFormat[3]==0x20 && |
325 | RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) { |
326 | udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n" , |
327 | pInfo->dataFormat[0], pInfo->dataFormat[1], |
328 | pInfo->dataFormat[2], pInfo->dataFormat[3], |
329 | pInfo->formatVersion[0]); |
330 | *status=U_UNSUPPORTED_ERROR; |
331 | return 0; |
332 | } |
333 | |
334 | // |
335 | // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific |
336 | // RBBIDataHeader). This swap also conveniently gets us |
337 | // the size of the ICU d.h., which lets us locate the start |
338 | // of the RBBI specific data. |
339 | // |
340 | int32_t =udata_swapDataHeader(ds, inData, length, outData, status); |
341 | |
342 | |
343 | // |
344 | // Get the RRBI Data Header, and check that it appears to be OK. |
345 | // |
346 | const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
347 | RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; |
348 | if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || |
349 | !RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) || |
350 | ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) { |
351 | udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n" ); |
352 | *status=U_UNSUPPORTED_ERROR; |
353 | return 0; |
354 | } |
355 | |
356 | // |
357 | // Prefight operation? Just return the size |
358 | // |
359 | int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); |
360 | int32_t totalSize = headerSize + breakDataLength; |
361 | if (length < 0) { |
362 | return totalSize; |
363 | } |
364 | |
365 | // |
366 | // Check that length passed in is consistent with length from RBBI data header. |
367 | // |
368 | if (length < totalSize) { |
369 | udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n" , |
370 | breakDataLength); |
371 | *status=U_INDEX_OUTOFBOUNDS_ERROR; |
372 | return 0; |
373 | } |
374 | |
375 | |
376 | // |
377 | // Swap the Data. Do the data itself first, then the RBBI Data Header, because |
378 | // we need to reference the header to locate the data, and an |
379 | // inplace swap of the header leaves it unusable. |
380 | // |
381 | uint8_t *outBytes = (uint8_t *)outData + headerSize; |
382 | RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; |
383 | |
384 | int32_t tableStartOffset; |
385 | int32_t tableLength; |
386 | |
387 | // |
388 | // If not swapping in place, zero out the output buffer before starting. |
389 | // Individual tables and other data items within are aligned to 8 byte boundaries |
390 | // when originally created. Any unused space between items needs to be zero. |
391 | // |
392 | if (inBytes != outBytes) { |
393 | uprv_memset(outBytes, 0, breakDataLength); |
394 | } |
395 | |
396 | // |
397 | // Each state table begins with several 32 bit fields. Calculate the size |
398 | // in bytes of these. |
399 | // |
400 | int32_t topSize = offsetof(RBBIStateTable, fTableData); |
401 | |
402 | // Forward state table. |
403 | tableStartOffset = ds->readUInt32(rbbiDH->fFTable); |
404 | tableLength = ds->readUInt32(rbbiDH->fFTableLen); |
405 | |
406 | if (tableLength > 0) { |
407 | RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset); |
408 | UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS; |
409 | |
410 | ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
411 | outBytes+tableStartOffset, status); |
412 | |
413 | // Swap the state table if the table is in 16 bits. |
414 | if (use8Bits) { |
415 | if (outBytes != inBytes) { |
416 | uprv_memmove(outBytes+tableStartOffset+topSize, |
417 | inBytes+tableStartOffset+topSize, |
418 | tableLength-topSize); |
419 | } |
420 | } else { |
421 | ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
422 | outBytes+tableStartOffset+topSize, status); |
423 | } |
424 | } |
425 | |
426 | // Reverse state table. Same layout as forward table, above. |
427 | tableStartOffset = ds->readUInt32(rbbiDH->fRTable); |
428 | tableLength = ds->readUInt32(rbbiDH->fRTableLen); |
429 | |
430 | if (tableLength > 0) { |
431 | RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset); |
432 | UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS; |
433 | |
434 | ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
435 | outBytes+tableStartOffset, status); |
436 | |
437 | // Swap the state table if the table is in 16 bits. |
438 | if (use8Bits) { |
439 | if (outBytes != inBytes) { |
440 | uprv_memmove(outBytes+tableStartOffset+topSize, |
441 | inBytes+tableStartOffset+topSize, |
442 | tableLength-topSize); |
443 | } |
444 | } else { |
445 | ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
446 | outBytes+tableStartOffset+topSize, status); |
447 | } |
448 | } |
449 | |
450 | // Trie table for character categories |
451 | ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), |
452 | outBytes+ds->readUInt32(rbbiDH->fTrie), status); |
453 | |
454 | // Source Rules Text. It's UTF8 data |
455 | if (outBytes != inBytes) { |
456 | uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource), |
457 | inBytes+ds->readUInt32(rbbiDH->fRuleSource), |
458 | ds->readUInt32(rbbiDH->fRuleSourceLen)); |
459 | } |
460 | |
461 | // Table of rule status values. It's all int_32 values |
462 | ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), |
463 | outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); |
464 | |
465 | // And, last, the header. |
466 | // It is all int32_t values except for fFormataVersion, which is an array of four bytes. |
467 | // Swap the whole thing as int32_t, then re-swap the one field. |
468 | // |
469 | ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); |
470 | ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); |
471 | |
472 | return totalSize; |
473 | } |
474 | |
475 | |
476 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
477 | |