1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | *************************************************************************** |
5 | * Copyright (C) 1999-2014 International Business Machines Corporation * |
6 | * and others. All rights reserved. * |
7 | *************************************************************************** |
8 | */ |
9 | |
10 | #include "unicode/utypes.h" |
11 | |
12 | #if !UCONFIG_NO_BREAK_ITERATION |
13 | |
14 | #include "unicode/utypes.h" |
15 | #include "rbbidata.h" |
16 | #include "rbbirb.h" |
17 | #include "utrie2.h" |
18 | #include "udatamem.h" |
19 | #include "cmemory.h" |
20 | #include "cstring.h" |
21 | #include "umutex.h" |
22 | |
23 | #include "uassert.h" |
24 | |
25 | |
26 | U_NAMESPACE_BEGIN |
27 | |
28 | //----------------------------------------------------------------------------- |
29 | // |
30 | // Constructors. |
31 | // |
32 | //----------------------------------------------------------------------------- |
33 | RBBIDataWrapper::(const RBBIDataHeader *data, UErrorCode &status) { |
34 | init0(); |
35 | init(data, status); |
36 | } |
37 | |
38 | RBBIDataWrapper::(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { |
39 | init0(); |
40 | init(data, status); |
41 | fDontFreeData = TRUE; |
42 | } |
43 | |
44 | RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { |
45 | init0(); |
46 | if (U_FAILURE(status)) { |
47 | return; |
48 | } |
49 | const DataHeader *dh = udm->pHeader; |
50 | int32_t = dh->dataHeader.headerSize; |
51 | if ( !(headerSize >= 20 && |
52 | dh->info.isBigEndian == U_IS_BIG_ENDIAN && |
53 | dh->info.charsetFamily == U_CHARSET_FAMILY && |
54 | dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk " |
55 | dh->info.dataFormat[1] == 0x72 && |
56 | dh->info.dataFormat[2] == 0x6b && |
57 | dh->info.dataFormat[3] == 0x20 && |
58 | isDataVersionAcceptable(dh->info.formatVersion)) |
59 | ) { |
60 | status = U_INVALID_FORMAT_ERROR; |
61 | return; |
62 | } |
63 | const char *dataAsBytes = reinterpret_cast<const char *>(dh); |
64 | const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize); |
65 | init(rbbidh, status); |
66 | fUDataMem = udm; |
67 | } |
68 | |
69 | UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) { |
70 | return RBBI_DATA_FORMAT_VERSION[0] == version[0]; |
71 | } |
72 | |
73 | |
74 | //----------------------------------------------------------------------------- |
75 | // |
76 | // init(). Does most of the work of construction, shared between the |
77 | // constructors. |
78 | // |
79 | //----------------------------------------------------------------------------- |
80 | void RBBIDataWrapper::init0() { |
81 | fHeader = NULL; |
82 | fForwardTable = NULL; |
83 | fReverseTable = NULL; |
84 | fRuleSource = NULL; |
85 | fRuleStatusTable = NULL; |
86 | fTrie = NULL; |
87 | fUDataMem = NULL; |
88 | fRefCount = 0; |
89 | fDontFreeData = TRUE; |
90 | } |
91 | |
92 | void RBBIDataWrapper::(const RBBIDataHeader *data, UErrorCode &status) { |
93 | if (U_FAILURE(status)) { |
94 | return; |
95 | } |
96 | fHeader = data; |
97 | if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) { |
98 | status = U_INVALID_FORMAT_ERROR; |
99 | return; |
100 | } |
101 | // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 |
102 | // that is no longer supported. At that time fFormatVersion was |
103 | // an int32_t field, rather than an array of 4 bytes. |
104 | |
105 | fDontFreeData = FALSE; |
106 | if (data->fFTableLen != 0) { |
107 | fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); |
108 | } |
109 | if (data->fRTableLen != 0) { |
110 | fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); |
111 | } |
112 | |
113 | fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
114 | (uint8_t *)data + fHeader->fTrie, |
115 | fHeader->fTrieLen, |
116 | NULL, // *actual length |
117 | &status); |
118 | if (U_FAILURE(status)) { |
119 | return; |
120 | } |
121 | |
122 | fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); |
123 | fRuleString.setTo(TRUE, fRuleSource, -1); |
124 | U_ASSERT(data->fRuleSourceLen > 0); |
125 | |
126 | fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); |
127 | fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); |
128 | |
129 | fRefCount = 1; |
130 | |
131 | #ifdef RBBI_DEBUG |
132 | char *debugEnv = getenv("U_RBBIDEBUG" ); |
133 | if (debugEnv && uprv_strstr(debugEnv, "data" )) {this->printData();} |
134 | #endif |
135 | } |
136 | |
137 | |
138 | //----------------------------------------------------------------------------- |
139 | // |
140 | // Destructor. Don't call this - use removeReference() instead. |
141 | // |
142 | //----------------------------------------------------------------------------- |
143 | RBBIDataWrapper::~RBBIDataWrapper() { |
144 | U_ASSERT(fRefCount == 0); |
145 | utrie2_close(fTrie); |
146 | fTrie = NULL; |
147 | if (fUDataMem) { |
148 | udata_close(fUDataMem); |
149 | } else if (!fDontFreeData) { |
150 | uprv_free((void *)fHeader); |
151 | } |
152 | } |
153 | |
154 | |
155 | |
156 | //----------------------------------------------------------------------------- |
157 | // |
158 | // Operator == Consider two RBBIDataWrappers to be equal if they |
159 | // refer to the same underlying data. Although |
160 | // the data wrappers are normally shared between |
161 | // iterator instances, it's possible to independently |
162 | // open the same data twice, and get two instances, which |
163 | // should still be ==. |
164 | // |
165 | //----------------------------------------------------------------------------- |
166 | UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { |
167 | if (fHeader == other.fHeader) { |
168 | return TRUE; |
169 | } |
170 | if (fHeader->fLength != other.fHeader->fLength) { |
171 | return FALSE; |
172 | } |
173 | if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { |
174 | return TRUE; |
175 | } |
176 | return FALSE; |
177 | } |
178 | |
179 | int32_t RBBIDataWrapper::hashCode() { |
180 | return fHeader->fFTableLen; |
181 | } |
182 | |
183 | |
184 | |
185 | //----------------------------------------------------------------------------- |
186 | // |
187 | // Reference Counting. A single RBBIDataWrapper object is shared among |
188 | // however many RulesBasedBreakIterator instances are |
189 | // referencing the same data. |
190 | // |
191 | //----------------------------------------------------------------------------- |
192 | void RBBIDataWrapper::removeReference() { |
193 | if (umtx_atomic_dec(&fRefCount) == 0) { |
194 | delete this; |
195 | } |
196 | } |
197 | |
198 | |
199 | RBBIDataWrapper *RBBIDataWrapper::addReference() { |
200 | umtx_atomic_inc(&fRefCount); |
201 | return this; |
202 | } |
203 | |
204 | |
205 | |
206 | //----------------------------------------------------------------------------- |
207 | // |
208 | // getRuleSourceString |
209 | // |
210 | //----------------------------------------------------------------------------- |
211 | const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { |
212 | return fRuleString; |
213 | } |
214 | |
215 | |
216 | //----------------------------------------------------------------------------- |
217 | // |
218 | // print - debugging function to dump the runtime data tables. |
219 | // |
220 | //----------------------------------------------------------------------------- |
221 | #ifdef RBBI_DEBUG |
222 | void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { |
223 | uint32_t c; |
224 | uint32_t s; |
225 | |
226 | RBBIDebugPrintf(" %s\n" , heading); |
227 | |
228 | RBBIDebugPrintf("State | Acc LA TagIx" ); |
229 | for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d " , c);} |
230 | RBBIDebugPrintf("\n------|---------------" ); for (c=0;c<fHeader->fCatCount; c++) { |
231 | RBBIDebugPrintf("----" ); |
232 | } |
233 | RBBIDebugPrintf("\n" ); |
234 | |
235 | if (table == NULL) { |
236 | RBBIDebugPrintf(" N U L L T A B L E\n\n" ); |
237 | return; |
238 | } |
239 | for (s=0; s<table->fNumStates; s++) { |
240 | RBBIStateTableRow *row = (RBBIStateTableRow *) |
241 | (table->fTableData + (table->fRowLen * s)); |
242 | RBBIDebugPrintf("%4d | %3d %3d %3d " , s, row->fAccepting, row->fLookAhead, row->fTagIdx); |
243 | for (c=0; c<fHeader->fCatCount; c++) { |
244 | RBBIDebugPrintf("%3d " , row->fNextState[c]); |
245 | } |
246 | RBBIDebugPrintf("\n" ); |
247 | } |
248 | RBBIDebugPrintf("\n" ); |
249 | } |
250 | #endif |
251 | |
252 | |
253 | void RBBIDataWrapper::printData() { |
254 | #ifdef RBBI_DEBUG |
255 | RBBIDebugPrintf("RBBI Data at %p\n" , (void *)fHeader); |
256 | RBBIDebugPrintf(" Version = {%d %d %d %d}\n" , fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], |
257 | fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); |
258 | RBBIDebugPrintf(" total length of data = %d\n" , fHeader->fLength); |
259 | RBBIDebugPrintf(" number of character categories = %d\n\n" , fHeader->fCatCount); |
260 | |
261 | printTable("Forward State Transition Table" , fForwardTable); |
262 | printTable("Reverse State Transition Table" , fReverseTable); |
263 | |
264 | RBBIDebugPrintf("\nOrignal Rules source:\n" ); |
265 | for (int32_t c=0; fRuleSource[c] != 0; c++) { |
266 | RBBIDebugPrintf("%c" , fRuleSource[c]); |
267 | } |
268 | RBBIDebugPrintf("\n\n" ); |
269 | #endif |
270 | } |
271 | |
272 | |
273 | U_NAMESPACE_END |
274 | U_NAMESPACE_USE |
275 | |
276 | //----------------------------------------------------------------------------- |
277 | // |
278 | // ubrk_swap - byte swap and char encoding swap of RBBI data |
279 | // |
280 | //----------------------------------------------------------------------------- |
281 | |
282 | U_CAPI int32_t U_EXPORT2 |
283 | ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, |
284 | UErrorCode *status) { |
285 | |
286 | if (status == NULL || U_FAILURE(*status)) { |
287 | return 0; |
288 | } |
289 | if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
290 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
291 | return 0; |
292 | } |
293 | |
294 | // |
295 | // Check that the data header is for for break data. |
296 | // (Header contents are defined in genbrk.cpp) |
297 | // |
298 | const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); |
299 | if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ |
300 | pInfo->dataFormat[1]==0x72 && |
301 | pInfo->dataFormat[2]==0x6b && |
302 | pInfo->dataFormat[3]==0x20 && |
303 | RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) { |
304 | udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n" , |
305 | pInfo->dataFormat[0], pInfo->dataFormat[1], |
306 | pInfo->dataFormat[2], pInfo->dataFormat[3], |
307 | pInfo->formatVersion[0]); |
308 | *status=U_UNSUPPORTED_ERROR; |
309 | return 0; |
310 | } |
311 | |
312 | // |
313 | // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific |
314 | // RBBIDataHeader). This swap also conveniently gets us |
315 | // the size of the ICU d.h., which lets us locate the start |
316 | // of the RBBI specific data. |
317 | // |
318 | int32_t =udata_swapDataHeader(ds, inData, length, outData, status); |
319 | |
320 | |
321 | // |
322 | // Get the RRBI Data Header, and check that it appears to be OK. |
323 | // |
324 | const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
325 | RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; |
326 | if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || |
327 | !RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) || |
328 | ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) { |
329 | udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n" ); |
330 | *status=U_UNSUPPORTED_ERROR; |
331 | return 0; |
332 | } |
333 | |
334 | // |
335 | // Prefight operation? Just return the size |
336 | // |
337 | int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); |
338 | int32_t totalSize = headerSize + breakDataLength; |
339 | if (length < 0) { |
340 | return totalSize; |
341 | } |
342 | |
343 | // |
344 | // Check that length passed in is consistent with length from RBBI data header. |
345 | // |
346 | if (length < totalSize) { |
347 | udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n" , |
348 | breakDataLength); |
349 | *status=U_INDEX_OUTOFBOUNDS_ERROR; |
350 | return 0; |
351 | } |
352 | |
353 | |
354 | // |
355 | // Swap the Data. Do the data itself first, then the RBBI Data Header, because |
356 | // we need to reference the header to locate the data, and an |
357 | // inplace swap of the header leaves it unusable. |
358 | // |
359 | uint8_t *outBytes = (uint8_t *)outData + headerSize; |
360 | RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; |
361 | |
362 | int32_t tableStartOffset; |
363 | int32_t tableLength; |
364 | |
365 | // |
366 | // If not swapping in place, zero out the output buffer before starting. |
367 | // Individual tables and other data items within are aligned to 8 byte boundaries |
368 | // when originally created. Any unused space between items needs to be zero. |
369 | // |
370 | if (inBytes != outBytes) { |
371 | uprv_memset(outBytes, 0, breakDataLength); |
372 | } |
373 | |
374 | // |
375 | // Each state table begins with several 32 bit fields. Calculate the size |
376 | // in bytes of these. |
377 | // |
378 | int32_t topSize = offsetof(RBBIStateTable, fTableData); |
379 | |
380 | // Forward state table. |
381 | tableStartOffset = ds->readUInt32(rbbiDH->fFTable); |
382 | tableLength = ds->readUInt32(rbbiDH->fFTableLen); |
383 | |
384 | if (tableLength > 0) { |
385 | ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
386 | outBytes+tableStartOffset, status); |
387 | ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
388 | outBytes+tableStartOffset+topSize, status); |
389 | } |
390 | |
391 | // Reverse state table. Same layout as forward table, above. |
392 | tableStartOffset = ds->readUInt32(rbbiDH->fRTable); |
393 | tableLength = ds->readUInt32(rbbiDH->fRTableLen); |
394 | |
395 | if (tableLength > 0) { |
396 | ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
397 | outBytes+tableStartOffset, status); |
398 | ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
399 | outBytes+tableStartOffset+topSize, status); |
400 | } |
401 | |
402 | // Trie table for character categories |
403 | utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), |
404 | outBytes+ds->readUInt32(rbbiDH->fTrie), status); |
405 | |
406 | // Source Rules Text. It's UChar data |
407 | ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), |
408 | outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); |
409 | |
410 | // Table of rule status values. It's all int_32 values |
411 | ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), |
412 | outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); |
413 | |
414 | // And, last, the header. |
415 | // It is all int32_t values except for fFormataVersion, which is an array of four bytes. |
416 | // Swap the whole thing as int32_t, then re-swap the one field. |
417 | // |
418 | ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); |
419 | ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); |
420 | |
421 | return totalSize; |
422 | } |
423 | |
424 | |
425 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
426 | |