1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// file: rbbirb.cpp
5//
6// Copyright (C) 2002-2011, International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains the RBBIRuleBuilder class implementation. This is the main class for
10// building (compiling) break rules into the tables required by the runtime
11// RBBI engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/brkiter.h"
19#include "unicode/rbbi.h"
20#include "unicode/ubrk.h"
21#include "unicode/unistr.h"
22#include "unicode/uniset.h"
23#include "unicode/uchar.h"
24#include "unicode/uchriter.h"
25#include "unicode/ustring.h"
26#include "unicode/parsepos.h"
27#include "unicode/parseerr.h"
28
29#include "cmemory.h"
30#include "cstring.h"
31#include "rbbirb.h"
32#include "rbbinode.h"
33#include "rbbiscan.h"
34#include "rbbisetb.h"
35#include "rbbitblb.h"
36#include "rbbidata.h"
37#include "uassert.h"
38
39
40U_NAMESPACE_BEGIN
41
42
43//----------------------------------------------------------------------------------------
44//
45// Constructor.
46//
47//----------------------------------------------------------------------------------------
48RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
49 UParseError *parseErr,
50 UErrorCode &status)
51 : fRules(rules), fStrippedRules(rules)
52{
53 fStatus = &status; // status is checked below
54 fParseError = parseErr;
55 fDebugEnv = nullptr;
56#ifdef RBBI_DEBUG
57 fDebugEnv = getenv("U_RBBIDEBUG");
58#endif
59
60
61 fForwardTree = nullptr;
62 fReverseTree = nullptr;
63 fSafeFwdTree = nullptr;
64 fSafeRevTree = nullptr;
65 fDefaultTree = &fForwardTree;
66 fForwardTable = nullptr;
67 fRuleStatusVals = nullptr;
68 fChainRules = false;
69 fLBCMNoChain = false;
70 fLookAheadHardBreak = false;
71 fUSetNodes = nullptr;
72 fRuleStatusVals = nullptr;
73 fScanner = nullptr;
74 fSetBuilder = nullptr;
75 if (parseErr) {
76 uprv_memset(parseErr, 0, sizeof(UParseError));
77 }
78
79 if (U_FAILURE(status)) {
80 return;
81 }
82
83 fUSetNodes = new UVector(status); // bcos status gets overwritten here
84 fRuleStatusVals = new UVector(status);
85 fScanner = new RBBIRuleScanner(this);
86 fSetBuilder = new RBBISetBuilder(this);
87 if (U_FAILURE(status)) {
88 return;
89 }
90 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
91 status = U_MEMORY_ALLOCATION_ERROR;
92 }
93}
94
95
96
97//----------------------------------------------------------------------------------------
98//
99// Destructor
100//
101//----------------------------------------------------------------------------------------
102RBBIRuleBuilder::~RBBIRuleBuilder() {
103
104 int i;
105 for (i=0; ; i++) {
106 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
107 if (n==nullptr) {
108 break;
109 }
110 delete n;
111 }
112
113 delete fUSetNodes;
114 delete fSetBuilder;
115 delete fForwardTable;
116 delete fForwardTree;
117 delete fReverseTree;
118 delete fSafeFwdTree;
119 delete fSafeRevTree;
120 delete fScanner;
121 delete fRuleStatusVals;
122}
123
124
125
126
127
128//----------------------------------------------------------------------------------------
129//
130// flattenData() - Collect up the compiled RBBI rule data and put it into
131// the format for saving in ICU data files,
132// which is also the format needed by the RBBI runtime engine.
133//
134//----------------------------------------------------------------------------------------
135static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
136
137RBBIDataHeader *RBBIRuleBuilder::flattenData() {
138 int32_t i;
139
140 if (U_FAILURE(*fStatus)) {
141 return nullptr;
142 }
143
144 // Remove whitespace from the rules to make it smaller.
145 // The rule parser has already removed comments.
146 fStrippedRules = fScanner->stripRules(fStrippedRules);
147
148 // Calculate the size of each section in the data.
149 // Sizes here are padded up to a multiple of 8 for better memory alignment.
150 // Sections sizes actually stored in the header are for the actual data
151 // without the padding.
152 //
153 int32_t headerSize = align8(sizeof(RBBIDataHeader));
154 int32_t forwardTableSize = align8(fForwardTable->getTableSize());
155 int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
156 int32_t trieSize = align8(fSetBuilder->getTrieSize());
157 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
158
159 int32_t rulesLengthInUTF8 = 0;
160 u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
161 fStrippedRules.getBuffer(), fStrippedRules.length(),
162 0xfffd, nullptr, fStatus);
163 *fStatus = U_ZERO_ERROR;
164
165 int32_t rulesSize = align8((rulesLengthInUTF8+1));
166
167 int32_t totalSize = headerSize
168 + forwardTableSize
169 + reverseTableSize
170 + statusTableSize + trieSize + rulesSize;
171
172#ifdef RBBI_DEBUG
173 if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
174 RBBIDebugPrintf("Header Size: %8d\n", headerSize);
175 RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
176 RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
177 RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
178 RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
179 RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
180 RBBIDebugPrintf("-----------------------------\n");
181 RBBIDebugPrintf("Total Size: %8d\n", totalSize);
182 }
183#endif
184
185 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
186 if (data == nullptr) {
187 *fStatus = U_MEMORY_ALLOCATION_ERROR;
188 return nullptr;
189 }
190 uprv_memset(data, 0, totalSize);
191
192
193 data->fMagic = 0xb1a0;
194 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
195 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
196 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
197 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
198 data->fLength = totalSize;
199 data->fCatCount = fSetBuilder->getNumCharCategories();
200
201 data->fFTable = headerSize;
202 data->fFTableLen = forwardTableSize;
203
204 data->fRTable = data->fFTable + data->fFTableLen;
205 data->fRTableLen = reverseTableSize;
206
207 data->fTrie = data->fRTable + data->fRTableLen;
208 data->fTrieLen = trieSize;
209 data->fStatusTable = data->fTrie + data->fTrieLen;
210 data->fStatusTableLen= statusTableSize;
211 data->fRuleSource = data->fStatusTable + statusTableSize;
212 data->fRuleSourceLen = rulesLengthInUTF8;
213
214 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
215
216 fForwardTable->exportTable((uint8_t *)data + data->fFTable);
217 fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
218 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
219
220 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
221 for (i=0; i<fRuleStatusVals->size(); i++) {
222 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
223 }
224
225 u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
226 fStrippedRules.getBuffer(), fStrippedRules.length(),
227 0xfffd, nullptr, fStatus);
228 if (U_FAILURE(*fStatus)) {
229 return nullptr;
230 }
231
232 return data;
233}
234
235
236//----------------------------------------------------------------------------------------
237//
238// createRuleBasedBreakIterator construct from source rules that are passed in
239// in a UnicodeString
240//
241//----------------------------------------------------------------------------------------
242BreakIterator *
243RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
244 UParseError *parseError,
245 UErrorCode &status)
246{
247 //
248 // Read the input rules, generate a parse tree, symbol table,
249 // and list of all Unicode Sets referenced by the rules.
250 //
251 RBBIRuleBuilder builder(rules, parseError, status);
252 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
253 return nullptr;
254 }
255
256 RBBIDataHeader *data = builder.build(status);
257
258 if (U_FAILURE(status)) {
259 return nullptr;
260 }
261
262 //
263 // Create a break iterator from the compiled rules.
264 // (Identical to creation from stored pre-compiled rules)
265 //
266 // status is checked after init in construction.
267 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
268 if (U_FAILURE(status)) {
269 delete This;
270 This = nullptr;
271 }
272 else if(This == nullptr) { // test for nullptr
273 status = U_MEMORY_ALLOCATION_ERROR;
274 }
275 return This;
276}
277
278RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
279 if (U_FAILURE(status)) {
280 return nullptr;
281 }
282
283 fScanner->parse();
284 if (U_FAILURE(status)) {
285 return nullptr;
286 }
287
288 //
289 // UnicodeSet processing.
290 // Munge the Unicode Sets to create an initial set of character categories.
291 //
292 fSetBuilder->buildRanges();
293
294 //
295 // Generate the DFA state transition table.
296 //
297 fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
298 if (fForwardTable == nullptr) {
299 status = U_MEMORY_ALLOCATION_ERROR;
300 return nullptr;
301 }
302
303 fForwardTable->buildForwardTable();
304
305 // State table and character category optimization.
306 // Merge equivalent rows and columns.
307 // Note that this process alters the initial set of character categories,
308 // causing the representation of UnicodeSets in the parse tree to become invalid.
309
310 optimizeTables();
311 fForwardTable->buildSafeReverseTable(status);
312
313
314#ifdef RBBI_DEBUG
315 if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
316 fForwardTable->printStates();
317 fForwardTable->printRuleStatusTable();
318 fForwardTable->printReverseTable();
319 }
320#endif
321
322 // Generate the mapping tables (TRIE) from input code points to
323 // the character categories.
324 //
325 fSetBuilder->buildTrie();
326
327 //
328 // Package up the compiled data into a memory image
329 // in the run-time format.
330 //
331 RBBIDataHeader *data = flattenData(); // returns nullptr if error
332 if (U_FAILURE(status)) {
333 return nullptr;
334 }
335 return data;
336}
337
338void RBBIRuleBuilder::optimizeTables() {
339 bool didSomething;
340 do {
341 didSomething = false;
342
343 // Begin looking for duplicates with char class 3.
344 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
345 // and should not have other categories merged into them.
346 IntPair duplPair = {3, 0};
347 while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
348 fSetBuilder->mergeCategories(duplPair);
349 fForwardTable->removeColumn(duplPair.second);
350 didSomething = true;
351 }
352
353 while (fForwardTable->removeDuplicateStates() > 0) {
354 didSomething = true;
355 }
356 } while (didSomething);
357}
358
359U_NAMESPACE_END
360
361#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
362