1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// file: rbbirb.cpp
5//
6// Copyright (C) 2002-2011, International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains the RBBIRuleBuilder class implementation. This is the main class for
10// building (compiling) break rules into the tables required by the runtime
11// RBBI engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/brkiter.h"
19#include "unicode/rbbi.h"
20#include "unicode/ubrk.h"
21#include "unicode/unistr.h"
22#include "unicode/uniset.h"
23#include "unicode/uchar.h"
24#include "unicode/uchriter.h"
25#include "unicode/parsepos.h"
26#include "unicode/parseerr.h"
27
28#include "cmemory.h"
29#include "cstring.h"
30#include "rbbirb.h"
31#include "rbbinode.h"
32#include "rbbiscan.h"
33#include "rbbisetb.h"
34#include "rbbitblb.h"
35#include "rbbidata.h"
36#include "uassert.h"
37
38
39U_NAMESPACE_BEGIN
40
41
42//----------------------------------------------------------------------------------------
43//
44// Constructor.
45//
46//----------------------------------------------------------------------------------------
47RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
48 UParseError *parseErr,
49 UErrorCode &status)
50 : fRules(rules), fStrippedRules(rules)
51{
52 fStatus = &status; // status is checked below
53 fParseError = parseErr;
54 fDebugEnv = NULL;
55#ifdef RBBI_DEBUG
56 fDebugEnv = getenv("U_RBBIDEBUG");
57#endif
58
59
60 fForwardTree = NULL;
61 fReverseTree = NULL;
62 fSafeFwdTree = NULL;
63 fSafeRevTree = NULL;
64 fDefaultTree = &fForwardTree;
65 fForwardTable = NULL;
66 fRuleStatusVals = NULL;
67 fChainRules = FALSE;
68 fLBCMNoChain = FALSE;
69 fLookAheadHardBreak = FALSE;
70 fUSetNodes = NULL;
71 fRuleStatusVals = NULL;
72 fScanner = NULL;
73 fSetBuilder = NULL;
74 if (parseErr) {
75 uprv_memset(parseErr, 0, sizeof(UParseError));
76 }
77
78 if (U_FAILURE(status)) {
79 return;
80 }
81
82 fUSetNodes = new UVector(status); // bcos status gets overwritten here
83 fRuleStatusVals = new UVector(status);
84 fScanner = new RBBIRuleScanner(this);
85 fSetBuilder = new RBBISetBuilder(this);
86 if (U_FAILURE(status)) {
87 return;
88 }
89 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
90 status = U_MEMORY_ALLOCATION_ERROR;
91 }
92}
93
94
95
96//----------------------------------------------------------------------------------------
97//
98// Destructor
99//
100//----------------------------------------------------------------------------------------
101RBBIRuleBuilder::~RBBIRuleBuilder() {
102
103 int i;
104 for (i=0; ; i++) {
105 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
106 if (n==NULL) {
107 break;
108 }
109 delete n;
110 }
111
112 delete fUSetNodes;
113 delete fSetBuilder;
114 delete fForwardTable;
115 delete fForwardTree;
116 delete fReverseTree;
117 delete fSafeFwdTree;
118 delete fSafeRevTree;
119 delete fScanner;
120 delete fRuleStatusVals;
121}
122
123
124
125
126
127//----------------------------------------------------------------------------------------
128//
129// flattenData() - Collect up the compiled RBBI rule data and put it into
130// the format for saving in ICU data files,
131// which is also the format needed by the RBBI runtime engine.
132//
133//----------------------------------------------------------------------------------------
134static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
135
136RBBIDataHeader *RBBIRuleBuilder::flattenData() {
137 int32_t i;
138
139 if (U_FAILURE(*fStatus)) {
140 return NULL;
141 }
142
143 // Remove whitespace from the rules to make it smaller.
144 // The rule parser has already removed comments.
145 fStrippedRules = fScanner->stripRules(fStrippedRules);
146
147 // Calculate the size of each section in the data.
148 // Sizes here are padded up to a multiple of 8 for better memory alignment.
149 // Sections sizes actually stored in the header are for the actual data
150 // without the padding.
151 //
152 int32_t headerSize = align8(sizeof(RBBIDataHeader));
153 int32_t forwardTableSize = align8(fForwardTable->getTableSize());
154 int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
155 int32_t trieSize = align8(fSetBuilder->getTrieSize());
156 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
157 int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
158
159 int32_t totalSize = headerSize
160 + forwardTableSize
161 + reverseTableSize
162 + statusTableSize + trieSize + rulesSize;
163
164#ifdef RBBI_DEBUG
165 if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
166 RBBIDebugPrintf("Header Size: %8d\n", headerSize);
167 RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
168 RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
169 RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
170 RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
171 RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
172 RBBIDebugPrintf("-----------------------------\n");
173 RBBIDebugPrintf("Total Size: %8d\n", totalSize);
174 }
175#endif
176
177 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
178 if (data == NULL) {
179 *fStatus = U_MEMORY_ALLOCATION_ERROR;
180 return NULL;
181 }
182 uprv_memset(data, 0, totalSize);
183
184
185 data->fMagic = 0xb1a0;
186 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
187 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
188 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
189 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
190 data->fLength = totalSize;
191 data->fCatCount = fSetBuilder->getNumCharCategories();
192
193 data->fFTable = headerSize;
194 data->fFTableLen = forwardTableSize;
195
196 data->fRTable = data->fFTable + data->fFTableLen;
197 data->fRTableLen = reverseTableSize;
198
199 data->fTrie = data->fRTable + data->fRTableLen;
200 data->fTrieLen = fSetBuilder->getTrieSize();
201 data->fStatusTable = data->fTrie + trieSize;
202 data->fStatusTableLen= statusTableSize;
203 data->fRuleSource = data->fStatusTable + statusTableSize;
204 data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
205
206 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
207
208 fForwardTable->exportTable((uint8_t *)data + data->fFTable);
209 fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
210 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
211
212 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
213 for (i=0; i<fRuleStatusVals->size(); i++) {
214 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
215 }
216
217 fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
218
219 return data;
220}
221
222
223//----------------------------------------------------------------------------------------
224//
225// createRuleBasedBreakIterator construct from source rules that are passed in
226// in a UnicodeString
227//
228//----------------------------------------------------------------------------------------
229BreakIterator *
230RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
231 UParseError *parseError,
232 UErrorCode &status)
233{
234 //
235 // Read the input rules, generate a parse tree, symbol table,
236 // and list of all Unicode Sets referenced by the rules.
237 //
238 RBBIRuleBuilder builder(rules, parseError, status);
239 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
240 return NULL;
241 }
242
243 RBBIDataHeader *data = builder.build(status);
244
245 if (U_FAILURE(status)) {
246 return nullptr;
247 }
248
249 //
250 // Create a break iterator from the compiled rules.
251 // (Identical to creation from stored pre-compiled rules)
252 //
253 // status is checked after init in construction.
254 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
255 if (U_FAILURE(status)) {
256 delete This;
257 This = NULL;
258 }
259 else if(This == NULL) { // test for NULL
260 status = U_MEMORY_ALLOCATION_ERROR;
261 }
262 return This;
263}
264
265RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
266 if (U_FAILURE(status)) {
267 return nullptr;
268 }
269
270 fScanner->parse();
271 if (U_FAILURE(status)) {
272 return nullptr;
273 }
274
275 //
276 // UnicodeSet processing.
277 // Munge the Unicode Sets to create a set of character categories.
278 // Generate the mapping tables (TRIE) from input code points to
279 // the character categories.
280 //
281 fSetBuilder->buildRanges();
282
283 //
284 // Generate the DFA state transition table.
285 //
286 fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
287 if (fForwardTable == nullptr) {
288 status = U_MEMORY_ALLOCATION_ERROR;
289 return nullptr;
290 }
291
292 fForwardTable->buildForwardTable();
293 optimizeTables();
294 fForwardTable->buildSafeReverseTable(status);
295
296
297#ifdef RBBI_DEBUG
298 if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
299 fForwardTable->printStates();
300 fForwardTable->printRuleStatusTable();
301 fForwardTable->printReverseTable();
302 }
303#endif
304
305 fSetBuilder->buildTrie();
306
307 //
308 // Package up the compiled data into a memory image
309 // in the run-time format.
310 //
311 RBBIDataHeader *data = flattenData(); // returns NULL if error
312 if (U_FAILURE(status)) {
313 return nullptr;
314 }
315 return data;
316}
317
318void RBBIRuleBuilder::optimizeTables() {
319 bool didSomething;
320 do {
321 didSomething = false;
322
323 // Begin looking for duplicates with char class 3.
324 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
325 // and should not have other categories merged into them.
326 IntPair duplPair = {3, 0};
327 while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
328 fSetBuilder->mergeCategories(duplPair);
329 fForwardTable->removeColumn(duplPair.second);
330 didSomething = true;
331 }
332
333 while (fForwardTable->removeDuplicateStates() > 0) {
334 didSomething = true;
335 }
336 } while (didSomething);
337}
338
339U_NAMESPACE_END
340
341#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
342