rbbiscan.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/rbbiscan.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// file: rbbiscan.cpp
5	//
6	// Copyright (C) 2002-2016, International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains the Rule Based Break Iterator Rule Builder functions for
10	// scanning the rules and assembling a parse tree. This is the first phase
11	// of compiling the rules.
12	//
13	// The overall of the rules is managed by class RBBIRuleBuilder, which will
14	// create and use an instance of this class as part of the process.
15	//
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_BREAK_ITERATION
20
21	#include "unicode/unistr.h"
22	#include "unicode/uniset.h"
23	#include "unicode/uchar.h"
24	#include "unicode/uchriter.h"
25	#include "unicode/parsepos.h"
26	#include "unicode/parseerr.h"
27	#include "cmemory.h"
28	#include "cstring.h"
29
30	#include "rbbirpt.h" // Contains state table for the rbbi rules parser.
31	// generated by a Perl script.
32	#include "rbbirb.h"
33	#include "rbbinode.h"
34	#include "rbbiscan.h"
35	#include "rbbitblb.h"
36
37	#include "uassert.h"
38
39	//------------------------------------------------------------------------------
40	//
41	// Unicode Set init strings for each of the character classes needed for parsing a rule file.
42	// (Initialized with hex values for portability to EBCDIC based machines.
43	// Really ugly, but there's no good way to avoid it.)
44	//
45	// The sets are referred to by name in the rbbirpt.txt, which is the
46	// source form of the state transition table for the RBBI rule parser.
47	//
48	//------------------------------------------------------------------------------
49	static const UChar gRuleSet_rule_char_pattern[] = {
50	// Characters that may appear as literals in patterns without escaping or quoting.
51	// [ ^ [ \ p { Z } \ u 0 0 2 0
52	`0x5b`, `0x5e`, `0x5b`, `0x5c`, `0x70`, `0x7b`, `0x5a`, `0x7d`, `0x5c`, `0x75`, `0x30`, `0x30`, `0x32`, `0x30`,
53	// - \ u 0 0 7 f ] - [ \ p
54	`0x2d`, `0x5c`, `0x75`, `0x30`, `0x30`, `0x37`, `0x66`, `0x5d`, `0x2d`, `0x5b`, `0x5c`, `0x70`,
55	// { L } ] - [ \ p { N } ] ]
56	`0x7b`, `0x4c`, `0x7d`, `0x5d`, `0x2d`, `0x5b`, `0x5c`, `0x70`, `0x7b`, `0x4e`, `0x7d`, `0x5d`, `0x5d`, `0`};
57
58	static const UChar gRuleSet_name_char_pattern[] = {
59	// [ _ \ p { L } \ p { N } ]
60	`0x5b`, `0x5f`, `0x5c`, `0x70`, `0x7b`, `0x4c`, `0x7d`, `0x5c`, `0x70`, `0x7b`, `0x4e`, `0x7d`, `0x5d`, `0`};
61
62	static const UChar gRuleSet_digit_char_pattern[] = {
63	// [ 0 - 9 ]
64	`0x5b`, `0x30`, `0x2d`, `0x39`, `0x5d`, `0`};
65
66	static const UChar gRuleSet_name_start_char_pattern[] = {
67	// [ _ \ p { L } ]
68	`0x5b`, `0x5f`, `0x5c`, `0x70`, `0x7b`, `0x4c`, `0x7d`, `0x5d`, `0` };
69
70	static const UChar kAny[] = {`0x61`, `0x6e`, `0x79`, `0x00`}; // "any"
71
72
73	U_CDECL_BEGIN
74	static void U_CALLCONV RBBISetTable_deleter(void *p) {
75	icu::RBBISetTableEl px = (icu::RBBISetTableEl )p;
76	delete px->key;
77	// Note: px->val is owned by the linked list "fSetsListHead" in scanner.
78	// Don't delete the value nodes here.
79	uprv_free(px);
80	}
81	U_CDECL_END
82
83	U_NAMESPACE_BEGIN
84
85	//------------------------------------------------------------------------------
86	//
87	// Constructor.
88	//
89	//------------------------------------------------------------------------------
90	RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
91	{
92	fRB = rb;
93	fScanIndex = `0`;
94	fNextIndex = `0`;
95	fQuoteMode = FALSE;
96	fLineNum = `1`;
97	fCharNum = `0`;
98	fLastChar = `0`;
99
100	fStateTable = NULL;
101	fStack[`0`] = `0`;
102	fStackPtr = `0`;
103	fNodeStack[`0`] = NULL;
104	fNodeStackPtr = `0`;
105
106	fReverseRule = FALSE;
107	fLookAheadRule = FALSE;
108	fNoChainInRule = FALSE;
109
110	fSymbolTable = NULL;
111	fSetTable = NULL;
112	fRuleNum = `0`;
113	fOptionStart = `0`;
114
115	// Do not check status until after all critical fields are sufficiently initialized
116	// that the destructor can run cleanly.
117	if (U_FAILURE(*rb->fStatus)) {
118	return;
119	}
120
121	//
122	// Set up the constant Unicode Sets.
123	// Note: These could be made static, lazily initialized, and shared among
124	// all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
125	// and the time to build these few sets should be small compared to a
126	// full break iterator build.
127	fRuleSets[kRuleSet_rule_char-`128`]
128	= UnicodeSet (UnicodeString (gRuleSet_rule_char_pattern), *rb->fStatus);
129	// fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
130	fRuleSets[kRuleSet_white_space-`128`].
131	add(`9`, `0xd`).add(`0x20`).add(`0x85`).add(`0x200e`, `0x200f`).add(`0x2028`, `0x2029`);
132	fRuleSets[kRuleSet_name_char-`128`]
133	= UnicodeSet (UnicodeString (gRuleSet_name_char_pattern), *rb->fStatus);
134	fRuleSets[kRuleSet_name_start_char-`128`]
135	= UnicodeSet (UnicodeString (gRuleSet_name_start_char_pattern), *rb->fStatus);
136	fRuleSets[kRuleSet_digit_char-`128`]
137	= UnicodeSet (UnicodeString (gRuleSet_digit_char_pattern), *rb->fStatus);
138	if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
139	// This case happens if ICU's data is missing. UnicodeSet tries to look up property
140	// names from the init string, can't find them, and claims an illegal argument.
141	// Change the error so that the actual problem will be clearer to users.
142	*rb->fStatus = U_BRK_INIT_ERROR;
143	}
144	if (U_FAILURE(*rb->fStatus)) {
145	return;
146	}
147
148	fSymbolTable = new RBBISymbolTable (this, rb->fRules, *rb->fStatus);
149	if (fSymbolTable == NULL) {
150	*rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
151	return;
152	}
153	fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
154	if (U_FAILURE(*rb->fStatus)) {
155	return;
156	}
157	uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
158	}
159
160
161
162	//------------------------------------------------------------------------------
163	//
164	// Destructor
165	//
166	//------------------------------------------------------------------------------
167	RBBIRuleScanner::~RBBIRuleScanner() {
168	delete fSymbolTable;
169	if (fSetTable != NULL) {
170	uhash_close(fSetTable);
171	fSetTable = NULL;
172
173	}
174
175
176	// Node Stack.
177	// Normally has one entry, which is the entire parse tree for the rules.
178	// If errors occured, there may be additional subtrees left on the stack.
179	while (fNodeStackPtr > `0`) {
180	delete fNodeStack[fNodeStackPtr];
181	fNodeStackPtr--;
182	}
183
184	}
185
186	//------------------------------------------------------------------------------
187	//
188	// doParseAction Do some action during rule parsing.
189	// Called by the parse state machine.
190	// Actions build the parse tree and Unicode Sets,
191	// and maintain the parse stack for nested expressions.
192	//
193	// TODO: unify EParseAction and RBBI_RuleParseAction enum types.
194	// They represent exactly the same thing. They're separate
195	// only to work around enum forward declaration restrictions
196	// in some compilers, while at the same time avoiding multiple
197	// definitions problems. I'm sure that there's a better way.
198	//
199	//------------------------------------------------------------------------------
200	UBool RBBIRuleScanner::doParseActions(int32_t action)
201	{
202	RBBINode *n = NULL;
203
204	UBool returnVal = TRUE;
205
206	switch (action) {
207
208	case doExprStart:
209	pushNewNode(RBBINode::opStart);
210	fRuleNum++;
211	break;
212
213
214	case doNoChain:
215	// Scanned a '^' while on the rule start state.
216	fNoChainInRule = TRUE;
217	break;
218
219
220	case doExprOrOperator:
221	{
222	fixOpStack(RBBINode::precOpCat);
223	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
224	RBBINode *orNode = pushNewNode(RBBINode::opOr);
225	if (U_FAILURE(*fRB->fStatus)) {
226	break;
227	}
228	orNode->fLeftChild = operandNode;
229	operandNode->fParent = orNode;
230	}
231	break;
232
233	case doExprCatOperator:
234	// concatenation operator.
235	// For the implicit concatenation of adjacent terms in an expression that are
236	// not separated by any other operator. Action is invoked between the
237	// actions for the two terms.
238	{
239	fixOpStack(RBBINode::precOpCat);
240	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
241	RBBINode *catNode = pushNewNode(RBBINode::opCat);
242	if (U_FAILURE(*fRB->fStatus)) {
243	break;
244	}
245	catNode->fLeftChild = operandNode;
246	operandNode->fParent = catNode;
247	}
248	break;
249
250	case doLParen:
251	// Open Paren.
252	// The openParen node is a dummy operation type with a low precedence,
253	// which has the affect of ensuring that any real binary op that
254	// follows within the parens binds more tightly to the operands than
255	// stuff outside of the parens.
256	pushNewNode(RBBINode::opLParen);
257	break;
258
259	case doExprRParen:
260	fixOpStack(RBBINode::precLParen);
261	break;
262
263	case doNOP:
264	break;
265
266	case doStartAssign:
267	// We've just scanned "$variable = "
268	// The top of the node stack has the $variable ref node.
269
270	// Save the start position of the RHS text in the StartExpression node
271	// that precedes the $variableReference node on the stack.
272	// This will eventually be used when saving the full $variable replacement
273	// text as a string.
274	n = fNodeStack[fNodeStackPtr-`1`];
275	n->fFirstPos = fNextIndex; // move past the '='
276
277	// Push a new start-of-expression node; needed to keep parse of the
278	// RHS expression happy.
279	pushNewNode(RBBINode::opStart);
280	break;
281
282
283
284
285	case doEndAssign:
286	{
287	// We have reached the end of an assignement statement.
288	// Current scan char is the ';' that terminates the assignment.
289
290	// Terminate expression, leaves expression parse tree rooted in TOS node.
291	fixOpStack(RBBINode::precStart);
292
293	RBBINode *startExprNode = fNodeStack[fNodeStackPtr-`2`];
294	RBBINode *varRefNode = fNodeStack[fNodeStackPtr-`1`];
295	RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
296
297	// Save original text of right side of assignment, excluding the terminating ';'
298	// in the root of the node for the right-hand-side expression.
299	RHSExprNode->fFirstPos = startExprNode->fFirstPos;
300	RHSExprNode->fLastPos = fScanIndex;
301	fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
302
303	// Expression parse tree becomes l. child of the $variable reference node.
304	varRefNode->fLeftChild = RHSExprNode;
305	RHSExprNode->fParent = varRefNode;
306
307	// Make a symbol table entry for the $variableRef node.
308	fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
309	if (U_FAILURE(*fRB->fStatus)) {
310	// This is a round-about way to get the parse position set
311	// so that duplicate symbols error messages include a line number.
312	UErrorCode t = *fRB->fStatus;
313	*fRB->fStatus = U_ZERO_ERROR;
314	error(t);
315	}
316
317	// Clean up the stack.
318	delete startExprNode;
319	fNodeStackPtr-=`3`;
320	break;
321	}
322
323	case doEndOfRule:
324	{
325	fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
326	if (U_FAILURE(fRB->fStatus)) { // parse tree rooted in TOS node.*
327	break;
328	}
329	#ifdef RBBI_DEBUG
330	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
331	#endif
332	U_ASSERT(fNodeStackPtr == `1`);
333	RBBINode *thisRule = fNodeStack[fNodeStackPtr];
334
335	// If this rule includes a look-ahead '/', add a endMark node to the
336	// expression tree.
337	if (fLookAheadRule) {
338	RBBINode *endNode = pushNewNode(RBBINode::endMark);
339	RBBINode *catNode = pushNewNode(RBBINode::opCat);
340	if (U_FAILURE(*fRB->fStatus)) {
341	break;
342	}
343	fNodeStackPtr -= `2`;
344	catNode->fLeftChild = thisRule;
345	catNode->fRightChild = endNode;
346	fNodeStack[fNodeStackPtr] = catNode;
347	endNode->fVal = fRuleNum;
348	endNode->fLookAheadEnd = TRUE;
349	thisRule = catNode;
350
351	// TODO: Disable chaining out of look-ahead (hard break) rules.
352	// The break on rule match is forced, so there is no point in building up
353	// the state table to chain into another rule for a longer match.
354	}
355
356	// Mark this node as being the root of a rule.
357	thisRule->fRuleRoot = TRUE;
358
359	// Flag if chaining into this rule is wanted.
360	//
361	if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
362	!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
363	thisRule->fChainIn = TRUE;
364	}
365
366
367	// All rule expressions are ORed together.
368	// The ';' that terminates an expression really just functions as a '\|' with
369	// a low operator prededence.
370	//
371	// Each of the four sets of rules are collected separately.
372	// (forward, reverse, safe_forward, safe_reverse)
373	// OR this rule into the appropriate group of them.
374	//
375	RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
376
377	if (*destRules != NULL) {
378	// This is not the first rule encounted.
379	// OR previous stuff (from destRules)*
380	// with the current rule expression (on the Node Stack)
381	// with the resulting OR expression going to destRules*
382	//
383	thisRule = fNodeStack[fNodeStackPtr];
384	RBBINode prevRules = destRules;
385	RBBINode *orNode = pushNewNode(RBBINode::opOr);
386	if (U_FAILURE(*fRB->fStatus)) {
387	break;
388	}
389	orNode->fLeftChild = prevRules;
390	prevRules->fParent = orNode;
391	orNode->fRightChild = thisRule;
392	thisRule->fParent = orNode;
393	*destRules = orNode;
394	}
395	else
396	{
397	// This is the first rule encountered (for this direction).
398	// Just move its parse tree from the stack to destRules.*
399	*destRules = fNodeStack[fNodeStackPtr];
400	}
401	fReverseRule = FALSE; // in preparation for the next rule.
402	fLookAheadRule = FALSE;
403	fNoChainInRule = FALSE;
404	fNodeStackPtr = `0`;
405	}
406	break;
407
408
409	case doRuleError:
410	error(U_BRK_RULE_SYNTAX);
411	returnVal = FALSE;
412	break;
413
414
415	case doVariableNameExpectedErr:
416	error(U_BRK_RULE_SYNTAX);
417	break;
418
419
420	//
421	// Unary operands + ? *
422	// These all appear after the operand to which they apply.
423	// When we hit one, the operand (may be a whole sub expression)
424	// will be on the top of the stack.
425	// Unary Operator becomes TOS, with the old TOS as its one child.
426	case doUnaryOpPlus:
427	{
428	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
429	RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
430	if (U_FAILURE(*fRB->fStatus)) {
431	break;
432	}
433	plusNode->fLeftChild = operandNode;
434	operandNode->fParent = plusNode;
435	}
436	break;
437
438	case doUnaryOpQuestion:
439	{
440	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
441	RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
442	if (U_FAILURE(*fRB->fStatus)) {
443	break;
444	}
445	qNode->fLeftChild = operandNode;
446	operandNode->fParent = qNode;
447	}
448	break;
449
450	case doUnaryOpStar:
451	{
452	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
453	RBBINode *starNode = pushNewNode(RBBINode::opStar);
454	if (U_FAILURE(*fRB->fStatus)) {
455	break;
456	}
457	starNode->fLeftChild = operandNode;
458	operandNode->fParent = starNode;
459	}
460	break;
461
462	case doRuleChar:
463	// A "Rule Character" is any single character that is a literal part
464	// of the regular expression. Like a, b and c in the expression "(abc) \| [:L:]"*
465	// These are pretty uncommon in break rules; the terms are more commonly
466	// sets. To keep things uniform, treat these characters like as
467	// sets that just happen to contain only one character.
468	{
469	n = pushNewNode(RBBINode::setRef);
470	if (U_FAILURE(*fRB->fStatus)) {
471	break;
472	}
473	findSetFor(UnicodeString (fC.fChar), n);
474	n->fFirstPos = fScanIndex;
475	n->fLastPos = fNextIndex;
476	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
477	break;
478	}
479
480	case doDotAny:
481	// scanned a ".", meaning match any single character.
482	{
483	n = pushNewNode(RBBINode::setRef);
484	if (U_FAILURE(*fRB->fStatus)) {
485	break;
486	}
487	findSetFor(UnicodeString (TRUE, kAny, `3`), n);
488	n->fFirstPos = fScanIndex;
489	n->fLastPos = fNextIndex;
490	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
491	break;
492	}
493
494	case doSlash:
495	// Scanned a '/', which identifies a look-ahead break position in a rule.
496	n = pushNewNode(RBBINode::lookAhead);
497	if (U_FAILURE(*fRB->fStatus)) {
498	break;
499	}
500	n->fVal = fRuleNum;
501	n->fFirstPos = fScanIndex;
502	n->fLastPos = fNextIndex;
503	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
504	fLookAheadRule = TRUE;
505	break;
506
507
508	case doStartTagValue:
509	// Scanned a '{', the opening delimiter for a tag value within a rule.
510	n = pushNewNode(RBBINode::tag);
511	if (U_FAILURE(*fRB->fStatus)) {
512	break;
513	}
514	n->fVal = `0`;
515	n->fFirstPos = fScanIndex;
516	n->fLastPos = fNextIndex;
517	break;
518
519	case doTagDigit:
520	// Just scanned a decimal digit that's part of a tag value
521	{
522	n = fNodeStack[fNodeStackPtr];
523	uint32_t v = u_charDigitValue(fC.fChar);
524	U_ASSERT(v < `10`);
525	n->fVal = n->fVal*`10` + v;
526	break;
527	}
528
529	case doTagValue:
530	n = fNodeStack[fNodeStackPtr];
531	n->fLastPos = fNextIndex;
532	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
533	break;
534
535	case doTagExpectedError:
536	error(U_BRK_MALFORMED_RULE_TAG);
537	returnVal = FALSE;
538	break;
539
540	case doOptionStart:
541	// Scanning a !!option. At the start of string.
542	fOptionStart = fScanIndex;
543	break;
544
545	case doOptionEnd:
546	{
547	UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
548	if (opt == UNICODE_STRING("chain", `5`)) {
549	fRB->fChainRules = TRUE;
550	} else if (opt == UNICODE_STRING("LBCMNoChain", `11`)) {
551	fRB->fLBCMNoChain = TRUE;
552	} else if (opt == UNICODE_STRING("forward", `7`)) {
553	fRB->fDefaultTree = &fRB->fForwardTree;
554	} else if (opt == UNICODE_STRING("reverse", `7`)) {
555	fRB->fDefaultTree = &fRB->fReverseTree;
556	} else if (opt == UNICODE_STRING("safe_forward", `12`)) {
557	fRB->fDefaultTree = &fRB->fSafeFwdTree;
558	} else if (opt == UNICODE_STRING("safe_reverse", `12`)) {
559	fRB->fDefaultTree = &fRB->fSafeRevTree;
560	} else if (opt == UNICODE_STRING("lookAheadHardBreak", `18`)) {
561	fRB->fLookAheadHardBreak = TRUE;
562	} else if (opt == UNICODE_STRING("quoted_literals_only", `20`)) {
563	fRuleSets[kRuleSet_rule_char-`128`].clear();
564	} else if (opt == UNICODE_STRING("unquoted_literals", `17`)) {
565	fRuleSets[kRuleSet_rule_char-`128`].applyPattern(UnicodeString (gRuleSet_rule_char_pattern), *fRB->fStatus);
566	} else {
567	error(U_BRK_UNRECOGNIZED_OPTION);
568	}
569	}
570	break;
571
572	case doReverseDir:
573	fReverseRule = TRUE;
574	break;
575
576	case doStartVariableName:
577	n = pushNewNode(RBBINode::varRef);
578	if (U_FAILURE(*fRB->fStatus)) {
579	break;
580	}
581	n->fFirstPos = fScanIndex;
582	break;
583
584	case doEndVariableName:
585	n = fNodeStack[fNodeStackPtr];
586	if (n==NULL \|\| n->fType != RBBINode::varRef) {
587	error(U_BRK_INTERNAL_ERROR);
588	break;
589	}
590	n->fLastPos = fScanIndex;
591	fRB->fRules.extractBetween(n->fFirstPos+`1`, n->fLastPos, n->fText);
592	// Look the newly scanned name up in the symbol table
593	// If there's an entry, set the l. child of the var ref to the replacement expression.
594	// (We also pass through here when scanning assignments, but no harm is done, other
595	// than a slight wasted effort that seems hard to avoid. Lookup will be null)
596	n->fLeftChild = fSymbolTable->lookupNode(n->fText);
597	break;
598
599	case doCheckVarDef:
600	n = fNodeStack[fNodeStackPtr];
601	if (n->fLeftChild == NULL) {
602	error(U_BRK_UNDEFINED_VARIABLE);
603	returnVal = FALSE;
604	}
605	break;
606
607	case doExprFinished:
608	break;
609
610	case doRuleErrorAssignExpr:
611	error(U_BRK_ASSIGN_ERROR);
612	returnVal = FALSE;
613	break;
614
615	case doExit:
616	returnVal = FALSE;
617	break;
618
619	case doScanUnicodeSet:
620	scanSet();
621	break;
622
623	default:
624	error(U_BRK_INTERNAL_ERROR);
625	returnVal = FALSE;
626	break;
627	}
628	return returnVal && U_SUCCESS(*fRB->fStatus);
629	}
630
631
632
633
634	//------------------------------------------------------------------------------
635	//
636	// Error Report a rule parse error.
637	// Only report it if no previous error has been recorded.
638	//
639	//------------------------------------------------------------------------------
640	void RBBIRuleScanner::error(UErrorCode e) {
641	if (U_SUCCESS(*fRB->fStatus)) {
642	*fRB->fStatus = e;
643	if (fRB->fParseError) {
644	fRB->fParseError->line = fLineNum;
645	fRB->fParseError->offset = fCharNum;
646	fRB->fParseError->preContext[`0`] = `0`;
647	fRB->fParseError->postContext[`0`] = `0`;
648	}
649	}
650	}
651
652
653
654
655	//------------------------------------------------------------------------------
656	//
657	// fixOpStack The parse stack holds partially assembled chunks of the parse tree.
658	// An entry on the stack may be as small as a single setRef node,
659	// or as large as the parse tree
660	// for an entire expression (this will be the one item left on the stack
661	// when the parsing of an RBBI rule completes.
662	//
663	// This function is called when a binary operator is encountered.
664	// It looks back up the stack for operators that are not yet associated
665	// with a right operand, and if the precedence of the stacked operator >=
666	// the precedence of the current operator, binds the operand left,
667	// to the previously encountered operator.
668	//
669	//------------------------------------------------------------------------------
670	void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
671	RBBINode *n;
672	// printNodeStack("entering fixOpStack()");
673	for (;;) {
674	n = fNodeStack[fNodeStackPtr-`1`]; // an operator node
675	if (n->fPrecedence == `0`) {
676	RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
677	error(U_BRK_INTERNAL_ERROR);
678	return;
679	}
680
681	if (n->fPrecedence < p \|\| n->fPrecedence <= RBBINode::precLParen) {
682	// The most recent operand goes with the current operator,
683	// not with the previously stacked one.
684	break;
685	}
686	// Stack operator is a binary op ( '\|' or concatenation)
687	// TOS operand becomes right child of this operator.
688	// Resulting subexpression becomes the TOS operand.
689	n->fRightChild = fNodeStack[fNodeStackPtr];
690	fNodeStack[fNodeStackPtr]->fParent = n;
691	fNodeStackPtr--;
692	// printNodeStack("looping in fixOpStack() ");
693	}
694
695	if (p <= RBBINode::precLParen) {
696	// Scan is at a right paren or end of expression.
697	// The scanned item must match the stack, or else there was an error.
698	// Discard the left paren (or start expr) node from the stack,
699	// leaving the completed (sub)expression as TOS.
700	if (n->fPrecedence != p) {
701	// Right paren encountered matched start of expression node, or
702	// end of expression matched with a left paren node.
703	error(U_BRK_MISMATCHED_PAREN);
704	}
705	fNodeStack[fNodeStackPtr-`1`] = fNodeStack[fNodeStackPtr];
706	fNodeStackPtr--;
707	// Delete the now-discarded LParen or Start node.
708	delete n;
709	}
710	// printNodeStack("leaving fixOpStack()");
711	}
712
713
714
715
716	//------------------------------------------------------------------------------
717	//
718	// findSetFor given a UnicodeString,
719	// - find the corresponding Unicode Set (uset node)
720	// (create one if necessary)
721	// - Set fLeftChild of the caller's node (should be a setRef node)
722	// to the uset node
723	// Maintain a hash table of uset nodes, so the same one is always used
724	// for the same string.
725	// If a "to adopt" set is provided and we haven't seen this key before,
726	// add the provided set to the hash table.
727	// If the string is one (32 bit) char in length, the set contains
728	// just one element which is the char in question.
729	// If the string is "any", return a set containing all chars.
730	//
731	//------------------------------------------------------------------------------
732	void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode node, UnicodeSet setToAdopt) {
733
734	RBBISetTableEl *el;
735
736	// First check whether we've already cached a set for this string.
737	// If so, just use the cached set in the new node.
738	// delete any set provided by the caller, since we own it.
739	el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
740	if (el != NULL) {
741	delete setToAdopt;
742	node->fLeftChild = el->val;
743	U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
744	return;
745	}
746
747	// Haven't seen this set before.
748	// If the caller didn't provide us with a prebuilt set,
749	// create a new UnicodeSet now.
750	if (setToAdopt == NULL) {
751	if (s.compare(kAny, -`1`) == `0`) {
752	setToAdopt = new UnicodeSet (`0x000000`, `0x10ffff`);
753	} else {
754	UChar32 c;
755	c = s.char32At(`0`);
756	setToAdopt = new UnicodeSet (c, c);
757	}
758	}
759
760	//
761	// Make a new uset node to refer to this UnicodeSet
762	// This new uset node becomes the child of the caller's setReference node.
763	//
764	RBBINode usetNode = new* RBBINode (RBBINode::uset);
765	if (usetNode == NULL) {
766	error(U_MEMORY_ALLOCATION_ERROR);
767	return;
768	}
769	usetNode->fInputSet = setToAdopt;
770	usetNode->fParent = node;
771	node->fLeftChild = usetNode;
772	usetNode->fText = s;
773
774
775	//
776	// Add the new uset node to the list of all uset nodes.
777	//
778	fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
779
780
781	//
782	// Add the new set to the set hash table.
783	//
784	el = (RBBISetTableEl )uprv_malloc(sizeof*(RBBISetTableEl));
785	UnicodeString tkey = new* UnicodeString (s);
786	if (tkey == NULL \|\| el == NULL \|\| setToAdopt == NULL) {
787	// Delete to avoid memory leak
788	delete tkey;
789	tkey = NULL;
790	uprv_free(el);
791	el = NULL;
792	delete setToAdopt;
793	setToAdopt = NULL;
794
795	error(U_MEMORY_ALLOCATION_ERROR);
796	return;
797	}
798	el->key = tkey;
799	el->val = usetNode;
800	uhash_put(fSetTable, el->key, el, fRB->fStatus);
801
802	return;
803	}
804
805
806
807	//
808	// Assorted Unicode character constants.
809	// Numeric because there is no portable way to enter them as literals.
810	// (Think EBCDIC).
811	//
812	static const UChar chCR = `0x0d`; // New lines, for terminating comments.
813	static const UChar chLF = `0x0a`;
814	static const UChar chNEL = `0x85`; // NEL newline variant
815	static const UChar chLS = `0x2028`; // Unicode Line Separator
816	static const UChar chApos = `0x27`; // single quote, for quoted chars.
817	static const UChar chPound = `0x23`; // '#', introduces a comment.
818	static const UChar chBackSlash = `0x5c`; // '\' introduces a char escape
819	static const UChar chLParen = `0x28`;
820	static const UChar chRParen = `0x29`;
821
822
823	//------------------------------------------------------------------------------
824	//
825	// stripRules Return a rules string without extra spaces.
826	// (Comments are removed separately, during rule parsing.)
827	//
828	//------------------------------------------------------------------------------
829	UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
830	UnicodeString strippedRules;
831	int32_t rulesLength = rules.length();
832	bool skippingSpaces = false;
833
834	for (int32_t idx=`0`; idx<rulesLength; idx = rules.moveIndex32(idx, `1`)) {
835	UChar32 cp = rules.char32At(idx);
836	bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
837	if (skippingSpaces && whiteSpace) {
838	continue;
839	}
840	strippedRules.append(cp);
841	skippingSpaces = whiteSpace;
842	}
843	return strippedRules;
844	}
845
846
847	//------------------------------------------------------------------------------
848	//
849	// nextCharLL Low Level Next Char from rule input source.
850	// Get a char from the input character iterator,
851	// keep track of input position for error reporting.
852	//
853	//------------------------------------------------------------------------------
854	UChar32 RBBIRuleScanner::nextCharLL() {
855	UChar32 ch;
856
857	if (fNextIndex >= fRB->fRules.length()) {
858	return (UChar32)-`1`;
859	}
860	ch = fRB->fRules.char32At(fNextIndex);
861	fNextIndex = fRB->fRules.moveIndex32(fNextIndex, `1`);
862
863	if (ch == chCR \|\|
864	ch == chNEL \|\|
865	ch == chLS \|\|
866	(ch == chLF && fLastChar != chCR)) {
867	// Character is starting a new line. Bump up the line number, and
868	// reset the column to 0.
869	fLineNum++;
870	fCharNum=`0`;
871	if (fQuoteMode) {
872	error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
873	fQuoteMode = FALSE;
874	}
875	}
876	else {
877	// Character is not starting a new line. Except in the case of a
878	// LF following a CR, increment the column position.
879	if (ch != chLF) {
880	fCharNum++;
881	}
882	}
883	fLastChar = ch;
884	return ch;
885	}
886
887
888	//------------------------------------------------------------------------------
889	//
890	// nextChar for rules scanning. At this level, we handle stripping
891	// out comments and processing backslash character escapes.
892	// The rest of the rules grammar is handled at the next level up.
893	//
894	//------------------------------------------------------------------------------
895	void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
896
897	// Unicode Character constants needed for the processing done by nextChar(),
898	// in hex because literals wont work on EBCDIC machines.
899
900	fScanIndex = fNextIndex;
901	c.fChar = nextCharLL();
902	c.fEscaped = FALSE;
903
904	//
905	// check for '' sequence.
906	// These are recognized in all contexts, whether in quoted text or not.
907	//
908	if (c.fChar == chApos) {
909	if (fRB->fRules.char32At(fNextIndex) == chApos) {
910	c.fChar = nextCharLL(); // get nextChar officially so character counts
911	c.fEscaped = TRUE; // stay correct.
912	}
913	else
914	{
915	// Single quote, by itself.
916	// Toggle quoting mode.
917	// Return either '(' or ')', because quotes cause a grouping of the quoted text.
918	fQuoteMode = !fQuoteMode;
919	if (fQuoteMode == TRUE) {
920	c.fChar = chLParen;
921	} else {
922	c.fChar = chRParen;
923	}
924	c.fEscaped = FALSE; // The paren that we return is not escaped.
925	return;
926	}
927	}
928
929	if (fQuoteMode) {
930	c.fEscaped = TRUE;
931	}
932	else
933	{
934	// We are not in a 'quoted region' of the source.
935	//
936	if (c.fChar == chPound) {
937	// Start of a comment. Consume the rest of it.
938	// The new-line char that terminates the comment is always returned.
939	// It will be treated as white-space, and serves to break up anything
940	// that might otherwise incorrectly clump together with a comment in
941	// the middle (a variable name, for example.)
942	int32_t commentStart = fScanIndex;
943	for (;;) {
944	c.fChar = nextCharLL();
945	if (c.fChar == (UChar32)-`1` \|\| // EOF
946	c.fChar == chCR \|\|
947	c.fChar == chLF \|\|
948	c.fChar == chNEL \|\|
949	c.fChar == chLS) {break;}
950	}
951	for (int32_t i=commentStart; i<fNextIndex-`1`; ++i) {
952	fRB->fStrippedRules.setCharAt(i, u`' '`);
953	}
954	}
955	if (c.fChar == (UChar32)-`1`) {
956	return;
957	}
958
959	//
960	// check for backslash escaped characters.
961	// Use UnicodeString::unescapeAt() to handle them.
962	//
963	if (c.fChar == chBackSlash) {
964	c.fEscaped = TRUE;
965	int32_t startX = fNextIndex;
966	c.fChar = fRB->fRules.unescapeAt(fNextIndex);
967	if (fNextIndex == startX) {
968	error(U_BRK_HEX_DIGITS_EXPECTED);
969	}
970	fCharNum += fNextIndex-startX;
971	}
972	}
973	// putc(c.fChar, stdout);
974	}
975
976	//------------------------------------------------------------------------------
977	//
978	// Parse RBBI rules. The state machine for rules parsing is here.
979	// The state tables are hand-written in the file rbbirpt.txt,
980	// and converted to the form used here by a perl
981	// script rbbicst.pl
982	//
983	//------------------------------------------------------------------------------
984	void RBBIRuleScanner::parse() {
985	uint16_t state;
986	const RBBIRuleTableEl *tableEl;
987
988	if (U_FAILURE(*fRB->fStatus)) {
989	return;
990	}
991
992	state = `1`;
993	nextChar(fC);
994	//
995	// Main loop for the rule parsing state machine.
996	// Runs once per state transition.
997	// Each time through optionally performs, depending on the state table,
998	// - an advance to the the next input char
999	// - an action to be performed.
1000	// - pushing or popping a state to/from the local state return stack.
1001	//
1002	for (;;) {
1003	// Bail out if anything has gone wrong.
1004	// RBBI rule file parsing stops on the first error encountered.
1005	if (U_FAILURE(*fRB->fStatus)) {
1006	break;
1007	}
1008
1009	// Quit if state == 0. This is the normal way to exit the state machine.
1010	//
1011	if (state == `0`) {
1012	break;
1013	}
1014
1015	// Find the state table element that matches the input char from the rule, or the
1016	// class of the input character. Start with the first table row for this
1017	// state, then linearly scan forward until we find a row that matches the
1018	// character. The last row for each state always matches all characters, so
1019	// the search will stop there, if not before.
1020	//
1021	tableEl = &gRuleParseStateTable[state];
1022	#ifdef RBBI_DEBUG
1023	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
1024	RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
1025	fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
1026	}
1027	#endif
1028
1029	for (;;) {
1030	#ifdef RBBI_DEBUG
1031	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
1032	#endif
1033	if (tableEl->fCharClass < `127` && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
1034	// Table row specified an individual character, not a set, and
1035	// the input character is not escaped, and
1036	// the input character matched it.
1037	break;
1038	}
1039	if (tableEl->fCharClass == `255`) {
1040	// Table row specified default, match anything character class.
1041	break;
1042	}
1043	if (tableEl->fCharClass == `254` && fC.fEscaped) {
1044	// Table row specified "escaped" and the char was escaped.
1045	break;
1046	}
1047	if (tableEl->fCharClass == `253` && fC.fEscaped &&
1048	(fC.fChar == `0x50` \|\| fC.fChar == `0x70` )) {
1049	// Table row specified "escaped P" and the char is either 'p' or 'P'.
1050	break;
1051	}
1052	if (tableEl->fCharClass == `252` && fC.fChar == (UChar32)-`1`) {
1053	// Table row specified eof and we hit eof on the input.
1054	break;
1055	}
1056
1057	if (tableEl->fCharClass >= `128` && tableEl->fCharClass < `240` && // Table specs a char class &&
1058	fC.fEscaped == FALSE && // char is not escaped &&
1059	fC.fChar != (UChar32)-`1`) { // char is not EOF
1060	U_ASSERT((tableEl->fCharClass-`128`) < UPRV_LENGTHOF(fRuleSets));
1061	if (fRuleSets[tableEl->fCharClass-`128`].contains(fC.fChar)) {
1062	// Table row specified a character class, or set of characters,
1063	// and the current char matches it.
1064	break;
1065	}
1066	}
1067
1068	// No match on this row, advance to the next row for this state,
1069	tableEl++;
1070	}
1071	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1072
1073	//
1074	// We've found the row of the state table that matches the current input
1075	// character from the rules string.
1076	// Perform any action specified by this row in the state table.
1077	if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
1078	// Break out of the state machine loop if the
1079	// the action signalled some kind of error, or
1080	// the action was to exit, occurs on normal end-of-rules-input.
1081	break;
1082	}
1083
1084	if (tableEl->fPushState != `0`) {
1085	fStackPtr++;
1086	if (fStackPtr >= kStackSize) {
1087	error(U_BRK_INTERNAL_ERROR);
1088	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1089	fStackPtr--;
1090	}
1091	fStack[fStackPtr] = tableEl->fPushState;
1092	}
1093
1094	if (tableEl->fNextChar) {
1095	nextChar(fC);
1096	}
1097
1098	// Get the next state from the table entry, or from the
1099	// state stack if the next state was specified as "pop".
1100	if (tableEl->fNextState != `255`) {
1101	state = tableEl->fNextState;
1102	} else {
1103	state = fStack[fStackPtr];
1104	fStackPtr--;
1105	if (fStackPtr < `0`) {
1106	error(U_BRK_INTERNAL_ERROR);
1107	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1108	fStackPtr++;
1109	}
1110	}
1111
1112	}
1113
1114	if (U_FAILURE(*fRB->fStatus)) {
1115	return;
1116	}
1117
1118	// If there are no forward rules set an error.
1119	//
1120	if (fRB->fForwardTree == NULL) {
1121	error(U_BRK_RULE_SYNTAX);
1122	return;
1123	}
1124
1125	//
1126	// Parsing of the input RBBI rules is complete.
1127	// We now have a parse tree for the rule expressions
1128	// and a list of all UnicodeSets that are referenced.
1129	//
1130	#ifdef RBBI_DEBUG
1131	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1132	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
1133	RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1134	RBBINode::printTree(fRB->fForwardTree, TRUE);
1135	RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1136	RBBINode::printTree(fRB->fReverseTree, TRUE);
1137	RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1138	RBBINode::printTree(fRB->fSafeFwdTree, TRUE);
1139	RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1140	RBBINode::printTree(fRB->fSafeRevTree, TRUE);
1141	}
1142	#endif
1143	}
1144
1145
1146	//------------------------------------------------------------------------------
1147	//
1148	// printNodeStack for debugging...
1149	//
1150	//------------------------------------------------------------------------------
1151	#ifdef RBBI_DEBUG
1152	void RBBIRuleScanner::printNodeStack(const char *title) {
1153	int i;
1154	RBBIDebugPrintf("%s. Dumping node stack...\n", title);
1155	for (i=fNodeStackPtr; i>`0`; i--) {RBBINode::printTree(fNodeStack[i], TRUE);}
1156	}
1157	#endif
1158
1159
1160
1161
1162	//------------------------------------------------------------------------------
1163	//
1164	// pushNewNode create a new RBBINode of the specified type and push it
1165	// onto the stack of nodes.
1166	//
1167	//------------------------------------------------------------------------------
1168	RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
1169	if (U_FAILURE(*fRB->fStatus)) {
1170	return NULL;
1171	}
1172	if (fNodeStackPtr >= kStackSize - `1`) {
1173	error(U_BRK_RULE_SYNTAX);
1174	RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1175	return NULL;
1176	}
1177	fNodeStackPtr++;
1178	fNodeStack[fNodeStackPtr] = new RBBINode (t);
1179	if (fNodeStack[fNodeStackPtr] == NULL) {
1180	*fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1181	}
1182	return fNodeStack[fNodeStackPtr];
1183	}
1184
1185
1186
1187	//------------------------------------------------------------------------------
1188	//
1189	// scanSet Construct a UnicodeSet from the text at the current scan
1190	// position. Advance the scan position to the first character
1191	// after the set.
1192	//
1193	// A new RBBI setref node referring to the set is pushed onto the node
1194	// stack.
1195	//
1196	// The scan position is normally under the control of the state machine
1197	// that controls rule parsing. UnicodeSets, however, are parsed by
1198	// the UnicodeSet constructor, not by the RBBI rule parser.
1199	//
1200	//------------------------------------------------------------------------------
1201	void RBBIRuleScanner::scanSet() {
1202	UnicodeSet *uset;
1203	ParsePosition pos;
1204	int startPos;
1205	int i;
1206
1207	if (U_FAILURE(*fRB->fStatus)) {
1208	return;
1209	}
1210
1211	pos.setIndex(fScanIndex);
1212	startPos = fScanIndex;
1213	UErrorCode localStatus = U_ZERO_ERROR;
1214	uset = new UnicodeSet ();
1215	if (uset == NULL) {
1216	localStatus = U_MEMORY_ALLOCATION_ERROR;
1217	} else {
1218	uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
1219	}
1220	if (U_FAILURE(localStatus)) {
1221	// TODO: Get more accurate position of the error from UnicodeSet's return info.
1222	// UnicodeSet appears to not be reporting correctly at this time.
1223	#ifdef RBBI_DEBUG
1224	RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
1225	#endif
1226	error(localStatus);
1227	delete uset;
1228	return;
1229	}
1230
1231	// Verify that the set contains at least one code point.
1232	//
1233	U_ASSERT(uset!=NULL);
1234	if (uset->isEmpty()) {
1235	// This set is empty.
1236	// Make it an error, because it almost certainly is not what the user wanted.
1237	// Also, avoids having to think about corner cases in the tree manipulation code
1238	// that occurs later on.
1239	error(U_BRK_RULE_EMPTY_SET);
1240	delete uset;
1241	return;
1242	}
1243
1244
1245	// Advance the RBBI parse postion over the UnicodeSet pattern.
1246	// Don't just set fScanIndex because the line/char positions maintained
1247	// for error reporting would be thrown off.
1248	i = pos.getIndex();
1249	for (;;) {
1250	if (fNextIndex >= i) {
1251	break;
1252	}
1253	nextCharLL();
1254	}
1255
1256	if (U_SUCCESS(*fRB->fStatus)) {
1257	RBBINode *n;
1258
1259	n = pushNewNode(RBBINode::setRef);
1260	if (U_FAILURE(*fRB->fStatus)) {
1261	return;
1262	}
1263	n->fFirstPos = startPos;
1264	n->fLastPos = fNextIndex;
1265	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1266	// findSetFor() serves several purposes here:
1267	// - Adopts storage for the UnicodeSet, will be responsible for deleting.
1268	// - Mantains collection of all sets in use, needed later for establishing
1269	// character categories for run time engine.
1270	// - Eliminates mulitiple instances of the same set.
1271	// - Creates a new uset node if necessary (if this isn't a duplicate.)
1272	findSetFor(n->fText, n, uset);
1273	}
1274
1275	}
1276
1277	int32_t RBBIRuleScanner::numRules() {
1278	return fRuleNum;
1279	}
1280
1281	U_NAMESPACE_END
1282
1283	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1284

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/rbbiscan.cpp