rbbiscan.cpp source code [Godot/thirdparty/icu4c/common/rbbiscan.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// file: rbbiscan.cpp
5	//
6	// Copyright (C) 2002-2016, International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains the Rule Based Break Iterator Rule Builder functions for
10	// scanning the rules and assembling a parse tree. This is the first phase
11	// of compiling the rules.
12	//
13	// The overall of the rules is managed by class RBBIRuleBuilder, which will
14	// create and use an instance of this class as part of the process.
15	//
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_BREAK_ITERATION
20
21	#include "unicode/unistr.h"
22	#include "unicode/uniset.h"
23	#include "unicode/uchar.h"
24	#include "unicode/uchriter.h"
25	#include "unicode/parsepos.h"
26	#include "unicode/parseerr.h"
27	#include "cmemory.h"
28	#include "cstring.h"
29
30	#include "rbbirpt.h" // Contains state table for the rbbi rules parser.
31	// generated by a Perl script.
32	#include "rbbirb.h"
33	#include "rbbinode.h"
34	#include "rbbiscan.h"
35	#include "rbbitblb.h"
36
37	#include "uassert.h"
38
39	//------------------------------------------------------------------------------
40	//
41	// Unicode Set init strings for each of the character classes needed for parsing a rule file.
42	// (Initialized with hex values for portability to EBCDIC based machines.
43	// Really ugly, but there's no good way to avoid it.)
44	//
45	// The sets are referred to by name in the rbbirpt.txt, which is the
46	// source form of the state transition table for the RBBI rule parser.
47	//
48	//------------------------------------------------------------------------------
49	static const char16_t gRuleSet_rule_char_pattern[] = {
50	// Characters that may appear as literals in patterns without escaping or quoting.
51	// [ ^ [ \ p { Z } \ u 0 0 2 0
52	`0x5b`, `0x5e`, `0x5b`, `0x5c`, `0x70`, `0x7b`, `0x5a`, `0x7d`, `0x5c`, `0x75`, `0x30`, `0x30`, `0x32`, `0x30`,
53	// - \ u 0 0 7 f ] - [ \ p
54	`0x2d`, `0x5c`, `0x75`, `0x30`, `0x30`, `0x37`, `0x66`, `0x5d`, `0x2d`, `0x5b`, `0x5c`, `0x70`,
55	// { L } ] - [ \ p { N } ] ]
56	`0x7b`, `0x4c`, `0x7d`, `0x5d`, `0x2d`, `0x5b`, `0x5c`, `0x70`, `0x7b`, `0x4e`, `0x7d`, `0x5d`, `0x5d`, `0`};
57
58	static const char16_t gRuleSet_name_char_pattern[] = {
59	// [ _ \ p { L } \ p { N } ]
60	`0x5b`, `0x5f`, `0x5c`, `0x70`, `0x7b`, `0x4c`, `0x7d`, `0x5c`, `0x70`, `0x7b`, `0x4e`, `0x7d`, `0x5d`, `0`};
61
62	static const char16_t gRuleSet_digit_char_pattern[] = {
63	// [ 0 - 9 ]
64	`0x5b`, `0x30`, `0x2d`, `0x39`, `0x5d`, `0`};
65
66	static const char16_t gRuleSet_name_start_char_pattern[] = {
67	// [ _ \ p { L } ]
68	`0x5b`, `0x5f`, `0x5c`, `0x70`, `0x7b`, `0x4c`, `0x7d`, `0x5d`, `0` };
69
70	static const char16_t kAny[] = {`0x61`, `0x6e`, `0x79`, `0x00`}; // "any"
71
72
73	U_CDECL_BEGIN
74	static void U_CALLCONV RBBISetTable_deleter(void *p) {
75	icu::RBBISetTableEl px = (icu::RBBISetTableEl )p;
76	delete px->key;
77	// Note: px->val is owned by the linked list "fSetsListHead" in scanner.
78	// Don't delete the value nodes here.
79	uprv_free(px);
80	}
81	U_CDECL_END
82
83	U_NAMESPACE_BEGIN
84
85	//------------------------------------------------------------------------------
86	//
87	// Constructor.
88	//
89	//------------------------------------------------------------------------------
90	RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
91	{
92	fRB = rb;
93	fScanIndex = `0`;
94	fNextIndex = `0`;
95	fQuoteMode = false;
96	fLineNum = `1`;
97	fCharNum = `0`;
98	fLastChar = `0`;
99
100	fStateTable = nullptr;
101	fStack[`0`] = `0`;
102	fStackPtr = `0`;
103	fNodeStack[`0`] = nullptr;
104	fNodeStackPtr = `0`;
105
106	fReverseRule = false;
107	fLookAheadRule = false;
108	fNoChainInRule = false;
109
110	fSymbolTable = nullptr;
111	fSetTable = nullptr;
112	fRuleNum = `0`;
113	fOptionStart = `0`;
114
115	// Do not check status until after all critical fields are sufficiently initialized
116	// that the destructor can run cleanly.
117	if (U_FAILURE(*rb->fStatus)) {
118	return;
119	}
120
121	//
122	// Set up the constant Unicode Sets.
123	// Note: These could be made static, lazily initialized, and shared among
124	// all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
125	// and the time to build these few sets should be small compared to a
126	// full break iterator build.
127	fRuleSets[kRuleSet_rule_char-`128`]
128	= UnicodeSet (UnicodeString (gRuleSet_rule_char_pattern), *rb->fStatus);
129	// fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
130	fRuleSets[kRuleSet_white_space-`128`].
131	add(`9`, `0xd`).add(`0x20`).add(`0x85`).add(`0x200e`, `0x200f`).add(`0x2028`, `0x2029`);
132	fRuleSets[kRuleSet_name_char-`128`]
133	= UnicodeSet (UnicodeString (gRuleSet_name_char_pattern), *rb->fStatus);
134	fRuleSets[kRuleSet_name_start_char-`128`]
135	= UnicodeSet (UnicodeString (gRuleSet_name_start_char_pattern), *rb->fStatus);
136	fRuleSets[kRuleSet_digit_char-`128`]
137	= UnicodeSet (UnicodeString (gRuleSet_digit_char_pattern), *rb->fStatus);
138	if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
139	// This case happens if ICU's data is missing. UnicodeSet tries to look up property
140	// names from the init string, can't find them, and claims an illegal argument.
141	// Change the error so that the actual problem will be clearer to users.
142	*rb->fStatus = U_BRK_INIT_ERROR;
143	}
144	if (U_FAILURE(*rb->fStatus)) {
145	return;
146	}
147
148	fSymbolTable = new RBBISymbolTable (this, rb->fRules, *rb->fStatus);
149	if (fSymbolTable == nullptr) {
150	*rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
151	return;
152	}
153	fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus);
154	if (U_FAILURE(*rb->fStatus)) {
155	return;
156	}
157	uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
158	}
159
160
161
162	//------------------------------------------------------------------------------
163	//
164	// Destructor
165	//
166	//------------------------------------------------------------------------------
167	RBBIRuleScanner::~RBBIRuleScanner() {
168	delete fSymbolTable;
169	if (fSetTable != nullptr) {
170	uhash_close(fSetTable);
171	fSetTable = nullptr;
172
173	}
174
175
176	// Node Stack.
177	// Normally has one entry, which is the entire parse tree for the rules.
178	// If errors occurred, there may be additional subtrees left on the stack.
179	while (fNodeStackPtr > `0`) {
180	delete fNodeStack[fNodeStackPtr];
181	fNodeStackPtr--;
182	}
183
184	}
185
186	//------------------------------------------------------------------------------
187	//
188	// doParseAction Do some action during rule parsing.
189	// Called by the parse state machine.
190	// Actions build the parse tree and Unicode Sets,
191	// and maintain the parse stack for nested expressions.
192	//
193	// TODO: unify EParseAction and RBBI_RuleParseAction enum types.
194	// They represent exactly the same thing. They're separate
195	// only to work around enum forward declaration restrictions
196	// in some compilers, while at the same time avoiding multiple
197	// definitions problems. I'm sure that there's a better way.
198	//
199	//------------------------------------------------------------------------------
200	UBool RBBIRuleScanner::doParseActions(int32_t action)
201	{
202	RBBINode n = nullptr*;
203
204	UBool returnVal = true;
205
206	switch (action) {
207
208	case doExprStart:
209	pushNewNode(RBBINode::opStart);
210	fRuleNum++;
211	break;
212
213
214	case doNoChain:
215	// Scanned a '^' while on the rule start state.
216	fNoChainInRule = true;
217	break;
218
219
220	case doExprOrOperator:
221	{
222	fixOpStack(RBBINode::precOpCat);
223	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
224	RBBINode *orNode = pushNewNode(RBBINode::opOr);
225	if (U_FAILURE(*fRB->fStatus)) {
226	break;
227	}
228	orNode->fLeftChild = operandNode;
229	operandNode->fParent = orNode;
230	}
231	break;
232
233	case doExprCatOperator:
234	// concatenation operator.
235	// For the implicit concatenation of adjacent terms in an expression that are
236	// not separated by any other operator. Action is invoked between the
237	// actions for the two terms.
238	{
239	fixOpStack(RBBINode::precOpCat);
240	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
241	RBBINode *catNode = pushNewNode(RBBINode::opCat);
242	if (U_FAILURE(*fRB->fStatus)) {
243	break;
244	}
245	catNode->fLeftChild = operandNode;
246	operandNode->fParent = catNode;
247	}
248	break;
249
250	case doLParen:
251	// Open Paren.
252	// The openParen node is a dummy operation type with a low precedence,
253	// which has the affect of ensuring that any real binary op that
254	// follows within the parens binds more tightly to the operands than
255	// stuff outside of the parens.
256	pushNewNode(RBBINode::opLParen);
257	break;
258
259	case doExprRParen:
260	fixOpStack(RBBINode::precLParen);
261	break;
262
263	case doNOP:
264	break;
265
266	case doStartAssign:
267	// We've just scanned "$variable = "
268	// The top of the node stack has the $variable ref node.
269
270	// Save the start position of the RHS text in the StartExpression node
271	// that precedes the $variableReference node on the stack.
272	// This will eventually be used when saving the full $variable replacement
273	// text as a string.
274	n = fNodeStack[fNodeStackPtr-`1`];
275	n->fFirstPos = fNextIndex; // move past the '='
276
277	// Push a new start-of-expression node; needed to keep parse of the
278	// RHS expression happy.
279	pushNewNode(RBBINode::opStart);
280	break;
281
282
283
284
285	case doEndAssign:
286	{
287	// We have reached the end of an assignment statement.
288	// Current scan char is the ';' that terminates the assignment.
289
290	// Terminate expression, leaves expression parse tree rooted in TOS node.
291	fixOpStack(RBBINode::precStart);
292
293	RBBINode *startExprNode = fNodeStack[fNodeStackPtr-`2`];
294	RBBINode *varRefNode = fNodeStack[fNodeStackPtr-`1`];
295	RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
296
297	// Save original text of right side of assignment, excluding the terminating ';'
298	// in the root of the node for the right-hand-side expression.
299	RHSExprNode->fFirstPos = startExprNode->fFirstPos;
300	RHSExprNode->fLastPos = fScanIndex;
301	fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
302
303	// Expression parse tree becomes l. child of the $variable reference node.
304	varRefNode->fLeftChild = RHSExprNode;
305	RHSExprNode->fParent = varRefNode;
306
307	// Make a symbol table entry for the $variableRef node.
308	fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
309	if (U_FAILURE(*fRB->fStatus)) {
310	// This is a round-about way to get the parse position set
311	// so that duplicate symbols error messages include a line number.
312	UErrorCode t = *fRB->fStatus;
313	*fRB->fStatus = U_ZERO_ERROR;
314	error(t);
315	}
316
317	// Clean up the stack.
318	delete startExprNode;
319	fNodeStackPtr-=`3`;
320	break;
321	}
322
323	case doEndOfRule:
324	{
325	fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
326	if (U_FAILURE(fRB->fStatus)) { // parse tree rooted in TOS node.*
327	break;
328	}
329	#ifdef RBBI_DEBUG
330	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
331	#endif
332	U_ASSERT(fNodeStackPtr == `1`);
333	RBBINode *thisRule = fNodeStack[fNodeStackPtr];
334
335	// If this rule includes a look-ahead '/', add a endMark node to the
336	// expression tree.
337	if (fLookAheadRule) {
338	RBBINode *endNode = pushNewNode(RBBINode::endMark);
339	RBBINode *catNode = pushNewNode(RBBINode::opCat);
340	if (U_FAILURE(*fRB->fStatus)) {
341	break;
342	}
343	fNodeStackPtr -= `2`;
344	catNode->fLeftChild = thisRule;
345	catNode->fRightChild = endNode;
346	fNodeStack[fNodeStackPtr] = catNode;
347	endNode->fVal = fRuleNum;
348	endNode->fLookAheadEnd = true;
349	thisRule = catNode;
350
351	// TODO: Disable chaining out of look-ahead (hard break) rules.
352	// The break on rule match is forced, so there is no point in building up
353	// the state table to chain into another rule for a longer match.
354	}
355
356	// Mark this node as being the root of a rule.
357	thisRule->fRuleRoot = true;
358
359	// Flag if chaining into this rule is wanted.
360	//
361	if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
362	!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
363	thisRule->fChainIn = true;
364	}
365
366
367	// All rule expressions are ORed together.
368	// The ';' that terminates an expression really just functions as a '\|' with
369	// a low operator prededence.
370	//
371	// Each of the four sets of rules are collected separately.
372	// (forward, reverse, safe_forward, safe_reverse)
373	// OR this rule into the appropriate group of them.
374	//
375	RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
376
377	if (destRules != nullptr*) {
378	// This is not the first rule encountered.
379	// OR previous stuff (from destRules)*
380	// with the current rule expression (on the Node Stack)
381	// with the resulting OR expression going to destRules*
382	//
383	thisRule = fNodeStack[fNodeStackPtr];
384	RBBINode prevRules = destRules;
385	RBBINode *orNode = pushNewNode(RBBINode::opOr);
386	if (U_FAILURE(*fRB->fStatus)) {
387	break;
388	}
389	orNode->fLeftChild = prevRules;
390	prevRules->fParent = orNode;
391	orNode->fRightChild = thisRule;
392	thisRule->fParent = orNode;
393	*destRules = orNode;
394	}
395	else
396	{
397	// This is the first rule encountered (for this direction).
398	// Just move its parse tree from the stack to destRules.*
399	*destRules = fNodeStack[fNodeStackPtr];
400	}
401	fReverseRule = false; // in preparation for the next rule.
402	fLookAheadRule = false;
403	fNoChainInRule = false;
404	fNodeStackPtr = `0`;
405	}
406	break;
407
408
409	case doRuleError:
410	error(U_BRK_RULE_SYNTAX);
411	returnVal = false;
412	break;
413
414
415	case doVariableNameExpectedErr:
416	error(U_BRK_RULE_SYNTAX);
417	break;
418
419
420	//
421	// Unary operands + ? *
422	// These all appear after the operand to which they apply.
423	// When we hit one, the operand (may be a whole sub expression)
424	// will be on the top of the stack.
425	// Unary Operator becomes TOS, with the old TOS as its one child.
426	case doUnaryOpPlus:
427	{
428	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
429	RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
430	if (U_FAILURE(*fRB->fStatus)) {
431	break;
432	}
433	plusNode->fLeftChild = operandNode;
434	operandNode->fParent = plusNode;
435	}
436	break;
437
438	case doUnaryOpQuestion:
439	{
440	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
441	RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
442	if (U_FAILURE(*fRB->fStatus)) {
443	break;
444	}
445	qNode->fLeftChild = operandNode;
446	operandNode->fParent = qNode;
447	}
448	break;
449
450	case doUnaryOpStar:
451	{
452	RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
453	RBBINode *starNode = pushNewNode(RBBINode::opStar);
454	if (U_FAILURE(*fRB->fStatus)) {
455	break;
456	}
457	starNode->fLeftChild = operandNode;
458	operandNode->fParent = starNode;
459	}
460	break;
461
462	case doRuleChar:
463	// A "Rule Character" is any single character that is a literal part
464	// of the regular expression. Like a, b and c in the expression "(abc) \| [:L:]"*
465	// These are pretty uncommon in break rules; the terms are more commonly
466	// sets. To keep things uniform, treat these characters like as
467	// sets that just happen to contain only one character.
468	{
469	n = pushNewNode(RBBINode::setRef);
470	if (U_FAILURE(*fRB->fStatus)) {
471	break;
472	}
473	findSetFor(UnicodeString (fC.fChar), n);
474	n->fFirstPos = fScanIndex;
475	n->fLastPos = fNextIndex;
476	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
477	break;
478	}
479
480	case doDotAny:
481	// scanned a ".", meaning match any single character.
482	{
483	n = pushNewNode(RBBINode::setRef);
484	if (U_FAILURE(*fRB->fStatus)) {
485	break;
486	}
487	findSetFor(UnicodeString (true, kAny, `3`), n);
488	n->fFirstPos = fScanIndex;
489	n->fLastPos = fNextIndex;
490	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
491	break;
492	}
493
494	case doSlash:
495	// Scanned a '/', which identifies a look-ahead break position in a rule.
496	n = pushNewNode(RBBINode::lookAhead);
497	if (U_FAILURE(*fRB->fStatus)) {
498	break;
499	}
500	n->fVal = fRuleNum;
501	n->fFirstPos = fScanIndex;
502	n->fLastPos = fNextIndex;
503	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
504	fLookAheadRule = true;
505	break;
506
507
508	case doStartTagValue:
509	// Scanned a '{', the opening delimiter for a tag value within a rule.
510	n = pushNewNode(RBBINode::tag);
511	if (U_FAILURE(*fRB->fStatus)) {
512	break;
513	}
514	n->fVal = `0`;
515	n->fFirstPos = fScanIndex;
516	n->fLastPos = fNextIndex;
517	break;
518
519	case doTagDigit:
520	// Just scanned a decimal digit that's part of a tag value
521	{
522	n = fNodeStack[fNodeStackPtr];
523	uint32_t v = u_charDigitValue(fC.fChar);
524	U_ASSERT(v < `10`);
525	n->fVal = n->fVal*`10` + v;
526	break;
527	}
528
529	case doTagValue:
530	n = fNodeStack[fNodeStackPtr];
531	n->fLastPos = fNextIndex;
532	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
533	break;
534
535	case doTagExpectedError:
536	error(U_BRK_MALFORMED_RULE_TAG);
537	returnVal = false;
538	break;
539
540	case doOptionStart:
541	// Scanning a !!option. At the start of string.
542	fOptionStart = fScanIndex;
543	break;
544
545	case doOptionEnd:
546	{
547	UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
548	if (opt == UNICODE_STRING("chain", `5`)) {
549	fRB->fChainRules = true;
550	} else if (opt == UNICODE_STRING("LBCMNoChain", `11`)) {
551	fRB->fLBCMNoChain = true;
552	} else if (opt == UNICODE_STRING("forward", `7`)) {
553	fRB->fDefaultTree = &fRB->fForwardTree;
554	} else if (opt == UNICODE_STRING("reverse", `7`)) {
555	fRB->fDefaultTree = &fRB->fReverseTree;
556	} else if (opt == UNICODE_STRING("safe_forward", `12`)) {
557	fRB->fDefaultTree = &fRB->fSafeFwdTree;
558	} else if (opt == UNICODE_STRING("safe_reverse", `12`)) {
559	fRB->fDefaultTree = &fRB->fSafeRevTree;
560	} else if (opt == UNICODE_STRING("lookAheadHardBreak", `18`)) {
561	fRB->fLookAheadHardBreak = true;
562	} else if (opt == UNICODE_STRING("quoted_literals_only", `20`)) {
563	fRuleSets[kRuleSet_rule_char-`128`].clear();
564	} else if (opt == UNICODE_STRING("unquoted_literals", `17`)) {
565	fRuleSets[kRuleSet_rule_char-`128`].applyPattern(UnicodeString (gRuleSet_rule_char_pattern), *fRB->fStatus);
566	} else {
567	error(U_BRK_UNRECOGNIZED_OPTION);
568	}
569	}
570	break;
571
572	case doReverseDir:
573	fReverseRule = true;
574	break;
575
576	case doStartVariableName:
577	n = pushNewNode(RBBINode::varRef);
578	if (U_FAILURE(*fRB->fStatus)) {
579	break;
580	}
581	n->fFirstPos = fScanIndex;
582	break;
583
584	case doEndVariableName:
585	n = fNodeStack[fNodeStackPtr];
586	if (n==nullptr \|\| n->fType != RBBINode::varRef) {
587	error(U_BRK_INTERNAL_ERROR);
588	break;
589	}
590	n->fLastPos = fScanIndex;
591	fRB->fRules.extractBetween(n->fFirstPos+`1`, n->fLastPos, n->fText);
592	// Look the newly scanned name up in the symbol table
593	// If there's an entry, set the l. child of the var ref to the replacement expression.
594	// (We also pass through here when scanning assignments, but no harm is done, other
595	// than a slight wasted effort that seems hard to avoid. Lookup will be null)
596	n->fLeftChild = fSymbolTable->lookupNode(n->fText);
597	break;
598
599	case doCheckVarDef:
600	n = fNodeStack[fNodeStackPtr];
601	if (n->fLeftChild == nullptr) {
602	error(U_BRK_UNDEFINED_VARIABLE);
603	returnVal = false;
604	}
605	break;
606
607	case doExprFinished:
608	break;
609
610	case doRuleErrorAssignExpr:
611	error(U_BRK_ASSIGN_ERROR);
612	returnVal = false;
613	break;
614
615	case doExit:
616	returnVal = false;
617	break;
618
619	case doScanUnicodeSet:
620	scanSet();
621	break;
622
623	default:
624	error(U_BRK_INTERNAL_ERROR);
625	returnVal = false;
626	break;
627	}
628	return returnVal && U_SUCCESS(*fRB->fStatus);
629	}
630
631
632
633
634	//------------------------------------------------------------------------------
635	//
636	// Error Report a rule parse error.
637	// Only report it if no previous error has been recorded.
638	//
639	//------------------------------------------------------------------------------
640	void RBBIRuleScanner::error(UErrorCode e) {
641	if (U_SUCCESS(*fRB->fStatus)) {
642	*fRB->fStatus = e;
643	if (fRB->fParseError) {
644	fRB->fParseError->line = fLineNum;
645	fRB->fParseError->offset = fCharNum;
646	fRB->fParseError->preContext[`0`] = `0`;
647	fRB->fParseError->postContext[`0`] = `0`;
648	}
649	}
650	}
651
652
653
654
655	//------------------------------------------------------------------------------
656	//
657	// fixOpStack The parse stack holds partially assembled chunks of the parse tree.
658	// An entry on the stack may be as small as a single setRef node,
659	// or as large as the parse tree
660	// for an entire expression (this will be the one item left on the stack
661	// when the parsing of an RBBI rule completes.
662	//
663	// This function is called when a binary operator is encountered.
664	// It looks back up the stack for operators that are not yet associated
665	// with a right operand, and if the precedence of the stacked operator >=
666	// the precedence of the current operator, binds the operand left,
667	// to the previously encountered operator.
668	//
669	//------------------------------------------------------------------------------
670	void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
671	RBBINode *n;
672	// printNodeStack("entering fixOpStack()");
673	for (;;) {
674	n = fNodeStack[fNodeStackPtr-`1`]; // an operator node
675	if (n->fPrecedence == `0`) {
676	RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
677	error(U_BRK_INTERNAL_ERROR);
678	return;
679	}
680
681	if (n->fPrecedence < p \|\| n->fPrecedence <= RBBINode::precLParen) {
682	// The most recent operand goes with the current operator,
683	// not with the previously stacked one.
684	break;
685	}
686	// Stack operator is a binary op ( '\|' or concatenation)
687	// TOS operand becomes right child of this operator.
688	// Resulting subexpression becomes the TOS operand.
689	n->fRightChild = fNodeStack[fNodeStackPtr];
690	fNodeStack[fNodeStackPtr]->fParent = n;
691	fNodeStackPtr--;
692	// printNodeStack("looping in fixOpStack() ");
693	}
694
695	if (p <= RBBINode::precLParen) {
696	// Scan is at a right paren or end of expression.
697	// The scanned item must match the stack, or else there was an error.
698	// Discard the left paren (or start expr) node from the stack,
699	// leaving the completed (sub)expression as TOS.
700	if (n->fPrecedence != p) {
701	// Right paren encountered matched start of expression node, or
702	// end of expression matched with a left paren node.
703	error(U_BRK_MISMATCHED_PAREN);
704	}
705	fNodeStack[fNodeStackPtr-`1`] = fNodeStack[fNodeStackPtr];
706	fNodeStackPtr--;
707	// Delete the now-discarded LParen or Start node.
708	delete n;
709	}
710	// printNodeStack("leaving fixOpStack()");
711	}
712
713
714
715
716	//------------------------------------------------------------------------------
717	//
718	// findSetFor given a UnicodeString,
719	// - find the corresponding Unicode Set (uset node)
720	// (create one if necessary)
721	// - Set fLeftChild of the caller's node (should be a setRef node)
722	// to the uset node
723	// Maintain a hash table of uset nodes, so the same one is always used
724	// for the same string.
725	// If a "to adopt" set is provided and we haven't seen this key before,
726	// add the provided set to the hash table.
727	// If the string is one (32 bit) char in length, the set contains
728	// just one element which is the char in question.
729	// If the string is "any", return a set containing all chars.
730	//
731	//------------------------------------------------------------------------------
732	void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode node, UnicodeSet setToAdopt) {
733
734	RBBISetTableEl *el;
735
736	// First check whether we've already cached a set for this string.
737	// If so, just use the cached set in the new node.
738	// delete any set provided by the caller, since we own it.
739	el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
740	if (el != nullptr) {
741	delete setToAdopt;
742	node->fLeftChild = el->val;
743	U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
744	return;
745	}
746
747	// Haven't seen this set before.
748	// If the caller didn't provide us with a prebuilt set,
749	// create a new UnicodeSet now.
750	if (setToAdopt == nullptr) {
751	if (s.compare(kAny, -`1`) == `0`) {
752	setToAdopt = new UnicodeSet (`0x000000`, `0x10ffff`);
753	} else {
754	UChar32 c;
755	c = s.char32At(`0`);
756	setToAdopt = new UnicodeSet (c, c);
757	}
758	}
759
760	//
761	// Make a new uset node to refer to this UnicodeSet
762	// This new uset node becomes the child of the caller's setReference node.
763	//
764	RBBINode usetNode = new* RBBINode (RBBINode::uset);
765	if (usetNode == nullptr) {
766	error(U_MEMORY_ALLOCATION_ERROR);
767	return;
768	}
769	usetNode->fInputSet = setToAdopt;
770	usetNode->fParent = node;
771	node->fLeftChild = usetNode;
772	usetNode->fText = s;
773
774
775	//
776	// Add the new uset node to the list of all uset nodes.
777	//
778	fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
779
780
781	//
782	// Add the new set to the set hash table.
783	//
784	el = (RBBISetTableEl )uprv_malloc(sizeof*(RBBISetTableEl));
785	UnicodeString tkey = new* UnicodeString (s);
786	if (tkey == nullptr \|\| el == nullptr \|\| setToAdopt == nullptr) {
787	// Delete to avoid memory leak
788	delete tkey;
789	tkey = nullptr;
790	uprv_free(el);
791	el = nullptr;
792	delete setToAdopt;
793	setToAdopt = nullptr;
794
795	error(U_MEMORY_ALLOCATION_ERROR);
796	return;
797	}
798	el->key = tkey;
799	el->val = usetNode;
800	uhash_put(fSetTable, el->key, el, fRB->fStatus);
801
802	return;
803	}
804
805
806
807	//
808	// Assorted Unicode character constants.
809	// Numeric because there is no portable way to enter them as literals.
810	// (Think EBCDIC).
811	//
812	static const char16_t chCR = `0x0d`; // New lines, for terminating comments.
813	static const char16_t chLF = `0x0a`;
814	static const char16_t chNEL = `0x85`; // NEL newline variant
815	static const char16_t chLS = `0x2028`; // Unicode Line Separator
816	static const char16_t chApos = `0x27`; // single quote, for quoted chars.
817	static const char16_t chPound = `0x23`; // '#', introduces a comment.
818	static const char16_t chBackSlash = `0x5c`; // '\' introduces a char escape
819	static const char16_t chLParen = `0x28`;
820	static const char16_t chRParen = `0x29`;
821
822
823	//------------------------------------------------------------------------------
824	//
825	// stripRules Return a rules string without extra spaces.
826	// (Comments are removed separately, during rule parsing.)
827	//
828	//------------------------------------------------------------------------------
829	UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
830	UnicodeString strippedRules;
831	int32_t rulesLength = rules.length();
832
833	for (int32_t idx=`0`; idx<rulesLength; idx = rules.moveIndex32(idx, `1`)) {
834	UChar32 cp = rules.char32At(idx);
835	bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
836	if (whiteSpace) {
837	continue;
838	}
839	strippedRules.append(cp);
840	}
841	return strippedRules;
842	}
843
844
845	//------------------------------------------------------------------------------
846	//
847	// nextCharLL Low Level Next Char from rule input source.
848	// Get a char from the input character iterator,
849	// keep track of input position for error reporting.
850	//
851	//------------------------------------------------------------------------------
852	UChar32 RBBIRuleScanner::nextCharLL() {
853	UChar32 ch;
854
855	if (fNextIndex >= fRB->fRules.length()) {
856	return (UChar32)-`1`;
857	}
858	ch = fRB->fRules.char32At(fNextIndex);
859	if (U_IS_SURROGATE(ch)) {
860	error(U_ILLEGAL_CHAR_FOUND);
861	return U_SENTINEL;
862	}
863	fNextIndex = fRB->fRules.moveIndex32(fNextIndex, `1`);
864
865	if (ch == chCR \|\|
866	ch == chNEL \|\|
867	ch == chLS \|\|
868	(ch == chLF && fLastChar != chCR)) {
869	// Character is starting a new line. Bump up the line number, and
870	// reset the column to 0.
871	fLineNum++;
872	fCharNum=`0`;
873	if (fQuoteMode) {
874	error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
875	fQuoteMode = false;
876	}
877	}
878	else {
879	// Character is not starting a new line. Except in the case of a
880	// LF following a CR, increment the column position.
881	if (ch != chLF) {
882	fCharNum++;
883	}
884	}
885	fLastChar = ch;
886	return ch;
887	}
888
889
890	//------------------------------------------------------------------------------
891	//
892	// nextChar for rules scanning. At this level, we handle stripping
893	// out comments and processing backslash character escapes.
894	// The rest of the rules grammar is handled at the next level up.
895	//
896	//------------------------------------------------------------------------------
897	void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
898
899	// Unicode Character constants needed for the processing done by nextChar(),
900	// in hex because literals wont work on EBCDIC machines.
901
902	fScanIndex = fNextIndex;
903	c.fChar = nextCharLL();
904	c.fEscaped = false;
905
906	//
907	// check for '' sequence.
908	// These are recognized in all contexts, whether in quoted text or not.
909	//
910	if (c.fChar == chApos) {
911	if (fRB->fRules.char32At(fNextIndex) == chApos) {
912	c.fChar = nextCharLL(); // get nextChar officially so character counts
913	c.fEscaped = true; // stay correct.
914	}
915	else
916	{
917	// Single quote, by itself.
918	// Toggle quoting mode.
919	// Return either '(' or ')', because quotes cause a grouping of the quoted text.
920	fQuoteMode = !fQuoteMode;
921	if (fQuoteMode) {
922	c.fChar = chLParen;
923	} else {
924	c.fChar = chRParen;
925	}
926	c.fEscaped = false; // The paren that we return is not escaped.
927	return;
928	}
929	}
930
931	if (fQuoteMode) {
932	c.fEscaped = true;
933	}
934	else
935	{
936	// We are not in a 'quoted region' of the source.
937	//
938	if (c.fChar == chPound) {
939	// Start of a comment. Consume the rest of it.
940	// The new-line char that terminates the comment is always returned.
941	// It will be treated as white-space, and serves to break up anything
942	// that might otherwise incorrectly clump together with a comment in
943	// the middle (a variable name, for example.)
944	int32_t commentStart = fScanIndex;
945	for (;;) {
946	c.fChar = nextCharLL();
947	if (c.fChar == (UChar32)-`1` \|\| // EOF
948	c.fChar == chCR \|\|
949	c.fChar == chLF \|\|
950	c.fChar == chNEL \|\|
951	c.fChar == chLS) {break;}
952	}
953	for (int32_t i=commentStart; i<fNextIndex-`1`; ++i) {
954	fRB->fStrippedRules.setCharAt(i, u`' '`);
955	}
956	}
957	if (c.fChar == (UChar32)-`1`) {
958	return;
959	}
960
961	//
962	// check for backslash escaped characters.
963	// Use UnicodeString::unescapeAt() to handle them.
964	//
965	if (c.fChar == chBackSlash) {
966	c.fEscaped = true;
967	int32_t startX = fNextIndex;
968	c.fChar = fRB->fRules.unescapeAt(fNextIndex);
969	if (fNextIndex == startX) {
970	error(U_BRK_HEX_DIGITS_EXPECTED);
971	}
972	fCharNum += fNextIndex-startX;
973	}
974	}
975	// putc(c.fChar, stdout);
976	}
977
978	//------------------------------------------------------------------------------
979	//
980	// Parse RBBI rules. The state machine for rules parsing is here.
981	// The state tables are hand-written in the file rbbirpt.txt,
982	// and converted to the form used here by a perl
983	// script rbbicst.pl
984	//
985	//------------------------------------------------------------------------------
986	void RBBIRuleScanner::parse() {
987	uint16_t state;
988	const RBBIRuleTableEl *tableEl;
989
990	if (U_FAILURE(*fRB->fStatus)) {
991	return;
992	}
993
994	state = `1`;
995	nextChar(fC);
996	//
997	// Main loop for the rule parsing state machine.
998	// Runs once per state transition.
999	// Each time through optionally performs, depending on the state table,
1000	// - an advance to the the next input char
1001	// - an action to be performed.
1002	// - pushing or popping a state to/from the local state return stack.
1003	//
1004	for (;;) {
1005	// Bail out if anything has gone wrong.
1006	// RBBI rule file parsing stops on the first error encountered.
1007	if (U_FAILURE(*fRB->fStatus)) {
1008	break;
1009	}
1010
1011	// Quit if state == 0. This is the normal way to exit the state machine.
1012	//
1013	if (state == `0`) {
1014	break;
1015	}
1016
1017	// Find the state table element that matches the input char from the rule, or the
1018	// class of the input character. Start with the first table row for this
1019	// state, then linearly scan forward until we find a row that matches the
1020	// character. The last row for each state always matches all characters, so
1021	// the search will stop there, if not before.
1022	//
1023	tableEl = &gRuleParseStateTable[state];
1024	#ifdef RBBI_DEBUG
1025	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
1026	RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
1027	fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
1028	}
1029	#endif
1030
1031	for (;;) {
1032	#ifdef RBBI_DEBUG
1033	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
1034	#endif
1035	if (tableEl->fCharClass < `127` && fC.fEscaped == false && tableEl->fCharClass == fC.fChar) {
1036	// Table row specified an individual character, not a set, and
1037	// the input character is not escaped, and
1038	// the input character matched it.
1039	break;
1040	}
1041	if (tableEl->fCharClass == `255`) {
1042	// Table row specified default, match anything character class.
1043	break;
1044	}
1045	if (tableEl->fCharClass == `254` && fC.fEscaped) {
1046	// Table row specified "escaped" and the char was escaped.
1047	break;
1048	}
1049	if (tableEl->fCharClass == `253` && fC.fEscaped &&
1050	(fC.fChar == `0x50` \|\| fC.fChar == `0x70` )) {
1051	// Table row specified "escaped P" and the char is either 'p' or 'P'.
1052	break;
1053	}
1054	if (tableEl->fCharClass == `252` && fC.fChar == (UChar32)-`1`) {
1055	// Table row specified eof and we hit eof on the input.
1056	break;
1057	}
1058
1059	if (tableEl->fCharClass >= `128` && tableEl->fCharClass < `240` && // Table specs a char class &&
1060	fC.fEscaped == false && // char is not escaped &&
1061	fC.fChar != (UChar32)-`1`) { // char is not EOF
1062	U_ASSERT((tableEl->fCharClass-`128`) < UPRV_LENGTHOF(fRuleSets));
1063	if (fRuleSets[tableEl->fCharClass-`128`].contains(fC.fChar)) {
1064	// Table row specified a character class, or set of characters,
1065	// and the current char matches it.
1066	break;
1067	}
1068	}
1069
1070	// No match on this row, advance to the next row for this state,
1071	tableEl++;
1072	}
1073	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1074
1075	//
1076	// We've found the row of the state table that matches the current input
1077	// character from the rules string.
1078	// Perform any action specified by this row in the state table.
1079	if (doParseActions((int32_t)tableEl->fAction) == false) {
1080	// Break out of the state machine loop if the
1081	// the action signalled some kind of error, or
1082	// the action was to exit, occurs on normal end-of-rules-input.
1083	break;
1084	}
1085
1086	if (tableEl->fPushState != `0`) {
1087	fStackPtr++;
1088	if (fStackPtr >= kStackSize) {
1089	error(U_BRK_INTERNAL_ERROR);
1090	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1091	fStackPtr--;
1092	}
1093	fStack[fStackPtr] = tableEl->fPushState;
1094	}
1095
1096	if (tableEl->fNextChar) {
1097	nextChar(fC);
1098	}
1099
1100	// Get the next state from the table entry, or from the
1101	// state stack if the next state was specified as "pop".
1102	if (tableEl->fNextState != `255`) {
1103	state = tableEl->fNextState;
1104	} else {
1105	state = fStack[fStackPtr];
1106	fStackPtr--;
1107	if (fStackPtr < `0`) {
1108	error(U_BRK_INTERNAL_ERROR);
1109	RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1110	fStackPtr++;
1111	}
1112	}
1113
1114	}
1115
1116	if (U_FAILURE(*fRB->fStatus)) {
1117	return;
1118	}
1119
1120	// If there are no forward rules set an error.
1121	//
1122	if (fRB->fForwardTree == nullptr) {
1123	error(U_BRK_RULE_SYNTAX);
1124	return;
1125	}
1126
1127	//
1128	// Parsing of the input RBBI rules is complete.
1129	// We now have a parse tree for the rule expressions
1130	// and a list of all UnicodeSets that are referenced.
1131	//
1132	#ifdef RBBI_DEBUG
1133	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1134	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
1135	RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1136	RBBINode::printTree(fRB->fForwardTree, true);
1137	RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1138	RBBINode::printTree(fRB->fReverseTree, true);
1139	RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1140	RBBINode::printTree(fRB->fSafeFwdTree, true);
1141	RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1142	RBBINode::printTree(fRB->fSafeRevTree, true);
1143	}
1144	#endif
1145	}
1146
1147
1148	//------------------------------------------------------------------------------
1149	//
1150	// printNodeStack for debugging...
1151	//
1152	//------------------------------------------------------------------------------
1153	#ifdef RBBI_DEBUG
1154	void RBBIRuleScanner::printNodeStack(const char *title) {
1155	int i;
1156	RBBIDebugPrintf("%s. Dumping node stack...\n", title);
1157	for (i=fNodeStackPtr; i>`0`; i--) {RBBINode::printTree(fNodeStack[i], true);}
1158	}
1159	#endif
1160
1161
1162
1163
1164	//------------------------------------------------------------------------------
1165	//
1166	// pushNewNode create a new RBBINode of the specified type and push it
1167	// onto the stack of nodes.
1168	//
1169	//------------------------------------------------------------------------------
1170	RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
1171	if (U_FAILURE(*fRB->fStatus)) {
1172	return nullptr;
1173	}
1174	if (fNodeStackPtr >= kStackSize - `1`) {
1175	error(U_BRK_RULE_SYNTAX);
1176	RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1177	return nullptr;
1178	}
1179	fNodeStackPtr++;
1180	fNodeStack[fNodeStackPtr] = new RBBINode (t);
1181	if (fNodeStack[fNodeStackPtr] == nullptr) {
1182	*fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1183	}
1184	return fNodeStack[fNodeStackPtr];
1185	}
1186
1187
1188
1189	//------------------------------------------------------------------------------
1190	//
1191	// scanSet Construct a UnicodeSet from the text at the current scan
1192	// position. Advance the scan position to the first character
1193	// after the set.
1194	//
1195	// A new RBBI setref node referring to the set is pushed onto the node
1196	// stack.
1197	//
1198	// The scan position is normally under the control of the state machine
1199	// that controls rule parsing. UnicodeSets, however, are parsed by
1200	// the UnicodeSet constructor, not by the RBBI rule parser.
1201	//
1202	//------------------------------------------------------------------------------
1203	void RBBIRuleScanner::scanSet() {
1204	UnicodeSet *uset;
1205	ParsePosition pos;
1206	int startPos;
1207	int i;
1208
1209	if (U_FAILURE(*fRB->fStatus)) {
1210	return;
1211	}
1212
1213	pos.setIndex(fScanIndex);
1214	startPos = fScanIndex;
1215	UErrorCode localStatus = U_ZERO_ERROR;
1216	uset = new UnicodeSet ();
1217	if (uset == nullptr) {
1218	localStatus = U_MEMORY_ALLOCATION_ERROR;
1219	} else {
1220	uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
1221	}
1222	if (U_FAILURE(localStatus)) {
1223	// TODO: Get more accurate position of the error from UnicodeSet's return info.
1224	// UnicodeSet appears to not be reporting correctly at this time.
1225	#ifdef RBBI_DEBUG
1226	RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
1227	#endif
1228	error(localStatus);
1229	delete uset;
1230	return;
1231	}
1232
1233	// Verify that the set contains at least one code point.
1234	//
1235	U_ASSERT(uset!=nullptr);
1236	if (uset->isEmpty()) {
1237	// This set is empty.
1238	// Make it an error, because it almost certainly is not what the user wanted.
1239	// Also, avoids having to think about corner cases in the tree manipulation code
1240	// that occurs later on.
1241	error(U_BRK_RULE_EMPTY_SET);
1242	delete uset;
1243	return;
1244	}
1245
1246
1247	// Advance the RBBI parse position over the UnicodeSet pattern.
1248	// Don't just set fScanIndex because the line/char positions maintained
1249	// for error reporting would be thrown off.
1250	i = pos.getIndex();
1251	for (;;) {
1252	if (fNextIndex >= i) {
1253	break;
1254	}
1255	nextCharLL();
1256	}
1257
1258	if (U_SUCCESS(*fRB->fStatus)) {
1259	RBBINode *n;
1260
1261	n = pushNewNode(RBBINode::setRef);
1262	if (U_FAILURE(*fRB->fStatus)) {
1263	return;
1264	}
1265	n->fFirstPos = startPos;
1266	n->fLastPos = fNextIndex;
1267	fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1268	// findSetFor() serves several purposes here:
1269	// - Adopts storage for the UnicodeSet, will be responsible for deleting.
1270	// - Maintains collection of all sets in use, needed later for establishing
1271	// character categories for run time engine.
1272	// - Eliminates mulitiple instances of the same set.
1273	// - Creates a new uset node if necessary (if this isn't a duplicate.)
1274	findSetFor(n->fText, n, uset);
1275	}
1276
1277	}
1278
1279	int32_t RBBIRuleScanner::numRules() {
1280	return fRuleNum;
1281	}
1282
1283	U_NAMESPACE_END
1284
1285	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1286

Browse the source code of Godot/thirdparty/icu4c/common/rbbiscan.cpp