rbbiscan.h source code [ClickHouse/contrib/icu/icu4c/source/common/rbbiscan.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// rbbiscan.h
5	//
6	// Copyright (C) 2002-2016, International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains declarations for class RBBIRuleScanner
10	//
11
12
13	#ifndef RBBISCAN_H
14	#define RBBISCAN_H
15
16	#include "unicode/utypes.h"
17	#include "unicode/uobject.h"
18	#include "unicode/rbbi.h"
19	#include "unicode/uniset.h"
20	#include "unicode/parseerr.h"
21	#include "uhash.h"
22	#include "uvector.h"
23	#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24	// looks up references to $variables within a set.
25	#include "rbbinode.h"
26	#include "rbbirpt.h"
27
28	U_NAMESPACE_BEGIN
29
30	class RBBIRuleBuilder;
31	class RBBISymbolTable;
32
33
34	//--------------------------------------------------------------------------------
35	//
36	// class RBBIRuleScanner does the lowest level, character-at-a-time
37	// scanning of break iterator rules.
38	//
39	// The output of the scanner is parse trees for
40	// the rule expressions and a list of all Unicode Sets
41	// encountered.
42	//
43	//--------------------------------------------------------------------------------
44
45	class RBBIRuleScanner : public UMemory {
46	public:
47
48	enum {
49	kStackSize = `100` // The size of the state stack for
50	}; // rules parsing. Corresponds roughly
51	// to the depth of parentheses nesting
52	// that is allowed in the rules.
53
54	struct RBBIRuleChar {
55	UChar32 fChar;
56	UBool fEscaped;
57	RBBIRuleChar() : fChar(`0`), fEscaped(FALSE) {}
58	};
59
60	RBBIRuleScanner(RBBIRuleBuilder *rb);
61
62
63	virtual ~RBBIRuleScanner();
64
65	void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
66	// Return false if at end.
67
68	UBool push(const RBBIRuleChar &c); // Push (unget) one character.
69	// Only a single character may be pushed.
70
71	void parse(); // Parse the rules, generating two parse
72	// trees, one each for the forward and
73	// reverse rules,
74	// and a list of UnicodeSets encountered.
75
76	int32_t numRules(); // Return the number of rules that have been seen.
77
78	/**
79	* Return a rules string without unnecessary
80	* characters.
81	*/
82	static UnicodeString stripRules(const UnicodeString &rules);
83	private:
84
85	UBool doParseActions(int32_t a);
86	void error(UErrorCode e); // error reporting convenience function.
87	void fixOpStack(RBBINode::OpPrecedence p);
88	// a character.
89	void findSetFor(const UnicodeString &s, RBBINode node, UnicodeSet setToAdopt = NULL);
90
91	UChar32 nextCharLL();
92	#ifdef RBBI_DEBUG
93	void printNodeStack(const char *title);
94	#endif
95	RBBINode *pushNewNode(RBBINode::NodeType t);
96	void scanSet();
97
98
99	RBBIRuleBuilder fRB; // The rule builder that we are part of.*
100
101	int32_t fScanIndex; // Index of current character being processed
102	// in the rule input string.
103	int32_t fNextIndex; // Index of the next character, which
104	// is the first character not yet scanned.
105	UBool fQuoteMode; // Scan is in a 'quoted region'
106	int32_t fLineNum; // Line number in input file.
107	int32_t fCharNum; // Char position within the line.
108	UChar32 fLastChar; // Previous char, needed to count CR-LF
109	// as a single line, not two.
110
111	RBBIRuleChar fC; // Current char for parse state machine
112	// processing.
113	UnicodeString fVarName; // $variableName, valid when we've just
114	// scanned one.
115
116	RBBIRuleTableEl *fStateTable; // State Transition Table for RBBI Rule*
117	// parsing. index by p[state][char-class]
118
119	uint16_t fStack[kStackSize]; // State stack, holds state pushes
120	int32_t fStackPtr; // and pops as specified in the state
121	// transition rules.
122
123	RBBINode fNodeStack[kStackSize]; // Node stack, holds nodes created*
124	// during the parse of a rule
125	int32_t fNodeStackPtr;
126
127
128	UBool fReverseRule; // True if the rule currently being scanned
129	// is a reverse direction rule (if it
130	// starts with a '!')
131
132	UBool fLookAheadRule; // True if the rule includes a '/'
133	// somewhere within it.
134
135	UBool fNoChainInRule; // True if the current rule starts with a '^'.
136
137	RBBISymbolTable fSymbolTable; // symbol table, holds definitions of*
138	// $variable symbols.
139
140	UHashtable fSetTable; // UnicocodeSet hash table, holds indexes to*
141	// the sets created while parsing rules.
142	// The key is the string used for creating
143	// the set.
144
145	UnicodeSet fRuleSets[`10`]; // Unicode Sets that are needed during
146	// the scanning of RBBI rules. The
147	// indicies for these are assigned by the
148	// perl script that builds the state tables.
149	// See rbbirpt.h.
150
151	int32_t fRuleNum; // Counts each rule as it is scanned.
152
153	int32_t fOptionStart; // Input index of start of a !!option
154	// keyword, while being scanned.
155
156	UnicodeSet *gRuleSet_rule_char;
157	UnicodeSet *gRuleSet_white_space;
158	UnicodeSet *gRuleSet_name_char;
159	UnicodeSet *gRuleSet_name_start_char;
160
161	RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
162	RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
163	};
164
165	U_NAMESPACE_END
166
167	#endif
168

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/rbbiscan.h