1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// rbbiscan.h
5//
6// Copyright (C) 2002-2016, International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains declarations for class RBBIRuleScanner
10//
11
12
13#ifndef RBBISCAN_H
14#define RBBISCAN_H
15
16#include "unicode/utypes.h"
17#include "unicode/uobject.h"
18#include "unicode/rbbi.h"
19#include "unicode/uniset.h"
20#include "unicode/parseerr.h"
21#include "uhash.h"
22#include "uvector.h"
23#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24 // looks up references to $variables within a set.
25#include "rbbinode.h"
26#include "rbbirpt.h"
27
28U_NAMESPACE_BEGIN
29
30class RBBIRuleBuilder;
31class RBBISymbolTable;
32
33
34//--------------------------------------------------------------------------------
35//
36// class RBBIRuleScanner does the lowest level, character-at-a-time
37// scanning of break iterator rules.
38//
39// The output of the scanner is parse trees for
40// the rule expressions and a list of all Unicode Sets
41// encountered.
42//
43//--------------------------------------------------------------------------------
44
45class RBBIRuleScanner : public UMemory {
46public:
47
48 enum {
49 kStackSize = 100 // The size of the state stack for
50 }; // rules parsing. Corresponds roughly
51 // to the depth of parentheses nesting
52 // that is allowed in the rules.
53
54 struct RBBIRuleChar {
55 UChar32 fChar;
56 UBool fEscaped;
57 RBBIRuleChar() : fChar(0), fEscaped(false) {}
58 };
59
60 RBBIRuleScanner(RBBIRuleBuilder *rb);
61
62
63 virtual ~RBBIRuleScanner();
64
65 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
66 // Return false if at end.
67
68 UBool push(const RBBIRuleChar &c); // Push (unget) one character.
69 // Only a single character may be pushed.
70
71 void parse(); // Parse the rules, generating two parse
72 // trees, one each for the forward and
73 // reverse rules,
74 // and a list of UnicodeSets encountered.
75
76 int32_t numRules(); // Return the number of rules that have been seen.
77
78 /**
79 * Return a rules string without unnecessary
80 * characters.
81 */
82 static UnicodeString stripRules(const UnicodeString &rules);
83private:
84
85 UBool doParseActions(int32_t a);
86 void error(UErrorCode e); // error reporting convenience function.
87 void fixOpStack(RBBINode::OpPrecedence p);
88 // a character.
89 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr);
90
91 UChar32 nextCharLL();
92#ifdef RBBI_DEBUG
93 void printNodeStack(const char *title);
94#endif
95 RBBINode *pushNewNode(RBBINode::NodeType t);
96 void scanSet();
97
98
99 RBBIRuleBuilder *fRB; // The rule builder that we are part of.
100
101 int32_t fScanIndex; // Index of current character being processed
102 // in the rule input string.
103 int32_t fNextIndex; // Index of the next character, which
104 // is the first character not yet scanned.
105 UBool fQuoteMode; // Scan is in a 'quoted region'
106 int32_t fLineNum; // Line number in input file.
107 int32_t fCharNum; // Char position within the line.
108 UChar32 fLastChar; // Previous char, needed to count CR-LF
109 // as a single line, not two.
110
111 RBBIRuleChar fC; // Current char for parse state machine
112 // processing.
113 UnicodeString fVarName; // $variableName, valid when we've just
114 // scanned one.
115
116 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
117 // parsing. index by p[state][char-class]
118
119 uint16_t fStack[kStackSize]; // State stack, holds state pushes
120 int32_t fStackPtr; // and pops as specified in the state
121 // transition rules.
122
123 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
124 // during the parse of a rule
125 int32_t fNodeStackPtr;
126
127
128 UBool fReverseRule; // True if the rule currently being scanned
129 // is a reverse direction rule (if it
130 // starts with a '!')
131
132 UBool fLookAheadRule; // True if the rule includes a '/'
133 // somewhere within it.
134
135 UBool fNoChainInRule; // True if the current rule starts with a '^'.
136
137 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
138 // $variable symbols.
139
140 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
141 // the sets created while parsing rules.
142 // The key is the string used for creating
143 // the set.
144
145 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
146 // the scanning of RBBI rules. The
147 // indices for these are assigned by the
148 // perl script that builds the state tables.
149 // See rbbirpt.h.
150
151 int32_t fRuleNum; // Counts each rule as it is scanned.
152
153 int32_t fOptionStart; // Input index of start of a !!option
154 // keyword, while being scanned.
155
156 UnicodeSet *gRuleSet_rule_char;
157 UnicodeSet *gRuleSet_white_space;
158 UnicodeSet *gRuleSet_name_char;
159 UnicodeSet *gRuleSet_name_start_char;
160
161 RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class
162 RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class
163};
164
165U_NAMESPACE_END
166
167#endif
168