1 | /* |
2 | * Copyright (c) 2015, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Character classes and their mnemonics. |
31 | */ |
32 | |
33 | #ifndef COMPONENTCLASS_H |
34 | #define COMPONENTCLASS_H |
35 | |
36 | #include <string> |
37 | #include <vector> |
38 | #include <utility> |
39 | |
40 | #include "Component.h" |
41 | #include "Parser.h" |
42 | #include "util/charreach.h" |
43 | #include "util/unicode_def.h" |
44 | #include "ue2common.h" |
45 | |
46 | namespace ue2 { |
47 | |
48 | enum PredefinedClass { |
49 | CLASS_ALNUM, |
50 | CLASS_ALPHA, |
51 | CLASS_ANY, /* dot, not quite any when not in dotall mode */ |
52 | CLASS_ASCII, |
53 | CLASS_BLANK, |
54 | CLASS_CNTRL, |
55 | CLASS_DIGIT, |
56 | CLASS_GRAPH, |
57 | CLASS_HORZ, |
58 | CLASS_LOWER, |
59 | CLASS_PRINT, |
60 | CLASS_PUNCT, |
61 | CLASS_SPACE, /* has vertical tab */ |
62 | CLASS_UPPER, |
63 | CLASS_VERT, |
64 | CLASS_WORD, |
65 | CLASS_XDIGIT, |
66 | CLASS_XGRAPH, /* [:graph:] in UCP mode */ |
67 | CLASS_XPRINT, /* [:print:] in UCP mode */ |
68 | CLASS_XPUNCT, /* [:punct:] in UCP mode */ |
69 | CLASS_UCP_C, |
70 | CLASS_UCP_CC, |
71 | CLASS_UCP_CF, |
72 | CLASS_UCP_CN, /* unallocated code points */ |
73 | CLASS_UCP_CO, |
74 | CLASS_UCP_CS, /* does not contain valid unicode codepoints */ |
75 | CLASS_UCP_L, |
76 | CLASS_UCP_LL, |
77 | CLASS_UCP_LM, |
78 | CLASS_UCP_LO, |
79 | CLASS_UCP_LT, |
80 | CLASS_UCP_LU, |
81 | CLASS_UCP_L_AND, /* L& = LL+LU+LT */ |
82 | CLASS_UCP_M, |
83 | CLASS_UCP_MC, |
84 | CLASS_UCP_ME, |
85 | CLASS_UCP_MN, |
86 | CLASS_UCP_N, |
87 | CLASS_UCP_ND, |
88 | CLASS_UCP_NL, |
89 | CLASS_UCP_NO, |
90 | CLASS_UCP_P, |
91 | CLASS_UCP_PC, |
92 | CLASS_UCP_PD, |
93 | CLASS_UCP_PE, |
94 | CLASS_UCP_PF, |
95 | CLASS_UCP_PI, |
96 | CLASS_UCP_PO, |
97 | CLASS_UCP_PS, |
98 | CLASS_UCP_S, |
99 | CLASS_UCP_SC, |
100 | CLASS_UCP_SK, |
101 | CLASS_UCP_SM, |
102 | CLASS_UCP_SO, |
103 | CLASS_UCP_Z, |
104 | CLASS_UCP_ZL, |
105 | CLASS_UCP_ZP, |
106 | CLASS_UCP_ZS, |
107 | CLASS_UCP_XAN, |
108 | CLASS_UCP_XPS, /* CLASS_SPACE */ |
109 | CLASS_UCP_XSP, |
110 | CLASS_UCP_XWD, |
111 | CLASS_SCRIPT_ARABIC, |
112 | CLASS_SCRIPT_ARMENIAN, |
113 | CLASS_SCRIPT_AVESTAN, |
114 | CLASS_SCRIPT_BALINESE, |
115 | CLASS_SCRIPT_BAMUM, |
116 | CLASS_SCRIPT_BATAK, |
117 | CLASS_SCRIPT_BENGALI, |
118 | CLASS_SCRIPT_BOPOMOFO, |
119 | CLASS_SCRIPT_BRAHMI, |
120 | CLASS_SCRIPT_BRAILLE, |
121 | CLASS_SCRIPT_BUGINESE, |
122 | CLASS_SCRIPT_BUHID, |
123 | CLASS_SCRIPT_CANADIAN_ABORIGINAL, |
124 | CLASS_SCRIPT_CARIAN, |
125 | CLASS_SCRIPT_CHAM, |
126 | CLASS_SCRIPT_CHEROKEE, |
127 | CLASS_SCRIPT_COMMON, |
128 | CLASS_SCRIPT_COPTIC, |
129 | CLASS_SCRIPT_CUNEIFORM, |
130 | CLASS_SCRIPT_CYPRIOT, |
131 | CLASS_SCRIPT_CYRILLIC, |
132 | CLASS_SCRIPT_DESERET, |
133 | CLASS_SCRIPT_DEVANAGARI, |
134 | CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS, |
135 | CLASS_SCRIPT_ETHIOPIC, |
136 | CLASS_SCRIPT_GEORGIAN, |
137 | CLASS_SCRIPT_GLAGOLITIC, |
138 | CLASS_SCRIPT_GOTHIC, |
139 | CLASS_SCRIPT_GREEK, |
140 | CLASS_SCRIPT_GUJARATI, |
141 | CLASS_SCRIPT_GURMUKHI, |
142 | CLASS_SCRIPT_HAN, |
143 | CLASS_SCRIPT_HANGUL, |
144 | CLASS_SCRIPT_HANUNOO, |
145 | CLASS_SCRIPT_HEBREW, |
146 | CLASS_SCRIPT_HIRAGANA, |
147 | CLASS_SCRIPT_IMPERIAL_ARAMAIC, |
148 | CLASS_SCRIPT_INHERITED, |
149 | CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI, |
150 | CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN, |
151 | CLASS_SCRIPT_JAVANESE, |
152 | CLASS_SCRIPT_KAITHI, |
153 | CLASS_SCRIPT_KANNADA, |
154 | CLASS_SCRIPT_KATAKANA, |
155 | CLASS_SCRIPT_KAYAH_LI, |
156 | CLASS_SCRIPT_KHAROSHTHI, |
157 | CLASS_SCRIPT_KHMER, |
158 | CLASS_SCRIPT_LAO, |
159 | CLASS_SCRIPT_LATIN, |
160 | CLASS_SCRIPT_LEPCHA, |
161 | CLASS_SCRIPT_LIMBU, |
162 | CLASS_SCRIPT_LINEAR_B, |
163 | CLASS_SCRIPT_LISU, |
164 | CLASS_SCRIPT_LYCIAN, |
165 | CLASS_SCRIPT_LYDIAN, |
166 | CLASS_SCRIPT_MALAYALAM, |
167 | CLASS_SCRIPT_MANDAIC, |
168 | CLASS_SCRIPT_MEETEI_MAYEK, |
169 | CLASS_SCRIPT_MONGOLIAN, |
170 | CLASS_SCRIPT_MYANMAR, |
171 | CLASS_SCRIPT_NEW_TAI_LUE, |
172 | CLASS_SCRIPT_NKO, |
173 | CLASS_SCRIPT_OGHAM, |
174 | CLASS_SCRIPT_OL_CHIKI, |
175 | CLASS_SCRIPT_OLD_ITALIC, |
176 | CLASS_SCRIPT_OLD_PERSIAN, |
177 | CLASS_SCRIPT_OLD_SOUTH_ARABIAN, |
178 | CLASS_SCRIPT_OLD_TURKIC, |
179 | CLASS_SCRIPT_ORIYA, |
180 | CLASS_SCRIPT_OSMANYA, |
181 | CLASS_SCRIPT_PHAGS_PA, |
182 | CLASS_SCRIPT_PHOENICIAN, |
183 | CLASS_SCRIPT_REJANG, |
184 | CLASS_SCRIPT_RUNIC, |
185 | CLASS_SCRIPT_SAMARITAN, |
186 | CLASS_SCRIPT_SAURASHTRA, |
187 | CLASS_SCRIPT_SHAVIAN, |
188 | CLASS_SCRIPT_SINHALA, |
189 | CLASS_SCRIPT_SUNDANESE, |
190 | CLASS_SCRIPT_SYLOTI_NAGRI, |
191 | CLASS_SCRIPT_SYRIAC, |
192 | CLASS_SCRIPT_TAGALOG, |
193 | CLASS_SCRIPT_TAGBANWA, |
194 | CLASS_SCRIPT_TAI_LE, |
195 | CLASS_SCRIPT_TAI_THAM, |
196 | CLASS_SCRIPT_TAI_VIET, |
197 | CLASS_SCRIPT_TAMIL, |
198 | CLASS_SCRIPT_TELUGU, |
199 | CLASS_SCRIPT_THAANA, |
200 | CLASS_SCRIPT_THAI, |
201 | CLASS_SCRIPT_TIBETAN, |
202 | CLASS_SCRIPT_TIFINAGH, |
203 | CLASS_SCRIPT_UGARITIC, |
204 | CLASS_SCRIPT_VAI, |
205 | CLASS_SCRIPT_YI, |
206 | CLASS_UCP_ANY |
207 | }; |
208 | |
209 | CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode); |
210 | |
211 | class ComponentClass; |
212 | class NFABuilder; |
213 | |
214 | /* Caller is responsible for lifecycle management, class finalized */ |
215 | std::unique_ptr<ComponentClass> |
216 | generateComponent(PredefinedClass c, bool negated, const ParseMode &mode); |
217 | |
218 | /* Caller is responsible for lifecycle management, class open */ |
219 | std::unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode); |
220 | |
221 | /** Common case: generate a component for a single literal character, possibly |
222 | * in caseless mode. Caller is responsible for lifecycle management. */ |
223 | std::unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c, |
224 | bool nocase); |
225 | |
226 | class ComponentClass : public Component { |
227 | friend class DumpVisitor; |
228 | protected: |
229 | explicit ComponentClass(const ParseMode &mode_in); |
230 | public: |
231 | ~ComponentClass() override; |
232 | ComponentClass *clone() const override = 0; |
233 | |
234 | Component *accept(ComponentVisitor &v) override = 0; |
235 | void accept(ConstComponentVisitor &v) const override = 0; |
236 | |
237 | /** \brief True if the class contains no members (i.e. it will not match |
238 | * against anything). This function can only be called on a finalized |
239 | * class. |
240 | * |
241 | * Note: This is a different concept to Component::empty. |
242 | */ |
243 | virtual bool class_empty(void) const = 0; |
244 | |
245 | virtual void add(PredefinedClass c, bool negated) = 0; |
246 | virtual void add(unichar c) = 0; /* may throw LocatedParseError */ |
247 | void addDash(void); |
248 | |
249 | void negate(void); |
250 | virtual void finalize(void) = 0; |
251 | |
252 | bool isNegated() const { return m_negate; } |
253 | |
254 | std::vector<PositionInfo> first() const override = 0; |
255 | std::vector<PositionInfo> last() const override = 0; |
256 | bool empty() const override { return false; } /* always 1 codepoint wide */ |
257 | |
258 | void notePositions(GlushkovBuildState &bs) override = 0; |
259 | void buildFollowSet(GlushkovBuildState &bs, |
260 | const std::vector<PositionInfo> &) override = 0; |
261 | |
262 | protected: |
263 | bool m_negate; |
264 | const ParseMode mode; |
265 | bool in_cand_range; |
266 | unichar range_start; |
267 | bool finalized; |
268 | |
269 | virtual void createRange(unichar) = 0; |
270 | |
271 | // Protected copy ctor. Use clone instead. |
272 | ComponentClass(const ComponentClass &other) |
273 | : Component(other), m_negate(other.m_negate), mode(other.mode), |
274 | in_cand_range(other.in_cand_range), range_start(other.range_start), |
275 | finalized(other.finalized) {} |
276 | }; |
277 | |
278 | } // namespace ue2 |
279 | |
280 | #endif // COMPONENTCLASS_H |
281 | |