1/*
2 * Copyright (c) 2015, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Character classes and their mnemonics.
31 */
32
33#ifndef COMPONENTCLASS_H
34#define COMPONENTCLASS_H
35
36#include <string>
37#include <vector>
38#include <utility>
39
40#include "Component.h"
41#include "Parser.h"
42#include "util/charreach.h"
43#include "util/unicode_def.h"
44#include "ue2common.h"
45
46namespace ue2 {
47
48enum PredefinedClass {
49 CLASS_ALNUM,
50 CLASS_ALPHA,
51 CLASS_ANY, /* dot, not quite any when not in dotall mode */
52 CLASS_ASCII,
53 CLASS_BLANK,
54 CLASS_CNTRL,
55 CLASS_DIGIT,
56 CLASS_GRAPH,
57 CLASS_HORZ,
58 CLASS_LOWER,
59 CLASS_PRINT,
60 CLASS_PUNCT,
61 CLASS_SPACE, /* has vertical tab */
62 CLASS_UPPER,
63 CLASS_VERT,
64 CLASS_WORD,
65 CLASS_XDIGIT,
66 CLASS_XGRAPH, /* [:graph:] in UCP mode */
67 CLASS_XPRINT, /* [:print:] in UCP mode */
68 CLASS_XPUNCT, /* [:punct:] in UCP mode */
69 CLASS_UCP_C,
70 CLASS_UCP_CC,
71 CLASS_UCP_CF,
72 CLASS_UCP_CN, /* unallocated code points */
73 CLASS_UCP_CO,
74 CLASS_UCP_CS, /* does not contain valid unicode codepoints */
75 CLASS_UCP_L,
76 CLASS_UCP_LL,
77 CLASS_UCP_LM,
78 CLASS_UCP_LO,
79 CLASS_UCP_LT,
80 CLASS_UCP_LU,
81 CLASS_UCP_L_AND, /* L& = LL+LU+LT */
82 CLASS_UCP_M,
83 CLASS_UCP_MC,
84 CLASS_UCP_ME,
85 CLASS_UCP_MN,
86 CLASS_UCP_N,
87 CLASS_UCP_ND,
88 CLASS_UCP_NL,
89 CLASS_UCP_NO,
90 CLASS_UCP_P,
91 CLASS_UCP_PC,
92 CLASS_UCP_PD,
93 CLASS_UCP_PE,
94 CLASS_UCP_PF,
95 CLASS_UCP_PI,
96 CLASS_UCP_PO,
97 CLASS_UCP_PS,
98 CLASS_UCP_S,
99 CLASS_UCP_SC,
100 CLASS_UCP_SK,
101 CLASS_UCP_SM,
102 CLASS_UCP_SO,
103 CLASS_UCP_Z,
104 CLASS_UCP_ZL,
105 CLASS_UCP_ZP,
106 CLASS_UCP_ZS,
107 CLASS_UCP_XAN,
108 CLASS_UCP_XPS, /* CLASS_SPACE */
109 CLASS_UCP_XSP,
110 CLASS_UCP_XWD,
111 CLASS_SCRIPT_ARABIC,
112 CLASS_SCRIPT_ARMENIAN,
113 CLASS_SCRIPT_AVESTAN,
114 CLASS_SCRIPT_BALINESE,
115 CLASS_SCRIPT_BAMUM,
116 CLASS_SCRIPT_BATAK,
117 CLASS_SCRIPT_BENGALI,
118 CLASS_SCRIPT_BOPOMOFO,
119 CLASS_SCRIPT_BRAHMI,
120 CLASS_SCRIPT_BRAILLE,
121 CLASS_SCRIPT_BUGINESE,
122 CLASS_SCRIPT_BUHID,
123 CLASS_SCRIPT_CANADIAN_ABORIGINAL,
124 CLASS_SCRIPT_CARIAN,
125 CLASS_SCRIPT_CHAM,
126 CLASS_SCRIPT_CHEROKEE,
127 CLASS_SCRIPT_COMMON,
128 CLASS_SCRIPT_COPTIC,
129 CLASS_SCRIPT_CUNEIFORM,
130 CLASS_SCRIPT_CYPRIOT,
131 CLASS_SCRIPT_CYRILLIC,
132 CLASS_SCRIPT_DESERET,
133 CLASS_SCRIPT_DEVANAGARI,
134 CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS,
135 CLASS_SCRIPT_ETHIOPIC,
136 CLASS_SCRIPT_GEORGIAN,
137 CLASS_SCRIPT_GLAGOLITIC,
138 CLASS_SCRIPT_GOTHIC,
139 CLASS_SCRIPT_GREEK,
140 CLASS_SCRIPT_GUJARATI,
141 CLASS_SCRIPT_GURMUKHI,
142 CLASS_SCRIPT_HAN,
143 CLASS_SCRIPT_HANGUL,
144 CLASS_SCRIPT_HANUNOO,
145 CLASS_SCRIPT_HEBREW,
146 CLASS_SCRIPT_HIRAGANA,
147 CLASS_SCRIPT_IMPERIAL_ARAMAIC,
148 CLASS_SCRIPT_INHERITED,
149 CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI,
150 CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN,
151 CLASS_SCRIPT_JAVANESE,
152 CLASS_SCRIPT_KAITHI,
153 CLASS_SCRIPT_KANNADA,
154 CLASS_SCRIPT_KATAKANA,
155 CLASS_SCRIPT_KAYAH_LI,
156 CLASS_SCRIPT_KHAROSHTHI,
157 CLASS_SCRIPT_KHMER,
158 CLASS_SCRIPT_LAO,
159 CLASS_SCRIPT_LATIN,
160 CLASS_SCRIPT_LEPCHA,
161 CLASS_SCRIPT_LIMBU,
162 CLASS_SCRIPT_LINEAR_B,
163 CLASS_SCRIPT_LISU,
164 CLASS_SCRIPT_LYCIAN,
165 CLASS_SCRIPT_LYDIAN,
166 CLASS_SCRIPT_MALAYALAM,
167 CLASS_SCRIPT_MANDAIC,
168 CLASS_SCRIPT_MEETEI_MAYEK,
169 CLASS_SCRIPT_MONGOLIAN,
170 CLASS_SCRIPT_MYANMAR,
171 CLASS_SCRIPT_NEW_TAI_LUE,
172 CLASS_SCRIPT_NKO,
173 CLASS_SCRIPT_OGHAM,
174 CLASS_SCRIPT_OL_CHIKI,
175 CLASS_SCRIPT_OLD_ITALIC,
176 CLASS_SCRIPT_OLD_PERSIAN,
177 CLASS_SCRIPT_OLD_SOUTH_ARABIAN,
178 CLASS_SCRIPT_OLD_TURKIC,
179 CLASS_SCRIPT_ORIYA,
180 CLASS_SCRIPT_OSMANYA,
181 CLASS_SCRIPT_PHAGS_PA,
182 CLASS_SCRIPT_PHOENICIAN,
183 CLASS_SCRIPT_REJANG,
184 CLASS_SCRIPT_RUNIC,
185 CLASS_SCRIPT_SAMARITAN,
186 CLASS_SCRIPT_SAURASHTRA,
187 CLASS_SCRIPT_SHAVIAN,
188 CLASS_SCRIPT_SINHALA,
189 CLASS_SCRIPT_SUNDANESE,
190 CLASS_SCRIPT_SYLOTI_NAGRI,
191 CLASS_SCRIPT_SYRIAC,
192 CLASS_SCRIPT_TAGALOG,
193 CLASS_SCRIPT_TAGBANWA,
194 CLASS_SCRIPT_TAI_LE,
195 CLASS_SCRIPT_TAI_THAM,
196 CLASS_SCRIPT_TAI_VIET,
197 CLASS_SCRIPT_TAMIL,
198 CLASS_SCRIPT_TELUGU,
199 CLASS_SCRIPT_THAANA,
200 CLASS_SCRIPT_THAI,
201 CLASS_SCRIPT_TIBETAN,
202 CLASS_SCRIPT_TIFINAGH,
203 CLASS_SCRIPT_UGARITIC,
204 CLASS_SCRIPT_VAI,
205 CLASS_SCRIPT_YI,
206 CLASS_UCP_ANY
207};
208
209CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode);
210
211class ComponentClass;
212class NFABuilder;
213
214/* Caller is responsible for lifecycle management, class finalized */
215std::unique_ptr<ComponentClass>
216generateComponent(PredefinedClass c, bool negated, const ParseMode &mode);
217
218/* Caller is responsible for lifecycle management, class open */
219std::unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode);
220
221/** Common case: generate a component for a single literal character, possibly
222 * in caseless mode. Caller is responsible for lifecycle management. */
223std::unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
224 bool nocase);
225
226class ComponentClass : public Component {
227 friend class DumpVisitor;
228protected:
229 explicit ComponentClass(const ParseMode &mode_in);
230public:
231 ~ComponentClass() override;
232 ComponentClass *clone() const override = 0;
233
234 Component *accept(ComponentVisitor &v) override = 0;
235 void accept(ConstComponentVisitor &v) const override = 0;
236
237 /** \brief True if the class contains no members (i.e. it will not match
238 * against anything). This function can only be called on a finalized
239 * class.
240 *
241 * Note: This is a different concept to Component::empty.
242 */
243 virtual bool class_empty(void) const = 0;
244
245 virtual void add(PredefinedClass c, bool negated) = 0;
246 virtual void add(unichar c) = 0; /* may throw LocatedParseError */
247 void addDash(void);
248
249 void negate(void);
250 virtual void finalize(void) = 0;
251
252 bool isNegated() const { return m_negate; }
253
254 std::vector<PositionInfo> first() const override = 0;
255 std::vector<PositionInfo> last() const override = 0;
256 bool empty() const override { return false; } /* always 1 codepoint wide */
257
258 void notePositions(GlushkovBuildState &bs) override = 0;
259 void buildFollowSet(GlushkovBuildState &bs,
260 const std::vector<PositionInfo> &) override = 0;
261
262protected:
263 bool m_negate;
264 const ParseMode mode;
265 bool in_cand_range;
266 unichar range_start;
267 bool finalized;
268
269 virtual void createRange(unichar) = 0;
270
271 // Protected copy ctor. Use clone instead.
272 ComponentClass(const ComponentClass &other)
273 : Component(other), m_negate(other.m_negate), mode(other.mode),
274 in_cand_range(other.in_cand_range), range_start(other.range_start),
275 finalized(other.finalized) {}
276};
277
278} // namespace ue2
279
280#endif // COMPONENTCLASS_H
281