| 1 | /* |
| 2 | * Copyright (c) 2015, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief Character classes and their mnemonics. |
| 31 | */ |
| 32 | |
| 33 | #ifndef COMPONENTCLASS_H |
| 34 | #define COMPONENTCLASS_H |
| 35 | |
| 36 | #include <string> |
| 37 | #include <vector> |
| 38 | #include <utility> |
| 39 | |
| 40 | #include "Component.h" |
| 41 | #include "Parser.h" |
| 42 | #include "util/charreach.h" |
| 43 | #include "util/unicode_def.h" |
| 44 | #include "ue2common.h" |
| 45 | |
| 46 | namespace ue2 { |
| 47 | |
| 48 | enum PredefinedClass { |
| 49 | CLASS_ALNUM, |
| 50 | CLASS_ALPHA, |
| 51 | CLASS_ANY, /* dot, not quite any when not in dotall mode */ |
| 52 | CLASS_ASCII, |
| 53 | CLASS_BLANK, |
| 54 | CLASS_CNTRL, |
| 55 | CLASS_DIGIT, |
| 56 | CLASS_GRAPH, |
| 57 | CLASS_HORZ, |
| 58 | CLASS_LOWER, |
| 59 | CLASS_PRINT, |
| 60 | CLASS_PUNCT, |
| 61 | CLASS_SPACE, /* has vertical tab */ |
| 62 | CLASS_UPPER, |
| 63 | CLASS_VERT, |
| 64 | CLASS_WORD, |
| 65 | CLASS_XDIGIT, |
| 66 | CLASS_XGRAPH, /* [:graph:] in UCP mode */ |
| 67 | CLASS_XPRINT, /* [:print:] in UCP mode */ |
| 68 | CLASS_XPUNCT, /* [:punct:] in UCP mode */ |
| 69 | CLASS_UCP_C, |
| 70 | CLASS_UCP_CC, |
| 71 | CLASS_UCP_CF, |
| 72 | CLASS_UCP_CN, /* unallocated code points */ |
| 73 | CLASS_UCP_CO, |
| 74 | CLASS_UCP_CS, /* does not contain valid unicode codepoints */ |
| 75 | CLASS_UCP_L, |
| 76 | CLASS_UCP_LL, |
| 77 | CLASS_UCP_LM, |
| 78 | CLASS_UCP_LO, |
| 79 | CLASS_UCP_LT, |
| 80 | CLASS_UCP_LU, |
| 81 | CLASS_UCP_L_AND, /* L& = LL+LU+LT */ |
| 82 | CLASS_UCP_M, |
| 83 | CLASS_UCP_MC, |
| 84 | CLASS_UCP_ME, |
| 85 | CLASS_UCP_MN, |
| 86 | CLASS_UCP_N, |
| 87 | CLASS_UCP_ND, |
| 88 | CLASS_UCP_NL, |
| 89 | CLASS_UCP_NO, |
| 90 | CLASS_UCP_P, |
| 91 | CLASS_UCP_PC, |
| 92 | CLASS_UCP_PD, |
| 93 | CLASS_UCP_PE, |
| 94 | CLASS_UCP_PF, |
| 95 | CLASS_UCP_PI, |
| 96 | CLASS_UCP_PO, |
| 97 | CLASS_UCP_PS, |
| 98 | CLASS_UCP_S, |
| 99 | CLASS_UCP_SC, |
| 100 | CLASS_UCP_SK, |
| 101 | CLASS_UCP_SM, |
| 102 | CLASS_UCP_SO, |
| 103 | CLASS_UCP_Z, |
| 104 | CLASS_UCP_ZL, |
| 105 | CLASS_UCP_ZP, |
| 106 | CLASS_UCP_ZS, |
| 107 | CLASS_UCP_XAN, |
| 108 | CLASS_UCP_XPS, /* CLASS_SPACE */ |
| 109 | CLASS_UCP_XSP, |
| 110 | CLASS_UCP_XWD, |
| 111 | CLASS_SCRIPT_ARABIC, |
| 112 | CLASS_SCRIPT_ARMENIAN, |
| 113 | CLASS_SCRIPT_AVESTAN, |
| 114 | CLASS_SCRIPT_BALINESE, |
| 115 | CLASS_SCRIPT_BAMUM, |
| 116 | CLASS_SCRIPT_BATAK, |
| 117 | CLASS_SCRIPT_BENGALI, |
| 118 | CLASS_SCRIPT_BOPOMOFO, |
| 119 | CLASS_SCRIPT_BRAHMI, |
| 120 | CLASS_SCRIPT_BRAILLE, |
| 121 | CLASS_SCRIPT_BUGINESE, |
| 122 | CLASS_SCRIPT_BUHID, |
| 123 | CLASS_SCRIPT_CANADIAN_ABORIGINAL, |
| 124 | CLASS_SCRIPT_CARIAN, |
| 125 | CLASS_SCRIPT_CHAM, |
| 126 | CLASS_SCRIPT_CHEROKEE, |
| 127 | CLASS_SCRIPT_COMMON, |
| 128 | CLASS_SCRIPT_COPTIC, |
| 129 | CLASS_SCRIPT_CUNEIFORM, |
| 130 | CLASS_SCRIPT_CYPRIOT, |
| 131 | CLASS_SCRIPT_CYRILLIC, |
| 132 | CLASS_SCRIPT_DESERET, |
| 133 | CLASS_SCRIPT_DEVANAGARI, |
| 134 | CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS, |
| 135 | CLASS_SCRIPT_ETHIOPIC, |
| 136 | CLASS_SCRIPT_GEORGIAN, |
| 137 | CLASS_SCRIPT_GLAGOLITIC, |
| 138 | CLASS_SCRIPT_GOTHIC, |
| 139 | CLASS_SCRIPT_GREEK, |
| 140 | CLASS_SCRIPT_GUJARATI, |
| 141 | CLASS_SCRIPT_GURMUKHI, |
| 142 | CLASS_SCRIPT_HAN, |
| 143 | CLASS_SCRIPT_HANGUL, |
| 144 | CLASS_SCRIPT_HANUNOO, |
| 145 | CLASS_SCRIPT_HEBREW, |
| 146 | CLASS_SCRIPT_HIRAGANA, |
| 147 | CLASS_SCRIPT_IMPERIAL_ARAMAIC, |
| 148 | CLASS_SCRIPT_INHERITED, |
| 149 | CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI, |
| 150 | CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN, |
| 151 | CLASS_SCRIPT_JAVANESE, |
| 152 | CLASS_SCRIPT_KAITHI, |
| 153 | CLASS_SCRIPT_KANNADA, |
| 154 | CLASS_SCRIPT_KATAKANA, |
| 155 | CLASS_SCRIPT_KAYAH_LI, |
| 156 | CLASS_SCRIPT_KHAROSHTHI, |
| 157 | CLASS_SCRIPT_KHMER, |
| 158 | CLASS_SCRIPT_LAO, |
| 159 | CLASS_SCRIPT_LATIN, |
| 160 | CLASS_SCRIPT_LEPCHA, |
| 161 | CLASS_SCRIPT_LIMBU, |
| 162 | CLASS_SCRIPT_LINEAR_B, |
| 163 | CLASS_SCRIPT_LISU, |
| 164 | CLASS_SCRIPT_LYCIAN, |
| 165 | CLASS_SCRIPT_LYDIAN, |
| 166 | CLASS_SCRIPT_MALAYALAM, |
| 167 | CLASS_SCRIPT_MANDAIC, |
| 168 | CLASS_SCRIPT_MEETEI_MAYEK, |
| 169 | CLASS_SCRIPT_MONGOLIAN, |
| 170 | CLASS_SCRIPT_MYANMAR, |
| 171 | CLASS_SCRIPT_NEW_TAI_LUE, |
| 172 | CLASS_SCRIPT_NKO, |
| 173 | CLASS_SCRIPT_OGHAM, |
| 174 | CLASS_SCRIPT_OL_CHIKI, |
| 175 | CLASS_SCRIPT_OLD_ITALIC, |
| 176 | CLASS_SCRIPT_OLD_PERSIAN, |
| 177 | CLASS_SCRIPT_OLD_SOUTH_ARABIAN, |
| 178 | CLASS_SCRIPT_OLD_TURKIC, |
| 179 | CLASS_SCRIPT_ORIYA, |
| 180 | CLASS_SCRIPT_OSMANYA, |
| 181 | CLASS_SCRIPT_PHAGS_PA, |
| 182 | CLASS_SCRIPT_PHOENICIAN, |
| 183 | CLASS_SCRIPT_REJANG, |
| 184 | CLASS_SCRIPT_RUNIC, |
| 185 | CLASS_SCRIPT_SAMARITAN, |
| 186 | CLASS_SCRIPT_SAURASHTRA, |
| 187 | CLASS_SCRIPT_SHAVIAN, |
| 188 | CLASS_SCRIPT_SINHALA, |
| 189 | CLASS_SCRIPT_SUNDANESE, |
| 190 | CLASS_SCRIPT_SYLOTI_NAGRI, |
| 191 | CLASS_SCRIPT_SYRIAC, |
| 192 | CLASS_SCRIPT_TAGALOG, |
| 193 | CLASS_SCRIPT_TAGBANWA, |
| 194 | CLASS_SCRIPT_TAI_LE, |
| 195 | CLASS_SCRIPT_TAI_THAM, |
| 196 | CLASS_SCRIPT_TAI_VIET, |
| 197 | CLASS_SCRIPT_TAMIL, |
| 198 | CLASS_SCRIPT_TELUGU, |
| 199 | CLASS_SCRIPT_THAANA, |
| 200 | CLASS_SCRIPT_THAI, |
| 201 | CLASS_SCRIPT_TIBETAN, |
| 202 | CLASS_SCRIPT_TIFINAGH, |
| 203 | CLASS_SCRIPT_UGARITIC, |
| 204 | CLASS_SCRIPT_VAI, |
| 205 | CLASS_SCRIPT_YI, |
| 206 | CLASS_UCP_ANY |
| 207 | }; |
| 208 | |
| 209 | CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode); |
| 210 | |
| 211 | class ComponentClass; |
| 212 | class NFABuilder; |
| 213 | |
| 214 | /* Caller is responsible for lifecycle management, class finalized */ |
| 215 | std::unique_ptr<ComponentClass> |
| 216 | generateComponent(PredefinedClass c, bool negated, const ParseMode &mode); |
| 217 | |
| 218 | /* Caller is responsible for lifecycle management, class open */ |
| 219 | std::unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode); |
| 220 | |
| 221 | /** Common case: generate a component for a single literal character, possibly |
| 222 | * in caseless mode. Caller is responsible for lifecycle management. */ |
| 223 | std::unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c, |
| 224 | bool nocase); |
| 225 | |
| 226 | class ComponentClass : public Component { |
| 227 | friend class DumpVisitor; |
| 228 | protected: |
| 229 | explicit ComponentClass(const ParseMode &mode_in); |
| 230 | public: |
| 231 | ~ComponentClass() override; |
| 232 | ComponentClass *clone() const override = 0; |
| 233 | |
| 234 | Component *accept(ComponentVisitor &v) override = 0; |
| 235 | void accept(ConstComponentVisitor &v) const override = 0; |
| 236 | |
| 237 | /** \brief True if the class contains no members (i.e. it will not match |
| 238 | * against anything). This function can only be called on a finalized |
| 239 | * class. |
| 240 | * |
| 241 | * Note: This is a different concept to Component::empty. |
| 242 | */ |
| 243 | virtual bool class_empty(void) const = 0; |
| 244 | |
| 245 | virtual void add(PredefinedClass c, bool negated) = 0; |
| 246 | virtual void add(unichar c) = 0; /* may throw LocatedParseError */ |
| 247 | void addDash(void); |
| 248 | |
| 249 | void negate(void); |
| 250 | virtual void finalize(void) = 0; |
| 251 | |
| 252 | bool isNegated() const { return m_negate; } |
| 253 | |
| 254 | std::vector<PositionInfo> first() const override = 0; |
| 255 | std::vector<PositionInfo> last() const override = 0; |
| 256 | bool empty() const override { return false; } /* always 1 codepoint wide */ |
| 257 | |
| 258 | void notePositions(GlushkovBuildState &bs) override = 0; |
| 259 | void buildFollowSet(GlushkovBuildState &bs, |
| 260 | const std::vector<PositionInfo> &) override = 0; |
| 261 | |
| 262 | protected: |
| 263 | bool m_negate; |
| 264 | const ParseMode mode; |
| 265 | bool in_cand_range; |
| 266 | unichar range_start; |
| 267 | bool finalized; |
| 268 | |
| 269 | virtual void createRange(unichar) = 0; |
| 270 | |
| 271 | // Protected copy ctor. Use clone instead. |
| 272 | ComponentClass(const ComponentClass &other) |
| 273 | : Component(other), m_negate(other.m_negate), mode(other.mode), |
| 274 | in_cand_range(other.in_cand_range), range_start(other.range_start), |
| 275 | finalized(other.finalized) {} |
| 276 | }; |
| 277 | |
| 278 | } // namespace ue2 |
| 279 | |
| 280 | #endif // COMPONENTCLASS_H |
| 281 | |