1 | /* |
2 | * Copyright (c) 2015, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Character classes and their mnemonics. |
31 | */ |
32 | #include "AsciiComponentClass.h" |
33 | #include "Utf8ComponentClass.h" |
34 | #include "buildstate.h" |
35 | #include "parse_error.h" |
36 | #include "position.h" |
37 | #include "position_info.h" |
38 | #include "nfagraph/ng_builder.h" |
39 | #include "util/charreach_util.h" |
40 | |
41 | using namespace std; |
42 | |
43 | namespace ue2 { |
44 | |
45 | AsciiComponentClass::AsciiComponentClass(const ParseMode &mode_in) |
46 | : ComponentClass(mode_in), position(GlushkovBuildState::POS_UNINITIALIZED) { |
47 | assert(!mode.utf8); |
48 | } |
49 | |
50 | AsciiComponentClass *AsciiComponentClass::clone() const { |
51 | return new AsciiComponentClass(*this); |
52 | } |
53 | |
54 | bool AsciiComponentClass::class_empty(void) const { |
55 | assert(finalized); |
56 | return cr.none(); |
57 | } |
58 | |
59 | void AsciiComponentClass::createRange(unichar to) { |
60 | assert(range_start <= 0xff); |
61 | unsigned char from = (u8)range_start; |
62 | if (from > to) { |
63 | throw LocatedParseError("Range out of order in character class" ); |
64 | } |
65 | |
66 | in_cand_range = false; |
67 | CharReach ncr(from, to); |
68 | if (mode.caseless) { |
69 | make_caseless(&ncr); |
70 | } |
71 | cr |= ncr; |
72 | range_start = INVALID_UNICODE; |
73 | } |
74 | |
75 | void AsciiComponentClass::notePositions(GlushkovBuildState &bs) { |
76 | // We should always be finalized by now. |
77 | assert(finalized); |
78 | |
79 | NFABuilder &builder = bs.getBuilder(); |
80 | position = builder.makePositions(1); |
81 | |
82 | builder.addCharReach(position, cr); |
83 | builder.setNodeReportID(position, 0 /* offset adj */); |
84 | recordPosBounds(position, position + 1); |
85 | } |
86 | |
87 | void AsciiComponentClass::buildFollowSet(GlushkovBuildState &, |
88 | const vector<PositionInfo> &) { |
89 | // all follow set construction is handled by firsts/lasts |
90 | } |
91 | |
92 | void AsciiComponentClass::add(PredefinedClass c, bool negative) { |
93 | if (in_cand_range) { // can't form a range here |
94 | throw LocatedParseError("Invalid range in character class" ); |
95 | } |
96 | DEBUG_PRINTF("getting %u %s\n" , (u32)c, negative ? "^" : "" ); |
97 | |
98 | if (mode.ucp) { |
99 | c = translateForUcpMode(c, mode); |
100 | } |
101 | |
102 | // Note: caselessness is handled by getPredefinedCharReach. |
103 | CharReach pcr = getPredefinedCharReach(c, mode); |
104 | if (negative) { |
105 | pcr.flip(); |
106 | } |
107 | |
108 | cr |= pcr; |
109 | range_start = INVALID_UNICODE; |
110 | in_cand_range = false; |
111 | } |
112 | |
113 | void AsciiComponentClass::add(unichar c) { |
114 | DEBUG_PRINTF("adding \\x%02x\n" , c); |
115 | if (c > 0xff) { // too big! |
116 | throw LocatedParseError("Hexadecimal value is greater than \\xFF" ); |
117 | } |
118 | |
119 | if (in_cand_range) { |
120 | createRange(c); |
121 | return; |
122 | } |
123 | |
124 | CharReach ncr(c, c); |
125 | if (mode.caseless) { |
126 | make_caseless(&ncr); |
127 | } |
128 | |
129 | cr |= ncr; |
130 | range_start = c; |
131 | } |
132 | |
133 | void AsciiComponentClass::finalize() { |
134 | if (finalized) { |
135 | return; |
136 | } |
137 | |
138 | // Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the |
139 | // dash is a literal dash. |
140 | if (in_cand_range) { |
141 | cr.set('-'); |
142 | in_cand_range = false; |
143 | } |
144 | |
145 | if (m_negate) { |
146 | cr.flip(); |
147 | } |
148 | |
149 | finalized = true; |
150 | } |
151 | |
152 | vector<PositionInfo> AsciiComponentClass::first(void) const { |
153 | return vector<PositionInfo>(1, PositionInfo(position)); |
154 | } |
155 | |
156 | vector<PositionInfo> AsciiComponentClass::last(void) const { |
157 | return vector<PositionInfo>(1, PositionInfo(position)); |
158 | } |
159 | |
160 | } // namespace ue2 |
161 | |