1 | /* |
2 | * Copyright (c) 2015, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Character class in UTF-8 mode. |
31 | */ |
32 | |
33 | #ifndef UTF8_COMPONENT_CLASS_H |
34 | #define UTF8_COMPONENT_CLASS_H |
35 | |
36 | #include "ComponentClass.h" |
37 | #include "ue2common.h" |
38 | #include "util/unicode_set.h" |
39 | |
40 | #include <map> |
41 | #include <set> |
42 | #include <string> |
43 | #include <vector> |
44 | |
45 | namespace ue2 { |
46 | |
47 | class UTF8ComponentClass : public ComponentClass { |
48 | friend class DumpVisitor; |
49 | friend class PrintVisitor; |
50 | friend class CaselessVisitor; |
51 | friend class SimplifyVisitor; |
52 | friend class SimplifyCandidatesVisitor; |
53 | public: |
54 | explicit UTF8ComponentClass(const ParseMode &mode); |
55 | ~UTF8ComponentClass() override {} |
56 | UTF8ComponentClass *clone() const override; |
57 | |
58 | Component *accept(ComponentVisitor &v) override { |
59 | Component *c = v.visit(this); |
60 | v.post(this); |
61 | return c; |
62 | } |
63 | |
64 | void accept(ConstComponentVisitor &v) const override { |
65 | v.pre(*this); |
66 | v.during(*this); |
67 | v.post(*this); |
68 | } |
69 | |
70 | bool class_empty(void) const override; |
71 | void add(PredefinedClass c, bool negative) override; |
72 | void add(unichar c) override; |
73 | void finalize(void) override; |
74 | void notePositions(GlushkovBuildState &bs) override; |
75 | void buildFollowSet(GlushkovBuildState &bs, |
76 | const std::vector<PositionInfo> &) override; |
77 | std::vector<PositionInfo> first(void) const override; |
78 | std::vector<PositionInfo> last(void) const override; |
79 | |
80 | protected: |
81 | void createRange(unichar to) override; |
82 | |
83 | private: |
84 | Position (NFABuilder &builder, u8 first_byte); |
85 | void addToTail(GlushkovBuildState &bs, std::map<Position, Position> &finals, |
86 | Position prev, unichar b, unichar e); |
87 | void ensureDotTrailer(GlushkovBuildState &bs); |
88 | void ensureTwoDotTrailer(GlushkovBuildState &bs); |
89 | void ensureThreeDotTrailer(GlushkovBuildState &bs); |
90 | void buildOneByte(GlushkovBuildState &bs); |
91 | void buildTwoByte(GlushkovBuildState &bs); |
92 | void buildThreeByte(GlushkovBuildState &bs); |
93 | void buildFourByte(GlushkovBuildState &bs); |
94 | |
95 | CodePointSet cps; |
96 | |
97 | std::map<u8, Position> heads; |
98 | Position single_pos; |
99 | Position one_dot_trailer; |
100 | Position two_dot_trailer; |
101 | Position three_dot_trailer; |
102 | |
103 | Position two_char_dot_head; |
104 | Position three_char_dot_head; |
105 | Position four_char_dot_head; |
106 | std::set<Position> tails; |
107 | }; |
108 | |
109 | PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode); |
110 | |
111 | CodePointSet getPredefinedCodePointSet(PredefinedClass c, |
112 | const ParseMode &mode); |
113 | |
114 | } // namespace |
115 | |
116 | #endif // UTF8_COMPONENT_CLASS_H |
117 | |