1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Literal Component Splitting. Identifies literals that span the
31 * graph and moves them into Rose.
32 */
33
34#include "ng_literal_component.h"
35
36#include "grey.h"
37#include "ng.h"
38#include "ng_prune.h"
39#include "ng_util.h"
40#include "ue2common.h"
41#include "compiler/compiler.h"
42#include "rose/rose_build.h"
43#include "util/container.h"
44#include "util/graph.h"
45#include "util/graph_range.h"
46#include "util/ue2string.h"
47
48#include <unordered_set>
49
50using namespace std;
51
52namespace ue2 {
53
54static
55bool isLiteralChar(const NGHolder &g, NFAVertex v, bool &nocase,
56 bool &casefixed) {
57 const CharReach &cr = g[v].char_reach;
58 const size_t num = cr.count();
59 if (num > 2) {
60 return false; // char class
61 }
62
63 if (!casefixed) {
64 if (num == 2 && cr.isCaselessChar()) {
65 nocase = true;
66 casefixed = true;
67 return true;
68 } else if (num == 1) {
69 if (cr.isAlpha()) {
70 nocase = false;
71 casefixed = true;
72 }
73 // otherwise, still acceptable but we can't fix caselessness yet
74 return true;
75 }
76 } else {
77 // nocase property is fixed
78 if (nocase) {
79 if ((num == 2 && cr.isCaselessChar()) ||
80 (num == 1 && !cr.isAlpha())) {
81 return true;
82 }
83 } else {
84 return (num == 1);
85 }
86 }
87
88 return false;
89}
90
91static
92void addToString(string &s, const NGHolder &g, NFAVertex v) {
93 const CharReach &cr = g[v].char_reach;
94 assert(cr.count() == 1 || cr.isCaselessChar());
95
96 char c = (char)cr.find_first();
97 s.push_back(c);
98}
99
100static
101bool splitOffLiteral(NG &ng, NGHolder &g, NFAVertex v, const bool anchored,
102 set<NFAVertex> &dead) {
103 DEBUG_PRINTF("examine vertex %zu\n", g[v].index);
104 bool nocase = false, casefixed = false;
105
106 assert(!is_special(v, g));
107
108 size_t reqInDegree;
109 if (anchored) {
110 reqInDegree = 1;
111 assert(edge(g.start, v, g).second);
112 } else {
113 reqInDegree = 2;
114 assert(edge(g.start, v, g).second);
115 assert(edge(g.startDs, v, g).second);
116 }
117 if (in_degree(v, g) > reqInDegree) {
118 DEBUG_PRINTF("extra in-edges\n");
119 return false;
120 }
121
122 if (!isLiteralChar(g, v, nocase, casefixed)) {
123 DEBUG_PRINTF("not literal\n");
124 return false;
125 }
126
127 string literal;
128 addToString(literal, g, v);
129
130 // Remaining vertices must come in a chain, each with one in-edge and one
131 // out-edge only.
132 NFAVertex u;
133 while (1) {
134 if (out_degree(v, g) != 1) {
135 DEBUG_PRINTF("branches, not literal\n");
136 return false;
137 }
138
139 u = v; // previous vertex
140 v = *(adjacent_vertices(v, g).first);
141
142 DEBUG_PRINTF("loop, v=%zu\n", g[v].index);
143
144 if (is_special(v, g)) {
145 if (v == g.accept || v == g.acceptEod) {
146 break; // OK
147 } else {
148 assert(0); // start?
149 return false;
150 }
151 } else {
152 // Ordinary, must be literal
153 if (!isLiteralChar(g, v, nocase, casefixed)) {
154 DEBUG_PRINTF("not literal\n");
155 return false;
156 }
157 if (in_degree(v, g) != 1) {
158 DEBUG_PRINTF("branches, not literal\n");
159 return false;
160 }
161 }
162
163 addToString(literal, g, v);
164 }
165
166 // Successfully found a literal; there might be multiple report IDs, in
167 // which case we add all the reports.
168 assert(!is_special(u, g));
169 bool eod = v == g.acceptEod;
170 assert(eod || v == g.accept);
171
172 DEBUG_PRINTF("success: found %s literal '%s'\n",
173 anchored ? "anchored" : "unanchored",
174 escapeString(literal).c_str());
175
176 // Literals of length 1 are better served going through later optimisation
177 // passes, where they might be combined together into a character class.
178 if (literal.length() == 1) {
179 DEBUG_PRINTF("skipping literal of length 1\n");
180 return false;
181 }
182
183 ng.rose->add(anchored, eod, ue2_literal(literal, nocase), g[u].reports);
184
185 // Remove the terminal vertex. Later, we rely on pruneUseless to remove the
186 // other vertices in this chain, since they'll no longer lead to an accept.
187 dead.insert(u);
188
189 return true;
190}
191
192/** \brief Split off literals. True if any changes were made to the graph. */
193bool splitOffLiterals(NG &ng, NGHolder &g) {
194 if (!ng.cc.grey.allowLiteral) {
195 return false;
196 }
197
198 bool changed = false;
199 set<NFAVertex> dead;
200
201 unordered_set<NFAVertex> unanchored; // for faster lookup.
202 insert(&unanchored, adjacent_vertices(g.startDs, g));
203
204 // Anchored literals.
205 for (auto v : adjacent_vertices_range(g.start, g)) {
206 if (!is_special(v, g) && !contains(unanchored, v)) {
207 changed |= splitOffLiteral(ng, g, v, true, dead);
208 }
209 }
210
211 // Unanchored literals.
212 for (auto v : adjacent_vertices_range(g.startDs, g)) {
213 if (!is_special(v, g)) {
214 changed |= splitOffLiteral(ng, g, v, false, dead);
215 }
216 }
217
218 if (changed) {
219 remove_vertices(dead, g);
220 pruneUseless(g);
221 return true;
222 }
223
224 return false;
225}
226
227} // namespace ue2
228