1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Tested by search_test.cc, exhaustive_test.cc, tester.cc |
6 | // |
7 | // Prog::UnsafeSearchBacktrack is a backtracking regular expression search, |
8 | // except that it remembers where it has been, trading a lot of |
9 | // memory for a lot of time. It exists only for testing purposes. |
10 | // |
11 | // Let me repeat that. |
12 | // |
13 | // THIS CODE SHOULD NEVER BE USED IN PRODUCTION: |
14 | // - It uses a ton of memory. |
15 | // - It uses a ton of stack. |
16 | // - It uses CHECK and LOG(FATAL). |
17 | // - It implements unanchored search by repeated anchored search. |
18 | // |
19 | // On the other hand, it is very simple and a good reference |
20 | // implementation for the more complicated regexp packages. |
21 | // |
22 | // In BUILD, this file is linked into the ":testing" library, |
23 | // not the main library, in order to make it harder to pick up |
24 | // accidentally. |
25 | |
26 | #include <stddef.h> |
27 | #include <stdint.h> |
28 | #include <string.h> |
29 | |
30 | #include "util/util.h" |
31 | #include "util/logging.h" |
32 | #include "re2/prog.h" |
33 | #include "re2/regexp.h" |
34 | |
35 | namespace re2 { |
36 | |
37 | // Backtracker holds the state for a backtracking search. |
38 | // |
39 | // Excluding the search parameters, the main search state |
40 | // is just the "capture registers", which record, for the |
41 | // current execution, the string position at which each |
42 | // parenthesis was passed. cap_[0] and cap_[1] are the |
43 | // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. |
44 | // |
45 | // To avoid infinite loops during backtracking on expressions |
46 | // like (a*)*, the visited_[] bitmap marks the (state, string-position) |
47 | // pairs that have already been explored and are thus not worth |
48 | // re-exploring if we get there via another path. Modern backtracking |
49 | // libraries engineer their program representation differently, to make |
50 | // such infinite loops possible to avoid without keeping a giant visited_ |
51 | // bitmap, but visited_ works fine for a reference implementation |
52 | // and it has the nice benefit of making the search run in linear time. |
53 | class Backtracker { |
54 | public: |
55 | explicit Backtracker(Prog* prog); |
56 | ~Backtracker(); |
57 | |
58 | bool Search(const StringPiece& text, const StringPiece& context, |
59 | bool anchored, bool longest, |
60 | StringPiece* submatch, int nsubmatch); |
61 | |
62 | private: |
63 | // Explores from instruction id at string position p looking for a match. |
64 | // Returns true if found (so that caller can stop trying other possibilities). |
65 | bool Visit(int id, const char* p); |
66 | |
67 | // Tries instruction id at string position p. |
68 | // Returns true if a match is found. |
69 | bool Try(int id, const char* p); |
70 | |
71 | // Search parameters |
72 | Prog* prog_; // program being run |
73 | StringPiece text_; // text being searched |
74 | StringPiece context_; // greater context of text being searched |
75 | bool anchored_; // whether search is anchored at text.begin() |
76 | bool longest_; // whether search wants leftmost-longest match |
77 | bool endmatch_; // whether search must end at text.end() |
78 | StringPiece *submatch_; // submatches to fill in |
79 | int nsubmatch_; // # of submatches to fill in |
80 | |
81 | // Search state |
82 | const char* cap_[64]; // capture registers |
83 | uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked |
84 | size_t nvisited_; // # of words in bitmap |
85 | }; |
86 | |
87 | Backtracker::Backtracker(Prog* prog) |
88 | : prog_(prog), |
89 | anchored_(false), |
90 | longest_(false), |
91 | endmatch_(false), |
92 | submatch_(NULL), |
93 | nsubmatch_(0), |
94 | visited_(NULL), |
95 | nvisited_(0) { |
96 | } |
97 | |
98 | Backtracker::~Backtracker() { |
99 | delete[] visited_; |
100 | } |
101 | |
102 | // Runs a backtracking search. |
103 | bool Backtracker::Search(const StringPiece& text, const StringPiece& context, |
104 | bool anchored, bool longest, |
105 | StringPiece* submatch, int nsubmatch) { |
106 | text_ = text; |
107 | context_ = context; |
108 | if (context_.data() == NULL) |
109 | context_ = text; |
110 | if (prog_->anchor_start() && text.begin() > context_.begin()) |
111 | return false; |
112 | if (prog_->anchor_end() && text.end() < context_.end()) |
113 | return false; |
114 | anchored_ = anchored | prog_->anchor_start(); |
115 | longest_ = longest | prog_->anchor_end(); |
116 | endmatch_ = prog_->anchor_end(); |
117 | submatch_ = submatch; |
118 | nsubmatch_ = nsubmatch; |
119 | CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_))); |
120 | memset(cap_, 0, sizeof cap_); |
121 | |
122 | // We use submatch_[0] for our own bookkeeping, |
123 | // so it had better exist. |
124 | StringPiece sp0; |
125 | if (nsubmatch < 1) { |
126 | submatch_ = &sp0; |
127 | nsubmatch_ = 1; |
128 | } |
129 | submatch_[0] = StringPiece(); |
130 | |
131 | // Allocate new visited_ bitmap -- size is proportional |
132 | // to text, so have to reallocate on each call to Search. |
133 | delete[] visited_; |
134 | nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; |
135 | visited_ = new uint32_t[nvisited_]; |
136 | memset(visited_, 0, nvisited_*sizeof visited_[0]); |
137 | |
138 | // Anchored search must start at text.begin(). |
139 | if (anchored_) { |
140 | cap_[0] = text.data(); |
141 | return Visit(prog_->start(), text.data()); |
142 | } |
143 | |
144 | // Unanchored search, starting from each possible text position. |
145 | // Notice that we have to try the empty string at the end of |
146 | // the text, so the loop condition is p <= text.end(), not p < text.end(). |
147 | for (const char* p = text.data(); p <= text.data() + text.size(); p++) { |
148 | cap_[0] = p; |
149 | if (Visit(prog_->start(), p)) // Match must be leftmost; done. |
150 | return true; |
151 | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
152 | // by simply not continuing the loop. |
153 | if (p == NULL) |
154 | break; |
155 | } |
156 | return false; |
157 | } |
158 | |
159 | // Explores from instruction id at string position p looking for a match. |
160 | // Return true if found (so that caller can stop trying other possibilities). |
161 | bool Backtracker::Visit(int id, const char* p) { |
162 | // Check bitmap. If we've already explored from here, |
163 | // either it didn't match or it did but we're hoping for a better match. |
164 | // Either way, don't go down that road again. |
165 | CHECK(p <= text_.data() + text_.size()); |
166 | size_t n = id*(text_.size()+1) + (p - text_.data()); |
167 | CHECK_LT(n/32, nvisited_); |
168 | if (visited_[n/32] & (1 << (n&31))) |
169 | return false; |
170 | visited_[n/32] |= 1 << (n&31); |
171 | |
172 | Prog::Inst* ip = prog_->inst(id); |
173 | if (Try(id, p)) { |
174 | if (longest_ && !ip->last()) |
175 | Visit(id+1, p); |
176 | return true; |
177 | } |
178 | if (!ip->last()) |
179 | return Visit(id+1, p); |
180 | return false; |
181 | } |
182 | |
183 | // Tries instruction id at string position p. |
184 | // Returns true if a match is found. |
185 | bool Backtracker::Try(int id, const char* p) { |
186 | // Pick out byte at current position. If at end of string, |
187 | // have to explore in hope of finishing a match. Use impossible byte -1. |
188 | int c = -1; |
189 | if (p < text_.data() + text_.size()) |
190 | c = *p & 0xFF; |
191 | |
192 | Prog::Inst* ip = prog_->inst(id); |
193 | switch (ip->opcode()) { |
194 | default: |
195 | LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); |
196 | return false; // not reached |
197 | |
198 | case kInstAltMatch: |
199 | // Ignored. |
200 | return false; |
201 | |
202 | case kInstByteRange: |
203 | if (ip->Matches(c)) |
204 | return Visit(ip->out(), p+1); |
205 | return false; |
206 | |
207 | case kInstCapture: |
208 | if (0 <= ip->cap() && |
209 | ip->cap() < static_cast<int>(arraysize(cap_))) { |
210 | // Capture p to register, but save old value. |
211 | const char* q = cap_[ip->cap()]; |
212 | cap_[ip->cap()] = p; |
213 | bool ret = Visit(ip->out(), p); |
214 | // Restore old value as we backtrack. |
215 | cap_[ip->cap()] = q; |
216 | return ret; |
217 | } |
218 | return Visit(ip->out(), p); |
219 | |
220 | case kInstEmptyWidth: |
221 | if (ip->empty() & ~Prog::EmptyFlags(context_, p)) |
222 | return false; |
223 | return Visit(ip->out(), p); |
224 | |
225 | case kInstNop: |
226 | return Visit(ip->out(), p); |
227 | |
228 | case kInstMatch: |
229 | // We found a match. If it's the best so far, record the |
230 | // parameters in the caller's submatch_ array. |
231 | if (endmatch_ && p != context_.data() + context_.size()) |
232 | return false; |
233 | cap_[1] = p; |
234 | if (submatch_[0].data() == NULL || |
235 | (longest_ && p > submatch_[0].data() + submatch_[0].size())) { |
236 | // First match so far - or better match. |
237 | for (int i = 0; i < nsubmatch_; i++) |
238 | submatch_[i] = StringPiece( |
239 | cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); |
240 | } |
241 | return true; |
242 | |
243 | case kInstFail: |
244 | return false; |
245 | } |
246 | } |
247 | |
248 | // Runs a backtracking search. |
249 | bool Prog::UnsafeSearchBacktrack(const StringPiece& text, |
250 | const StringPiece& context, |
251 | Anchor anchor, |
252 | MatchKind kind, |
253 | StringPiece* match, |
254 | int nmatch) { |
255 | // If full match, we ask for an anchored longest match |
256 | // and then check that match[0] == text. |
257 | // So make sure match[0] exists. |
258 | StringPiece sp0; |
259 | if (kind == kFullMatch) { |
260 | anchor = kAnchored; |
261 | if (nmatch < 1) { |
262 | match = &sp0; |
263 | nmatch = 1; |
264 | } |
265 | } |
266 | |
267 | // Run the search. |
268 | Backtracker b(this); |
269 | bool anchored = anchor == kAnchored; |
270 | bool longest = kind != kFirstMatch; |
271 | if (!b.Search(text, context, anchored, longest, match, nmatch)) |
272 | return false; |
273 | if (kind == kFullMatch && match[0].end() != text.end()) |
274 | return false; |
275 | return true; |
276 | } |
277 | |
278 | } // namespace re2 |
279 | |