1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6//
7// Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
8// except that it remembers where it has been, trading a lot of
9// memory for a lot of time. It exists only for testing purposes.
10//
11// Let me repeat that.
12//
13// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14// - It uses a ton of memory.
15// - It uses a ton of stack.
16// - It uses CHECK and LOG(FATAL).
17// - It implements unanchored search by repeated anchored search.
18//
19// On the other hand, it is very simple and a good reference
20// implementation for the more complicated regexp packages.
21//
22// In BUILD, this file is linked into the ":testing" library,
23// not the main library, in order to make it harder to pick up
24// accidentally.
25
26#include <stddef.h>
27#include <stdint.h>
28#include <string.h>
29
30#include "util/util.h"
31#include "util/logging.h"
32#include "re2/prog.h"
33#include "re2/regexp.h"
34
35namespace re2 {
36
37// Backtracker holds the state for a backtracking search.
38//
39// Excluding the search parameters, the main search state
40// is just the "capture registers", which record, for the
41// current execution, the string position at which each
42// parenthesis was passed. cap_[0] and cap_[1] are the
43// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
44//
45// To avoid infinite loops during backtracking on expressions
46// like (a*)*, the visited_[] bitmap marks the (state, string-position)
47// pairs that have already been explored and are thus not worth
48// re-exploring if we get there via another path. Modern backtracking
49// libraries engineer their program representation differently, to make
50// such infinite loops possible to avoid without keeping a giant visited_
51// bitmap, but visited_ works fine for a reference implementation
52// and it has the nice benefit of making the search run in linear time.
53class Backtracker {
54 public:
55 explicit Backtracker(Prog* prog);
56 ~Backtracker();
57
58 bool Search(const StringPiece& text, const StringPiece& context,
59 bool anchored, bool longest,
60 StringPiece* submatch, int nsubmatch);
61
62 private:
63 // Explores from instruction id at string position p looking for a match.
64 // Returns true if found (so that caller can stop trying other possibilities).
65 bool Visit(int id, const char* p);
66
67 // Tries instruction id at string position p.
68 // Returns true if a match is found.
69 bool Try(int id, const char* p);
70
71 // Search parameters
72 Prog* prog_; // program being run
73 StringPiece text_; // text being searched
74 StringPiece context_; // greater context of text being searched
75 bool anchored_; // whether search is anchored at text.begin()
76 bool longest_; // whether search wants leftmost-longest match
77 bool endmatch_; // whether search must end at text.end()
78 StringPiece *submatch_; // submatches to fill in
79 int nsubmatch_; // # of submatches to fill in
80
81 // Search state
82 const char* cap_[64]; // capture registers
83 uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
84 size_t nvisited_; // # of words in bitmap
85};
86
87Backtracker::Backtracker(Prog* prog)
88 : prog_(prog),
89 anchored_(false),
90 longest_(false),
91 endmatch_(false),
92 submatch_(NULL),
93 nsubmatch_(0),
94 visited_(NULL),
95 nvisited_(0) {
96}
97
98Backtracker::~Backtracker() {
99 delete[] visited_;
100}
101
102// Runs a backtracking search.
103bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
104 bool anchored, bool longest,
105 StringPiece* submatch, int nsubmatch) {
106 text_ = text;
107 context_ = context;
108 if (context_.data() == NULL)
109 context_ = text;
110 if (prog_->anchor_start() && text.begin() > context_.begin())
111 return false;
112 if (prog_->anchor_end() && text.end() < context_.end())
113 return false;
114 anchored_ = anchored | prog_->anchor_start();
115 longest_ = longest | prog_->anchor_end();
116 endmatch_ = prog_->anchor_end();
117 submatch_ = submatch;
118 nsubmatch_ = nsubmatch;
119 CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_)));
120 memset(cap_, 0, sizeof cap_);
121
122 // We use submatch_[0] for our own bookkeeping,
123 // so it had better exist.
124 StringPiece sp0;
125 if (nsubmatch < 1) {
126 submatch_ = &sp0;
127 nsubmatch_ = 1;
128 }
129 submatch_[0] = StringPiece();
130
131 // Allocate new visited_ bitmap -- size is proportional
132 // to text, so have to reallocate on each call to Search.
133 delete[] visited_;
134 nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
135 visited_ = new uint32_t[nvisited_];
136 memset(visited_, 0, nvisited_*sizeof visited_[0]);
137
138 // Anchored search must start at text.begin().
139 if (anchored_) {
140 cap_[0] = text.data();
141 return Visit(prog_->start(), text.data());
142 }
143
144 // Unanchored search, starting from each possible text position.
145 // Notice that we have to try the empty string at the end of
146 // the text, so the loop condition is p <= text.end(), not p < text.end().
147 for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
148 cap_[0] = p;
149 if (Visit(prog_->start(), p)) // Match must be leftmost; done.
150 return true;
151 // Avoid invoking undefined behavior (arithmetic on a null pointer)
152 // by simply not continuing the loop.
153 if (p == NULL)
154 break;
155 }
156 return false;
157}
158
159// Explores from instruction id at string position p looking for a match.
160// Return true if found (so that caller can stop trying other possibilities).
161bool Backtracker::Visit(int id, const char* p) {
162 // Check bitmap. If we've already explored from here,
163 // either it didn't match or it did but we're hoping for a better match.
164 // Either way, don't go down that road again.
165 CHECK(p <= text_.data() + text_.size());
166 size_t n = id*(text_.size()+1) + (p - text_.data());
167 CHECK_LT(n/32, nvisited_);
168 if (visited_[n/32] & (1 << (n&31)))
169 return false;
170 visited_[n/32] |= 1 << (n&31);
171
172 Prog::Inst* ip = prog_->inst(id);
173 if (Try(id, p)) {
174 if (longest_ && !ip->last())
175 Visit(id+1, p);
176 return true;
177 }
178 if (!ip->last())
179 return Visit(id+1, p);
180 return false;
181}
182
183// Tries instruction id at string position p.
184// Returns true if a match is found.
185bool Backtracker::Try(int id, const char* p) {
186 // Pick out byte at current position. If at end of string,
187 // have to explore in hope of finishing a match. Use impossible byte -1.
188 int c = -1;
189 if (p < text_.data() + text_.size())
190 c = *p & 0xFF;
191
192 Prog::Inst* ip = prog_->inst(id);
193 switch (ip->opcode()) {
194 default:
195 LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
196 return false; // not reached
197
198 case kInstAltMatch:
199 // Ignored.
200 return false;
201
202 case kInstByteRange:
203 if (ip->Matches(c))
204 return Visit(ip->out(), p+1);
205 return false;
206
207 case kInstCapture:
208 if (0 <= ip->cap() &&
209 ip->cap() < static_cast<int>(arraysize(cap_))) {
210 // Capture p to register, but save old value.
211 const char* q = cap_[ip->cap()];
212 cap_[ip->cap()] = p;
213 bool ret = Visit(ip->out(), p);
214 // Restore old value as we backtrack.
215 cap_[ip->cap()] = q;
216 return ret;
217 }
218 return Visit(ip->out(), p);
219
220 case kInstEmptyWidth:
221 if (ip->empty() & ~Prog::EmptyFlags(context_, p))
222 return false;
223 return Visit(ip->out(), p);
224
225 case kInstNop:
226 return Visit(ip->out(), p);
227
228 case kInstMatch:
229 // We found a match. If it's the best so far, record the
230 // parameters in the caller's submatch_ array.
231 if (endmatch_ && p != context_.data() + context_.size())
232 return false;
233 cap_[1] = p;
234 if (submatch_[0].data() == NULL ||
235 (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
236 // First match so far - or better match.
237 for (int i = 0; i < nsubmatch_; i++)
238 submatch_[i] = StringPiece(
239 cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
240 }
241 return true;
242
243 case kInstFail:
244 return false;
245 }
246}
247
248// Runs a backtracking search.
249bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
250 const StringPiece& context,
251 Anchor anchor,
252 MatchKind kind,
253 StringPiece* match,
254 int nmatch) {
255 // If full match, we ask for an anchored longest match
256 // and then check that match[0] == text.
257 // So make sure match[0] exists.
258 StringPiece sp0;
259 if (kind == kFullMatch) {
260 anchor = kAnchored;
261 if (nmatch < 1) {
262 match = &sp0;
263 nmatch = 1;
264 }
265 }
266
267 // Run the search.
268 Backtracker b(this);
269 bool anchored = anchor == kAnchored;
270 bool longest = kind != kFirstMatch;
271 if (!b.Search(text, context, anchored, longest, match, nmatch))
272 return false;
273 if (kind == kFullMatch && match[0].end() != text.end())
274 return false;
275 return true;
276}
277
278} // namespace re2
279