1 | // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Compiled regular expression representation. |
6 | // Tested by compile_test.cc |
7 | |
8 | #include "re2/prog.h" |
9 | |
10 | #include <stdint.h> |
11 | #include <string.h> |
12 | #include <algorithm> |
13 | #include <memory> |
14 | #include <utility> |
15 | |
16 | #include "util/util.h" |
17 | #include "util/logging.h" |
18 | #include "util/strutil.h" |
19 | #include "re2/bitmap256.h" |
20 | #include "re2/stringpiece.h" |
21 | |
22 | namespace re2 { |
23 | |
24 | // Constructors per Inst opcode |
25 | |
26 | void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) { |
27 | DCHECK_EQ(out_opcode_, 0); |
28 | set_out_opcode(out, kInstAlt); |
29 | out1_ = out1; |
30 | } |
31 | |
32 | void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) { |
33 | DCHECK_EQ(out_opcode_, 0); |
34 | set_out_opcode(out, kInstByteRange); |
35 | lo_ = lo & 0xFF; |
36 | hi_ = hi & 0xFF; |
37 | hint_foldcase_ = foldcase&1; |
38 | } |
39 | |
40 | void Prog::Inst::InitCapture(int cap, uint32_t out) { |
41 | DCHECK_EQ(out_opcode_, 0); |
42 | set_out_opcode(out, kInstCapture); |
43 | cap_ = cap; |
44 | } |
45 | |
46 | void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) { |
47 | DCHECK_EQ(out_opcode_, 0); |
48 | set_out_opcode(out, kInstEmptyWidth); |
49 | empty_ = empty; |
50 | } |
51 | |
52 | void Prog::Inst::InitMatch(int32_t id) { |
53 | DCHECK_EQ(out_opcode_, 0); |
54 | set_opcode(kInstMatch); |
55 | match_id_ = id; |
56 | } |
57 | |
58 | void Prog::Inst::InitNop(uint32_t out) { |
59 | DCHECK_EQ(out_opcode_, 0); |
60 | set_opcode(kInstNop); |
61 | } |
62 | |
63 | void Prog::Inst::InitFail() { |
64 | DCHECK_EQ(out_opcode_, 0); |
65 | set_opcode(kInstFail); |
66 | } |
67 | |
68 | std::string Prog::Inst::Dump() { |
69 | switch (opcode()) { |
70 | default: |
71 | return StringPrintf("opcode %d" , static_cast<int>(opcode())); |
72 | |
73 | case kInstAlt: |
74 | return StringPrintf("alt -> %d | %d" , out(), out1_); |
75 | |
76 | case kInstAltMatch: |
77 | return StringPrintf("altmatch -> %d | %d" , out(), out1_); |
78 | |
79 | case kInstByteRange: |
80 | return StringPrintf("byte%s [%02x-%02x] %d -> %d" , |
81 | foldcase() ? "/i" : "" , |
82 | lo_, hi_, hint(), out()); |
83 | |
84 | case kInstCapture: |
85 | return StringPrintf("capture %d -> %d" , cap_, out()); |
86 | |
87 | case kInstEmptyWidth: |
88 | return StringPrintf("emptywidth %#x -> %d" , |
89 | static_cast<int>(empty_), out()); |
90 | |
91 | case kInstMatch: |
92 | return StringPrintf("match! %d" , match_id()); |
93 | |
94 | case kInstNop: |
95 | return StringPrintf("nop -> %d" , out()); |
96 | |
97 | case kInstFail: |
98 | return StringPrintf("fail" ); |
99 | } |
100 | } |
101 | |
102 | Prog::Prog() |
103 | : anchor_start_(false), |
104 | anchor_end_(false), |
105 | reversed_(false), |
106 | did_flatten_(false), |
107 | did_onepass_(false), |
108 | start_(0), |
109 | start_unanchored_(0), |
110 | size_(0), |
111 | bytemap_range_(0), |
112 | first_byte_(-1), |
113 | flags_(0), |
114 | list_count_(0), |
115 | dfa_mem_(0), |
116 | dfa_first_(NULL), |
117 | dfa_longest_(NULL) { |
118 | } |
119 | |
120 | Prog::~Prog() { |
121 | DeleteDFA(dfa_longest_); |
122 | DeleteDFA(dfa_first_); |
123 | } |
124 | |
125 | typedef SparseSet Workq; |
126 | |
127 | static inline void AddToQueue(Workq* q, int id) { |
128 | if (id != 0) |
129 | q->insert(id); |
130 | } |
131 | |
132 | static std::string ProgToString(Prog* prog, Workq* q) { |
133 | std::string s; |
134 | for (Workq::iterator i = q->begin(); i != q->end(); ++i) { |
135 | int id = *i; |
136 | Prog::Inst* ip = prog->inst(id); |
137 | s += StringPrintf("%d. %s\n" , id, ip->Dump().c_str()); |
138 | AddToQueue(q, ip->out()); |
139 | if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) |
140 | AddToQueue(q, ip->out1()); |
141 | } |
142 | return s; |
143 | } |
144 | |
145 | static std::string FlattenedProgToString(Prog* prog, int start) { |
146 | std::string s; |
147 | for (int id = start; id < prog->size(); id++) { |
148 | Prog::Inst* ip = prog->inst(id); |
149 | if (ip->last()) |
150 | s += StringPrintf("%d. %s\n" , id, ip->Dump().c_str()); |
151 | else |
152 | s += StringPrintf("%d+ %s\n" , id, ip->Dump().c_str()); |
153 | } |
154 | return s; |
155 | } |
156 | |
157 | std::string Prog::Dump() { |
158 | if (did_flatten_) |
159 | return FlattenedProgToString(this, start_); |
160 | |
161 | Workq q(size_); |
162 | AddToQueue(&q, start_); |
163 | return ProgToString(this, &q); |
164 | } |
165 | |
166 | std::string Prog::DumpUnanchored() { |
167 | if (did_flatten_) |
168 | return FlattenedProgToString(this, start_unanchored_); |
169 | |
170 | Workq q(size_); |
171 | AddToQueue(&q, start_unanchored_); |
172 | return ProgToString(this, &q); |
173 | } |
174 | |
175 | std::string Prog::DumpByteMap() { |
176 | std::string map; |
177 | for (int c = 0; c < 256; c++) { |
178 | int b = bytemap_[c]; |
179 | int lo = c; |
180 | while (c < 256-1 && bytemap_[c+1] == b) |
181 | c++; |
182 | int hi = c; |
183 | map += StringPrintf("[%02x-%02x] -> %d\n" , lo, hi, b); |
184 | } |
185 | return map; |
186 | } |
187 | |
188 | int Prog::first_byte() { |
189 | std::call_once(first_byte_once_, [](Prog* prog) { |
190 | prog->first_byte_ = prog->ComputeFirstByte(); |
191 | }, this); |
192 | return first_byte_; |
193 | } |
194 | |
195 | static bool IsMatch(Prog*, Prog::Inst*); |
196 | |
197 | // Peep-hole optimizer. |
198 | void Prog::Optimize() { |
199 | Workq q(size_); |
200 | |
201 | // Eliminate nops. Most are taken out during compilation |
202 | // but a few are hard to avoid. |
203 | q.clear(); |
204 | AddToQueue(&q, start_); |
205 | for (Workq::iterator i = q.begin(); i != q.end(); ++i) { |
206 | int id = *i; |
207 | |
208 | Inst* ip = inst(id); |
209 | int j = ip->out(); |
210 | Inst* jp; |
211 | while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { |
212 | j = jp->out(); |
213 | } |
214 | ip->set_out(j); |
215 | AddToQueue(&q, ip->out()); |
216 | |
217 | if (ip->opcode() == kInstAlt) { |
218 | j = ip->out1(); |
219 | while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { |
220 | j = jp->out(); |
221 | } |
222 | ip->out1_ = j; |
223 | AddToQueue(&q, ip->out1()); |
224 | } |
225 | } |
226 | |
227 | // Insert kInstAltMatch instructions |
228 | // Look for |
229 | // ip: Alt -> j | k |
230 | // j: ByteRange [00-FF] -> ip |
231 | // k: Match |
232 | // or the reverse (the above is the greedy one). |
233 | // Rewrite Alt to AltMatch. |
234 | q.clear(); |
235 | AddToQueue(&q, start_); |
236 | for (Workq::iterator i = q.begin(); i != q.end(); ++i) { |
237 | int id = *i; |
238 | Inst* ip = inst(id); |
239 | AddToQueue(&q, ip->out()); |
240 | if (ip->opcode() == kInstAlt) |
241 | AddToQueue(&q, ip->out1()); |
242 | |
243 | if (ip->opcode() == kInstAlt) { |
244 | Inst* j = inst(ip->out()); |
245 | Inst* k = inst(ip->out1()); |
246 | if (j->opcode() == kInstByteRange && j->out() == id && |
247 | j->lo() == 0x00 && j->hi() == 0xFF && |
248 | IsMatch(this, k)) { |
249 | ip->set_opcode(kInstAltMatch); |
250 | continue; |
251 | } |
252 | if (IsMatch(this, j) && |
253 | k->opcode() == kInstByteRange && k->out() == id && |
254 | k->lo() == 0x00 && k->hi() == 0xFF) { |
255 | ip->set_opcode(kInstAltMatch); |
256 | } |
257 | } |
258 | } |
259 | } |
260 | |
261 | // Is ip a guaranteed match at end of text, perhaps after some capturing? |
262 | static bool IsMatch(Prog* prog, Prog::Inst* ip) { |
263 | for (;;) { |
264 | switch (ip->opcode()) { |
265 | default: |
266 | LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); |
267 | return false; |
268 | |
269 | case kInstAlt: |
270 | case kInstAltMatch: |
271 | case kInstByteRange: |
272 | case kInstFail: |
273 | case kInstEmptyWidth: |
274 | return false; |
275 | |
276 | case kInstCapture: |
277 | case kInstNop: |
278 | ip = prog->inst(ip->out()); |
279 | break; |
280 | |
281 | case kInstMatch: |
282 | return true; |
283 | } |
284 | } |
285 | } |
286 | |
287 | uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { |
288 | int flags = 0; |
289 | |
290 | // ^ and \A |
291 | if (p == text.data()) |
292 | flags |= kEmptyBeginText | kEmptyBeginLine; |
293 | else if (p[-1] == '\n') |
294 | flags |= kEmptyBeginLine; |
295 | |
296 | // $ and \z |
297 | if (p == text.data() + text.size()) |
298 | flags |= kEmptyEndText | kEmptyEndLine; |
299 | else if (p < text.data() + text.size() && p[0] == '\n') |
300 | flags |= kEmptyEndLine; |
301 | |
302 | // \b and \B |
303 | if (p == text.data() && p == text.data() + text.size()) { |
304 | // no word boundary here |
305 | } else if (p == text.data()) { |
306 | if (IsWordChar(p[0])) |
307 | flags |= kEmptyWordBoundary; |
308 | } else if (p == text.data() + text.size()) { |
309 | if (IsWordChar(p[-1])) |
310 | flags |= kEmptyWordBoundary; |
311 | } else { |
312 | if (IsWordChar(p[-1]) != IsWordChar(p[0])) |
313 | flags |= kEmptyWordBoundary; |
314 | } |
315 | if (!(flags & kEmptyWordBoundary)) |
316 | flags |= kEmptyNonWordBoundary; |
317 | |
318 | return flags; |
319 | } |
320 | |
321 | // ByteMapBuilder implements a coloring algorithm. |
322 | // |
323 | // The first phase is a series of "mark and merge" batches: we mark one or more |
324 | // [lo-hi] ranges, then merge them into our internal state. Batching is not for |
325 | // performance; rather, it means that the ranges are treated indistinguishably. |
326 | // |
327 | // Internally, the ranges are represented using a bitmap that stores the splits |
328 | // and a vector that stores the colors; both of them are indexed by the ranges' |
329 | // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at |
330 | // hi (if not already split), then recolor each range in between. The color map |
331 | // (i.e. from the old color to the new color) is maintained for the lifetime of |
332 | // the batch and so underpins this somewhat obscure approach to set operations. |
333 | // |
334 | // The second phase builds the bytemap from our internal state: we recolor each |
335 | // range, then store the new color (which is now the byte class) in each of the |
336 | // corresponding array elements. Finally, we output the number of byte classes. |
337 | class ByteMapBuilder { |
338 | public: |
339 | ByteMapBuilder() { |
340 | // Initial state: the [0-255] range has color 256. |
341 | // This will avoid problems during the second phase, |
342 | // in which we assign byte classes numbered from 0. |
343 | splits_.Set(255); |
344 | colors_[255] = 256; |
345 | nextcolor_ = 257; |
346 | } |
347 | |
348 | void Mark(int lo, int hi); |
349 | void Merge(); |
350 | void Build(uint8_t* bytemap, int* bytemap_range); |
351 | |
352 | private: |
353 | int Recolor(int oldcolor); |
354 | |
355 | Bitmap256 splits_; |
356 | int colors_[256]; |
357 | int nextcolor_; |
358 | std::vector<std::pair<int, int>> colormap_; |
359 | std::vector<std::pair<int, int>> ranges_; |
360 | |
361 | ByteMapBuilder(const ByteMapBuilder&) = delete; |
362 | ByteMapBuilder& operator=(const ByteMapBuilder&) = delete; |
363 | }; |
364 | |
365 | void ByteMapBuilder::Mark(int lo, int hi) { |
366 | DCHECK_GE(lo, 0); |
367 | DCHECK_GE(hi, 0); |
368 | DCHECK_LE(lo, 255); |
369 | DCHECK_LE(hi, 255); |
370 | DCHECK_LE(lo, hi); |
371 | |
372 | // Ignore any [0-255] ranges. They cause us to recolor every range, which |
373 | // has no effect on the eventual result and is therefore a waste of time. |
374 | if (lo == 0 && hi == 255) |
375 | return; |
376 | |
377 | ranges_.emplace_back(lo, hi); |
378 | } |
379 | |
380 | void ByteMapBuilder::Merge() { |
381 | for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin(); |
382 | it != ranges_.end(); |
383 | ++it) { |
384 | int lo = it->first-1; |
385 | int hi = it->second; |
386 | |
387 | if (0 <= lo && !splits_.Test(lo)) { |
388 | splits_.Set(lo); |
389 | int next = splits_.FindNextSetBit(lo+1); |
390 | colors_[lo] = colors_[next]; |
391 | } |
392 | if (!splits_.Test(hi)) { |
393 | splits_.Set(hi); |
394 | int next = splits_.FindNextSetBit(hi+1); |
395 | colors_[hi] = colors_[next]; |
396 | } |
397 | |
398 | int c = lo+1; |
399 | while (c < 256) { |
400 | int next = splits_.FindNextSetBit(c); |
401 | colors_[next] = Recolor(colors_[next]); |
402 | if (next == hi) |
403 | break; |
404 | c = next+1; |
405 | } |
406 | } |
407 | colormap_.clear(); |
408 | ranges_.clear(); |
409 | } |
410 | |
411 | void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) { |
412 | // Assign byte classes numbered from 0. |
413 | nextcolor_ = 0; |
414 | |
415 | int c = 0; |
416 | while (c < 256) { |
417 | int next = splits_.FindNextSetBit(c); |
418 | uint8_t b = static_cast<uint8_t>(Recolor(colors_[next])); |
419 | while (c <= next) { |
420 | bytemap[c] = b; |
421 | c++; |
422 | } |
423 | } |
424 | |
425 | *bytemap_range = nextcolor_; |
426 | } |
427 | |
428 | int ByteMapBuilder::Recolor(int oldcolor) { |
429 | // Yes, this is a linear search. There can be at most 256 |
430 | // colors and there will typically be far fewer than that. |
431 | // Also, we need to consider keys *and* values in order to |
432 | // avoid recoloring a given range more than once per batch. |
433 | std::vector<std::pair<int, int>>::const_iterator it = |
434 | std::find_if(colormap_.begin(), colormap_.end(), |
435 | [=](const std::pair<int, int>& kv) -> bool { |
436 | return kv.first == oldcolor || kv.second == oldcolor; |
437 | }); |
438 | if (it != colormap_.end()) |
439 | return it->second; |
440 | int newcolor = nextcolor_; |
441 | nextcolor_++; |
442 | colormap_.emplace_back(oldcolor, newcolor); |
443 | return newcolor; |
444 | } |
445 | |
446 | void Prog::ComputeByteMap() { |
447 | // Fill in bytemap with byte classes for the program. |
448 | // Ranges of bytes that are treated indistinguishably |
449 | // will be mapped to a single byte class. |
450 | ByteMapBuilder builder; |
451 | |
452 | // Don't repeat the work for ^ and $. |
453 | bool marked_line_boundaries = false; |
454 | // Don't repeat the work for \b and \B. |
455 | bool marked_word_boundaries = false; |
456 | |
457 | for (int id = 0; id < size(); id++) { |
458 | Inst* ip = inst(id); |
459 | if (ip->opcode() == kInstByteRange) { |
460 | int lo = ip->lo(); |
461 | int hi = ip->hi(); |
462 | builder.Mark(lo, hi); |
463 | if (ip->foldcase() && lo <= 'z' && hi >= 'a') { |
464 | int foldlo = lo; |
465 | int foldhi = hi; |
466 | if (foldlo < 'a') |
467 | foldlo = 'a'; |
468 | if (foldhi > 'z') |
469 | foldhi = 'z'; |
470 | if (foldlo <= foldhi) { |
471 | foldlo += 'A' - 'a'; |
472 | foldhi += 'A' - 'a'; |
473 | builder.Mark(foldlo, foldhi); |
474 | } |
475 | } |
476 | // If this Inst is not the last Inst in its list AND the next Inst is |
477 | // also a ByteRange AND the Insts have the same out, defer the merge. |
478 | if (!ip->last() && |
479 | inst(id+1)->opcode() == kInstByteRange && |
480 | ip->out() == inst(id+1)->out()) |
481 | continue; |
482 | builder.Merge(); |
483 | } else if (ip->opcode() == kInstEmptyWidth) { |
484 | if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) && |
485 | !marked_line_boundaries) { |
486 | builder.Mark('\n', '\n'); |
487 | builder.Merge(); |
488 | marked_line_boundaries = true; |
489 | } |
490 | if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) && |
491 | !marked_word_boundaries) { |
492 | // We require two batches here: the first for ranges that are word |
493 | // characters, the second for ranges that are not word characters. |
494 | for (bool isword : {true, false}) { |
495 | int j; |
496 | for (int i = 0; i < 256; i = j) { |
497 | for (j = i + 1; j < 256 && |
498 | Prog::IsWordChar(static_cast<uint8_t>(i)) == |
499 | Prog::IsWordChar(static_cast<uint8_t>(j)); |
500 | j++) |
501 | ; |
502 | if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword) |
503 | builder.Mark(i, j - 1); |
504 | } |
505 | builder.Merge(); |
506 | } |
507 | marked_word_boundaries = true; |
508 | } |
509 | } |
510 | } |
511 | |
512 | builder.Build(bytemap_, &bytemap_range_); |
513 | |
514 | if (0) { // For debugging, use trivial bytemap. |
515 | LOG(ERROR) << "Using trivial bytemap." ; |
516 | for (int i = 0; i < 256; i++) |
517 | bytemap_[i] = static_cast<uint8_t>(i); |
518 | bytemap_range_ = 256; |
519 | } |
520 | } |
521 | |
522 | // Prog::Flatten() implements a graph rewriting algorithm. |
523 | // |
524 | // The overall process is similar to epsilon removal, but retains some epsilon |
525 | // transitions: those from Capture and EmptyWidth instructions; and those from |
526 | // nullable subexpressions. (The latter avoids quadratic blowup in transitions |
527 | // in the worst case.) It might be best thought of as Alt instruction elision. |
528 | // |
529 | // In conceptual terms, it divides the Prog into "trees" of instructions, then |
530 | // traverses the "trees" in order to produce "lists" of instructions. A "tree" |
531 | // is one or more instructions that grow from one "root" instruction to one or |
532 | // more "leaf" instructions; if a "tree" has exactly one instruction, then the |
533 | // "root" is also the "leaf". In most cases, a "root" is the successor of some |
534 | // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction) |
535 | // and is considered a "successor root". A "leaf" can be a ByteRange, Capture, |
536 | // EmptyWidth or Match instruction. However, this is insufficient for handling |
537 | // nested nullable subexpressions correctly, so in some cases, a "root" is the |
538 | // dominator of the instructions reachable from some "successor root" (i.e. it |
539 | // has an unreachable predecessor) and is considered a "dominator root". Since |
540 | // only Alt instructions can be "dominator roots" (other instructions would be |
541 | // "leaves"), only Alt instructions are required to be marked as predecessors. |
542 | // |
543 | // Dividing the Prog into "trees" comprises two passes: marking the "successor |
544 | // roots" and the predecessors; and marking the "dominator roots". Sorting the |
545 | // "successor roots" by their bytecode offsets enables iteration in order from |
546 | // greatest to least during the second pass; by working backwards in this case |
547 | // and flooding the graph no further than "leaves" and already marked "roots", |
548 | // it becomes possible to mark "dominator roots" without doing excessive work. |
549 | // |
550 | // Traversing the "trees" is just iterating over the "roots" in order of their |
551 | // marking and flooding the graph no further than "leaves" and "roots". When a |
552 | // "leaf" is reached, the instruction is copied with its successor remapped to |
553 | // its "root" number. When a "root" is reached, a Nop instruction is generated |
554 | // with its successor remapped similarly. As each "list" is produced, its last |
555 | // instruction is marked as such. After all of the "lists" have been produced, |
556 | // a pass over their instructions remaps their successors to bytecode offsets. |
557 | void Prog::Flatten() { |
558 | if (did_flatten_) |
559 | return; |
560 | did_flatten_ = true; |
561 | |
562 | // Scratch structures. It's important that these are reused by functions |
563 | // that we call in loops because they would thrash the heap otherwise. |
564 | SparseSet reachable(size()); |
565 | std::vector<int> stk; |
566 | stk.reserve(size()); |
567 | |
568 | // First pass: Marks "successor roots" and predecessors. |
569 | // Builds the mapping from inst-ids to root-ids. |
570 | SparseArray<int> rootmap(size()); |
571 | SparseArray<int> predmap(size()); |
572 | std::vector<std::vector<int>> predvec; |
573 | MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk); |
574 | |
575 | // Second pass: Marks "dominator roots". |
576 | SparseArray<int> sorted(rootmap); |
577 | std::sort(sorted.begin(), sorted.end(), sorted.less); |
578 | for (SparseArray<int>::const_iterator i = sorted.end() - 1; |
579 | i != sorted.begin(); |
580 | --i) { |
581 | if (i->index() != start_unanchored() && i->index() != start()) |
582 | MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk); |
583 | } |
584 | |
585 | // Third pass: Emits "lists". Remaps outs to root-ids. |
586 | // Builds the mapping from root-ids to flat-ids. |
587 | std::vector<int> flatmap(rootmap.size()); |
588 | std::vector<Inst> flat; |
589 | flat.reserve(size()); |
590 | for (SparseArray<int>::const_iterator i = rootmap.begin(); |
591 | i != rootmap.end(); |
592 | ++i) { |
593 | flatmap[i->value()] = static_cast<int>(flat.size()); |
594 | EmitList(i->index(), &rootmap, &flat, &reachable, &stk); |
595 | flat.back().set_last(); |
596 | // We have the bounds of the "list", so this is the |
597 | // most convenient point at which to compute hints. |
598 | ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size())); |
599 | } |
600 | |
601 | list_count_ = static_cast<int>(flatmap.size()); |
602 | for (int i = 0; i < kNumInst; i++) |
603 | inst_count_[i] = 0; |
604 | |
605 | // Fourth pass: Remaps outs to flat-ids. |
606 | // Counts instructions by opcode. |
607 | for (int id = 0; id < static_cast<int>(flat.size()); id++) { |
608 | Inst* ip = &flat[id]; |
609 | if (ip->opcode() != kInstAltMatch) // handled in EmitList() |
610 | ip->set_out(flatmap[ip->out()]); |
611 | inst_count_[ip->opcode()]++; |
612 | } |
613 | |
614 | int total = 0; |
615 | for (int i = 0; i < kNumInst; i++) |
616 | total += inst_count_[i]; |
617 | DCHECK_EQ(total, static_cast<int>(flat.size())); |
618 | |
619 | // Remap start_unanchored and start. |
620 | if (start_unanchored() == 0) { |
621 | DCHECK_EQ(start(), 0); |
622 | } else if (start_unanchored() == start()) { |
623 | set_start_unanchored(flatmap[1]); |
624 | set_start(flatmap[1]); |
625 | } else { |
626 | set_start_unanchored(flatmap[1]); |
627 | set_start(flatmap[2]); |
628 | } |
629 | |
630 | // Finally, replace the old instructions with the new instructions. |
631 | size_ = static_cast<int>(flat.size()); |
632 | inst_ = PODArray<Inst>(size_); |
633 | memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]); |
634 | |
635 | // Populate the list heads for BitState. |
636 | // 512 instructions limits the memory footprint to 1KiB. |
637 | if (size_ <= 512) { |
638 | list_heads_ = PODArray<uint16_t>(size_); |
639 | // 0xFF makes it more obvious if we try to look up a non-head. |
640 | memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]); |
641 | for (int i = 0; i < list_count_; ++i) |
642 | list_heads_[flatmap[i]] = i; |
643 | } |
644 | } |
645 | |
646 | void Prog::MarkSuccessors(SparseArray<int>* rootmap, |
647 | SparseArray<int>* predmap, |
648 | std::vector<std::vector<int>>* predvec, |
649 | SparseSet* reachable, std::vector<int>* stk) { |
650 | // Mark the kInstFail instruction. |
651 | rootmap->set_new(0, rootmap->size()); |
652 | |
653 | // Mark the start_unanchored and start instructions. |
654 | if (!rootmap->has_index(start_unanchored())) |
655 | rootmap->set_new(start_unanchored(), rootmap->size()); |
656 | if (!rootmap->has_index(start())) |
657 | rootmap->set_new(start(), rootmap->size()); |
658 | |
659 | reachable->clear(); |
660 | stk->clear(); |
661 | stk->push_back(start_unanchored()); |
662 | while (!stk->empty()) { |
663 | int id = stk->back(); |
664 | stk->pop_back(); |
665 | Loop: |
666 | if (reachable->contains(id)) |
667 | continue; |
668 | reachable->insert_new(id); |
669 | |
670 | Inst* ip = inst(id); |
671 | switch (ip->opcode()) { |
672 | default: |
673 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
674 | break; |
675 | |
676 | case kInstAltMatch: |
677 | case kInstAlt: |
678 | // Mark this instruction as a predecessor of each out. |
679 | for (int out : {ip->out(), ip->out1()}) { |
680 | if (!predmap->has_index(out)) { |
681 | predmap->set_new(out, static_cast<int>(predvec->size())); |
682 | predvec->emplace_back(); |
683 | } |
684 | (*predvec)[predmap->get_existing(out)].emplace_back(id); |
685 | } |
686 | stk->push_back(ip->out1()); |
687 | id = ip->out(); |
688 | goto Loop; |
689 | |
690 | case kInstByteRange: |
691 | case kInstCapture: |
692 | case kInstEmptyWidth: |
693 | // Mark the out of this instruction as a "root". |
694 | if (!rootmap->has_index(ip->out())) |
695 | rootmap->set_new(ip->out(), rootmap->size()); |
696 | id = ip->out(); |
697 | goto Loop; |
698 | |
699 | case kInstNop: |
700 | id = ip->out(); |
701 | goto Loop; |
702 | |
703 | case kInstMatch: |
704 | case kInstFail: |
705 | break; |
706 | } |
707 | } |
708 | } |
709 | |
710 | void Prog::MarkDominator(int root, SparseArray<int>* rootmap, |
711 | SparseArray<int>* predmap, |
712 | std::vector<std::vector<int>>* predvec, |
713 | SparseSet* reachable, std::vector<int>* stk) { |
714 | reachable->clear(); |
715 | stk->clear(); |
716 | stk->push_back(root); |
717 | while (!stk->empty()) { |
718 | int id = stk->back(); |
719 | stk->pop_back(); |
720 | Loop: |
721 | if (reachable->contains(id)) |
722 | continue; |
723 | reachable->insert_new(id); |
724 | |
725 | if (id != root && rootmap->has_index(id)) { |
726 | // We reached another "tree" via epsilon transition. |
727 | continue; |
728 | } |
729 | |
730 | Inst* ip = inst(id); |
731 | switch (ip->opcode()) { |
732 | default: |
733 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
734 | break; |
735 | |
736 | case kInstAltMatch: |
737 | case kInstAlt: |
738 | stk->push_back(ip->out1()); |
739 | id = ip->out(); |
740 | goto Loop; |
741 | |
742 | case kInstByteRange: |
743 | case kInstCapture: |
744 | case kInstEmptyWidth: |
745 | break; |
746 | |
747 | case kInstNop: |
748 | id = ip->out(); |
749 | goto Loop; |
750 | |
751 | case kInstMatch: |
752 | case kInstFail: |
753 | break; |
754 | } |
755 | } |
756 | |
757 | for (SparseSet::const_iterator i = reachable->begin(); |
758 | i != reachable->end(); |
759 | ++i) { |
760 | int id = *i; |
761 | if (predmap->has_index(id)) { |
762 | for (int pred : (*predvec)[predmap->get_existing(id)]) { |
763 | if (!reachable->contains(pred)) { |
764 | // id has a predecessor that cannot be reached from root! |
765 | // Therefore, id must be a "root" too - mark it as such. |
766 | if (!rootmap->has_index(id)) |
767 | rootmap->set_new(id, rootmap->size()); |
768 | } |
769 | } |
770 | } |
771 | } |
772 | } |
773 | |
774 | void Prog::EmitList(int root, SparseArray<int>* rootmap, |
775 | std::vector<Inst>* flat, |
776 | SparseSet* reachable, std::vector<int>* stk) { |
777 | reachable->clear(); |
778 | stk->clear(); |
779 | stk->push_back(root); |
780 | while (!stk->empty()) { |
781 | int id = stk->back(); |
782 | stk->pop_back(); |
783 | Loop: |
784 | if (reachable->contains(id)) |
785 | continue; |
786 | reachable->insert_new(id); |
787 | |
788 | if (id != root && rootmap->has_index(id)) { |
789 | // We reached another "tree" via epsilon transition. Emit a kInstNop |
790 | // instruction so that the Prog does not become quadratically larger. |
791 | flat->emplace_back(); |
792 | flat->back().set_opcode(kInstNop); |
793 | flat->back().set_out(rootmap->get_existing(id)); |
794 | continue; |
795 | } |
796 | |
797 | Inst* ip = inst(id); |
798 | switch (ip->opcode()) { |
799 | default: |
800 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
801 | break; |
802 | |
803 | case kInstAltMatch: |
804 | flat->emplace_back(); |
805 | flat->back().set_opcode(kInstAltMatch); |
806 | flat->back().set_out(static_cast<int>(flat->size())); |
807 | flat->back().out1_ = static_cast<uint32_t>(flat->size())+1; |
808 | FALLTHROUGH_INTENDED; |
809 | |
810 | case kInstAlt: |
811 | stk->push_back(ip->out1()); |
812 | id = ip->out(); |
813 | goto Loop; |
814 | |
815 | case kInstByteRange: |
816 | case kInstCapture: |
817 | case kInstEmptyWidth: |
818 | flat->emplace_back(); |
819 | memmove(&flat->back(), ip, sizeof *ip); |
820 | flat->back().set_out(rootmap->get_existing(ip->out())); |
821 | break; |
822 | |
823 | case kInstNop: |
824 | id = ip->out(); |
825 | goto Loop; |
826 | |
827 | case kInstMatch: |
828 | case kInstFail: |
829 | flat->emplace_back(); |
830 | memmove(&flat->back(), ip, sizeof *ip); |
831 | break; |
832 | } |
833 | } |
834 | } |
835 | |
836 | // For each ByteRange instruction in [begin, end), computes a hint to execution |
837 | // engines: the delta to the next instruction (in flat) worth exploring iff the |
838 | // current instruction matched. |
839 | // |
840 | // Implements a coloring algorithm related to ByteMapBuilder, but in this case, |
841 | // colors are instructions and recoloring ranges precisely identifies conflicts |
842 | // between instructions. Iterating backwards over [begin, end) is guaranteed to |
843 | // identify the nearest conflict (if any) with only linear complexity. |
844 | void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { |
845 | Bitmap256 splits; |
846 | int colors[256]; |
847 | |
848 | bool dirty = false; |
849 | for (int id = end; id >= begin; --id) { |
850 | if (id == end || |
851 | (*flat)[id].opcode() != kInstByteRange) { |
852 | if (dirty) { |
853 | dirty = false; |
854 | splits.Clear(); |
855 | } |
856 | splits.Set(255); |
857 | colors[255] = id; |
858 | // At this point, the [0-255] range is colored with id. |
859 | // Thus, hints cannot point beyond id; and if id == end, |
860 | // hints that would have pointed to id will be 0 instead. |
861 | continue; |
862 | } |
863 | dirty = true; |
864 | |
865 | // We recolor the [lo-hi] range with id. Note that first ratchets backwards |
866 | // from end to the nearest conflict (if any) during recoloring. |
867 | int first = end; |
868 | auto Recolor = [&](int lo, int hi) { |
869 | // Like ByteMapBuilder, we split at lo-1 and at hi. |
870 | --lo; |
871 | |
872 | if (0 <= lo && !splits.Test(lo)) { |
873 | splits.Set(lo); |
874 | int next = splits.FindNextSetBit(lo+1); |
875 | colors[lo] = colors[next]; |
876 | } |
877 | if (!splits.Test(hi)) { |
878 | splits.Set(hi); |
879 | int next = splits.FindNextSetBit(hi+1); |
880 | colors[hi] = colors[next]; |
881 | } |
882 | |
883 | int c = lo+1; |
884 | while (c < 256) { |
885 | int next = splits.FindNextSetBit(c); |
886 | // Ratchet backwards... |
887 | first = std::min(first, colors[next]); |
888 | // Recolor with id - because it's the new nearest conflict! |
889 | colors[next] = id; |
890 | if (next == hi) |
891 | break; |
892 | c = next+1; |
893 | } |
894 | }; |
895 | |
896 | Inst* ip = &(*flat)[id]; |
897 | int lo = ip->lo(); |
898 | int hi = ip->hi(); |
899 | Recolor(lo, hi); |
900 | if (ip->foldcase() && lo <= 'z' && hi >= 'a') { |
901 | int foldlo = lo; |
902 | int foldhi = hi; |
903 | if (foldlo < 'a') |
904 | foldlo = 'a'; |
905 | if (foldhi > 'z') |
906 | foldhi = 'z'; |
907 | if (foldlo <= foldhi) { |
908 | foldlo += 'A' - 'a'; |
909 | foldhi += 'A' - 'a'; |
910 | Recolor(foldlo, foldhi); |
911 | } |
912 | } |
913 | |
914 | if (first != end) { |
915 | uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767)); |
916 | ip->hint_foldcase_ |= hint<<1; |
917 | } |
918 | } |
919 | } |
920 | |
921 | } // namespace re2 |
922 | |