prog.cc source code [RE2/re2/prog.cc]

1	// Copyright 2007 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Compiled regular expression representation.
6	// Tested by compile_test.cc
7
8	#include "re2/prog.h"
9
10	#include <stdint.h>
11	#include <string.h>
12	#include <algorithm>
13	#include <memory>
14	#include <utility>
15
16	#include "util/util.h"
17	#include "util/logging.h"
18	#include "util/strutil.h"
19	#include "re2/bitmap256.h"
20	#include "re2/stringpiece.h"
21
22	namespace re2 {
23
24	// Constructors per Inst opcode
25
26	void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
27	DCHECK_EQ(out_opcode_, `0`);
28	set_out_opcode(out, kInstAlt);
29	out1_ = out1;
30	}
31
32	void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
33	DCHECK_EQ(out_opcode_, `0`);
34	set_out_opcode(out, kInstByteRange);
35	lo_ = lo & `0xFF`;
36	hi_ = hi & `0xFF`;
37	hint_foldcase_ = foldcase&`1`;
38	}
39
40	void Prog::Inst::InitCapture(int cap, uint32_t out) {
41	DCHECK_EQ(out_opcode_, `0`);
42	set_out_opcode(out, kInstCapture);
43	cap_ = cap;
44	}
45
46	void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
47	DCHECK_EQ(out_opcode_, `0`);
48	set_out_opcode(out, kInstEmptyWidth);
49	empty_ = empty;
50	}
51
52	void Prog::Inst::InitMatch(int32_t id) {
53	DCHECK_EQ(out_opcode_, `0`);
54	set_opcode(kInstMatch);
55	match_id_ = id;
56	}
57
58	void Prog::Inst::InitNop(uint32_t out) {
59	DCHECK_EQ(out_opcode_, `0`);
60	set_opcode(kInstNop);
61	}
62
63	void Prog::Inst::InitFail() {
64	DCHECK_EQ(out_opcode_, `0`);
65	set_opcode(kInstFail);
66	}
67
68	std::string Prog::Inst::Dump() {
69	switch (opcode()) {
70	default:
71	return StringPrintf("opcode %d", static_cast<int>(opcode()));
72
73	case kInstAlt:
74	return StringPrintf("alt -> %d \| %d", out(), out1_);
75
76	case kInstAltMatch:
77	return StringPrintf("altmatch -> %d \| %d", out(), out1_);
78
79	case kInstByteRange:
80	return StringPrintf("byte%s [%02x-%02x] %d -> %d",
81	foldcase() ? "/i" : "",
82	lo_, hi_, hint(), out());
83
84	case kInstCapture:
85	return StringPrintf("capture %d -> %d", cap_, out());
86
87	case kInstEmptyWidth:
88	return StringPrintf("emptywidth %#x -> %d",
89	static_cast<int>(empty_), out());
90
91	case kInstMatch:
92	return StringPrintf("match! %d", match_id());
93
94	case kInstNop:
95	return StringPrintf("nop -> %d", out());
96
97	case kInstFail:
98	return StringPrintf("fail");
99	}
100	}
101
102	Prog::Prog()
103	: anchor_start_(false),
104	anchor_end_(false),
105	reversed_(false),
106	did_flatten_(false),
107	did_onepass_(false),
108	start_(`0`),
109	start_unanchored_(`0`),
110	size_(`0`),
111	bytemap_range_(`0`),
112	first_byte_(-`1`),
113	flags_(`0`),
114	list_count_(`0`),
115	dfa_mem_(`0`),
116	dfa_first_(NULL),
117	dfa_longest_(NULL) {
118	}
119
120	Prog::~Prog() {
121	DeleteDFA(dfa_longest_);
122	DeleteDFA(dfa_first_);
123	}
124
125	typedef SparseSet Workq;
126
127	static inline void AddToQueue(Workq* q, int id) {
128	if (id != `0`)
129	q->insert(id);
130	}
131
132	static std::string ProgToString(Prog* prog, Workq* q) {
133	std::string s;
134	for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
135	int id = *i;
136	Prog::Inst* ip = prog->inst(id);
137	s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
138	AddToQueue(q, ip->out());
139	if (ip->opcode() == kInstAlt \|\| ip->opcode() == kInstAltMatch)
140	AddToQueue(q, ip->out1());
141	}
142	return s;
143	}
144
145	static std::string FlattenedProgToString(Prog* prog, int start) {
146	std::string s;
147	for (int id = start; id < prog->size(); id++) {
148	Prog::Inst* ip = prog->inst(id);
149	if (ip->last())
150	s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
151	else
152	s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
153	}
154	return s;
155	}
156
157	std::string Prog::Dump() {
158	if (did_flatten_)
159	return FlattenedProgToString(this, start_);
160
161	Workq q(size_);
162	AddToQueue(&q, start_);
163	return ProgToString(this, &q);
164	}
165
166	std::string Prog::DumpUnanchored() {
167	if (did_flatten_)
168	return FlattenedProgToString(this, start_unanchored_);
169
170	Workq q(size_);
171	AddToQueue(&q, start_unanchored_);
172	return ProgToString(this, &q);
173	}
174
175	std::string Prog::DumpByteMap() {
176	std::string map;
177	for (int c = `0`; c < `256`; c++) {
178	int b = bytemap_[c];
179	int lo = c;
180	while (c < `256`-`1` && bytemap_[c+`1`] == b)
181	c++;
182	int hi = c;
183	map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
184	}
185	return map;
186	}
187
188	int Prog::first_byte() {
189	std::call_once(first_byte_once_, [](Prog* prog) {
190	prog->first_byte_ = prog->ComputeFirstByte();
191	}, this);
192	return first_byte_;
193	}
194
195	static bool IsMatch(Prog, Prog::Inst);
196
197	// Peep-hole optimizer.
198	void Prog::Optimize() {
199	Workq q(size_);
200
201	// Eliminate nops. Most are taken out during compilation
202	// but a few are hard to avoid.
203	q.clear();
204	AddToQueue(&q, start_);
205	for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
206	int id = *i;
207
208	Inst* ip = inst(id);
209	int j = ip->out();
210	Inst* jp;
211	while (j != `0` && (jp=inst(j))->opcode() == kInstNop) {
212	j = jp->out();
213	}
214	ip->set_out(j);
215	AddToQueue(&q, ip->out());
216
217	if (ip->opcode() == kInstAlt) {
218	j = ip->out1();
219	while (j != `0` && (jp=inst(j))->opcode() == kInstNop) {
220	j = jp->out();
221	}
222	ip->out1_ = j;
223	AddToQueue(&q, ip->out1());
224	}
225	}
226
227	// Insert kInstAltMatch instructions
228	// Look for
229	// ip: Alt -> j \| k
230	// j: ByteRange [00-FF] -> ip
231	// k: Match
232	// or the reverse (the above is the greedy one).
233	// Rewrite Alt to AltMatch.
234	q.clear();
235	AddToQueue(&q, start_);
236	for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
237	int id = *i;
238	Inst* ip = inst(id);
239	AddToQueue(&q, ip->out());
240	if (ip->opcode() == kInstAlt)
241	AddToQueue(&q, ip->out1());
242
243	if (ip->opcode() == kInstAlt) {
244	Inst* j = inst(ip->out());
245	Inst* k = inst(ip->out1());
246	if (j->opcode() == kInstByteRange && j->out() == id &&
247	j->lo() == `0x00` && j->hi() == `0xFF` &&
248	IsMatch(this, k)) {
249	ip->set_opcode(kInstAltMatch);
250	continue;
251	}
252	if (IsMatch(this, j) &&
253	k->opcode() == kInstByteRange && k->out() == id &&
254	k->lo() == `0x00` && k->hi() == `0xFF`) {
255	ip->set_opcode(kInstAltMatch);
256	}
257	}
258	}
259	}
260
261	// Is ip a guaranteed match at end of text, perhaps after some capturing?
262	static bool IsMatch(Prog* prog, Prog::Inst* ip) {
263	for (;;) {
264	switch (ip->opcode()) {
265	default:
266	LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
267	return false;
268
269	case kInstAlt:
270	case kInstAltMatch:
271	case kInstByteRange:
272	case kInstFail:
273	case kInstEmptyWidth:
274	return false;
275
276	case kInstCapture:
277	case kInstNop:
278	ip = prog->inst(ip->out());
279	break;
280
281	case kInstMatch:
282	return true;
283	}
284	}
285	}
286
287	uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
288	int flags = `0`;
289
290	// ^ and \A
291	if (p == text.data())
292	flags \|= kEmptyBeginText \| kEmptyBeginLine;
293	else if (p[-`1`] == `'\n'`)
294	flags \|= kEmptyBeginLine;
295
296	// $ and \z
297	if (p == text.data() + text.size())
298	flags \|= kEmptyEndText \| kEmptyEndLine;
299	else if (p < text.data() + text.size() && p[`0`] == `'\n'`)
300	flags \|= kEmptyEndLine;
301
302	// \b and \B
303	if (p == text.data() && p == text.data() + text.size()) {
304	// no word boundary here
305	} else if (p == text.data()) {
306	if (IsWordChar(p[`0`]))
307	flags \|= kEmptyWordBoundary;
308	} else if (p == text.data() + text.size()) {
309	if (IsWordChar(p[-`1`]))
310	flags \|= kEmptyWordBoundary;
311	} else {
312	if (IsWordChar(p[-`1`]) != IsWordChar(p[`0`]))
313	flags \|= kEmptyWordBoundary;
314	}
315	if (!(flags & kEmptyWordBoundary))
316	flags \|= kEmptyNonWordBoundary;
317
318	return flags;
319	}
320
321	// ByteMapBuilder implements a coloring algorithm.
322	//
323	// The first phase is a series of "mark and merge" batches: we mark one or more
324	// [lo-hi] ranges, then merge them into our internal state. Batching is not for
325	// performance; rather, it means that the ranges are treated indistinguishably.
326	//
327	// Internally, the ranges are represented using a bitmap that stores the splits
328	// and a vector that stores the colors; both of them are indexed by the ranges'
329	// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
330	// hi (if not already split), then recolor each range in between. The color map
331	// (i.e. from the old color to the new color) is maintained for the lifetime of
332	// the batch and so underpins this somewhat obscure approach to set operations.
333	//
334	// The second phase builds the bytemap from our internal state: we recolor each
335	// range, then store the new color (which is now the byte class) in each of the
336	// corresponding array elements. Finally, we output the number of byte classes.
337	class ByteMapBuilder {
338	public:
339	ByteMapBuilder() {
340	// Initial state: the [0-255] range has color 256.
341	// This will avoid problems during the second phase,
342	// in which we assign byte classes numbered from 0.
343	splits_.Set(`255`);
344	colors_[`255`] = `256`;
345	nextcolor_ = `257`;
346	}
347
348	void Mark(int lo, int hi);
349	void Merge();
350	void Build(uint8_t* bytemap, int* bytemap_range);
351
352	private:
353	int Recolor(int oldcolor);
354
355	Bitmap256 splits_;
356	int colors_[`256`];
357	int nextcolor_;
358	std::vector<std::pair<int, int>> colormap_;
359	std::vector<std::pair<int, int>> ranges_;
360
361	ByteMapBuilder(const ByteMapBuilder&) = delete;
362	ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
363	};
364
365	void ByteMapBuilder::Mark(int lo, int hi) {
366	DCHECK_GE(lo, `0`);
367	DCHECK_GE(hi, `0`);
368	DCHECK_LE(lo, `255`);
369	DCHECK_LE(hi, `255`);
370	DCHECK_LE(lo, hi);
371
372	// Ignore any [0-255] ranges. They cause us to recolor every range, which
373	// has no effect on the eventual result and is therefore a waste of time.
374	if (lo == `0` && hi == `255`)
375	return;
376
377	ranges_.emplace_back(lo, hi);
378	}
379
380	void ByteMapBuilder::Merge() {
381	for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
382	it != ranges_.end();
383	++it) {
384	int lo = it ->first-`1`;
385	int hi = it ->second;
386
387	if (`0` <= lo && !splits_.Test(lo)) {
388	splits_.Set(lo);
389	int next = splits_.FindNextSetBit(lo+`1`);
390	colors_[lo] = colors_[next];
391	}
392	if (!splits_.Test(hi)) {
393	splits_.Set(hi);
394	int next = splits_.FindNextSetBit(hi+`1`);
395	colors_[hi] = colors_[next];
396	}
397
398	int c = lo+`1`;
399	while (c < `256`) {
400	int next = splits_.FindNextSetBit(c);
401	colors_[next] = Recolor(colors_[next]);
402	if (next == hi)
403	break;
404	c = next+`1`;
405	}
406	}
407	colormap_.clear();
408	ranges_.clear();
409	}
410
411	void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
412	// Assign byte classes numbered from 0.
413	nextcolor_ = `0`;
414
415	int c = `0`;
416	while (c < `256`) {
417	int next = splits_.FindNextSetBit(c);
418	uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
419	while (c <= next) {
420	bytemap[c] = b;
421	c++;
422	}
423	}
424
425	*bytemap_range = nextcolor_;
426	}
427
428	int ByteMapBuilder::Recolor(int oldcolor) {
429	// Yes, this is a linear search. There can be at most 256
430	// colors and there will typically be far fewer than that.
431	// Also, we need to consider keys and* values in order to*
432	// avoid recoloring a given range more than once per batch.
433	std::vector<std::pair<int, int>>::const_iterator it =
434	std::find_if(colormap_.begin(), colormap_.end(),
435	[=](const std::pair<int, int>& kv) -> bool {
436	return kv.first == oldcolor \|\| kv.second == oldcolor;
437	});
438	if (it != colormap_.end())
439	return it ->second;
440	int newcolor = nextcolor_;
441	nextcolor_++;
442	colormap_.emplace_back(oldcolor, newcolor);
443	return newcolor;
444	}
445
446	void Prog::ComputeByteMap() {
447	// Fill in bytemap with byte classes for the program.
448	// Ranges of bytes that are treated indistinguishably
449	// will be mapped to a single byte class.
450	ByteMapBuilder builder;
451
452	// Don't repeat the work for ^ and $.
453	bool marked_line_boundaries = false;
454	// Don't repeat the work for \b and \B.
455	bool marked_word_boundaries = false;
456
457	for (int id = `0`; id < size(); id++) {
458	Inst* ip = inst(id);
459	if (ip->opcode() == kInstByteRange) {
460	int lo = ip->lo();
461	int hi = ip->hi();
462	builder.Mark(lo, hi);
463	if (ip->foldcase() && lo <= `'z'` && hi >= `'a'`) {
464	int foldlo = lo;
465	int foldhi = hi;
466	if (foldlo < `'a'`)
467	foldlo = `'a'`;
468	if (foldhi > `'z'`)
469	foldhi = `'z'`;
470	if (foldlo <= foldhi) {
471	foldlo += `'A'` - `'a'`;
472	foldhi += `'A'` - `'a'`;
473	builder.Mark(foldlo, foldhi);
474	}
475	}
476	// If this Inst is not the last Inst in its list AND the next Inst is
477	// also a ByteRange AND the Insts have the same out, defer the merge.
478	if (!ip->last() &&
479	inst(id+`1`)->opcode() == kInstByteRange &&
480	ip->out() == inst(id+`1`)->out())
481	continue;
482	builder.Merge();
483	} else if (ip->opcode() == kInstEmptyWidth) {
484	if (ip->empty() & (kEmptyBeginLine\|kEmptyEndLine) &&
485	!marked_line_boundaries) {
486	builder.Mark(`'\n'`, `'\n'`);
487	builder.Merge();
488	marked_line_boundaries = true;
489	}
490	if (ip->empty() & (kEmptyWordBoundary\|kEmptyNonWordBoundary) &&
491	!marked_word_boundaries) {
492	// We require two batches here: the first for ranges that are word
493	// characters, the second for ranges that are not word characters.
494	for (bool isword : {true, false}) {
495	int j;
496	for (int i = `0`; i < `256`; i = j) {
497	for (j = i + `1`; j < `256` &&
498	Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499	Prog::IsWordChar(static_cast<uint8_t>(j));
500	j++)
501	;
502	if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503	builder.Mark(i, j - `1`);
504	}
505	builder.Merge();
506	}
507	marked_word_boundaries = true;
508	}
509	}
510	}
511
512	builder.Build(bytemap_, &bytemap_range_);
513
514	if (`0`) { // For debugging, use trivial bytemap.
515	LOG(ERROR) << "Using trivial bytemap.";
516	for (int i = `0`; i < `256`; i++)
517	bytemap_[i] = static_cast<uint8_t>(i);
518	bytemap_range_ = `256`;
519	}
520	}
521
522	// Prog::Flatten() implements a graph rewriting algorithm.
523	//
524	// The overall process is similar to epsilon removal, but retains some epsilon
525	// transitions: those from Capture and EmptyWidth instructions; and those from
526	// nullable subexpressions. (The latter avoids quadratic blowup in transitions
527	// in the worst case.) It might be best thought of as Alt instruction elision.
528	//
529	// In conceptual terms, it divides the Prog into "trees" of instructions, then
530	// traverses the "trees" in order to produce "lists" of instructions. A "tree"
531	// is one or more instructions that grow from one "root" instruction to one or
532	// more "leaf" instructions; if a "tree" has exactly one instruction, then the
533	// "root" is also the "leaf". In most cases, a "root" is the successor of some
534	// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535	// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536	// EmptyWidth or Match instruction. However, this is insufficient for handling
537	// nested nullable subexpressions correctly, so in some cases, a "root" is the
538	// dominator of the instructions reachable from some "successor root" (i.e. it
539	// has an unreachable predecessor) and is considered a "dominator root". Since
540	// only Alt instructions can be "dominator roots" (other instructions would be
541	// "leaves"), only Alt instructions are required to be marked as predecessors.
542	//
543	// Dividing the Prog into "trees" comprises two passes: marking the "successor
544	// roots" and the predecessors; and marking the "dominator roots". Sorting the
545	// "successor roots" by their bytecode offsets enables iteration in order from
546	// greatest to least during the second pass; by working backwards in this case
547	// and flooding the graph no further than "leaves" and already marked "roots",
548	// it becomes possible to mark "dominator roots" without doing excessive work.
549	//
550	// Traversing the "trees" is just iterating over the "roots" in order of their
551	// marking and flooding the graph no further than "leaves" and "roots". When a
552	// "leaf" is reached, the instruction is copied with its successor remapped to
553	// its "root" number. When a "root" is reached, a Nop instruction is generated
554	// with its successor remapped similarly. As each "list" is produced, its last
555	// instruction is marked as such. After all of the "lists" have been produced,
556	// a pass over their instructions remaps their successors to bytecode offsets.
557	void Prog::Flatten() {
558	if (did_flatten_)
559	return;
560	did_flatten_ = true;
561
562	// Scratch structures. It's important that these are reused by functions
563	// that we call in loops because they would thrash the heap otherwise.
564	SparseSet reachable(size());
565	std::vector<int> stk;
566	stk.reserve(size());
567
568	// First pass: Marks "successor roots" and predecessors.
569	// Builds the mapping from inst-ids to root-ids.
570	SparseArray<int> rootmap(size());
571	SparseArray<int> predmap(size());
572	std::vector<std::vector<int>> predvec;
573	MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574
575	// Second pass: Marks "dominator roots".
576	SparseArray<int> sorted(rootmap);
577	std::sort(sorted.begin(), sorted.end(), sorted.less);
578	for (SparseArray<int>::const_iterator i = sorted.end() - `1`;
579	i != sorted.begin();
580	--i) {
581	if (i->index() != start_unanchored() && i->index() != start())
582	MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583	}
584
585	// Third pass: Emits "lists". Remaps outs to root-ids.
586	// Builds the mapping from root-ids to flat-ids.
587	std::vector<int> flatmap(rootmap.size());
588	std::vector<Inst> flat;
589	flat.reserve(size());
590	for (SparseArray<int>::const_iterator i = rootmap.begin();
591	i != rootmap.end();
592	++i) {
593	flatmap [i->value()] = static_cast<int>(flat.size());
594	EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595	flat.back().set_last();
596	// We have the bounds of the "list", so this is the
597	// most convenient point at which to compute hints.
598	ComputeHints(&flat, flatmap [i->value()], static_cast<int>(flat.size()));
599	}
600
601	list_count_ = static_cast<int>(flatmap.size());
602	for (int i = `0`; i < kNumInst; i++)
603	inst_count_[i] = `0`;
604
605	// Fourth pass: Remaps outs to flat-ids.
606	// Counts instructions by opcode.
607	for (int id = `0`; id < static_cast<int>(flat.size()); id++) {
608	Inst* ip = &flat [id];
609	if (ip->opcode() != kInstAltMatch) // handled in EmitList()
610	ip->set_out(flatmap [ip->out()]);
611	inst_count_[ip->opcode()]++;
612	}
613
614	int total = `0`;
615	for (int i = `0`; i < kNumInst; i++)
616	total += inst_count_[i];
617	DCHECK_EQ(total, static_cast<int>(flat.size()));
618
619	// Remap start_unanchored and start.
620	if (start_unanchored() == `0`) {
621	DCHECK_EQ(start(), `0`);
622	} else if (start_unanchored() == start()) {
623	set_start_unanchored(flatmap [`1`]);
624	set_start(flatmap [`1`]);
625	} else {
626	set_start_unanchored(flatmap [`1`]);
627	set_start(flatmap [`2`]);
628	}
629
630	// Finally, replace the old instructions with the new instructions.
631	size_ = static_cast<int>(flat.size());
632	inst_ = PODArray<Inst>(size_);
633	memmove(inst_.data(), flat.data(), size_*sizeof inst_[`0`]);
634
635	// Populate the list heads for BitState.
636	// 512 instructions limits the memory footprint to 1KiB.
637	if (size_ <= `512`) {
638	list_heads_ = PODArray<uint16_t>(size_);
639	// 0xFF makes it more obvious if we try to look up a non-head.
640	memset(list_heads_.data(), `0xFF`, size_*sizeof list_heads_[`0`]);
641	for (int i = `0`; i < list_count_; ++i)
642	list_heads_[flatmap [i]] = i;
643	}
644	}
645
646	void Prog::MarkSuccessors(SparseArray<int>* rootmap,
647	SparseArray<int>* predmap,
648	std::vector<std::vector<int>>* predvec,
649	SparseSet* reachable, std::vector<int>* stk) {
650	// Mark the kInstFail instruction.
651	rootmap->set_new(`0`, rootmap->size());
652
653	// Mark the start_unanchored and start instructions.
654	if (!rootmap->has_index(start_unanchored()))
655	rootmap->set_new(start_unanchored(), rootmap->size());
656	if (!rootmap->has_index(start()))
657	rootmap->set_new(start(), rootmap->size());
658
659	reachable->clear();
660	stk->clear();
661	stk->push_back(start_unanchored());
662	while (!stk->empty()) {
663	int id = stk->back();
664	stk->pop_back();
665	Loop:
666	if (reachable->contains(id))
667	continue;
668	reachable->insert_new(id);
669
670	Inst* ip = inst(id);
671	switch (ip->opcode()) {
672	default:
673	LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
674	break;
675
676	case kInstAltMatch:
677	case kInstAlt:
678	// Mark this instruction as a predecessor of each out.
679	for (int out : {ip->out(), ip->out1()}) {
680	if (!predmap->has_index(out)) {
681	predmap->set_new(out, static_cast<int>(predvec->size()));
682	predvec->emplace_back();
683	}
684	(*predvec)[predmap->get_existing(out)].emplace_back(id);
685	}
686	stk->push_back(ip->out1());
687	id = ip->out();
688	goto Loop;
689
690	case kInstByteRange:
691	case kInstCapture:
692	case kInstEmptyWidth:
693	// Mark the out of this instruction as a "root".
694	if (!rootmap->has_index(ip->out()))
695	rootmap->set_new(ip->out(), rootmap->size());
696	id = ip->out();
697	goto Loop;
698
699	case kInstNop:
700	id = ip->out();
701	goto Loop;
702
703	case kInstMatch:
704	case kInstFail:
705	break;
706	}
707	}
708	}
709
710	void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
711	SparseArray<int>* predmap,
712	std::vector<std::vector<int>>* predvec,
713	SparseSet* reachable, std::vector<int>* stk) {
714	reachable->clear();
715	stk->clear();
716	stk->push_back(root);
717	while (!stk->empty()) {
718	int id = stk->back();
719	stk->pop_back();
720	Loop:
721	if (reachable->contains(id))
722	continue;
723	reachable->insert_new(id);
724
725	if (id != root && rootmap->has_index(id)) {
726	// We reached another "tree" via epsilon transition.
727	continue;
728	}
729
730	Inst* ip = inst(id);
731	switch (ip->opcode()) {
732	default:
733	LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
734	break;
735
736	case kInstAltMatch:
737	case kInstAlt:
738	stk->push_back(ip->out1());
739	id = ip->out();
740	goto Loop;
741
742	case kInstByteRange:
743	case kInstCapture:
744	case kInstEmptyWidth:
745	break;
746
747	case kInstNop:
748	id = ip->out();
749	goto Loop;
750
751	case kInstMatch:
752	case kInstFail:
753	break;
754	}
755	}
756
757	for (SparseSet::const_iterator i = reachable->begin();
758	i != reachable->end();
759	++i) {
760	int id = *i;
761	if (predmap->has_index(id)) {
762	for (int pred : (*predvec)[predmap->get_existing(id)]) {
763	if (!reachable->contains(pred)) {
764	// id has a predecessor that cannot be reached from root!
765	// Therefore, id must be a "root" too - mark it as such.
766	if (!rootmap->has_index(id))
767	rootmap->set_new(id, rootmap->size());
768	}
769	}
770	}
771	}
772	}
773
774	void Prog::EmitList(int root, SparseArray<int>* rootmap,
775	std::vector<Inst>* flat,
776	SparseSet* reachable, std::vector<int>* stk) {
777	reachable->clear();
778	stk->clear();
779	stk->push_back(root);
780	while (!stk->empty()) {
781	int id = stk->back();
782	stk->pop_back();
783	Loop:
784	if (reachable->contains(id))
785	continue;
786	reachable->insert_new(id);
787
788	if (id != root && rootmap->has_index(id)) {
789	// We reached another "tree" via epsilon transition. Emit a kInstNop
790	// instruction so that the Prog does not become quadratically larger.
791	flat->emplace_back();
792	flat->back().set_opcode(kInstNop);
793	flat->back().set_out(rootmap->get_existing(id));
794	continue;
795	}
796
797	Inst* ip = inst(id);
798	switch (ip->opcode()) {
799	default:
800	LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
801	break;
802
803	case kInstAltMatch:
804	flat->emplace_back();
805	flat->back().set_opcode(kInstAltMatch);
806	flat->back().set_out(static_cast<int>(flat->size()));
807	flat->back().out1_ = static_cast<uint32_t>(flat->size())+`1`;
808	FALLTHROUGH_INTENDED;
809
810	case kInstAlt:
811	stk->push_back(ip->out1());
812	id = ip->out();
813	goto Loop;
814
815	case kInstByteRange:
816	case kInstCapture:
817	case kInstEmptyWidth:
818	flat->emplace_back();
819	memmove(&flat->back(), ip, sizeof *ip);
820	flat->back().set_out(rootmap->get_existing(ip->out()));
821	break;
822
823	case kInstNop:
824	id = ip->out();
825	goto Loop;
826
827	case kInstMatch:
828	case kInstFail:
829	flat->emplace_back();
830	memmove(&flat->back(), ip, sizeof *ip);
831	break;
832	}
833	}
834	}
835
836	// For each ByteRange instruction in [begin, end), computes a hint to execution
837	// engines: the delta to the next instruction (in flat) worth exploring iff the
838	// current instruction matched.
839	//
840	// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
841	// colors are instructions and recoloring ranges precisely identifies conflicts
842	// between instructions. Iterating backwards over [begin, end) is guaranteed to
843	// identify the nearest conflict (if any) with only linear complexity.
844	void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
845	Bitmap256 splits;
846	int colors[`256`];
847
848	bool dirty = false;
849	for (int id = end; id >= begin; --id) {
850	if (id == end \|\|
851	(*flat)[id].opcode() != kInstByteRange) {
852	if (dirty) {
853	dirty = false;
854	splits.Clear();
855	}
856	splits.Set(`255`);
857	colors[`255`] = id;
858	// At this point, the [0-255] range is colored with id.
859	// Thus, hints cannot point beyond id; and if id == end,
860	// hints that would have pointed to id will be 0 instead.
861	continue;
862	}
863	dirty = true;
864
865	// We recolor the [lo-hi] range with id. Note that first ratchets backwards
866	// from end to the nearest conflict (if any) during recoloring.
867	int first = end;
868	auto Recolor = [&](int lo, int hi) {
869	// Like ByteMapBuilder, we split at lo-1 and at hi.
870	--lo;
871
872	if (`0` <= lo && !splits.Test(lo)) {
873	splits.Set(lo);
874	int next = splits.FindNextSetBit(lo+`1`);
875	colors[lo] = colors[next];
876	}
877	if (!splits.Test(hi)) {
878	splits.Set(hi);
879	int next = splits.FindNextSetBit(hi+`1`);
880	colors[hi] = colors[next];
881	}
882
883	int c = lo+`1`;
884	while (c < `256`) {
885	int next = splits.FindNextSetBit(c);
886	// Ratchet backwards...
887	first = std::min(first, colors[next]);
888	// Recolor with id - because it's the new nearest conflict!
889	colors[next] = id;
890	if (next == hi)
891	break;
892	c = next+`1`;
893	}
894	};
895
896	Inst* ip = &(*flat)[id];
897	int lo = ip->lo();
898	int hi = ip->hi();
899	Recolor (lo, hi);
900	if (ip->foldcase() && lo <= `'z'` && hi >= `'a'`) {
901	int foldlo = lo;
902	int foldhi = hi;
903	if (foldlo < `'a'`)
904	foldlo = `'a'`;
905	if (foldhi > `'z'`)
906	foldhi = `'z'`;
907	if (foldlo <= foldhi) {
908	foldlo += `'A'` - `'a'`;
909	foldhi += `'A'` - `'a'`;
910	Recolor (foldlo, foldhi);
911	}
912	}
913
914	if (first != end) {
915	uint16_t hint = static_cast<uint16_t>(std::min(first - id, `32767`));
916	ip->hint_foldcase_ \|= hint<<`1`;
917	}
918	}
919	}
920
921	} // namespace re2
922

Browse the source code of RE2/re2/prog.cc