compile.cc source code [RE2/re2/compile.cc]

1	// Copyright 2007 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Compile regular expression to Prog.
6	//
7	// Prog and Inst are defined in prog.h.
8	// This file's external interface is just Regexp::CompileToProg.
9	// The Compiler class defined in this file is private.
10
11	#include <stdint.h>
12	#include <string.h>
13	#include <unordered_map>
14	#include <utility>
15
16	#include "util/logging.h"
17	#include "util/utf.h"
18	#include "re2/pod_array.h"
19	#include "re2/prog.h"
20	#include "re2/re2.h"
21	#include "re2/regexp.h"
22	#include "re2/walker-inl.h"
23
24	namespace re2 {
25
26	// List of pointers to Inst that need to be filled in (patched).*
27	// Because the Inst haven't been filled in yet,*
28	// we can use the Inst word to hold the list's "next" pointer.*
29	// It's kind of sleazy, but it works well in practice.
30	// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31	//
32	// Because the out and out1 fields in Inst are no longer pointers,
33	// we can't use pointers directly here either. Instead, p refers
34	// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
35	// p == 0 represents the NULL list. This is okay because instruction #0
36	// is always the fail instruction, which never appears on a list.
37
38	struct PatchList {
39	uint32_t p;
40
41	// Returns patch list containing just p.
42	static PatchList Mk(uint32_t p);
43
44	// Patches all the entries on l to have value v.
45	// Caller must not ever use patch list again.
46	static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
47
48	// Deref returns the next pointer pointed at by p.
49	static PatchList Deref(Prog::Inst *inst0, PatchList l);
50
51	// Appends two patch lists and returns result.
52	static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
53	};
54
55	static PatchList nullPatchList = { `0` };
56
57	// Returns patch list containing just p.
58	PatchList PatchList::Mk(uint32_t p) {
59	PatchList l;
60	l.p = p;
61	return l;
62	}
63
64	// Returns the next pointer pointed at by l.
65	PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
66	Prog::Inst* ip = &inst0[l.p>>`1`];
67	if (l.p&`1`)
68	l.p = ip->out1();
69	else
70	l.p = ip->out();
71	return l;
72	}
73
74	// Patches all the entries on l to have value v.
75	void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
76	while (l.p != `0`) {
77	Prog::Inst* ip = &inst0[l.p>>`1`];
78	if (l.p&`1`) {
79	l.p = ip->out1();
80	ip->out1_ = val;
81	} else {
82	l.p = ip->out();
83	ip->set_out(val);
84	}
85	}
86	}
87
88	// Appends two patch lists and returns result.
89	PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
90	if (l1.p == `0`)
91	return l2;
92	if (l2.p == `0`)
93	return l1;
94
95	PatchList l = l1;
96	for (;;) {
97	PatchList next = PatchList::Deref(inst0, l);
98	if (next.p == `0`)
99	break;
100	l = next;
101	}
102
103	Prog::Inst* ip = &inst0[l.p>>`1`];
104	if (l.p&`1`)
105	ip->out1_ = l2.p;
106	else
107	ip->set_out(l2.p);
108
109	return l1;
110	}
111
112	// Compiled program fragment.
113	struct Frag {
114	uint32_t begin;
115	PatchList end;
116
117	Frag() : begin(`0`) { end.p = `0`; } // needed so Frag can go in vector
118	Frag(uint32_t begin, PatchList end) : begin(begin), end (end) {}
119	};
120
121	// Input encodings.
122	enum Encoding {
123	kEncodingUTF8 = `1`, // UTF-8 (0-10FFFF)
124	kEncodingLatin1, // Latin-1 (0-FF)
125	};
126
127	class Compiler : public Regexp::Walker<Frag> {
128	public:
129	explicit Compiler();
130	~Compiler();
131
132	// Compiles Regexp to a new Prog.
133	// Caller is responsible for deleting Prog when finished with it.
134	// If reversed is true, compiles for walking over the input
135	// string backward (reverses all concatenations).
136	static Prog Compile(Regexp re, bool reversed, int64_t max_mem);
137
138	// Compiles alternation of all the re to a new Prog.
139	// Each re has a match with an id equal to its index in the vector.
140	static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
141
142	// Interface for Regexp::Walker, which helps traverse the Regexp.
143	// The walk is purely post-recursive: given the machines for the
144	// children, PostVisit combines them to create the machine for
145	// the current node. The child_args are Frags.
146	// The Compiler traverses the Regexp parse tree, visiting
147	// each node in depth-first order. It invokes PreVisit before
148	// visiting the node's children and PostVisit after visiting
149	// the children.
150	Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
151	Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
152	int nchild_args);
153	Frag ShortVisit(Regexp* re, Frag parent_arg);
154	Frag Copy(Frag arg);
155
156	// Given fragment a, returns a+ or a+?; a or a?; a? or a??
157	Frag Plus(Frag a, bool nongreedy);
158	Frag Star(Frag a, bool nongreedy);
159	Frag Quest(Frag a, bool nongreedy);
160
161	// Given fragment a, returns (a) capturing as \n.
162	Frag Capture(Frag a, int n);
163
164	// Given fragments a and b, returns ab; a\|b
165	Frag Cat(Frag a, Frag b);
166	Frag Alt(Frag a, Frag b);
167
168	// Returns a fragment that can't match anything.
169	Frag NoMatch();
170
171	// Returns a fragment that matches the empty string.
172	Frag Match(int32_t id);
173
174	// Returns a no-op fragment.
175	Frag Nop();
176
177	// Returns a fragment matching the byte range lo-hi.
178	Frag ByteRange(int lo, int hi, bool foldcase);
179
180	// Returns a fragment matching an empty-width special op.
181	Frag EmptyWidth(EmptyOp op);
182
183	// Adds n instructions to the program.
184	// Returns the index of the first one.
185	// Returns -1 if no more instructions are available.
186	int AllocInst(int n);
187
188	// Rune range compiler.
189
190	// Begins a new alternation.
191	void BeginRange();
192
193	// Adds a fragment matching the rune range lo-hi.
194	void AddRuneRange(Rune lo, Rune hi, bool foldcase);
195	void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
196	void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
197	void Add_80_10ffff();
198
199	// New suffix that matches the byte range lo-hi, then goes to next.
200	int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
201	int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
202
203	// Returns true iff the suffix is cached.
204	bool IsCachedRuneByteSuffix(int id);
205
206	// Adds a suffix to alternation.
207	void AddSuffix(int id);
208
209	// Adds a suffix to the trie starting from the given root node.
210	// Returns zero iff allocating an instruction fails. Otherwise, returns
211	// the current root node, which might be different from what was given.
212	int AddSuffixRecursive(int root, int id);
213
214	// Finds the trie node for the given suffix. Returns a Frag in order to
215	// distinguish between pointing at the root node directly (end.p == 0)
216	// and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
217	Frag FindByteRange(int root, int id);
218
219	// Compares two ByteRanges and returns true iff they are equal.
220	bool ByteRangeEqual(int id1, int id2);
221
222	// Returns the alternation of all the added suffixes.
223	Frag EndRange();
224
225	// Single rune.
226	Frag Literal(Rune r, bool foldcase);
227
228	void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
229	Prog* Finish();
230
231	// Returns . where dot = any byte*
232	Frag DotStar();
233
234	private:
235	Prog* prog_; // Program being built.
236	bool failed_; // Did we give up compiling?
237	Encoding encoding_; // Input encoding
238	bool reversed_; // Should program run backward over text?
239
240	PODArray<Prog::Inst> inst_;
241	int ninst_; // Number of instructions used.
242	int max_ninst_; // Maximum number of instructions.
243
244	int64_t max_mem_; // Total memory budget.
245
246	std::unordered_map<uint64_t, int> rune_cache_;
247	Frag rune_range_;
248
249	RE2::Anchor anchor_; // anchor mode for RE2::Set
250
251	Compiler(const Compiler&) = delete;
252	Compiler& operator=(const Compiler&) = delete;
253	};
254
255	Compiler::Compiler() {
256	prog_ = new Prog ();
257	failed_ = false;
258	encoding_ = kEncodingUTF8;
259	reversed_ = false;
260	ninst_ = `0`;
261	max_ninst_ = `1`; // make AllocInst for fail instruction okay
262	max_mem_ = `0`;
263	int fail = AllocInst(`1`);
264	inst_[fail].InitFail();
265	max_ninst_ = `0`; // Caller must change
266	}
267
268	Compiler::~Compiler() {
269	delete prog_;
270	}
271
272	int Compiler::AllocInst(int n) {
273	if (failed_ \|\| ninst_ + n > max_ninst_) {
274	failed_ = true;
275	return -`1`;
276	}
277
278	if (ninst_ + n > inst_.size()) {
279	int cap = inst_.size();
280	if (cap == `0`)
281	cap = `8`;
282	while (ninst_ + n > cap)
283	cap *= `2`;
284	PODArray<Prog::Inst> inst(cap);
285	if (inst_.data() != NULL)
286	memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[`0`]);
287	memset(inst.data() + ninst_, `0`, (cap - ninst_)*sizeof inst_[`0`]);
288	inst_ = std::move(inst);
289	}
290	int id = ninst_;
291	ninst_ += n;
292	return id;
293	}
294
295	// These routines are somewhat hard to visualize in text --
296	// see http://swtch.com/~rsc/regexp/regexp1.html for
297	// pictures explaining what is going on here.
298
299	// Returns an unmatchable fragment.
300	Frag Compiler::NoMatch() {
301	return Frag (`0`, nullPatchList);
302	}
303
304	// Is a an unmatchable fragment?
305	static bool IsNoMatch(Frag a) {
306	return a.begin == `0`;
307	}
308
309	// Given fragments a and b, returns fragment for ab.
310	Frag Compiler::Cat(Frag a, Frag b) {
311	if (IsNoMatch(a) \|\| IsNoMatch(b))
312	return NoMatch();
313
314	// Elide no-op.
315	Prog::Inst* begin = &inst_[a.begin];
316	if (begin->opcode() == kInstNop &&
317	a.end.p == (a.begin << `1`) &&
318	begin->out() == `0`) {
319	// in case refs to a somewhere
320	PatchList::Patch(inst_.data(), a.end, b.begin);
321	return b;
322	}
323
324	// To run backward over string, reverse all concatenations.
325	if (reversed_) {
326	PatchList::Patch(inst_.data(), b.end, a.begin);
327	return Frag (b.begin, a.end);
328	}
329
330	PatchList::Patch(inst_.data(), a.end, b.begin);
331	return Frag (a.begin, b.end);
332	}
333
334	// Given fragments for a and b, returns fragment for a\|b.
335	Frag Compiler::Alt(Frag a, Frag b) {
336	// Special case for convenience in loops.
337	if (IsNoMatch(a))
338	return b;
339	if (IsNoMatch(b))
340	return a;
341
342	int id = AllocInst(`1`);
343	if (id < `0`)
344	return NoMatch();
345
346	inst_[id].InitAlt(a.begin, b.begin);
347	return Frag (id, PatchList::Append(inst_.data(), a.end, b.end));
348	}
349
350	// When capturing submatches in like-Perl mode, a kOpAlt Inst
351	// treats out_ as the first choice, out1_ as the second.
352	//
353	// For , +, and ?, if out_ causes another repetition,*
354	// then the operator is greedy. If out1_ is the repetition
355	// (and out_ moves forward), then the operator is non-greedy.
356
357	// Given a fragment a, returns a fragment for a or a? (if nongreedy)
358	Frag Compiler::Star(Frag a, bool nongreedy) {
359	int id = AllocInst(`1`);
360	if (id < `0`)
361	return NoMatch();
362	inst_[id].InitAlt(`0`, `0`);
363	PatchList::Patch(inst_.data(), a.end, id);
364	if (nongreedy) {
365	inst_[id].out1_ = a.begin;
366	return Frag (id, PatchList::Mk(id << `1`));
367	} else {
368	inst_[id].set_out(a.begin);
369	return Frag (id, PatchList::Mk((id << `1`) \| `1`));
370	}
371	}
372
373	// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
374	Frag Compiler::Plus(Frag a, bool nongreedy) {
375	// a+ is just a with a different entry point.*
376	Frag f = Star(a, nongreedy);
377	return Frag (a.begin, f.end);
378	}
379
380	// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
381	Frag Compiler::Quest(Frag a, bool nongreedy) {
382	if (IsNoMatch(a))
383	return Nop();
384	int id = AllocInst(`1`);
385	if (id < `0`)
386	return NoMatch();
387	PatchList pl;
388	if (nongreedy) {
389	inst_[id].InitAlt(`0`, a.begin);
390	pl = PatchList::Mk(id << `1`);
391	} else {
392	inst_[id].InitAlt(a.begin, `0`);
393	pl = PatchList::Mk((id << `1`) \| `1`);
394	}
395	return Frag (id, PatchList::Append(inst_.data(), pl, a.end));
396	}
397
398	// Returns a fragment for the byte range lo-hi.
399	Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
400	int id = AllocInst(`1`);
401	if (id < `0`)
402	return NoMatch();
403	inst_[id].InitByteRange(lo, hi, foldcase, `0`);
404	return Frag (id, PatchList::Mk(id << `1`));
405	}
406
407	// Returns a no-op fragment. Sometimes unavoidable.
408	Frag Compiler::Nop() {
409	int id = AllocInst(`1`);
410	if (id < `0`)
411	return NoMatch();
412	inst_[id].InitNop(`0`);
413	return Frag (id, PatchList::Mk(id << `1`));
414	}
415
416	// Returns a fragment that signals a match.
417	Frag Compiler::Match(int32_t match_id) {
418	int id = AllocInst(`1`);
419	if (id < `0`)
420	return NoMatch();
421	inst_[id].InitMatch(match_id);
422	return Frag (id, nullPatchList);
423	}
424
425	// Returns a fragment matching a particular empty-width op (like ^ or $)
426	Frag Compiler::EmptyWidth(EmptyOp empty) {
427	int id = AllocInst(`1`);
428	if (id < `0`)
429	return NoMatch();
430	inst_[id].InitEmptyWidth(empty, `0`);
431	return Frag (id, PatchList::Mk(id << `1`));
432	}
433
434	// Given a fragment a, returns a fragment with capturing parens around a.
435	Frag Compiler::Capture(Frag a, int n) {
436	if (IsNoMatch(a))
437	return NoMatch();
438	int id = AllocInst(`2`);
439	if (id < `0`)
440	return NoMatch();
441	inst_[id].InitCapture(`2`*n, a.begin);
442	inst_[id+`1`].InitCapture(`2`*n+`1`, `0`);
443	PatchList::Patch(inst_.data(), a.end, id+`1`);
444
445	return Frag (id, PatchList::Mk((id+`1`) << `1`));
446	}
447
448	// A Rune is a name for a Unicode code point.
449	// Returns maximum rune encoded by UTF-8 sequence of length len.
450	static int MaxRune(int len) {
451	int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
452	if (len == `1`)
453	b = `7`;
454	else
455	b = `8`-(len+`1`) + `6`*(len-`1`);
456	return (`1`<<b) - `1`; // maximum Rune for b bits.
457	}
458
459	// The rune range compiler caches common suffix fragments,
460	// which are very common in UTF-8 (e.g., [80-bf]).
461	// The fragment suffixes are identified by their start
462	// instructions. NULL denotes the eventual end match.
463	// The Frag accumulates in rune_range_. Caching common
464	// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
465	// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
466
467	void Compiler::BeginRange() {
468	rune_cache_.clear();
469	rune_range_.begin = `0`;
470	rune_range_.end = nullPatchList;
471	}
472
473	int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
474	int next) {
475	Frag f = ByteRange(lo, hi, foldcase);
476	if (next != `0`) {
477	PatchList::Patch(inst_.data(), f.end, next);
478	} else {
479	rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
480	}
481	return f.begin;
482	}
483
484	static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
485	int next) {
486	return (uint64_t)next << `17` \|
487	(uint64_t)lo << `9` \|
488	(uint64_t)hi << `1` \|
489	(uint64_t)foldcase;
490	}
491
492	int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
493	int next) {
494	uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
495	std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
496	if (it != rune_cache_.end())
497	return it ->second;
498	int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
499	rune_cache_[key] = id;
500	return id;
501	}
502
503	bool Compiler::IsCachedRuneByteSuffix(int id) {
504	uint8_t lo = inst_[id].lo_;
505	uint8_t hi = inst_[id].hi_;
506	bool foldcase = inst_[id].foldcase() != `0`;
507	int next = inst_[id].out();
508
509	uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
510	return rune_cache_.find(key) != rune_cache_.end();
511	}
512
513	void Compiler::AddSuffix(int id) {
514	if (failed_)
515	return;
516
517	if (rune_range_.begin == `0`) {
518	rune_range_.begin = id;
519	return;
520	}
521
522	if (encoding_ == kEncodingUTF8) {
523	// Build a trie in order to reduce fanout.
524	rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
525	return;
526	}
527
528	int alt = AllocInst(`1`);
529	if (alt < `0`) {
530	rune_range_.begin = `0`;
531	return;
532	}
533	inst_[alt].InitAlt(rune_range_.begin, id);
534	rune_range_.begin = alt;
535	}
536
537	int Compiler::AddSuffixRecursive(int root, int id) {
538	DCHECK(inst_[root].opcode() == kInstAlt \|\|
539	inst_[root].opcode() == kInstByteRange);
540
541	Frag f = FindByteRange(root, id);
542	if (IsNoMatch(f)) {
543	int alt = AllocInst(`1`);
544	if (alt < `0`)
545	return `0`;
546	inst_[alt].InitAlt(root, id);
547	return alt;
548	}
549
550	int br;
551	if (f.end.p == `0`)
552	br = root;
553	else if (f.end.p&`1`)
554	br = inst_[f.begin].out1();
555	else
556	br = inst_[f.begin].out();
557
558	if (IsCachedRuneByteSuffix(br)) {
559	// We can't fiddle with cached suffixes, so make a clone of the head.
560	int byterange = AllocInst(`1`);
561	if (byterange < `0`)
562	return `0`;
563	inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
564	inst_[br].foldcase(), inst_[br].out());
565
566	// Ensure that the parent points to the clone, not to the original.
567	// Note that this could leave the head unreachable except via the cache.
568	br = byterange;
569	if (f.end.p == `0`)
570	root = br;
571	else if (f.end.p&`1`)
572	inst_[f.begin].out1_ = br;
573	else
574	inst_[f.begin].set_out(br);
575	}
576
577	int out = inst_[id].out();
578	if (!IsCachedRuneByteSuffix(id)) {
579	// The head should be the instruction most recently allocated, so free it
580	// instead of leaving it unreachable.
581	DCHECK_EQ(id, ninst_-`1`);
582	inst_[id].out_opcode_ = `0`;
583	inst_[id].out1_ = `0`;
584	ninst_--;
585	}
586
587	out = AddSuffixRecursive(inst_[br].out(), out);
588	if (out == `0`)
589	return `0`;
590
591	inst_[br].set_out(out);
592	return root;
593	}
594
595	bool Compiler::ByteRangeEqual(int id1, int id2) {
596	return inst_[id1].lo() == inst_[id2].lo() &&
597	inst_[id1].hi() == inst_[id2].hi() &&
598	inst_[id1].foldcase() == inst_[id2].foldcase();
599	}
600
601	Frag Compiler::FindByteRange(int root, int id) {
602	if (inst_[root].opcode() == kInstByteRange) {
603	if (ByteRangeEqual(root, id))
604	return Frag (root, nullPatchList);
605	else
606	return NoMatch();
607	}
608
609	while (inst_[root].opcode() == kInstAlt) {
610	int out1 = inst_[root].out1();
611	if (ByteRangeEqual(out1, id))
612	return Frag (root, PatchList::Mk((root << `1`) \| `1`));
613
614	// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
615	// what we're looking for, then we can stop immediately. Unfortunately, we
616	// can't short-circuit the search in reverse mode.
617	if (!reversed_)
618	return NoMatch();
619
620	int out = inst_[root].out();
621	if (inst_[out].opcode() == kInstAlt)
622	root = out;
623	else if (ByteRangeEqual(out, id))
624	return Frag (root, PatchList::Mk(root << `1`));
625	else
626	return NoMatch();
627	}
628
629	LOG(DFATAL) << "should never happen";
630	return NoMatch();
631	}
632
633	Frag Compiler::EndRange() {
634	return rune_range_;
635	}
636
637	// Converts rune range lo-hi into a fragment that recognizes
638	// the bytes that would make up those runes in the current
639	// encoding (Latin 1 or UTF-8).
640	// This lets the machine work byte-by-byte even when
641	// using multibyte encodings.
642
643	void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
644	switch (encoding_) {
645	default:
646	case kEncodingUTF8:
647	AddRuneRangeUTF8(lo, hi, foldcase);
648	break;
649	case kEncodingLatin1:
650	AddRuneRangeLatin1(lo, hi, foldcase);
651	break;
652	}
653	}
654
655	void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
656	// Latin-1 is easy: runes are* bytes.*
657	if (lo > hi \|\| lo > `0xFF`)
658	return;
659	if (hi > `0xFF`)
660	hi = `0xFF`;
661	AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
662	static_cast<uint8_t>(hi), foldcase, `0`));
663	}
664
665	// Table describing how to make a UTF-8 matching machine
666	// for the rune range 80-10FFFF (Runeself-Runemax).
667	// This range happens frequently enough (for example /./ and /[^a-z]/)
668	// and the rune_cache_ map is slow enough that this is worth
669	// special handling. Makes compilation of a small expression
670	// with a dot in it about 10% faster.
671	// The in the comments below mark whole sequences.*
672	static struct ByteRangeProg {
673	int next;
674	int lo;
675	int hi;
676	} prog_80_10ffff[] = {
677	// Two-byte
678	{ -`1`, `0x80`, `0xBF`, }, // 0: 80-BF
679	{ `0`, `0xC2`, `0xDF`, }, // 1: C2-DF 80-BF*
680
681	// Three-byte
682	{ `0`, `0xA0`, `0xBF`, }, // 2: A0-BF 80-BF
683	{ `2`, `0xE0`, `0xE0`, }, // 3: E0 A0-BF 80-BF*
684	{ `0`, `0x80`, `0xBF`, }, // 4: 80-BF 80-BF
685	{ `4`, `0xE1`, `0xEF`, }, // 5: E1-EF 80-BF 80-BF*
686
687	// Four-byte
688	{ `4`, `0x90`, `0xBF`, }, // 6: 90-BF 80-BF 80-BF
689	{ `6`, `0xF0`, `0xF0`, }, // 7: F0 90-BF 80-BF 80-BF*
690	{ `4`, `0x80`, `0xBF`, }, // 8: 80-BF 80-BF 80-BF
691	{ `8`, `0xF1`, `0xF3`, }, // 9: F1-F3 80-BF 80-BF 80-BF*
692	{ `4`, `0x80`, `0x8F`, }, // 10: 80-8F 80-BF 80-BF
693	{ `10`, `0xF4`, `0xF4`, }, // 11: F4 80-8F 80-BF 80-BF*
694	};
695
696	void Compiler::Add_80_10ffff() {
697	int inst[arraysize(prog_80_10ffff)] = { `0` }; // does not need to be initialized; silences gcc warning
698	for (size_t i = `0`; i < arraysize(prog_80_10ffff); i++) {
699	const ByteRangeProg& p = prog_80_10ffff[i];
700	int next = `0`;
701	if (p.next >= `0`)
702	next = inst[p.next];
703	inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
704	static_cast<uint8_t>(p.hi), false, next);
705	if ((p.lo & `0xC0`) != `0x80`)
706	AddSuffix(inst[i]);
707	}
708	}
709
710	void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
711	if (lo > hi)
712	return;
713
714	// Pick off 80-10FFFF as a common special case
715	// that can bypass the slow rune_cache_.
716	if (lo == `0x80` && hi == `0x10ffff` && !reversed_) {
717	Add_80_10ffff();
718	return;
719	}
720
721	// Split range into same-length sized ranges.
722	for (int i = `1`; i < UTFmax; i++) {
723	Rune max = MaxRune(i);
724	if (lo <= max && max < hi) {
725	AddRuneRangeUTF8(lo, max, foldcase);
726	AddRuneRangeUTF8(max+`1`, hi, foldcase);
727	return;
728	}
729	}
730
731	// ASCII range is always a special case.
732	if (hi < Runeself) {
733	AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
734	static_cast<uint8_t>(hi), foldcase, `0`));
735	return;
736	}
737
738	// Split range into sections that agree on leading bytes.
739	for (int i = `1`; i < UTFmax; i++) {
740	uint32_t m = (`1`<<(`6`i)) - `1`; // last i bytes of a UTF-8 sequence*
741	if ((lo & ~m) != (hi & ~m)) {
742	if ((lo & m) != `0`) {
743	AddRuneRangeUTF8(lo, lo\|m, foldcase);
744	AddRuneRangeUTF8((lo\|m)+`1`, hi, foldcase);
745	return;
746	}
747	if ((hi & m) != m) {
748	AddRuneRangeUTF8(lo, (hi&~m)-`1`, foldcase);
749	AddRuneRangeUTF8(hi&~m, hi, foldcase);
750	return;
751	}
752	}
753	}
754
755	// Finally. Generate byte matching equivalent for lo-hi.
756	uint8_t ulo[UTFmax], uhi[UTFmax];
757	int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
758	int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
759	(void)m; // USED(m)
760	DCHECK_EQ(n, m);
761
762	// The logic below encodes this thinking:
763	//
764	// 1. When we have built the whole suffix, we know that it cannot
765	// possibly be a suffix of anything longer: in forward mode, nothing
766	// else can occur before the leading byte; in reverse mode, nothing
767	// else can occur after the last continuation byte or else the leading
768	// byte would have to change. Thus, there is no benefit to caching
769	// the first byte of the suffix whereas there is a cost involved in
770	// cloning it if it begins a common prefix, which is fairly likely.
771	//
772	// 2. Conversely, the last byte of the suffix cannot possibly be a
773	// prefix of anything because next == 0, so we will never want to
774	// clone it, but it is fairly likely to be a common suffix. Perhaps
775	// more so in reverse mode than in forward mode because the former is
776	// "converging" towards lower entropy, but caching is still worthwhile
777	// for the latter in cases such as 80-BF.
778	//
779	// 3. Handling the bytes between the first and the last is less
780	// straightforward and, again, the approach depends on whether we are
781	// "converging" towards lower entropy: in forward mode, a single byte
782	// is unlikely to be part of a common suffix whereas a byte range
783	// is more likely so; in reverse mode, a byte range is unlikely to
784	// be part of a common suffix whereas a single byte is more likely
785	// so. The same benefit versus cost argument applies here.
786	int id = `0`;
787	if (reversed_) {
788	for (int i = `0`; i < n; i++) {
789	// In reverse UTF-8 mode: cache the leading byte; don't cache the last
790	// continuation byte; cache anything else iff it's a single byte (XX-XX).
791	if (i == `0` \|\| (ulo[i] == uhi[i] && i != n-`1`))
792	id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
793	else
794	id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
795	}
796	} else {
797	for (int i = n-`1`; i >= `0`; i--) {
798	// In forward UTF-8 mode: don't cache the leading byte; cache the last
799	// continuation byte; cache anything else iff it's a byte range (XX-YY).
800	if (i == n-`1` \|\| (ulo[i] < uhi[i] && i != `0`))
801	id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
802	else
803	id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
804	}
805	}
806	AddSuffix(id);
807	}
808
809	// Should not be called.
810	Frag Compiler::Copy(Frag arg) {
811	// We're using WalkExponential; there should be no copying.
812	LOG(DFATAL) << "Compiler::Copy called!";
813	failed_ = true;
814	return NoMatch();
815	}
816
817	// Visits a node quickly; called once WalkExponential has
818	// decided to cut this walk short.
819	Frag Compiler::ShortVisit(Regexp* re, Frag) {
820	failed_ = true;
821	return NoMatch();
822	}
823
824	// Called before traversing a node's children during the walk.
825	Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
826	// Cut off walk if we've already failed.
827	if (failed_)
828	stop = true*;
829
830	return Frag (); // not used by caller
831	}
832
833	Frag Compiler::Literal(Rune r, bool foldcase) {
834	switch (encoding_) {
835	default:
836	return Frag ();
837
838	case kEncodingLatin1:
839	return ByteRange(r, r, foldcase);
840
841	case kEncodingUTF8: {
842	if (r < Runeself) // Make common case fast.
843	return ByteRange(r, r, foldcase);
844	uint8_t buf[UTFmax];
845	int n = runetochar(reinterpret_cast<char*>(buf), &r);
846	Frag f = ByteRange((uint8_t)buf[`0`], buf[`0`], false);
847	for (int i = `1`; i < n; i++)
848	f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
849	return f;
850	}
851	}
852	}
853
854	// Called after traversing the node's children during the walk.
855	// Given their frags, build and return the frag for this re.
856	Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
857	int nchild_frags) {
858	// If a child failed, don't bother going forward, especially
859	// since the child_frags might contain Frags with NULLs in them.
860	if (failed_)
861	return NoMatch();
862
863	// Given the child fragments, return the fragment for this node.
864	switch (re->op()) {
865	case kRegexpRepeat:
866	// Should not see; code at bottom of function will print error
867	break;
868
869	case kRegexpNoMatch:
870	return NoMatch();
871
872	case kRegexpEmptyMatch:
873	return Nop();
874
875	case kRegexpHaveMatch: {
876	Frag f = Match(re->match_id());
877	if (anchor_ == RE2::ANCHOR_BOTH) {
878	// Append \z or else the subexpression will effectively be unanchored.
879	// Complemented by the UNANCHORED case in CompileSet().
880	f = Cat(EmptyWidth(kEmptyEndText), f);
881	}
882	return f;
883	}
884
885	case kRegexpConcat: {
886	Frag f = child_frags[`0`];
887	for (int i = `1`; i < nchild_frags; i++)
888	f = Cat(f, child_frags[i]);
889	return f;
890	}
891
892	case kRegexpAlternate: {
893	Frag f = child_frags[`0`];
894	for (int i = `1`; i < nchild_frags; i++)
895	f = Alt(f, child_frags[i]);
896	return f;
897	}
898
899	case kRegexpStar:
900	return Star(child_frags[`0`], (re->parse_flags()&Regexp::NonGreedy) != `0`);
901
902	case kRegexpPlus:
903	return Plus(child_frags[`0`], (re->parse_flags()&Regexp::NonGreedy) != `0`);
904
905	case kRegexpQuest:
906	return Quest(child_frags[`0`], (re->parse_flags()&Regexp::NonGreedy) != `0`);
907
908	case kRegexpLiteral:
909	return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != `0`);
910
911	case kRegexpLiteralString: {
912	// Concatenation of literals.
913	if (re->nrunes() == `0`)
914	return Nop();
915	Frag f;
916	for (int i = `0`; i < re->nrunes(); i++) {
917	Frag f1 = Literal(re->runes()[i],
918	(re->parse_flags()&Regexp::FoldCase) != `0`);
919	if (i == `0`)
920	f = f1;
921	else
922	f = Cat(f, f1);
923	}
924	return f;
925	}
926
927	case kRegexpAnyChar:
928	BeginRange();
929	AddRuneRange(`0`, Runemax, false);
930	return EndRange();
931
932	case kRegexpAnyByte:
933	return ByteRange(`0x00`, `0xFF`, false);
934
935	case kRegexpCharClass: {
936	CharClass* cc = re->cc();
937	if (cc->empty()) {
938	// This can't happen.
939	LOG(DFATAL) << "No ranges in char class";
940	failed_ = true;
941	return NoMatch();
942	}
943
944	// ASCII case-folding optimization: if the char class
945	// behaves the same on A-Z as it does on a-z,
946	// discard any ranges wholly contained in A-Z
947	// and mark the other ranges as foldascii.
948	// This reduces the size of a program for
949	// (?i)abc from 3 insts per letter to 1 per letter.
950	bool foldascii = cc->FoldsASCII();
951
952	// Character class is just a big OR of the different
953	// character ranges in the class.
954	BeginRange();
955	for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
956	// ASCII case-folding optimization (see above).
957	if (foldascii && `'A'` <= i->lo && i->hi <= `'Z'`)
958	continue;
959
960	// If this range contains all of A-Za-z or none of it,
961	// the fold flag is unnecessary; don't bother.
962	bool fold = foldascii;
963	if ((i->lo <= `'A'` && `'z'` <= i->hi) \|\| i->hi < `'A'` \|\| `'z'` < i->lo \|\|
964	(`'Z'` < i->lo && i->hi < `'a'`))
965	fold = false;
966
967	AddRuneRange(i->lo, i->hi, fold);
968	}
969	return EndRange();
970	}
971
972	case kRegexpCapture:
973	// If this is a non-capturing parenthesis -- (?:foo) --
974	// just use the inner expression.
975	if (re->cap() < `0`)
976	return child_frags[`0`];
977	return Capture(child_frags[`0`], re->cap());
978
979	case kRegexpBeginLine:
980	return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
981
982	case kRegexpEndLine:
983	return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
984
985	case kRegexpBeginText:
986	return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
987
988	case kRegexpEndText:
989	return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
990
991	case kRegexpWordBoundary:
992	return EmptyWidth(kEmptyWordBoundary);
993
994	case kRegexpNoWordBoundary:
995	return EmptyWidth(kEmptyNonWordBoundary);
996	}
997	LOG(DFATAL) << "Missing case in Compiler: " << re->op();
998	failed_ = true;
999	return NoMatch();
1000	}
1001
1002	// Is this regexp required to start at the beginning of the text?
1003	// Only approximate; can return false for complicated regexps like (\Aa\|\Ab),
1004	// but handles (\A(a\|b)). Could use the Walker to write a more exact one.
1005	static bool IsAnchorStart(Regexp** pre, int depth) {
1006	Regexp* re = *pre;
1007	Regexp* sub;
1008	// The depth limit makes sure that we don't overflow
1009	// the stack on a deeply nested regexp. As the comment
1010	// above says, IsAnchorStart is conservative, so returning
1011	// a false negative is okay. The exact limit is somewhat arbitrary.
1012	if (re == NULL \|\| depth >= `4`)
1013	return false;
1014	switch (re->op()) {
1015	default:
1016	break;
1017	case kRegexpConcat:
1018	if (re->nsub() > `0`) {
1019	sub = re->sub()[`0`]->Incref();
1020	if (IsAnchorStart(&sub, depth+`1`)) {
1021	PODArray<Regexp*> subcopy(re->nsub());
1022	subcopy [`0`] = sub; // already have reference
1023	for (int i = `1`; i < re->nsub(); i++)
1024	subcopy [i] = re->sub()[i]->Incref();
1025	*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1026	re->Decref();
1027	return true;
1028	}
1029	sub->Decref();
1030	}
1031	break;
1032	case kRegexpCapture:
1033	sub = re->sub()[`0`]->Incref();
1034	if (IsAnchorStart(&sub, depth+`1`)) {
1035	*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1036	re->Decref();
1037	return true;
1038	}
1039	sub->Decref();
1040	break;
1041	case kRegexpBeginText:
1042	*pre = Regexp::LiteralString(NULL, `0`, re->parse_flags());
1043	re->Decref();
1044	return true;
1045	}
1046	return false;
1047	}
1048
1049	// Is this regexp required to start at the end of the text?
1050	// Only approximate; can return false for complicated regexps like (a\z\|b\z),
1051	// but handles ((a\|b)\z). Could use the Walker to write a more exact one.
1052	static bool IsAnchorEnd(Regexp** pre, int depth) {
1053	Regexp* re = *pre;
1054	Regexp* sub;
1055	// The depth limit makes sure that we don't overflow
1056	// the stack on a deeply nested regexp. As the comment
1057	// above says, IsAnchorEnd is conservative, so returning
1058	// a false negative is okay. The exact limit is somewhat arbitrary.
1059	if (re == NULL \|\| depth >= `4`)
1060	return false;
1061	switch (re->op()) {
1062	default:
1063	break;
1064	case kRegexpConcat:
1065	if (re->nsub() > `0`) {
1066	sub = re->sub()[re->nsub() - `1`]->Incref();
1067	if (IsAnchorEnd(&sub, depth+`1`)) {
1068	PODArray<Regexp*> subcopy(re->nsub());
1069	subcopy [re->nsub() - `1`] = sub; // already have reference
1070	for (int i = `0`; i < re->nsub() - `1`; i++)
1071	subcopy [i] = re->sub()[i]->Incref();
1072	*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1073	re->Decref();
1074	return true;
1075	}
1076	sub->Decref();
1077	}
1078	break;
1079	case kRegexpCapture:
1080	sub = re->sub()[`0`]->Incref();
1081	if (IsAnchorEnd(&sub, depth+`1`)) {
1082	*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1083	re->Decref();
1084	return true;
1085	}
1086	sub->Decref();
1087	break;
1088	case kRegexpEndText:
1089	*pre = Regexp::LiteralString(NULL, `0`, re->parse_flags());
1090	re->Decref();
1091	return true;
1092	}
1093	return false;
1094	}
1095
1096	void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1097	RE2::Anchor anchor) {
1098	prog_->set_flags(flags);
1099
1100	if (flags & Regexp::Latin1)
1101	encoding_ = kEncodingLatin1;
1102	max_mem_ = max_mem;
1103	if (max_mem <= `0`) {
1104	max_ninst_ = `100000`; // more than enough
1105	} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1106	// No room for anything.
1107	max_ninst_ = `0`;
1108	} else {
1109	int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1110	// Limit instruction count so that inst->id() fits nicely in an int.
1111	// SparseArray also assumes that the indices (inst->id()) are ints.
1112	// The call to WalkExponential uses 2max_ninst_ below,*
1113	// and other places in the code use 2 or 3 prog->size().*
1114	// Limiting to 2^24 should avoid overflow in those places.
1115	// (The point of allowing more than 32 bits of memory is to
1116	// have plenty of room for the DFA states, not to use it up
1117	// on the program.)
1118	if (m >= `1`<<`24`)
1119	m = `1`<<`24`;
1120
1121	// Inst imposes its own limit (currently bigger than 2^24 but be safe).
1122	if (m > Prog::Inst::kMaxInst)
1123	m = Prog::Inst::kMaxInst;
1124
1125	max_ninst_ = static_cast<int>(m);
1126	}
1127
1128	anchor_ = anchor;
1129	}
1130
1131	// Compiles re, returning program.
1132	// Caller is responsible for deleting prog_.
1133	// If reversed is true, compiles a program that expects
1134	// to run over the input string backward (reverses all concatenations).
1135	// The reversed flag is also recorded in the returned program.
1136	Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1137	Compiler c;
1138	c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED / unused /);
1139	c.reversed_ = reversed;
1140
1141	// Simplify to remove things like counted repetitions
1142	// and character classes like \d.
1143	Regexp* sre = re->Simplify();
1144	if (sre == NULL)
1145	return NULL;
1146
1147	// Record whether prog is anchored, removing the anchors.
1148	// (They get in the way of other optimizations.)
1149	bool is_anchor_start = IsAnchorStart(&sre, `0`);
1150	bool is_anchor_end = IsAnchorEnd(&sre, `0`);
1151
1152	// Generate fragment for entire regexp.
1153	Frag all = c.WalkExponential(sre, Frag (), `2`*c.max_ninst_);
1154	sre->Decref();
1155	if (c.failed_)
1156	return NULL;
1157
1158	// Success! Finish by putting Match node at end, and record start.
1159	// Turn off c.reversed_ (if it is set) to force the remaining concatenations
1160	// to behave normally.
1161	c.reversed_ = false;
1162	all = c.Cat(all, c.Match(`0`));
1163
1164	c.prog_->set_reversed(reversed);
1165	if (c.prog_->reversed()) {
1166	c.prog_->set_anchor_start(is_anchor_end);
1167	c.prog_->set_anchor_end(is_anchor_start);
1168	} else {
1169	c.prog_->set_anchor_start(is_anchor_start);
1170	c.prog_->set_anchor_end(is_anchor_end);
1171	}
1172
1173	c.prog_->set_start(all.begin);
1174	if (!c.prog_->anchor_start()) {
1175	// Also create unanchored version, which starts with a .? loop.*
1176	all = c.Cat(c.DotStar(), all);
1177	}
1178	c.prog_->set_start_unanchored(all.begin);
1179
1180	// Hand ownership of prog_ to caller.
1181	return c.Finish();
1182	}
1183
1184	Prog* Compiler::Finish() {
1185	if (failed_)
1186	return NULL;
1187
1188	if (prog_->start() == `0` && prog_->start_unanchored() == `0`) {
1189	// No possible matches; keep Fail instruction only.
1190	ninst_ = `1`;
1191	}
1192
1193	// Hand off the array to Prog.
1194	prog_->inst_ = std::move(inst_);
1195	prog_->size_ = ninst_;
1196
1197	prog_->Optimize();
1198	prog_->Flatten();
1199	prog_->ComputeByteMap();
1200
1201	// Record remaining memory for DFA.
1202	if (max_mem_ <= `0`) {
1203	prog_->set_dfa_mem(`1`<<`20`);
1204	} else {
1205	int64_t m = max_mem_ - sizeof(Prog);
1206	m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
1207	if (prog_->CanBitState())
1208	m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
1209	if (m < `0`)
1210	m = `0`;
1211	prog_->set_dfa_mem(m);
1212	}
1213
1214	Prog* p = prog_;
1215	prog_ = NULL;
1216	return p;
1217	}
1218
1219	// Converts Regexp to Prog.
1220	Prog* Regexp::CompileToProg(int64_t max_mem) {
1221	return Compiler::Compile(this, false, max_mem);
1222	}
1223
1224	Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1225	return Compiler::Compile(this, true, max_mem);
1226	}
1227
1228	Frag Compiler::DotStar() {
1229	return Star(ByteRange(`0x00`, `0xff`, false), true);
1230	}
1231
1232	// Compiles RE set to Prog.
1233	Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1234	Compiler c;
1235	c.Setup(re->parse_flags(), max_mem, anchor);
1236
1237	Regexp* sre = re->Simplify();
1238	if (sre == NULL)
1239	return NULL;
1240
1241	Frag all = c.WalkExponential(sre, Frag (), `2`*c.max_ninst_);
1242	sre->Decref();
1243	if (c.failed_)
1244	return NULL;
1245
1246	c.prog_->set_anchor_start(true);
1247	c.prog_->set_anchor_end(true);
1248
1249	if (anchor == RE2::UNANCHORED) {
1250	// Prepend . or else the expression will effectively be anchored.*
1251	// Complemented by the ANCHOR_BOTH case in PostVisit().
1252	all = c.Cat(c.DotStar(), all);
1253	}
1254	c.prog_->set_start(all.begin);
1255	c.prog_->set_start_unanchored(all.begin);
1256
1257	Prog* prog = c.Finish();
1258	if (prog == NULL)
1259	return NULL;
1260
1261	// Make sure DFA has enough memory to operate,
1262	// since we're not going to fall back to the NFA.
1263	bool dfa_failed = false;
1264	StringPiece sp = "hello, world";
1265	prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1266	NULL, &dfa_failed, NULL);
1267	if (dfa_failed) {
1268	delete prog;
1269	return NULL;
1270	}
1271
1272	return prog;
1273	}
1274
1275	Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1276	return Compiler::CompileSet(re, anchor, max_mem);
1277	}
1278
1279	} // namespace re2
1280

Browse the source code of RE2/re2/compile.cc