regexp.cc source code [RE2/re2/regexp.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression representation.
6	// Tested by parse_test.cc
7
8	#include "re2/regexp.h"
9
10	#include <stddef.h>
11	#include <stdint.h>
12	#include <string.h>
13	#include <algorithm>
14	#include <map>
15	#include <mutex>
16	#include <string>
17	#include <vector>
18
19	#include "util/util.h"
20	#include "util/logging.h"
21	#include "util/mutex.h"
22	#include "util/utf.h"
23	#include "re2/stringpiece.h"
24	#include "re2/walker-inl.h"
25
26	namespace re2 {
27
28	// Constructor. Allocates vectors as appropriate for operator.
29	Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
30	: op_(static_cast<uint8_t>(op)),
31	simple_(false),
32	parse_flags_(static_cast<uint16_t>(parse_flags)),
33	ref_(`1`),
34	nsub_(`0`),
35	down_(NULL) {
36	subone_ = NULL;
37	memset(the_union_, `0`, sizeof the_union_);
38	}
39
40	// Destructor. Assumes already cleaned up children.
41	// Private: use Decref() instead of delete to destroy Regexps.
42	// Can't call Decref on the sub-Regexps here because
43	// that could cause arbitrarily deep recursion, so
44	// required Decref() to have handled them for us.
45	Regexp::~Regexp() {
46	if (nsub_ > `0`)
47	LOG(DFATAL) << "Regexp not destroyed.";
48
49	switch (op_) {
50	default:
51	break;
52	case kRegexpCapture:
53	delete name_;
54	break;
55	case kRegexpLiteralString:
56	delete[] runes_;
57	break;
58	case kRegexpCharClass:
59	if (cc_)
60	cc_->Delete();
61	delete ccb_;
62	break;
63	}
64	}
65
66	// If it's possible to destroy this regexp without recurring,
67	// do so and return true. Else return false.
68	bool Regexp::QuickDestroy() {
69	if (nsub_ == `0`) {
70	delete this;
71	return true;
72	}
73	return false;
74	}
75
76	// Lazily allocated.
77	static Mutex* ref_mutex;
78	static std::map<Regexp, int> ref_map;
79
80	int Regexp::Ref() {
81	if (ref_ < kMaxRef)
82	return ref_;
83
84	MutexLock l(ref_mutex);
85	return (ref_map)[this*];
86	}
87
88	// Increments reference count, returns object as convenience.
89	Regexp* Regexp::Incref() {
90	if (ref_ >= kMaxRef-`1`) {
91	static std::once_flag ref_once;
92	std::call_once(ref_once, []() {
93	ref_mutex = new Mutex;
94	ref_map = new std::map<Regexp, int*>;
95	});
96
97	// Store ref count in overflow map.
98	MutexLock l(ref_mutex);
99	if (ref_ == kMaxRef) {
100	// already overflowed
101	(ref_map)[this*]++;
102	} else {
103	// overflowing now
104	(ref_map)[this*] = kMaxRef;
105	ref_ = kMaxRef;
106	}
107	return this;
108	}
109
110	ref_++;
111	return this;
112	}
113
114	// Decrements reference count and deletes this object if count reaches 0.
115	void Regexp::Decref() {
116	if (ref_ == kMaxRef) {
117	// Ref count is stored in overflow map.
118	MutexLock l(ref_mutex);
119	int r = (ref_map)[this*] - `1`;
120	if (r < kMaxRef) {
121	ref_ = static_cast<uint16_t>(r);
122	ref_map->erase(this);
123	} else {
124	(ref_map)[this*] = r;
125	}
126	return;
127	}
128	ref_--;
129	if (ref_ == `0`)
130	Destroy();
131	}
132
133	// Deletes this object; ref count has count reached 0.
134	void Regexp::Destroy() {
135	if (QuickDestroy())
136	return;
137
138	// Handle recursive Destroy with explicit stack
139	// to avoid arbitrarily deep recursion on process stack [sigh].
140	down_ = NULL;
141	Regexp* stack = this;
142	while (stack != NULL) {
143	Regexp* re = stack;
144	stack = re->down_;
145	if (re->ref_ != `0`)
146	LOG(DFATAL) << "Bad reference count " << re->ref_;
147	if (re->nsub_ > `0`) {
148	Regexp** subs = re->sub();
149	for (int i = `0`; i < re->nsub_; i++) {
150	Regexp* sub = subs[i];
151	if (sub == NULL)
152	continue;
153	if (sub->ref_ == kMaxRef)
154	sub->Decref();
155	else
156	--sub->ref_;
157	if (sub->ref_ == `0` && !sub->QuickDestroy()) {
158	sub->down_ = stack;
159	stack = sub;
160	}
161	}
162	if (re->nsub_ > `1`)
163	delete[] subs;
164	re->nsub_ = `0`;
165	}
166	delete re;
167	}
168	}
169
170	void Regexp::AddRuneToString(Rune r) {
171	DCHECK(op_ == kRegexpLiteralString);
172	if (nrunes_ == `0`) {
173	// start with 8
174	runes_ = new Rune[`8`];
175	} else if (nrunes_ >= `8` && (nrunes_ & (nrunes_ - `1`)) == `0`) {
176	// double on powers of two
177	Rune *old = runes_;
178	runes_ = new Rune[nrunes_ * `2`];
179	for (int i = `0`; i < nrunes_; i++)
180	runes_[i] = old[i];
181	delete[] old;
182	}
183
184	runes_[nrunes_++] = r;
185	}
186
187	Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188	Regexp* re = new Regexp (kRegexpHaveMatch, flags);
189	re->match_id_ = match_id;
190	return re;
191	}
192
193	Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194	// Squash , ++ and ??.
195	if (op == sub->op() && flags == sub->parse_flags())
196	return sub;
197
198	// Squash +, ?, +, +?, ?* and ?+. They all squash to , so because
199	// op is Star/Plus/Quest, we just have to check that sub->op() is too.
200	if ((sub->op() == kRegexpStar \|\|
201	sub->op() == kRegexpPlus \|\|
202	sub->op() == kRegexpQuest) &&
203	flags == sub->parse_flags()) {
204	// If sub is Star, no need to rewrite it.
205	if (sub->op() == kRegexpStar)
206	return sub;
207
208	// Rewrite sub to Star.
209	Regexp* re = new Regexp (kRegexpStar, flags);
210	re->AllocSub(`1`);
211	re->sub()[`0`] = sub->sub()[`0`]->Incref();
212	sub->Decref(); // We didn't consume the reference after all.
213	return re;
214	}
215
216	Regexp* re = new Regexp (op, flags);
217	re->AllocSub(`1`);
218	re->sub()[`0`] = sub;
219	return re;
220	}
221
222	Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223	return StarPlusOrQuest(kRegexpPlus, sub, flags);
224	}
225
226	Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
227	return StarPlusOrQuest(kRegexpStar, sub, flags);
228	}
229
230	Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
231	return StarPlusOrQuest(kRegexpQuest, sub, flags);
232	}
233
234	Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235	ParseFlags flags, bool can_factor) {
236	if (nsub == `1`)
237	return sub[`0`];
238
239	if (nsub == `0`) {
240	if (op == kRegexpAlternate)
241	return new Regexp (kRegexpNoMatch, flags);
242	else
243	return new Regexp (kRegexpEmptyMatch, flags);
244	}
245
246	Regexp** subcopy = NULL;
247	if (op == kRegexpAlternate && can_factor) {
248	// Going to edit sub; make a copy so we don't step on caller.
249	subcopy = new Regexp*[nsub];
250	memmove(subcopy, sub, nsub * sizeof sub[`0`]);
251	sub = subcopy;
252	nsub = FactorAlternation(sub, nsub, flags);
253	if (nsub == `1`) {
254	Regexp* re = sub[`0`];
255	delete[] subcopy;
256	return re;
257	}
258	}
259
260	if (nsub > kMaxNsub) {
261	// Too many subexpressions to fit in a single Regexp.
262	// Make a two-level tree. Two levels gets us to 65535^2.
263	int nbigsub = (nsub+kMaxNsub-`1`)/kMaxNsub;
264	Regexp* re = new Regexp (op, flags);
265	re->AllocSub(nbigsub);
266	Regexp** subs = re->sub();
267	for (int i = `0`; i < nbigsub - `1`; i++)
268	subs[i] = ConcatOrAlternate(op, sub+ikMaxNsub, kMaxNsub, flags, false*);
269	subs[nbigsub - `1`] = ConcatOrAlternate(op, sub+(nbigsub-`1`)*kMaxNsub,
270	nsub - (nbigsub-`1`)*kMaxNsub, flags,
271	false);
272	delete[] subcopy;
273	return re;
274	}
275
276	Regexp* re = new Regexp (op, flags);
277	re->AllocSub(nsub);
278	Regexp** subs = re->sub();
279	for (int i = `0`; i < nsub; i++)
280	subs[i] = sub[i];
281
282	delete[] subcopy;
283	return re;
284	}
285
286	Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
287	return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
288	}
289
290	Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
291	return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
292	}
293
294	Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
295	return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
296	}
297
298	Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
299	Regexp* re = new Regexp (kRegexpCapture, flags);
300	re->AllocSub(`1`);
301	re->sub()[`0`] = sub;
302	re->cap_ = cap;
303	return re;
304	}
305
306	Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
307	Regexp* re = new Regexp (kRegexpRepeat, flags);
308	re->AllocSub(`1`);
309	re->sub()[`0`] = sub;
310	re->min_ = min;
311	re->max_ = max;
312	return re;
313	}
314
315	Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
316	Regexp* re = new Regexp (kRegexpLiteral, flags);
317	re->rune_ = rune;
318	return re;
319	}
320
321	Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
322	if (nrunes <= `0`)
323	return new Regexp (kRegexpEmptyMatch, flags);
324	if (nrunes == `1`)
325	return NewLiteral(runes[`0`], flags);
326	Regexp* re = new Regexp (kRegexpLiteralString, flags);
327	for (int i = `0`; i < nrunes; i++)
328	re->AddRuneToString(runes[i]);
329	return re;
330	}
331
332	Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
333	Regexp* re = new Regexp (kRegexpCharClass, flags);
334	re->cc_ = cc;
335	return re;
336	}
337
338	void Regexp::Swap(Regexp* that) {
339	// Regexp is not trivially copyable, so we cannot freely copy it with
340	// memmove(3), but swapping objects like so is safe for our purposes.
341	char tmp[sizeof *this];
342	void* vthis = reinterpret_cast<void>(this*);
343	void* vthat = reinterpret_cast<void*>(that);
344	memmove(tmp, vthis, sizeof *this);
345	memmove(vthis, vthat, sizeof *this);
346	memmove(vthat, tmp, sizeof *this);
347	}
348
349	// Tests equality of all top-level structure but not subregexps.
350	static bool TopEqual(Regexp* a, Regexp* b) {
351	if (a->op() != b->op())
352	return false;
353
354	switch (a->op()) {
355	case kRegexpNoMatch:
356	case kRegexpEmptyMatch:
357	case kRegexpAnyChar:
358	case kRegexpAnyByte:
359	case kRegexpBeginLine:
360	case kRegexpEndLine:
361	case kRegexpWordBoundary:
362	case kRegexpNoWordBoundary:
363	case kRegexpBeginText:
364	return true;
365
366	case kRegexpEndText:
367	// The parse flags remember whether it's \z or (?-m:$),
368	// which matters when testing against PCRE.
369	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == `0`;
370
371	case kRegexpLiteral:
372	return a->rune() == b->rune() &&
373	((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == `0`;
374
375	case kRegexpLiteralString:
376	return a->nrunes() == b->nrunes() &&
377	((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == `0` &&
378	memcmp(a->runes(), b->runes(),
379	a->nrunes() * sizeof a->runes()[`0`]) == `0`;
380
381	case kRegexpAlternate:
382	case kRegexpConcat:
383	return a->nsub() == b->nsub();
384
385	case kRegexpStar:
386	case kRegexpPlus:
387	case kRegexpQuest:
388	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == `0`;
389
390	case kRegexpRepeat:
391	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == `0` &&
392	a->min() == b->min() &&
393	a->max() == b->max();
394
395	case kRegexpCapture:
396	return a->cap() == b->cap() && a->name() == b->name();
397
398	case kRegexpHaveMatch:
399	return a->match_id() == b->match_id();
400
401	case kRegexpCharClass: {
402	CharClass* acc = a->cc();
403	CharClass* bcc = b->cc();
404	return acc->size() == bcc->size() &&
405	acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
406	memcmp(acc->begin(), bcc->begin(),
407	(acc->end() - acc->begin()) * sizeof acc->begin()[`0`]) == `0`;
408	}
409	}
410
411	LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
412	return `0`;
413	}
414
415	bool Regexp::Equal(Regexp* a, Regexp* b) {
416	if (a == NULL \|\| b == NULL)
417	return a == b;
418
419	if (!TopEqual(a, b))
420	return false;
421
422	// Fast path:
423	// return without allocating vector if there are no subregexps.
424	switch (a->op()) {
425	case kRegexpAlternate:
426	case kRegexpConcat:
427	case kRegexpStar:
428	case kRegexpPlus:
429	case kRegexpQuest:
430	case kRegexpRepeat:
431	case kRegexpCapture:
432	break;
433
434	default:
435	return true;
436	}
437
438	// Committed to doing real work.
439	// The stack (vector) has pairs of regexps waiting to
440	// be compared. The regexps are only equal if
441	// all the pairs end up being equal.
442	std::vector<Regexp*> stk;
443
444	for (;;) {
445	// Invariant: TopEqual(a, b) == true.
446	Regexp* a2;
447	Regexp* b2;
448	switch (a->op()) {
449	default:
450	break;
451	case kRegexpAlternate:
452	case kRegexpConcat:
453	for (int i = `0`; i < a->nsub(); i++) {
454	a2 = a->sub()[i];
455	b2 = b->sub()[i];
456	if (!TopEqual(a2, b2))
457	return false;
458	stk.push_back(a2);
459	stk.push_back(b2);
460	}
461	break;
462
463	case kRegexpStar:
464	case kRegexpPlus:
465	case kRegexpQuest:
466	case kRegexpRepeat:
467	case kRegexpCapture:
468	a2 = a->sub()[`0`];
469	b2 = b->sub()[`0`];
470	if (!TopEqual(a2, b2))
471	return false;
472	// Really:
473	// stk.push_back(a2);
474	// stk.push_back(b2);
475	// break;
476	// but faster to assign directly and loop.
477	a = a2;
478	b = b2;
479	continue;
480	}
481
482	size_t n = stk.size();
483	if (n == `0`)
484	break;
485
486	DCHECK_GE(n, `2`);
487	a = stk [n-`2`];
488	b = stk [n-`1`];
489	stk.resize(n-`2`);
490	}
491
492	return true;
493	}
494
495	// Keep in sync with enum RegexpStatusCode in regexp.h
496	static const char *kErrorStrings[] = {
497	"no error",
498	"unexpected error",
499	"invalid escape sequence",
500	"invalid character class",
501	"invalid character class range",
502	"missing ]",
503	"missing )",
504	"trailing \\",
505	"no argument for repetition operator",
506	"invalid repetition size",
507	"bad repetition operator",
508	"invalid perl operator",
509	"invalid UTF-8",
510	"invalid named capture group",
511	};
512
513	std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
514	if (code < `0` \|\| code >= arraysize(kErrorStrings))
515	code = kRegexpInternalError;
516	return kErrorStrings[code];
517	}
518
519	std::string RegexpStatus::Text() const {
520	if (error_arg_.empty())
521	return CodeText(code_);
522	std::string s;
523	s.append(CodeText(code_));
524	s.append(": ");
525	s.append(error_arg_.data(), error_arg_.size());
526	return s;
527	}
528
529	void RegexpStatus::Copy(const RegexpStatus& status) {
530	code_ = status.code_;
531	error_arg_ = status.error_arg_;
532	}
533
534	typedef int Ignored; // Walker<void> doesn't exist
535
536	// Walker subclass to count capturing parens in regexp.
537	class NumCapturesWalker : public Regexp::Walker<Ignored> {
538	public:
539	NumCapturesWalker() : ncapture_(`0`) {}
540	int ncapture() { return ncapture_; }
541
542	virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
543	if (re->op() == kRegexpCapture)
544	ncapture_++;
545	return ignored;
546	}
547	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
548	// Should never be called: we use Walk not WalkExponential.
549	LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550	return ignored;
551	}
552
553	private:
554	int ncapture_;
555
556	NumCapturesWalker(const NumCapturesWalker&) = delete;
557	NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558	};
559
560	int Regexp::NumCaptures() {
561	NumCapturesWalker w;
562	w.Walk(this, `0`);
563	return w.ncapture();
564	}
565
566	// Walker class to build map of named capture groups and their indices.
567	class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568	public:
569	NamedCapturesWalker() : map_(NULL) {}
570	~NamedCapturesWalker() { delete map_; }
571
572	std::map<std::string, int>* TakeMap() {
573	std::map<std::string, int>* m = map_;
574	map_ = NULL;
575	return m;
576	}
577
578	Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579	if (re->op() == kRegexpCapture && re->name() != NULL) {
580	// Allocate map once we find a name.
581	if (map_ == NULL)
582	map_ = new std::map<std::string, int>;
583
584	// Record first occurrence of each name.
585	// (The rule is that if you have the same name
586	// multiple times, only the leftmost one counts.)
587	if (map_->find(*re->name()) == map_->end())
588	(map_)[re->name()] = re->cap();
589	}
590	return ignored;
591	}
592
593	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594	// Should never be called: we use Walk not WalkExponential.
595	LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596	return ignored;
597	}
598
599	private:
600	std::map<std::string, int>* map_;
601
602	NamedCapturesWalker(const NamedCapturesWalker&) = delete;
603	NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
604	};
605
606	std::map<std::string, int>* Regexp::NamedCaptures() {
607	NamedCapturesWalker w;
608	w.Walk(this, `0`);
609	return w.TakeMap();
610	}
611
612	// Walker class to build map from capture group indices to their names.
613	class CaptureNamesWalker : public Regexp::Walker<Ignored> {
614	public:
615	CaptureNamesWalker() : map_(NULL) {}
616	~CaptureNamesWalker() { delete map_; }
617
618	std::map<int, std::string>* TakeMap() {
619	std::map<int, std::string>* m = map_;
620	map_ = NULL;
621	return m;
622	}
623
624	Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
625	if (re->op() == kRegexpCapture && re->name() != NULL) {
626	// Allocate map once we find a name.
627	if (map_ == NULL)
628	map_ = new std::map<int, std::string>;
629
630	(map_)[re->cap()] = re->name();
631	}
632	return ignored;
633	}
634
635	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
636	// Should never be called: we use Walk not WalkExponential.
637	LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
638	return ignored;
639	}
640
641	private:
642	std::map<int, std::string>* map_;
643
644	CaptureNamesWalker(const CaptureNamesWalker&) = delete;
645	CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
646	};
647
648	std::map<int, std::string>* Regexp::CaptureNames() {
649	CaptureNamesWalker w;
650	w.Walk(this, `0`);
651	return w.TakeMap();
652	}
653
654	// Determines whether regexp matches must be anchored
655	// with a fixed string prefix. If so, returns the prefix and
656	// the regexp that remains after the prefix. The prefix might
657	// be ASCII case-insensitive.
658	bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
659	Regexp** suffix) {
660	// No need for a walker: the regexp must be of the form
661	// 1. some number of ^ anchors
662	// 2. a literal char or string
663	// 3. the rest
664	prefix->clear();
665	foldcase = false*;
666	*suffix = NULL;
667	if (op_ != kRegexpConcat)
668	return false;
669
670	// Some number of anchors, then a literal or concatenation.
671	int i = `0`;
672	Regexp sub = this**->sub();
673	while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
674	i++;
675	if (i == `0` \|\| i >= nsub_)
676	return false;
677
678	Regexp* re = sub[i];
679	switch (re->op_) {
680	default:
681	return false;
682
683	case kRegexpLiteralString:
684	// Convert to string in proper encoding.
685	if (re->parse_flags() & Latin1) {
686	prefix->resize(re->nrunes_);
687	for (int j = `0`; j < re->nrunes_; j++)
688	(prefix)[j] = static_cast<char*>(re->runes_[j]);
689	} else {
690	// Convert to UTF-8 in place.
691	// Assume worst-case space and then trim.
692	prefix->resize(re->nrunes_ * UTFmax);
693	char p = &(prefix)[`0`];
694	for (int j = `0`; j < re->nrunes_; j++) {
695	Rune r = re->runes_[j];
696	if (r < Runeself)
697	p++ = static_cast<char*>(r);
698	else
699	p += runetochar(p, &r);
700	}
701	prefix->resize(p - &(*prefix)[`0`]);
702	}
703	break;
704
705	case kRegexpLiteral:
706	if ((re->parse_flags() & Latin1) \|\| re->rune_ < Runeself) {
707	prefix->append(`1`, static_cast<char>(re->rune_));
708	} else {
709	char buf[UTFmax];
710	prefix->append(buf, runetochar(buf, &re->rune_));
711	}
712	break;
713	}
714	*foldcase = (sub[i]->parse_flags() & FoldCase) != `0`;
715	i++;
716
717	// The rest.
718	if (i < nsub_) {
719	for (int j = i; j < nsub_; j++)
720	sub[j]->Incref();
721	re = Concat(sub + i, nsub_ - i, parse_flags());
722	} else {
723	re = new Regexp (kRegexpEmptyMatch, parse_flags());
724	}
725	*suffix = re;
726	return true;
727	}
728
729	// Character class builder is a balanced binary tree (STL set)
730	// containing non-overlapping, non-abutting RuneRanges.
731	// The less-than operator used in the tree treats two
732	// ranges as equal if they overlap at all, so that
733	// lookups for a particular Rune are possible.
734
735	CharClassBuilder::CharClassBuilder() {
736	nrunes_ = `0`;
737	upper_ = `0`;
738	lower_ = `0`;
739	}
740
741	// Add lo-hi to the class; return whether class got bigger.
742	bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
743	if (hi < lo)
744	return false;
745
746	if (lo <= `'z'` && hi >= `'A'`) {
747	// Overlaps some alpha, maybe not all.
748	// Update bitmaps telling which ASCII letters are in the set.
749	Rune lo1 = std::max<Rune>(lo, `'A'`);
750	Rune hi1 = std::min<Rune>(hi, `'Z'`);
751	if (lo1 <= hi1)
752	upper_ \|= ((`1` << (hi1 - lo1 + `1`)) - `1`) << (lo1 - `'A'`);
753
754	lo1 = std::max<Rune>(lo, `'a'`);
755	hi1 = std::min<Rune>(hi, `'z'`);
756	if (lo1 <= hi1)
757	lower_ \|= ((`1` << (hi1 - lo1 + `1`)) - `1`) << (lo1 - `'a'`);
758	}
759
760	{ // Check whether lo, hi is already in the class.
761	iterator it = ranges_.find(RuneRange (lo, lo));
762	if (it != end() && it ->lo <= lo && hi <= it ->hi)
763	return false;
764	}
765
766	// Look for a range abutting lo on the left.
767	// If it exists, take it out and increase our range.
768	if (lo > `0`) {
769	iterator it = ranges_.find(RuneRange (lo-`1`, lo-`1`));
770	if (it != end()) {
771	lo = it ->lo;
772	if (it ->hi > hi)
773	hi = it ->hi;
774	nrunes_ -= it ->hi - it ->lo + `1`;
775	ranges_.erase(it);
776	}
777	}
778
779	// Look for a range abutting hi on the right.
780	// If it exists, take it out and increase our range.
781	if (hi < Runemax) {
782	iterator it = ranges_.find(RuneRange (hi+`1`, hi+`1`));
783	if (it != end()) {
784	hi = it ->hi;
785	nrunes_ -= it ->hi - it ->lo + `1`;
786	ranges_.erase(it);
787	}
788	}
789
790	// Look for ranges between lo and hi. Take them out.
791	// This is only safe because the set has no overlapping ranges.
792	// We've already removed any ranges abutting lo and hi, so
793	// any that overlap [lo, hi] must be contained within it.
794	for (;;) {
795	iterator it = ranges_.find(RuneRange (lo, hi));
796	if (it == end())
797	break;
798	nrunes_ -= it ->hi - it ->lo + `1`;
799	ranges_.erase(it);
800	}
801
802	// Finally, add [lo, hi].
803	nrunes_ += hi - lo + `1`;
804	ranges_.insert(RuneRange (lo, hi));
805	return true;
806	}
807
808	void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
809	for (iterator it = cc->begin(); it != cc->end(); ++it)
810	AddRange(it ->lo, it ->hi);
811	}
812
813	bool CharClassBuilder::Contains(Rune r) {
814	return ranges_.find(RuneRange (r, r)) != end();
815	}
816
817	// Does the character class behave the same on A-Z as on a-z?
818	bool CharClassBuilder::FoldsASCII() {
819	return ((upper_ ^ lower_) & AlphaMask) == `0`;
820	}
821
822	CharClassBuilder* CharClassBuilder::Copy() {
823	CharClassBuilder* cc = new CharClassBuilder;
824	for (iterator it = begin(); it != end(); ++it)
825	cc->ranges_.insert(RuneRange (it ->lo, it ->hi));
826	cc->upper_ = upper_;
827	cc->lower_ = lower_;
828	cc->nrunes_ = nrunes_;
829	return cc;
830	}
831
832
833
834	void CharClassBuilder::RemoveAbove(Rune r) {
835	if (r >= Runemax)
836	return;
837
838	if (r < `'z'`) {
839	if (r < `'a'`)
840	lower_ = `0`;
841	else
842	lower_ &= AlphaMask >> (`'z'` - r);
843	}
844
845	if (r < `'Z'`) {
846	if (r < `'A'`)
847	upper_ = `0`;
848	else
849	upper_ &= AlphaMask >> (`'Z'` - r);
850	}
851
852	for (;;) {
853
854	iterator it = ranges_.find(RuneRange (r + `1`, Runemax));
855	if (it == end())
856	break;
857	RuneRange rr = *it;
858	ranges_.erase(it);
859	nrunes_ -= rr.hi - rr.lo + `1`;
860	if (rr.lo <= r) {
861	rr.hi = r;
862	ranges_.insert(rr);
863	nrunes_ += rr.hi - rr.lo + `1`;
864	}
865	}
866	}
867
868	void CharClassBuilder::Negate() {
869	// Build up negation and then copy in.
870	// Could edit ranges in place, but C++ won't let me.
871	std::vector<RuneRange> v;
872	v.reserve(ranges_.size() + `1`);
873
874	// In negation, first range begins at 0, unless
875	// the current class begins at 0.
876	iterator it = begin();
877	if (it == end()) {
878	v.push_back(RuneRange (`0`, Runemax));
879	} else {
880	int nextlo = `0`;
881	if (it ->lo == `0`) {
882	nextlo = it ->hi + `1`;
883	++it;
884	}
885	for (; it != end(); ++it) {
886	v.push_back(RuneRange (nextlo, it ->lo - `1`));
887	nextlo = it ->hi + `1`;
888	}
889	if (nextlo <= Runemax)
890	v.push_back(RuneRange (nextlo, Runemax));
891	}
892
893	ranges_.clear();
894	for (size_t i = `0`; i < v.size(); i++)
895	ranges_.insert(v [i]);
896
897	upper_ = AlphaMask & ~upper_;
898	lower_ = AlphaMask & ~lower_;
899	nrunes_ = Runemax+`1` - nrunes_;
900	}
901
902	// Character class is a sorted list of ranges.
903	// The ranges are allocated in the same block as the header,
904	// necessitating a special allocator and Delete method.
905
906	CharClass* CharClass::New(int maxranges) {
907	CharClass* cc;
908	uint8_t* data = new uint8_t[sizeof cc + maxrangessizeof cc->ranges_[`0`]];
909	cc = reinterpret_cast<CharClass*>(data);
910	cc->ranges_ = reinterpret_cast<RuneRange>(data + sizeof* *cc);
911	cc->nranges_ = `0`;
912	cc->folds_ascii_ = false;
913	cc->nrunes_ = `0`;
914	return cc;
915	}
916
917	void CharClass::Delete() {
918	uint8_t* data = reinterpret_cast<uint8_t>(this*);
919	delete[] data;
920	}
921
922	CharClass* CharClass::Negate() {
923	CharClass* cc = CharClass::New(nranges_+`1`);
924	cc->folds_ascii_ = folds_ascii_;
925	cc->nrunes_ = Runemax + `1` - nrunes_;
926	int n = `0`;
927	int nextlo = `0`;
928	for (CharClass::iterator it = begin(); it != end(); ++it) {
929	if (it->lo == nextlo) {
930	nextlo = it->hi + `1`;
931	} else {
932	cc->ranges_[n++] = RuneRange (nextlo, it->lo - `1`);
933	nextlo = it->hi + `1`;
934	}
935	}
936	if (nextlo <= Runemax)
937	cc->ranges_[n++] = RuneRange (nextlo, Runemax);
938	cc->nranges_ = n;
939	return cc;
940	}
941
942	bool CharClass::Contains(Rune r) {
943	RuneRange* rr = ranges_;
944	int n = nranges_;
945	while (n > `0`) {
946	int m = n/`2`;
947	if (rr[m].hi < r) {
948	rr += m+`1`;
949	n -= m+`1`;
950	} else if (r < rr[m].lo) {
951	n = m;
952	} else { // rr[m].lo <= r && r <= rr[m].hi
953	return true;
954	}
955	}
956	return false;
957	}
958
959	CharClass* CharClassBuilder::GetCharClass() {
960	CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
961	int n = `0`;
962	for (iterator it = begin(); it != end(); ++it)
963	cc->ranges_[n++] = *it;
964	cc->nranges_ = n;
965	DCHECK_LE(n, static_cast<int>(ranges_.size()));
966	cc->nrunes_ = nrunes_;
967	cc->folds_ascii_ = FoldsASCII();
968	return cc;
969	}
970
971	} // namespace re2
972

Browse the source code of RE2/re2/regexp.cc