regexp.cc source code [engine/third_party/dart/runtime/vm/regexp.cc]

1	// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2	// for details. All rights reserved. Use of this source code is governed by a
3	// BSD-style license that can be found in the LICENSE file.
4
5	#include "vm/regexp.h"
6
7	#include <memory>
8
9	#include "platform/splay-tree-inl.h"
10	#include "platform/unicode.h"
11
12	#include "unicode/uniset.h"
13
14	#include "vm/dart_entry.h"
15	#include "vm/regexp_assembler.h"
16	#include "vm/regexp_assembler_bytecode.h"
17	#include "vm/regexp_ast.h"
18	#include "vm/symbols.h"
19	#include "vm/thread.h"
20	#include "vm/unibrow-inl.h"
21
22	#if !defined(DART_PRECOMPILED_RUNTIME)
23	#include "vm/regexp_assembler_ir.h"
24	#endif // !defined(DART_PRECOMPILED_RUNTIME)
25
26	#define Z (zone())
27
28	namespace dart {
29
30	// Default to generating optimized regexp code.
31	static const bool kRegexpOptimization = true;
32
33	// More makes code generation slower, less makes V8 benchmark score lower.
34	static const intptr_t kMaxLookaheadForBoyerMoore = `8`;
35
36	ContainedInLattice AddRange(ContainedInLattice containment,
37	const int32_t* ranges,
38	intptr_t ranges_length,
39	Interval new_range) {
40	ASSERT((ranges_length & `1`) == `1`);
41	ASSERT(ranges[ranges_length - `1`] == Utf::kMaxCodePoint + `1`);
42	if (containment == kLatticeUnknown) return containment;
43	bool inside = false;
44	int32_t last = `0`;
45	for (intptr_t i = `0`; i < ranges_length;
46	inside = !inside, last = ranges[i], i++) {
47	// Consider the range from last to ranges[i].
48	// We haven't got to the new range yet.
49	if (ranges[i] <= new_range.from()) continue;
50	// New range is wholly inside last-ranges[i]. Note that new_range.to() is
51	// inclusive, but the values in ranges are not.
52	if (last <= new_range.from() && new_range.to() < ranges[i]) {
53	return Combine(containment, inside ? kLatticeIn : kLatticeOut);
54	}
55	return kLatticeUnknown;
56	}
57	return containment;
58	}
59
60	// -------------------------------------------------------------------
61	// Implementation of the Irregexp regular expression engine.
62	//
63	// The Irregexp regular expression engine is intended to be a complete
64	// implementation of ECMAScript regular expressions. It generates
65	// IR code that is subsequently compiled to native code.
66
67	// The Irregexp regexp engine is structured in three steps.
68	// 1) The parser generates an abstract syntax tree. See regexp_ast.cc.
69	// 2) From the AST a node network is created. The nodes are all
70	// subclasses of RegExpNode. The nodes represent states when
71	// executing a regular expression. Several optimizations are
72	// performed on the node network.
73	// 3) From the nodes we generate IR instructions that can actually
74	// execute the regular expression (perform the search). The
75	// code generation step is described in more detail below.
76
77	// Code generation.
78	//
79	// The nodes are divided into four main categories.
80	// Choice nodes*
81	// These represent places where the regular expression can
82	// match in more than one way. For example on entry to an
83	// alternation (foo\|bar) or a repetition (, +, ? or {}).*
84	// Action nodes*
85	// These represent places where some action should be
86	// performed. Examples include recording the current position
87	// in the input string to a register (in order to implement
88	// captures) or other actions on register for example in order
89	// to implement the counters needed for {} repetitions.
90	// Matching nodes*
91	// These attempt to match some element part of the input string.
92	// Examples of elements include character classes, plain strings
93	// or back references.
94	// End nodes*
95	// These are used to implement the actions required on finding
96	// a successful match or failing to find a match.
97	//
98	// The code generated maintains some state as it runs. This consists of the
99	// following elements:
100	//
101	// The capture registers. Used for string captures.*
102	// Other registers. Used for counters etc.*
103	// The current position.*
104	// The stack of backtracking information. Used when a matching node*
105	// fails to find a match and needs to try an alternative.
106	//
107	// Conceptual regular expression execution model:
108	//
109	// There is a simple conceptual model of regular expression execution
110	// which will be presented first. The actual code generated is a more
111	// efficient simulation of the simple conceptual model:
112	//
113	// Choice nodes are implemented as follows:*
114	// For each choice except the last {
115	// push current position
116	// push backtrack code location
117	// <generate code to test for choice>
118	// backtrack code location:
119	// pop current position
120	// }
121	// <generate code to test for last choice>
122	//
123	// Actions nodes are generated as follows*
124	// <push affected registers on backtrack stack>
125	// <generate code to perform action>
126	// push backtrack code location
127	// <generate code to test for following nodes>
128	// backtrack code location:
129	// <pop affected registers to restore their state>
130	// <pop backtrack location from stack and go to it>
131	//
132	// Matching nodes are generated as follows:*
133	// if input string matches at current position
134	// update current position
135	// <generate code to test for following nodes>
136	// else
137	// <pop backtrack location from stack and go to it>
138	//
139	// Thus it can be seen that the current position is saved and restored
140	// by the choice nodes, whereas the registers are saved and restored by
141	// by the action nodes that manipulate them.
142	//
143	// The other interesting aspect of this model is that nodes are generated
144	// at the point where they are needed by a recursive call to Emit(). If
145	// the node has already been code generated then the Emit() call will
146	// generate a jump to the previously generated code instead. In order to
147	// limit recursion it is possible for the Emit() function to put the node
148	// on a work list for later generation and instead generate a jump. The
149	// destination of the jump is resolved later when the code is generated.
150	//
151	// Actual regular expression code generation.
152	//
153	// Code generation is actually more complicated than the above. In order
154	// to improve the efficiency of the generated code some optimizations are
155	// performed
156	//
157	// Choice nodes have 1-character lookahead.*
158	// A choice node looks at the following character and eliminates some of
159	// the choices immediately based on that character. This is not yet
160	// implemented.
161	// Simple greedy loops store reduced backtracking information.*
162	// A quantifier like /.foo/m will greedily match the whole input. It will*
163	// then need to backtrack to a point where it can match "foo". The naive
164	// implementation of this would push each character position onto the
165	// backtracking stack, then pop them off one by one. This would use space
166	// proportional to the length of the input string. However since the "."
167	// can only match in one way and always has a constant length (in this case
168	// of 1) it suffices to store the current position on the top of the stack
169	// once. Matching now becomes merely incrementing the current position and
170	// backtracking becomes decrementing the current position and checking the
171	// result against the stored current position. This is faster and saves
172	// space.
173	// The current state is virtualized.*
174	// This is used to defer expensive operations until it is clear that they
175	// are needed and to generate code for a node more than once, allowing
176	// specialized an efficient versions of the code to be created. This is
177	// explained in the section below.
178	//
179	// Execution state virtualization.
180	//
181	// Instead of emitting code, nodes that manipulate the state can record their
182	// manipulation in an object called the Trace. The Trace object can record a
183	// current position offset, an optional backtrack code location on the top of
184	// the virtualized backtrack stack and some register changes. When a node is
185	// to be emitted it can flush the Trace or update it. Flushing the Trace
186	// will emit code to bring the actual state into line with the virtual state.
187	// Avoiding flushing the state can postpone some work (e.g. updates of capture
188	// registers). Postponing work can save time when executing the regular
189	// expression since it may be found that the work never has to be done as a
190	// failure to match can occur. In addition it is much faster to jump to a
191	// known backtrack code location than it is to pop an unknown backtrack
192	// location from the stack and jump there.
193	//
194	// The virtual state found in the Trace affects code generation. For example
195	// the virtual state contains the difference between the actual current
196	// position and the virtual current position, and matching code needs to use
197	// this offset to attempt a match in the correct location of the input
198	// string. Therefore code generated for a non-trivial trace is specialized
199	// to that trace. The code generator therefore has the ability to generate
200	// code for each node several times. In order to limit the size of the
201	// generated code there is an arbitrary limit on how many specialized sets of
202	// code may be generated for a given node. If the limit is reached, the
203	// trace is flushed and a generic version of the code for a node is emitted.
204	// This is subsequently used for that node. The code emitted for non-generic
205	// trace is not recorded in the node and so it cannot currently be reused in
206	// the event that code generation is requested for an identical trace.
207
208	void RegExpTree::AppendToText(RegExpText* text) {
209	UNREACHABLE();
210	}
211
212	void RegExpAtom::AppendToText(RegExpText* text) {
213	text->AddElement(TextElement::Atom(this));
214	}
215
216	void RegExpCharacterClass::AppendToText(RegExpText* text) {
217	text->AddElement(TextElement::CharClass(this));
218	}
219
220	void RegExpText::AppendToText(RegExpText* text) {
221	for (intptr_t i = `0`; i < elements()->length(); i++)
222	text->AddElement((*elements())[i]);
223	}
224
225	TextElement TextElement::Atom(RegExpAtom* atom) {
226	return TextElement (ATOM, atom);
227	}
228
229	TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
230	return TextElement (CHAR_CLASS, char_class);
231	}
232
233	intptr_t TextElement::length() const {
234	switch (text_type()) {
235	case ATOM:
236	return atom()->length();
237
238	case CHAR_CLASS:
239	return `1`;
240	}
241	UNREACHABLE();
242	return `0`;
243	}
244
245	class FrequencyCollator : public ValueObject {
246	public:
247	FrequencyCollator() : total_samples_(`0`) {
248	for (intptr_t i = `0`; i < RegExpMacroAssembler::kTableSize; i++) {
249	frequencies_[i] = CharacterFrequency (i);
250	}
251	}
252
253	void CountCharacter(intptr_t character) {
254	intptr_t index = (character & RegExpMacroAssembler::kTableMask);
255	frequencies_[index].Increment();
256	total_samples_++;
257	}
258
259	// Does not measure in percent, but rather per-128 (the table size from the
260	// regexp macro assembler).
261	intptr_t Frequency(intptr_t in_character) {
262	ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
263	if (total_samples_ < `1`) return `1`; // Division by zero.
264	intptr_t freq_in_per128 =
265	(frequencies_[in_character].counter() * `128`) / total_samples_;
266	return freq_in_per128;
267	}
268
269	private:
270	class CharacterFrequency {
271	public:
272	CharacterFrequency() : counter_(`0`), character_(-`1`) {}
273	explicit CharacterFrequency(intptr_t character)
274	: counter_(`0`), character_(character) {}
275
276	void Increment() { counter_++; }
277	intptr_t counter() { return counter_; }
278	intptr_t character() { return character_; }
279
280	private:
281	intptr_t counter_;
282	intptr_t character_;
283
284	DISALLOW_ALLOCATION();
285	};
286
287	private:
288	CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
289	intptr_t total_samples_;
290	};
291
292	class RegExpCompiler : public ValueObject {
293	public:
294	RegExpCompiler(intptr_t capture_count, bool is_one_byte);
295
296	intptr_t AllocateRegister() { return next_register_++; }
297
298	// Lookarounds to match lone surrogates for unicode character class matches
299	// are never nested. We can therefore reuse registers.
300	intptr_t UnicodeLookaroundStackRegister() {
301	if (unicode_lookaround_stack_register_ == kNoRegister) {
302	unicode_lookaround_stack_register_ = AllocateRegister();
303	}
304	return unicode_lookaround_stack_register_;
305	}
306
307	intptr_t UnicodeLookaroundPositionRegister() {
308	if (unicode_lookaround_position_register_ == kNoRegister) {
309	unicode_lookaround_position_register_ = AllocateRegister();
310	}
311	return unicode_lookaround_position_register_;
312	}
313
314	#if !defined(DART_PRECOMPILED_RUNTIME)
315	RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,
316	RegExpNode* start,
317	intptr_t capture_count,
318	const String& pattern);
319	#endif
320
321	RegExpEngine::CompilationResult Assemble(
322	BytecodeRegExpMacroAssembler* assembler,
323	RegExpNode* start,
324	intptr_t capture_count,
325	const String& pattern);
326
327	inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
328
329	static const intptr_t kImplementationOffset = `0`;
330	static const intptr_t kNumberOfRegistersOffset = `0`;
331	static const intptr_t kCodeOffset = `1`;
332
333	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
334	EndNode* accept() { return accept_; }
335
336	static const intptr_t kMaxRecursion = `100`;
337	inline intptr_t recursion_depth() { return recursion_depth_; }
338	inline void IncrementRecursionDepth() { recursion_depth_++; }
339	inline void DecrementRecursionDepth() { recursion_depth_--; }
340
341	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
342
343	inline bool one_byte() const { return is_one_byte_; }
344	bool read_backward() { return read_backward_; }
345	void set_read_backward(bool value) { read_backward_ = value; }
346	FrequencyCollator* frequency_collator() { return &frequency_collator_; }
347
348	intptr_t current_expansion_factor() { return current_expansion_factor_; }
349	void set_current_expansion_factor(intptr_t value) {
350	current_expansion_factor_ = value;
351	}
352
353	Zone* zone() const { return zone_; }
354
355	static const intptr_t kNoRegister = -`1`;
356
357	private:
358	EndNode* accept_;
359	intptr_t next_register_;
360	intptr_t unicode_lookaround_stack_register_;
361	intptr_t unicode_lookaround_position_register_;
362	ZoneGrowableArray<RegExpNode> work_list_;
363	intptr_t recursion_depth_;
364	RegExpMacroAssembler* macro_assembler_;
365	bool is_one_byte_;
366	bool reg_exp_too_big_;
367	bool read_backward_;
368	intptr_t current_expansion_factor_;
369	FrequencyCollator frequency_collator_;
370	Zone* zone_;
371	};
372
373	class RecursionCheck : public ValueObject {
374	public:
375	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
376	compiler->IncrementRecursionDepth();
377	}
378	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
379
380	private:
381	RegExpCompiler* compiler_;
382	};
383
384	static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
385	return RegExpEngine::CompilationResult ("RegExp too big");
386	}
387
388	// Attempts to compile the regexp using an Irregexp code generator. Returns
389	// a fixed array or a null handle depending on whether it succeeded.
390	RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte)
391	: next_register_(`2` * (capture_count + `1`)),
392	unicode_lookaround_stack_register_(kNoRegister),
393	unicode_lookaround_position_register_(kNoRegister),
394	work_list_(NULL),
395	recursion_depth_(`0`),
396	is_one_byte_(is_one_byte),
397	reg_exp_too_big_(false),
398	read_backward_(false),
399	current_expansion_factor_(`1`),
400	zone_(Thread::Current()->zone()) {
401	accept_ = new (Z) EndNode (EndNode::ACCEPT, Z);
402	}
403
404	#if !defined(DART_PRECOMPILED_RUNTIME)
405	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
406	IRRegExpMacroAssembler* macro_assembler,
407	RegExpNode* start,
408	intptr_t capture_count,
409	const String& pattern) {
410	macro_assembler->set_slow_safe(false / use_slow_safe_regexp_compiler /);
411	macro_assembler_ = macro_assembler;
412
413	ZoneGrowableArray<RegExpNode*> work_list(`0`);
414	work_list_ = &work_list;
415	BlockLabel fail;
416	macro_assembler_->PushBacktrack(&fail);
417	Trace new_trace;
418	start->Emit(this, &new_trace);
419	macro_assembler_->BindBlock(&fail);
420	macro_assembler_->Fail();
421	while (!work_list.is_empty()) {
422	work_list.RemoveLast()->Emit(this, &new_trace);
423	}
424	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
425
426	macro_assembler->GenerateBacktrackBlock();
427	macro_assembler->FinalizeRegistersArray();
428
429	return RegExpEngine::CompilationResult (
430	macro_assembler->backtrack_goto(), macro_assembler->graph_entry(),
431	macro_assembler->num_blocks(), macro_assembler->num_stack_locals(),
432	next_register_);
433	}
434	#endif
435
436	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
437	BytecodeRegExpMacroAssembler* macro_assembler,
438	RegExpNode* start,
439	intptr_t capture_count,
440	const String& pattern) {
441	macro_assembler->set_slow_safe(false / use_slow_safe_regexp_compiler /);
442	macro_assembler_ = macro_assembler;
443
444	ZoneGrowableArray<RegExpNode*> work_list(`0`);
445	work_list_ = &work_list;
446	BlockLabel fail;
447	macro_assembler_->PushBacktrack(&fail);
448	Trace new_trace;
449	start->Emit(this, &new_trace);
450	macro_assembler_->BindBlock(&fail);
451	macro_assembler_->Fail();
452	while (!work_list.is_empty()) {
453	work_list.RemoveLast()->Emit(this, &new_trace);
454	}
455	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
456
457	TypedData& bytecode = TypedData::ZoneHandle(macro_assembler->GetBytecode());
458	return RegExpEngine::CompilationResult (&bytecode, next_register_);
459	}
460
461	bool Trace::DeferredAction::Mentions(intptr_t that) {
462	if (action_type() == ActionNode::CLEAR_CAPTURES) {
463	Interval range = static_cast<DeferredClearCaptures>(this*)->range();
464	return range.Contains(that);
465	} else {
466	return reg() == that;
467	}
468	}
469
470	bool Trace::mentions_reg(intptr_t reg) {
471	for (DeferredAction* action = actions_; action != NULL;
472	action = action->next()) {
473	if (action->Mentions(reg)) return true;
474	}
475	return false;
476	}
477
478	bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
479	ASSERT(*cp_offset == `0`);
480	for (DeferredAction* action = actions_; action != NULL;
481	action = action->next()) {
482	if (action->Mentions(reg)) {
483	if (action->action_type() == ActionNode::STORE_POSITION) {
484	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
485	return true;
486	} else {
487	return false;
488	}
489	}
490	}
491	return false;
492	}
493
494	// This is called as we come into a loop choice node and some other tricky
495	// nodes. It normalizes the state of the code generator to ensure we can
496	// generate generic code.
497	intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers, Zone* zone) {
498	intptr_t max_register = RegExpCompiler::kNoRegister;
499	for (DeferredAction* action = actions_; action != NULL;
500	action = action->next()) {
501	if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
502	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
503	for (intptr_t i = range.from(); i <= range.to(); i++)
504	affected_registers->Set(i, zone);
505	if (range.to() > max_register) max_register = range.to();
506	} else {
507	affected_registers->Set(action->reg(), zone);
508	if (action->reg() > max_register) max_register = action->reg();
509	}
510	}
511	return max_register;
512	}
513
514	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
515	intptr_t max_register,
516	const OutSet& registers_to_pop,
517	const OutSet& registers_to_clear) {
518	for (intptr_t reg = max_register; reg >= `0`; reg--) {
519	if (registers_to_pop.Get(reg)) {
520	assembler->PopRegister(reg);
521	} else if (registers_to_clear.Get(reg)) {
522	intptr_t clear_to = reg;
523	while (reg > `0` && registers_to_clear.Get(reg - `1`)) {
524	reg--;
525	}
526	assembler->ClearRegisters(reg, clear_to);
527	}
528	}
529	}
530
531	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
532	intptr_t max_register,
533	const OutSet& affected_registers,
534	OutSet* registers_to_pop,
535	OutSet* registers_to_clear,
536	Zone* zone) {
537	for (intptr_t reg = `0`; reg <= max_register; reg++) {
538	if (!affected_registers.Get(reg)) {
539	continue;
540	}
541
542	// The chronologically first deferred action in the trace
543	// is used to infer the action needed to restore a register
544	// to its previous state (or not, if it's safe to ignore it).
545	enum DeferredActionUndoType { ACTION_IGNORE, ACTION_RESTORE, ACTION_CLEAR };
546	DeferredActionUndoType undo_action = ACTION_IGNORE;
547
548	intptr_t value = `0`;
549	bool absolute = false;
550	bool clear = false;
551	static const intptr_t kNoStore = kMinInt32;
552	intptr_t store_position = kNoStore;
553	// This is a little tricky because we are scanning the actions in reverse
554	// historical order (newest first).
555	for (DeferredAction* action = actions_; action != NULL;
556	action = action->next()) {
557	if (action->Mentions(reg)) {
558	switch (action->action_type()) {
559	case ActionNode::SET_REGISTER: {
560	Trace::DeferredSetRegister* psr =
561	static_cast<Trace::DeferredSetRegister*>(action);
562	if (!absolute) {
563	value += psr->value();
564	absolute = true;
565	}
566	// SET_REGISTER is currently only used for newly introduced loop
567	// counters. They can have a significant previous value if they
568	// occour in a loop. TODO(lrn): Propagate this information, so we
569	// can set undo_action to ACTION_IGNORE if we know there is no
570	// value to restore.
571	undo_action = ACTION_RESTORE;
572	ASSERT(store_position == kNoStore);
573	ASSERT(!clear);
574	break;
575	}
576	case ActionNode::INCREMENT_REGISTER:
577	if (!absolute) {
578	value++;
579	}
580	ASSERT(store_position == kNoStore);
581	ASSERT(!clear);
582	undo_action = ACTION_RESTORE;
583	break;
584	case ActionNode::STORE_POSITION: {
585	Trace::DeferredCapture* pc =
586	static_cast<Trace::DeferredCapture*>(action);
587	if (!clear && store_position == kNoStore) {
588	store_position = pc->cp_offset();
589	}
590
591	// For captures we know that stores and clears alternate.
592	// Other register, are never cleared, and if the occur
593	// inside a loop, they might be assigned more than once.
594	if (reg <= `1`) {
595	// Registers zero and one, aka "capture zero", is
596	// always set correctly if we succeed. There is no
597	// need to undo a setting on backtrack, because we
598	// will set it again or fail.
599	undo_action = ACTION_IGNORE;
600	} else {
601	undo_action = pc->is_capture() ? ACTION_CLEAR : ACTION_RESTORE;
602	}
603	ASSERT(!absolute);
604	ASSERT(value == `0`);
605	break;
606	}
607	case ActionNode::CLEAR_CAPTURES: {
608	// Since we're scanning in reverse order, if we've already
609	// set the position we have to ignore historically earlier
610	// clearing operations.
611	if (store_position == kNoStore) {
612	clear = true;
613	}
614	undo_action = ACTION_RESTORE;
615	ASSERT(!absolute);
616	ASSERT(value == `0`);
617	break;
618	}
619	default:
620	UNREACHABLE();
621	break;
622	}
623	}
624	}
625	// Prepare for the undo-action (e.g., push if it's going to be popped).
626	if (undo_action == ACTION_RESTORE) {
627	assembler->PushRegister(reg);
628	registers_to_pop->Set(reg, zone);
629	} else if (undo_action == ACTION_CLEAR) {
630	registers_to_clear->Set(reg, zone);
631	}
632	// Perform the chronologically last action (or accumulated increment)
633	// for the register.
634	if (store_position != kNoStore) {
635	assembler->WriteCurrentPositionToRegister(reg, store_position);
636	} else if (clear) {
637	assembler->ClearRegisters(reg, reg);
638	} else if (absolute) {
639	assembler->SetRegister(reg, value);
640	} else if (value != `0`) {
641	assembler->AdvanceRegister(reg, value);
642	}
643	}
644	}
645
646	// This is called as we come into a loop choice node and some other tricky
647	// nodes. It normalizes the state of the code generator to ensure we can
648	// generate generic code.
649	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
650	RegExpMacroAssembler* assembler = compiler->macro_assembler();
651
652	ASSERT(!is_trivial());
653
654	if (actions_ == NULL && backtrack() == NULL) {
655	// Here we just have some deferred cp advances to fix and we are back to
656	// a normal situation. We may also have to forget some information gained
657	// through a quick check that was already performed.
658	if (cp_offset_ != `0`) assembler->AdvanceCurrentPosition(cp_offset_);
659	// Create a new trivial state and generate the node with that.
660	Trace new_state;
661	successor->Emit(compiler, &new_state);
662	return;
663	}
664
665	// Generate deferred actions here along with code to undo them again.
666	OutSet affected_registers;
667
668	if (backtrack() != NULL) {
669	// Here we have a concrete backtrack location. These are set up by choice
670	// nodes and so they indicate that we have a deferred save of the current
671	// position which we may need to emit here.
672	assembler->PushCurrentPosition();
673	}
674	Zone* zone = successor->zone();
675	intptr_t max_register = FindAffectedRegisters(&affected_registers, zone);
676	OutSet registers_to_pop;
677	OutSet registers_to_clear;
678	PerformDeferredActions(assembler, max_register, affected_registers,
679	&registers_to_pop, &registers_to_clear, zone);
680	if (cp_offset_ != `0`) {
681	assembler->AdvanceCurrentPosition(cp_offset_);
682	}
683
684	// Create a new trivial state and generate the node with that.
685	BlockLabel undo;
686	assembler->PushBacktrack(&undo);
687	Trace new_state;
688	successor->Emit(compiler, &new_state);
689
690	// On backtrack we need to restore state.
691	assembler->BindBlock(&undo);
692	RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
693	registers_to_clear);
694	if (backtrack() == NULL) {
695	assembler->Backtrack();
696	} else {
697	assembler->PopCurrentPosition();
698	assembler->GoTo(backtrack());
699	}
700	}
701
702	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
703	RegExpMacroAssembler* assembler = compiler->macro_assembler();
704
705	// Omit flushing the trace. We discard the entire stack frame anyway.
706
707	if (!label()->is_bound()) {
708	// We are completely independent of the trace, since we ignore it,
709	// so this code can be used as the generic version.
710	assembler->BindBlock(label());
711	}
712
713	// Throw away everything on the backtrack stack since the start
714	// of the negative submatch and restore the character position.
715	assembler->ReadCurrentPositionFromRegister(current_position_register_);
716	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
717	if (clear_capture_count_ > `0`) {
718	// Clear any captures that might have been performed during the success
719	// of the body of the negative look-ahead.
720	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - `1`;
721	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
722	}
723	// Now that we have unwound the stack we find at the top of the stack the
724	// backtrack that the BeginSubmatch node got.
725	assembler->Backtrack();
726	}
727
728	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
729	if (!trace->is_trivial()) {
730	trace->Flush(compiler, this);
731	return;
732	}
733	RegExpMacroAssembler* assembler = compiler->macro_assembler();
734	if (!label()->is_bound()) {
735	assembler->BindBlock(label());
736	}
737	switch (action_) {
738	case ACCEPT:
739	assembler->Succeed();
740	return;
741	case BACKTRACK:
742	assembler->GoTo(trace->backtrack());
743	return;
744	case NEGATIVE_SUBMATCH_SUCCESS:
745	// This case is handled in a different virtual method.
746	UNREACHABLE();
747	}
748	UNIMPLEMENTED();
749	}
750
751	void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
752	if (guards_ == NULL) guards_ = new (zone) ZoneGrowableArray<Guard*>(`1`);
753	guards_->Add(guard);
754	}
755
756	ActionNode* ActionNode::SetRegister(intptr_t reg,
757	intptr_t val,
758	RegExpNode* on_success) {
759	ActionNode* result =
760	new (on_success->zone()) ActionNode (SET_REGISTER, on_success);
761	result->data_.u_store_register.reg = reg;
762	result->data_.u_store_register.value = val;
763	return result;
764	}
765
766	ActionNode* ActionNode::IncrementRegister(intptr_t reg,
767	RegExpNode* on_success) {
768	ActionNode* result =
769	new (on_success->zone()) ActionNode (INCREMENT_REGISTER, on_success);
770	result->data_.u_increment_register.reg = reg;
771	return result;
772	}
773
774	ActionNode* ActionNode::StorePosition(intptr_t reg,
775	bool is_capture,
776	RegExpNode* on_success) {
777	ActionNode* result =
778	new (on_success->zone()) ActionNode (STORE_POSITION, on_success);
779	result->data_.u_position_register.reg = reg;
780	result->data_.u_position_register.is_capture = is_capture;
781	return result;
782	}
783
784	ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
785	ActionNode* result =
786	new (on_success->zone()) ActionNode (CLEAR_CAPTURES, on_success);
787	result->data_.u_clear_captures.range_from = range.from();
788	result->data_.u_clear_captures.range_to = range.to();
789	return result;
790	}
791
792	ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg,
793	intptr_t position_reg,
794	RegExpNode* on_success) {
795	ActionNode* result =
796	new (on_success->zone()) ActionNode (BEGIN_SUBMATCH, on_success);
797	result->data_.u_submatch.stack_pointer_register = stack_reg;
798	result->data_.u_submatch.current_position_register = position_reg;
799	return result;
800	}
801
802	ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg,
803	intptr_t position_reg,
804	intptr_t clear_register_count,
805	intptr_t clear_register_from,
806	RegExpNode* on_success) {
807	ActionNode* result = new (on_success->zone())
808	ActionNode (POSITIVE_SUBMATCH_SUCCESS, on_success);
809	result->data_.u_submatch.stack_pointer_register = stack_reg;
810	result->data_.u_submatch.current_position_register = position_reg;
811	result->data_.u_submatch.clear_register_count = clear_register_count;
812	result->data_.u_submatch.clear_register_from = clear_register_from;
813	return result;
814	}
815
816	ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register,
817	intptr_t repetition_register,
818	intptr_t repetition_limit,
819	RegExpNode* on_success) {
820	ActionNode* result =
821	new (on_success->zone()) ActionNode (EMPTY_MATCH_CHECK, on_success);
822	result->data_.u_empty_match_check.start_register = start_register;
823	result->data_.u_empty_match_check.repetition_register = repetition_register;
824	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
825	return result;
826	}
827
828	#define DEFINE_ACCEPT(Type) \
829	void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
830	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
831	#undef DEFINE_ACCEPT
832
833	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
834	visitor->VisitLoopChoice(this);
835	}
836
837	// -------------------------------------------------------------------
838	// Emit code.
839
840	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
841	Guard* guard,
842	Trace* trace) {
843	switch (guard->op()) {
844	case Guard::LT:
845	ASSERT(!trace->mentions_reg(guard->reg()));
846	macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
847	trace->backtrack());
848	break;
849	case Guard::GEQ:
850	ASSERT(!trace->mentions_reg(guard->reg()));
851	macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
852	trace->backtrack());
853	break;
854	}
855	}
856
857	// Returns the number of characters in the equivalence class, omitting those
858	// that cannot occur in the source string because it is ASCII.
859	static intptr_t GetCaseIndependentLetters(uint16_t character,
860	bool one_byte_subject,
861	int32_t* letters) {
862	unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
863	intptr_t length = jsregexp_uncanonicalize.get(character, `'\0'`, letters);
864	// Unibrow returns 0 or 1 for characters where case independence is
865	// trivial.
866	if (length == `0`) {
867	letters[`0`] = character;
868	length = `1`;
869	}
870	if (!one_byte_subject \|\| character <= Symbols::kMaxOneCharCodeSymbol) {
871	return length;
872	}
873
874	// The standard requires that non-ASCII characters cannot have ASCII
875	// character codes in their equivalence class.
876	// TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
877	// is it? For example, \u00C5 is equivalent to \u212B.
878	return `0`;
879	}
880
881	static inline bool EmitSimpleCharacter(Zone* zone,
882	RegExpCompiler* compiler,
883	uint16_t c,
884	BlockLabel* on_failure,
885	intptr_t cp_offset,
886	bool check,
887	bool preloaded) {
888	RegExpMacroAssembler* assembler = compiler->macro_assembler();
889	bool bound_checked = false;
890	if (!preloaded) {
891	assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
892	bound_checked = true;
893	}
894	assembler->CheckNotCharacter(c, on_failure);
895	return bound_checked;
896	}
897
898	// Only emits non-letters (things that don't have case). Only used for case
899	// independent matches.
900	static inline bool EmitAtomNonLetter(Zone* zone,
901	RegExpCompiler* compiler,
902	uint16_t c,
903	BlockLabel* on_failure,
904	intptr_t cp_offset,
905	bool check,
906	bool preloaded) {
907	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
908	bool one_byte = compiler->one_byte();
909	int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
910	intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
911	if (length < `1`) {
912	// This can't match. Must be an one-byte subject and a non-one-byte
913	// character. We do not need to do anything since the one-byte pass
914	// already handled this.
915	return false; // Bounds not checked.
916	}
917	bool checked = false;
918	// We handle the length > 1 case in a later pass.
919	if (length == `1`) {
920	if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {
921	// Can't match - see above.
922	return false; // Bounds not checked.
923	}
924	if (!preloaded) {
925	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
926	checked = check;
927	}
928	macro_assembler->CheckNotCharacter(c, on_failure);
929	}
930	return checked;
931	}
932
933	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
934	bool one_byte,
935	uint16_t c1,
936	uint16_t c2,
937	BlockLabel* on_failure) {
938	uint16_t char_mask;
939	if (one_byte) {
940	char_mask = Symbols::kMaxOneCharCodeSymbol;
941	} else {
942	char_mask = Utf16::kMaxCodeUnit;
943	}
944	uint16_t exor = c1 ^ c2;
945	// Check whether exor has only one bit set.
946	if (((exor - `1`) & exor) == `0`) {
947	// If c1 and c2 differ only by one bit.
948	// Ecma262UnCanonicalize always gives the highest number last.
949	ASSERT(c2 > c1);
950	uint16_t mask = char_mask ^ exor;
951	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
952	return true;
953	}
954	ASSERT(c2 > c1);
955	uint16_t diff = c2 - c1;
956	if (((diff - `1`) & diff) == `0` && c1 >= diff) {
957	// If the characters differ by 2^n but don't differ by one bit then
958	// subtract the difference from the found character, then do the or
959	// trick. We avoid the theoretical case where negative numbers are
960	// involved in order to simplify code generation.
961	uint16_t mask = char_mask ^ diff;
962	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
963	on_failure);
964	return true;
965	}
966	return false;
967	}
968
969	typedef bool EmitCharacterFunction(Zone* zone,
970	RegExpCompiler* compiler,
971	uint16_t c,
972	BlockLabel* on_failure,
973	intptr_t cp_offset,
974	bool check,
975	bool preloaded);
976
977	// Only emits letters (things that have case). Only used for case independent
978	// matches.
979	static inline bool EmitAtomLetter(Zone* zone,
980	RegExpCompiler* compiler,
981	uint16_t c,
982	BlockLabel* on_failure,
983	intptr_t cp_offset,
984	bool check,
985	bool preloaded) {
986	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
987	bool one_byte = compiler->one_byte();
988	int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
989	intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
990	if (length <= `1`) return false;
991	// We may not need to check against the end of the input string
992	// if this character lies before a character that matched.
993	if (!preloaded) {
994	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
995	}
996	BlockLabel ok;
997	ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == `4`);
998	switch (length) {
999	case `2`: {
1000	if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[`0`],
1001	chars[`1`], on_failure)) {
1002	} else {
1003	macro_assembler->CheckCharacter(chars[`0`], &ok);
1004	macro_assembler->CheckNotCharacter(chars[`1`], on_failure);
1005	macro_assembler->BindBlock(&ok);
1006	}
1007	break;
1008	}
1009	case `4`:
1010	macro_assembler->CheckCharacter(chars[`3`], &ok);
1011	FALL_THROUGH;
1012	case `3`:
1013	macro_assembler->CheckCharacter(chars[`0`], &ok);
1014	macro_assembler->CheckCharacter(chars[`1`], &ok);
1015	macro_assembler->CheckNotCharacter(chars[`2`], on_failure);
1016	macro_assembler->BindBlock(&ok);
1017	break;
1018	default:
1019	UNREACHABLE();
1020	break;
1021	}
1022	return true;
1023	}
1024
1025	static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1026	uint16_t border,
1027	BlockLabel* fall_through,
1028	BlockLabel* above_or_equal,
1029	BlockLabel* below) {
1030	if (below != fall_through) {
1031	masm->CheckCharacterLT(border, below);
1032	if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1033	} else {
1034	masm->CheckCharacterGT(border - `1`, above_or_equal);
1035	}
1036	}
1037
1038	static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1039	uint16_t first,
1040	uint16_t last,
1041	BlockLabel* fall_through,
1042	BlockLabel* in_range,
1043	BlockLabel* out_of_range) {
1044	if (in_range == fall_through) {
1045	if (first == last) {
1046	masm->CheckNotCharacter(first, out_of_range);
1047	} else {
1048	masm->CheckCharacterNotInRange(first, last, out_of_range);
1049	}
1050	} else {
1051	if (first == last) {
1052	masm->CheckCharacter(first, in_range);
1053	} else {
1054	masm->CheckCharacterInRange(first, last, in_range);
1055	}
1056	if (out_of_range != fall_through) masm->GoTo(out_of_range);
1057	}
1058	}
1059
1060	// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1061	// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1062	static void EmitUseLookupTable(RegExpMacroAssembler* masm,
1063	ZoneGrowableArray<uint16_t>* ranges,
1064	intptr_t start_index,
1065	intptr_t end_index,
1066	uint16_t min_char,
1067	BlockLabel* fall_through,
1068	BlockLabel* even_label,
1069	BlockLabel* odd_label) {
1070	static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1071	static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1072
1073	intptr_t base = (min_char & ~kMask);
1074
1075	// Assert that everything is on one kTableSize page.
1076	for (intptr_t i = start_index; i <= end_index; i++) {
1077	ASSERT((ranges->At(i) & ~kMask) == base);
1078	}
1079	ASSERT(start_index == `0` \|\| (ranges->At(start_index - `1`) & ~kMask) <= base);
1080
1081	char templ[kSize];
1082	BlockLabel* on_bit_set;
1083	BlockLabel* on_bit_clear;
1084	intptr_t bit;
1085	if (even_label == fall_through) {
1086	on_bit_set = odd_label;
1087	on_bit_clear = even_label;
1088	bit = `1`;
1089	} else {
1090	on_bit_set = even_label;
1091	on_bit_clear = odd_label;
1092	bit = `0`;
1093	}
1094	for (intptr_t i = `0`; i < (ranges->At(start_index) & kMask) && i < kSize;
1095	i++) {
1096	templ[i] = bit;
1097	}
1098	intptr_t j = `0`;
1099	bit ^= `1`;
1100	for (intptr_t i = start_index; i < end_index; i++) {
1101	for (j = (ranges->At(i) & kMask); j < (ranges->At(i + `1`) & kMask); j++) {
1102	templ[j] = bit;
1103	}
1104	bit ^= `1`;
1105	}
1106	for (intptr_t i = j; i < kSize; i++) {
1107	templ[i] = bit;
1108	}
1109	// TODO(erikcorry): Cache these.
1110	const TypedData& ba = TypedData::ZoneHandle(
1111	masm->zone(), TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
1112	for (intptr_t i = `0`; i < kSize; i++) {
1113	ba.SetUint8(i, templ[i]);
1114	}
1115	masm->CheckBitInTable(ba, on_bit_set);
1116	if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1117	}
1118
1119	static void CutOutRange(RegExpMacroAssembler* masm,
1120	ZoneGrowableArray<uint16_t>* ranges,
1121	intptr_t start_index,
1122	intptr_t end_index,
1123	intptr_t cut_index,
1124	BlockLabel* even_label,
1125	BlockLabel* odd_label) {
1126	bool odd = (((cut_index - start_index) & `1`) == `1`);
1127	BlockLabel* in_range_label = odd ? odd_label : even_label;
1128	BlockLabel dummy;
1129	EmitDoubleBoundaryTest(masm, ranges->At(cut_index),
1130	ranges->At(cut_index + `1`) - `1`, &dummy, in_range_label,
1131	&dummy);
1132	ASSERT(!dummy.is_linked());
1133	// Cut out the single range by rewriting the array. This creates a new
1134	// range that is a merger of the two ranges on either side of the one we
1135	// are cutting out. The oddity of the labels is preserved.
1136	for (intptr_t j = cut_index; j > start_index; j--) {
1137	(*ranges)[j] = ranges->At(j - `1`);
1138	}
1139	for (intptr_t j = cut_index + `1`; j < end_index; j++) {
1140	(*ranges)[j] = ranges->At(j + `1`);
1141	}
1142	}
1143
1144	// Unicode case. Split the search space into kSize spaces that are handled
1145	// with recursion.
1146	static void SplitSearchSpace(ZoneGrowableArray<uint16_t>* ranges,
1147	intptr_t start_index,
1148	intptr_t end_index,
1149	intptr_t* new_start_index,
1150	intptr_t* new_end_index,
1151	uint16_t* border) {
1152	static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1153	static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1154
1155	uint16_t first = ranges->At(start_index);
1156	uint16_t last = ranges->At(end_index) - `1`;
1157
1158	*new_start_index = start_index;
1159	*border = (ranges->At(start_index) & ~kMask) + kSize;
1160	while (*new_start_index < end_index) {
1161	if (ranges->At(new_start_index) > border) break;
1162	(*new_start_index)++;
1163	}
1164	// new_start_index is the index of the first edge that is beyond the
1165	// current kSize space.
1166
1167	// For very large search spaces we do a binary chop search of the non-Latin1
1168	// space instead of just going to the end of the current kSize space. The
1169	// heuristics are complicated a little by the fact that any 128-character
1170	// encoding space can be quickly tested with a table lookup, so we don't
1171	// wish to do binary chop search at a smaller granularity than that. A
1172	// 128-character space can take up a lot of space in the ranges array if,
1173	// for example, we only want to match every second character (eg. the lower
1174	// case characters on some Unicode pages).
1175	intptr_t binary_chop_index = (end_index + start_index) / `2`;
1176	// The first test ensures that we get to the code that handles the Latin1
1177	// range with a single not-taken branch, speeding up this important
1178	// character range (even non-Latin1 charset-based text has spaces and
1179	// punctuation).
1180	if (border - `1` > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.*
1181	end_index - start_index > (new_start_index - start_index) `2` &&
1182	last - first > kSize * `2` && binary_chop_index > *new_start_index &&
1183	ranges->At(binary_chop_index) >= first + `2` * kSize) {
1184	intptr_t scan_forward_for_section_border = binary_chop_index;
1185	intptr_t new_border = (ranges->At(binary_chop_index) \| kMask) + `1`;
1186
1187	while (scan_forward_for_section_border < end_index) {
1188	if (ranges->At(scan_forward_for_section_border) > new_border) {
1189	*new_start_index = scan_forward_for_section_border;
1190	*border = new_border;
1191	break;
1192	}
1193	scan_forward_for_section_border++;
1194	}
1195	}
1196
1197	ASSERT(*new_start_index > start_index);
1198	new_end_index = new_start_index - `1`;
1199	if (ranges->At(new_end_index) == border) {
1200	(*new_end_index)--;
1201	}
1202	if (*border >= ranges->At(end_index)) {
1203	*border = ranges->At(end_index);
1204	new_start_index = end_index; // Won't be used.*
1205	*new_end_index = end_index - `1`;
1206	}
1207	}
1208
1209	// Gets a series of segment boundaries representing a character class. If the
1210	// character is in the range between an even and an odd boundary (counting from
1211	// start_index) then go to even_label, otherwise go to odd_label. We already
1212	// know that the character is in the range of min_char to max_char inclusive.
1213	// Either label can be NULL indicating backtracking. Either label can also be
1214	// equal to the fall_through label.
1215	static void GenerateBranches(RegExpMacroAssembler* masm,
1216	ZoneGrowableArray<uint16_t>* ranges,
1217	intptr_t start_index,
1218	intptr_t end_index,
1219	uint16_t min_char,
1220	uint16_t max_char,
1221	BlockLabel* fall_through,
1222	BlockLabel* even_label,
1223	BlockLabel* odd_label) {
1224	uint16_t first = ranges->At(start_index);
1225	uint16_t last = ranges->At(end_index) - `1`;
1226
1227	ASSERT(min_char < first);
1228
1229	// Just need to test if the character is before or on-or-after
1230	// a particular character.
1231	if (start_index == end_index) {
1232	EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1233	return;
1234	}
1235
1236	// Another almost trivial case: There is one interval in the middle that is
1237	// different from the end intervals.
1238	if (start_index + `1` == end_index) {
1239	EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1240	odd_label);
1241	return;
1242	}
1243
1244	// It's not worth using table lookup if there are very few intervals in the
1245	// character class.
1246	if (end_index - start_index <= `6`) {
1247	// It is faster to test for individual characters, so we look for those
1248	// first, then try arbitrary ranges in the second round.
1249	static intptr_t kNoCutIndex = -`1`;
1250	intptr_t cut = kNoCutIndex;
1251	for (intptr_t i = start_index; i < end_index; i++) {
1252	if (ranges->At(i) == ranges->At(i + `1`) - `1`) {
1253	cut = i;
1254	break;
1255	}
1256	}
1257	if (cut == kNoCutIndex) cut = start_index;
1258	CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1259	odd_label);
1260	ASSERT(end_index - start_index >= `2`);
1261	GenerateBranches(masm, ranges, start_index + `1`, end_index - `1`, min_char,
1262	max_char, fall_through, even_label, odd_label);
1263	return;
1264	}
1265
1266	// If there are a lot of intervals in the regexp, then we will use tables to
1267	// determine whether the character is inside or outside the character class.
1268	static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;
1269
1270	if ((max_char >> kBits) == (min_char >> kBits)) {
1271	EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1272	fall_through, even_label, odd_label);
1273	return;
1274	}
1275
1276	if ((min_char >> kBits) != (first >> kBits)) {
1277	masm->CheckCharacterLT(first, odd_label);
1278	GenerateBranches(masm, ranges, start_index + `1`, end_index, first, max_char,
1279	fall_through, odd_label, even_label);
1280	return;
1281	}
1282
1283	intptr_t new_start_index = `0`;
1284	intptr_t new_end_index = `0`;
1285	uint16_t border = `0`;
1286
1287	SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1288	&new_end_index, &border);
1289
1290	BlockLabel handle_rest;
1291	BlockLabel* above = &handle_rest;
1292	if (border == last + `1`) {
1293	// We didn't find any section that started after the limit, so everything
1294	// above the border is one of the terminal labels.
1295	above = (end_index & `1`) != (start_index & `1`) ? odd_label : even_label;
1296	ASSERT(new_end_index == end_index - `1`);
1297	}
1298
1299	ASSERT(start_index <= new_end_index);
1300	ASSERT(new_start_index <= end_index);
1301	ASSERT(start_index < new_start_index);
1302	ASSERT(new_end_index < end_index);
1303	ASSERT(new_end_index + `1` == new_start_index \|\|
1304	(new_end_index + `2` == new_start_index &&
1305	border == ranges->At(new_end_index + `1`)));
1306	ASSERT(min_char < border - `1`);
1307	ASSERT(border < max_char);
1308	ASSERT(ranges->At(new_end_index) < border);
1309	ASSERT(border < ranges->At(new_start_index) \|\|
1310	(border == ranges->At(new_start_index) &&
1311	new_start_index == end_index && new_end_index == end_index - `1` &&
1312	border == last + `1`));
1313	ASSERT(new_start_index == `0` \|\| border >= ranges->At(new_start_index - `1`));
1314
1315	masm->CheckCharacterGT(border - `1`, above);
1316	BlockLabel dummy;
1317	GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1318	border - `1`, &dummy, even_label, odd_label);
1319
1320	if (handle_rest.is_linked()) {
1321	masm->BindBlock(&handle_rest);
1322	bool flip = (new_start_index & `1`) != (start_index & `1`);
1323	GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1324	&dummy, flip ? odd_label : even_label,
1325	flip ? even_label : odd_label);
1326	}
1327	}
1328
1329	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1330	RegExpCharacterClass* cc,
1331	bool one_byte,
1332	BlockLabel* on_failure,
1333	intptr_t cp_offset,
1334	bool check_offset,
1335	bool preloaded,
1336	Zone* zone) {
1337	ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1338	if (!CharacterRange::IsCanonical(ranges)) {
1339	CharacterRange::Canonicalize(ranges);
1340	}
1341
1342	uint16_t max_char;
1343	if (one_byte) {
1344	max_char = Symbols::kMaxOneCharCodeSymbol;
1345	} else {
1346	max_char = Utf16::kMaxCodeUnit;
1347	}
1348
1349	intptr_t range_count = ranges->length();
1350
1351	intptr_t last_valid_range = range_count - `1`;
1352	while (last_valid_range >= `0`) {
1353	const CharacterRange& range = ranges->At(last_valid_range);
1354	if (range.from() <= max_char) {
1355	break;
1356	}
1357	last_valid_range--;
1358	}
1359
1360	if (last_valid_range < `0`) {
1361	if (!cc->is_negated()) {
1362	macro_assembler->GoTo(on_failure);
1363	}
1364	if (check_offset) {
1365	macro_assembler->CheckPosition(cp_offset, on_failure);
1366	}
1367	return;
1368	}
1369
1370	if (last_valid_range == `0` && ranges->At(`0`).IsEverything(max_char)) {
1371	if (cc->is_negated()) {
1372	macro_assembler->GoTo(on_failure);
1373	} else {
1374	// This is a common case hit by non-anchored expressions.
1375	if (check_offset) {
1376	macro_assembler->CheckPosition(cp_offset, on_failure);
1377	}
1378	}
1379	return;
1380	}
1381
1382	if (!preloaded) {
1383	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1384	}
1385
1386	if (cc->is_standard() && macro_assembler->CheckSpecialCharacterClass(
1387	cc->standard_type(), on_failure)) {
1388	return;
1389	}
1390
1391	// A new list with ascending entries. Each entry is a code unit
1392	// where there is a boundary between code units that are part of
1393	// the class and code units that are not. Normally we insert an
1394	// entry at zero which goes to the failure label, but if there
1395	// was already one there we fall through for success on that entry.
1396	// Subsequent entries have alternating meaning (success/failure).
1397	ZoneGrowableArray<uint16_t>* range_boundaries =
1398	new (zone) ZoneGrowableArray<uint16_t>(last_valid_range);
1399
1400	bool zeroth_entry_is_failure = !cc->is_negated();
1401
1402	for (intptr_t i = `0`; i <= last_valid_range; i++) {
1403	const CharacterRange& range = ranges->At(i);
1404	if (range.from() == `0`) {
1405	ASSERT(i == `0`);
1406	zeroth_entry_is_failure = !zeroth_entry_is_failure;
1407	} else {
1408	range_boundaries->Add(range.from());
1409	}
1410	if (range.to() + `1` <= max_char) {
1411	range_boundaries->Add(range.to() + `1`);
1412	}
1413	}
1414	intptr_t end_index = range_boundaries->length() - `1`;
1415
1416	BlockLabel fall_through;
1417	GenerateBranches(macro_assembler, range_boundaries,
1418	`0`, // start_index.
1419	end_index,
1420	`0`, // min_char.
1421	max_char, &fall_through,
1422	zeroth_entry_is_failure ? &fall_through : on_failure,
1423	zeroth_entry_is_failure ? on_failure : &fall_through);
1424	macro_assembler->BindBlock(&fall_through);
1425	}
1426
1427	RegExpNode::~RegExpNode() {}
1428
1429	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1430	Trace* trace) {
1431	// If we are generating a greedy loop then don't stop and don't reuse code.
1432	if (trace->stop_node() != NULL) {
1433	return CONTINUE;
1434	}
1435
1436	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1437	if (trace->is_trivial()) {
1438	if (label_.is_bound()) {
1439	// We are being asked to generate a generic version, but that's already
1440	// been done so just go to it.
1441	macro_assembler->GoTo(&label_);
1442	return DONE;
1443	}
1444	if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1445	// To avoid too deep recursion we push the node to the work queue and just
1446	// generate a goto here.
1447	compiler->AddWork(this);
1448	macro_assembler->GoTo(&label_);
1449	return DONE;
1450	}
1451	// Generate generic version of the node and bind the label for later use.
1452	macro_assembler->BindBlock(&label_);
1453	return CONTINUE;
1454	}
1455
1456	// We are being asked to make a non-generic version. Keep track of how many
1457	// non-generic versions we generate so as not to overdo it.
1458	trace_count_++;
1459	if (kRegexpOptimization && trace_count_ < kMaxCopiesCodeGenerated &&
1460	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1461	return CONTINUE;
1462	}
1463
1464	// If we get here code has been generated for this node too many times or
1465	// recursion is too deep. Time to switch to a generic version. The code for
1466	// generic versions above can handle deep recursion properly.
1467	trace->Flush(compiler, this);
1468	return DONE;
1469	}
1470
1471	intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
1472	intptr_t budget,
1473	bool not_at_start) {
1474	if (budget <= `0`) return `0`;
1475	if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return `0`; // Rewinds input!
1476	return on_success()->EatsAtLeast(still_to_find, budget - `1`, not_at_start);
1477	}
1478
1479	void ActionNode::FillInBMInfo(intptr_t offset,
1480	intptr_t budget,
1481	BoyerMooreLookahead* bm,
1482	bool not_at_start) {
1483	if (action_type_ == BEGIN_SUBMATCH) {
1484	bm->SetRest(offset);
1485	} else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1486	on_success()->FillInBMInfo(offset, budget - `1`, bm, not_at_start);
1487	}
1488	SaveBMInfo(bm, not_at_start, offset);
1489	}
1490
1491	intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
1492	intptr_t budget,
1493	bool not_at_start) {
1494	if (budget <= `0`) return `0`;
1495	// If we know we are not at the start and we are asked "how many characters
1496	// will you match if you succeed?" then we can answer anything since false
1497	// implies false. So lets just return the max answer (still_to_find) since
1498	// that won't prevent us from preloading a lot of characters for the other
1499	// branches in the node graph.
1500	if (assertion_type() == AT_START && not_at_start) return still_to_find;
1501	return on_success()->EatsAtLeast(still_to_find, budget - `1`, not_at_start);
1502	}
1503
1504	void AssertionNode::FillInBMInfo(intptr_t offset,
1505	intptr_t budget,
1506	BoyerMooreLookahead* bm,
1507	bool not_at_start) {
1508	// Match the behaviour of EatsAtLeast on this node.
1509	if (assertion_type() == AT_START && not_at_start) return;
1510	on_success()->FillInBMInfo(offset, budget - `1`, bm, not_at_start);
1511	SaveBMInfo(bm, not_at_start, offset);
1512	}
1513
1514	intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
1515	intptr_t budget,
1516	bool not_at_start) {
1517	if (read_backward()) return `0`;
1518	if (budget <= `0`) return `0`;
1519	return on_success()->EatsAtLeast(still_to_find, budget - `1`, not_at_start);
1520	}
1521
1522	intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
1523	intptr_t budget,
1524	bool not_at_start) {
1525	if (read_backward()) return `0`;
1526	intptr_t answer = Length();
1527	if (answer >= still_to_find) return answer;
1528	if (budget <= `0`) return answer;
1529	// We are not at start after this node so we set the last argument to 'true'.
1530	return answer +
1531	on_success()->EatsAtLeast(still_to_find - answer, budget - `1`, true);
1532	}
1533
1534	intptr_t NegativeLookaroundChoiceNode::EatsAtLeast(intptr_t still_to_find,
1535	intptr_t budget,
1536	bool not_at_start) {
1537	if (budget <= `0`) return `0`;
1538	// Alternative 0 is the negative lookahead, alternative 1 is what comes
1539	// afterwards.
1540	RegExpNode* node = (*alternatives_)[`1`].node();
1541	return node->EatsAtLeast(still_to_find, budget - `1`, not_at_start);
1542	}
1543
1544	void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1545	QuickCheckDetails* details,
1546	RegExpCompiler* compiler,
1547	intptr_t filled_in,
1548	bool not_at_start) {
1549	// Alternative 0 is the negative lookahead, alternative 1 is what comes
1550	// afterwards.
1551	RegExpNode* node = (*alternatives_)[`1`].node();
1552	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1553	}
1554
1555	intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
1556	intptr_t budget,
1557	RegExpNode* ignore_this_node,
1558	bool not_at_start) {
1559	if (budget <= `0`) return `0`;
1560	intptr_t min = `100`;
1561	intptr_t choice_count = alternatives_->length();
1562	budget = (budget - `1`) / choice_count;
1563	for (intptr_t i = `0`; i < choice_count; i++) {
1564	RegExpNode* node = (*alternatives_)[i].node();
1565	if (node == ignore_this_node) continue;
1566	intptr_t node_eats_at_least =
1567	node->EatsAtLeast(still_to_find, budget, not_at_start);
1568	if (node_eats_at_least < min) min = node_eats_at_least;
1569	if (min == `0`) return `0`;
1570	}
1571	return min;
1572	}
1573
1574	intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1575	intptr_t budget,
1576	bool not_at_start) {
1577	return EatsAtLeastHelper(still_to_find, budget - `1`, loop_node_, not_at_start);
1578	}
1579
1580	intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
1581	intptr_t budget,
1582	bool not_at_start) {
1583	return EatsAtLeastHelper(still_to_find, budget, NULL, not_at_start);
1584	}
1585
1586	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1587	static inline uint32_t SmearBitsRight(uint32_t v) {
1588	v \|= v >> `1`;
1589	v \|= v >> `2`;
1590	v \|= v >> `4`;
1591	v \|= v >> `8`;
1592	v \|= v >> `16`;
1593	return v;
1594	}
1595
1596	bool QuickCheckDetails::Rationalize(bool asc) {
1597	bool found_useful_op = false;
1598	uint32_t char_mask;
1599	if (asc) {
1600	char_mask = Symbols::kMaxOneCharCodeSymbol;
1601	} else {
1602	char_mask = Utf16::kMaxCodeUnit;
1603	}
1604	mask_ = `0`;
1605	value_ = `0`;
1606	intptr_t char_shift = `0`;
1607	for (intptr_t i = `0`; i < characters_; i++) {
1608	Position* pos = &positions_[i];
1609	if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != `0`) {
1610	found_useful_op = true;
1611	}
1612	mask_ \|= (pos->mask & char_mask) << char_shift;
1613	value_ \|= (pos->value & char_mask) << char_shift;
1614	char_shift += asc ? `8` : `16`;
1615	}
1616	return found_useful_op;
1617	}
1618
1619	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1620	Trace* bounds_check_trace,
1621	Trace* trace,
1622	bool preload_has_checked_bounds,
1623	BlockLabel* on_possible_success,
1624	QuickCheckDetails* details,
1625	bool fall_through_on_failure) {
1626	if (details->characters() == `0`) return false;
1627	GetQuickCheckDetails(details, compiler, `0`,
1628	trace->at_start() == Trace::FALSE_VALUE);
1629	if (details->cannot_match()) return false;
1630	if (!details->Rationalize(compiler->one_byte())) return false;
1631	ASSERT(details->characters() == `1` \|\|
1632	compiler->macro_assembler()->CanReadUnaligned());
1633	uint32_t mask = details->mask();
1634	uint32_t value = details->value();
1635
1636	RegExpMacroAssembler* assembler = compiler->macro_assembler();
1637
1638	if (trace->characters_preloaded() != details->characters()) {
1639	ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());
1640	// We are attempting to preload the minimum number of characters
1641	// any choice would eat, so if the bounds check fails, then none of the
1642	// choices can succeed, so we can just immediately backtrack, rather
1643	// than go to the next choice.
1644	assembler->LoadCurrentCharacter(
1645	trace->cp_offset(), bounds_check_trace->backtrack(),
1646	!preload_has_checked_bounds, details->characters());
1647	}
1648
1649	bool need_mask = true;
1650
1651	if (details->characters() == `1`) {
1652	// If number of characters preloaded is 1 then we used a byte or 16 bit
1653	// load so the value is already masked down.
1654	uint32_t char_mask;
1655	if (compiler->one_byte()) {
1656	char_mask = Symbols::kMaxOneCharCodeSymbol;
1657	} else {
1658	char_mask = Utf16::kMaxCodeUnit;
1659	}
1660	if ((mask & char_mask) == char_mask) need_mask = false;
1661	mask &= char_mask;
1662	} else {
1663	// For 2-character preloads in one-byte mode or 1-character preloads in
1664	// two-byte mode we also use a 16 bit load with zero extend.
1665	if (details->characters() == `2` && compiler->one_byte()) {
1666	if ((mask & `0xffff`) == `0xffff`) need_mask = false;
1667	} else if (details->characters() == `1` && !compiler->one_byte()) {
1668	if ((mask & `0xffff`) == `0xffff`) need_mask = false;
1669	} else {
1670	if (mask == `0xffffffff`) need_mask = false;
1671	}
1672	}
1673
1674	if (fall_through_on_failure) {
1675	if (need_mask) {
1676	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1677	} else {
1678	assembler->CheckCharacter(value, on_possible_success);
1679	}
1680	} else {
1681	if (need_mask) {
1682	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1683	} else {
1684	assembler->CheckNotCharacter(value, trace->backtrack());
1685	}
1686	}
1687	return true;
1688	}
1689
1690	// Here is the meat of GetQuickCheckDetails (see also the comment on the
1691	// super-class in the .h file).
1692	//
1693	// We iterate along the text object, building up for each character a
1694	// mask and value that can be used to test for a quick failure to match.
1695	// The masks and values for the positions will be combined into a single
1696	// machine word for the current character width in order to be used in
1697	// generating a quick check.
1698	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1699	RegExpCompiler* compiler,
1700	intptr_t characters_filled_in,
1701	bool not_at_start) {
1702	#if defined(__GNUC__) && defined(__BYTE_ORDER__)
1703	// TODO(zerny): Make the combination code byte-order independent.
1704	ASSERT(details->characters() == `1` \|\|
1705	(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__));
1706	#endif
1707	// Do not collect any quick check details if the text node reads backward,
1708	// since it reads in the opposite direction than we use for quick checks.
1709	if (read_backward()) return;
1710	ASSERT(characters_filled_in < details->characters());
1711	intptr_t characters = details->characters();
1712	int32_t char_mask;
1713	if (compiler->one_byte()) {
1714	char_mask = Symbols::kMaxOneCharCodeSymbol;
1715	} else {
1716	char_mask = Utf16::kMaxCodeUnit;
1717	}
1718	for (intptr_t k = `0`; k < elms_->length(); k++) {
1719	TextElement elm = elms_->At(k);
1720	if (elm.text_type() == TextElement::ATOM) {
1721	ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1722	for (intptr_t i = `0`; i < characters && i < quarks->length(); i++) {
1723	QuickCheckDetails::Position* pos =
1724	details->positions(characters_filled_in);
1725	uint16_t c = quarks->At(i);
1726	if (c > char_mask) {
1727	// If we expect a non-Latin1 character from an one-byte string,
1728	// there is no way we can match. Not even case independent
1729	// matching can turn an Latin1 character into non-Latin1 or
1730	// vice versa.
1731	// TODO(dcarney): issue 3550. Verify that this works as expected.
1732	// For example, \u0178 is uppercase of \u00ff (y-umlaut).
1733	details->set_cannot_match();
1734	pos->determines_perfectly = false;
1735	return;
1736	}
1737	if (elm.atom()->ignore_case()) {
1738	int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1739	intptr_t length =
1740	GetCaseIndependentLetters(c, compiler->one_byte(), chars);
1741	ASSERT(length != `0`); // Can only happen if c > char_mask (see above).
1742	if (length == `1`) {
1743	// This letter has no case equivalents, so it's nice and simple
1744	// and the mask-compare will determine definitely whether we have
1745	// a match at this character position.
1746	pos->mask = char_mask;
1747	pos->value = c;
1748	pos->determines_perfectly = true;
1749	} else {
1750	uint32_t common_bits = char_mask;
1751	uint32_t bits = chars[`0`];
1752	for (intptr_t j = `1`; j < length; j++) {
1753	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1754	common_bits ^= differing_bits;
1755	bits &= common_bits;
1756	}
1757	// If length is 2 and common bits has only one zero in it then
1758	// our mask and compare instruction will determine definitely
1759	// whether we have a match at this character position. Otherwise
1760	// it can only be an approximate check.
1761	uint32_t one_zero = (common_bits \| ~char_mask);
1762	if (length == `2` && ((~one_zero) & ((~one_zero) - `1`)) == `0`) {
1763	pos->determines_perfectly = true;
1764	}
1765	pos->mask = common_bits;
1766	pos->value = bits;
1767	}
1768	} else {
1769	// Don't ignore case. Nice simple case where the mask-compare will
1770	// determine definitely whether we have a match at this character
1771	// position.
1772	pos->mask = char_mask;
1773	pos->value = c;
1774	pos->determines_perfectly = true;
1775	}
1776	characters_filled_in++;
1777	ASSERT(characters_filled_in <= details->characters());
1778	if (characters_filled_in == details->characters()) {
1779	return;
1780	}
1781	}
1782	} else {
1783	QuickCheckDetails::Position* pos =
1784	details->positions(characters_filled_in);
1785	RegExpCharacterClass* tree = elm.char_class();
1786	ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
1787	ASSERT(!ranges->is_empty());
1788	if (tree->is_negated()) {
1789	// A quick check uses multi-character mask and compare. There is no
1790	// useful way to incorporate a negative char class into this scheme
1791	// so we just conservatively create a mask and value that will always
1792	// succeed.
1793	pos->mask = `0`;
1794	pos->value = `0`;
1795	} else {
1796	intptr_t first_range = `0`;
1797	while (ranges->At(first_range).from() > char_mask) {
1798	first_range++;
1799	if (first_range == ranges->length()) {
1800	details->set_cannot_match();
1801	pos->determines_perfectly = false;
1802	return;
1803	}
1804	}
1805	CharacterRange range = ranges->At(first_range);
1806	uint16_t from = range.from();
1807	uint16_t to = range.to();
1808	if (to > char_mask) {
1809	to = char_mask;
1810	}
1811	uint32_t differing_bits = (from ^ to);
1812	// A mask and compare is only perfect if the differing bits form a
1813	// number like 00011111 with one single block of trailing 1s.
1814	if ((differing_bits & (differing_bits + `1`)) == `0` &&
1815	from + differing_bits == to) {
1816	pos->determines_perfectly = true;
1817	}
1818	uint32_t common_bits = ~SmearBitsRight(differing_bits);
1819	uint32_t bits = (from & common_bits);
1820	for (intptr_t i = first_range + `1`; i < ranges->length(); i++) {
1821	CharacterRange range = ranges->At(i);
1822	uint16_t from = range.from();
1823	uint16_t to = range.to();
1824	if (from > char_mask) continue;
1825	if (to > char_mask) to = char_mask;
1826	// Here we are combining more ranges into the mask and compare
1827	// value. With each new range the mask becomes more sparse and
1828	// so the chances of a false positive rise. A character class
1829	// with multiple ranges is assumed never to be equivalent to a
1830	// mask and compare operation.
1831	pos->determines_perfectly = false;
1832	uint32_t new_common_bits = (from ^ to);
1833	new_common_bits = ~SmearBitsRight(new_common_bits);
1834	common_bits &= new_common_bits;
1835	bits &= new_common_bits;
1836	uint32_t differing_bits = (from & common_bits) ^ bits;
1837	common_bits ^= differing_bits;
1838	bits &= common_bits;
1839	}
1840	pos->mask = common_bits;
1841	pos->value = bits;
1842	}
1843	characters_filled_in++;
1844	ASSERT(characters_filled_in <= details->characters());
1845	if (characters_filled_in == details->characters()) {
1846	return;
1847	}
1848	}
1849	}
1850	ASSERT(characters_filled_in != details->characters());
1851	if (!details->cannot_match()) {
1852	on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1853	true);
1854	}
1855	}
1856
1857	void QuickCheckDetails::Clear() {
1858	for (int i = `0`; i < characters_; i++) {
1859	positions_[i].mask = `0`;
1860	positions_[i].value = `0`;
1861	positions_[i].determines_perfectly = false;
1862	}
1863	characters_ = `0`;
1864	}
1865
1866	void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
1867	if (by >= characters_ \|\| by < `0`) {
1868	// check that by < 0 => characters_ == 0
1869	ASSERT(by >= `0` \|\| characters_ == `0`);
1870	Clear();
1871	return;
1872	}
1873	for (intptr_t i = `0`; i < characters_ - by; i++) {
1874	positions_[i] = positions_[by + i];
1875	}
1876	for (intptr_t i = characters_ - by; i < characters_; i++) {
1877	positions_[i].mask = `0`;
1878	positions_[i].value = `0`;
1879	positions_[i].determines_perfectly = false;
1880	}
1881	characters_ -= by;
1882	// We could change mask_ and value_ here but we would never advance unless
1883	// they had already been used in a check and they won't be used again because
1884	// it would gain us nothing. So there's no point.
1885	}
1886
1887	void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
1888	ASSERT(characters_ == other->characters_);
1889	if (other->cannot_match_) {
1890	return;
1891	}
1892	if (cannot_match_) {
1893	*this = *other;
1894	return;
1895	}
1896	for (intptr_t i = from_index; i < characters_; i++) {
1897	QuickCheckDetails::Position* pos = positions(i);
1898	QuickCheckDetails::Position* other_pos = other->positions(i);
1899	if (pos->mask != other_pos->mask \|\| pos->value != other_pos->value \|\|
1900	!other_pos->determines_perfectly) {
1901	// Our mask-compare operation will be approximate unless we have the
1902	// exact same operation on both sides of the alternation.
1903	pos->determines_perfectly = false;
1904	}
1905	pos->mask &= other_pos->mask;
1906	pos->value &= pos->mask;
1907	other_pos->value &= pos->mask;
1908	uint16_t differing_bits = (pos->value ^ other_pos->value);
1909	pos->mask &= ~differing_bits;
1910	pos->value &= pos->mask;
1911	}
1912	}
1913
1914	class VisitMarker : public ValueObject {
1915	public:
1916	explicit VisitMarker(NodeInfo* info) : info_(info) {
1917	ASSERT(!info->visited);
1918	info->visited = true;
1919	}
1920	~VisitMarker() { info_->visited = false; }
1921
1922	private:
1923	NodeInfo* info_;
1924	};
1925
1926	RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth) {
1927	if (info()->replacement_calculated) return replacement();
1928	if (depth < `0`) return this;
1929	ASSERT(!info()->visited);
1930	VisitMarker marker(info());
1931	return FilterSuccessor(depth - `1`);
1932	}
1933
1934	RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth) {
1935	RegExpNode* next = on_success_->FilterOneByte(depth - `1`);
1936	if (next == NULL) return set_replacement(NULL);
1937	on_success_ = next;
1938	return set_replacement(this);
1939	}
1940
1941	// We need to check for the following characters: 0x39c 0x3bc 0x178.
1942	static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
1943	// TODO(dcarney): this could be a lot more efficient.
1944	return range.Contains(`0x39c`) \|\| range.Contains(`0x3bc`) \|\|
1945	range.Contains(`0x178`);
1946	}
1947
1948	static bool RangesContainLatin1Equivalents(
1949	ZoneGrowableArray<CharacterRange>* ranges) {
1950	for (intptr_t i = `0`; i < ranges->length(); i++) {
1951	// TODO(dcarney): this could be a lot more efficient.
1952	if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;
1953	}
1954	return false;
1955	}
1956
1957	static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
1958	ASSERT(c > Symbols::kMaxOneCharCodeSymbol);
1959	switch (c) {
1960	// This are equivalent characters in unicode.
1961	case `0x39c`:
1962	case `0x3bc`:
1963	return `0xb5`;
1964	// This is an uppercase of a Latin-1 character
1965	// outside of Latin-1.
1966	case `0x178`:
1967	return `0xff`;
1968	}
1969	return `0`;
1970	}
1971
1972	RegExpNode* TextNode::FilterOneByte(intptr_t depth) {
1973	if (info()->replacement_calculated) return replacement();
1974	if (depth < `0`) return this;
1975	ASSERT(!info()->visited);
1976	VisitMarker marker(info());
1977	intptr_t element_count = elms_->length();
1978	for (intptr_t i = `0`; i < element_count; i++) {
1979	TextElement elm = elms_->At(i);
1980	if (elm.text_type() == TextElement::ATOM) {
1981	ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1982	for (intptr_t j = `0`; j < quarks->length(); j++) {
1983	uint16_t c = quarks->At(j);
1984	if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
1985	if (!elm.atom()->ignore_case()) return set_replacement(NULL);
1986	// Here, we need to check for characters whose upper and lower cases
1987	// are outside the Latin-1 range.
1988	uint16_t converted = ConvertNonLatin1ToLatin1(c);
1989	// Character is outside Latin-1 completely
1990	if (converted == `0`) return set_replacement(NULL);
1991	// Convert quark to Latin-1 in place.
1992	(*quarks)[`0`] = converted;
1993	}
1994	} else {
1995	ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
1996	RegExpCharacterClass* cc = elm.char_class();
1997	ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1998	if (!CharacterRange::IsCanonical(ranges)) {
1999	CharacterRange::Canonicalize(ranges);
2000	}
2001	// Now they are in order so we only need to look at the first.
2002	intptr_t range_count = ranges->length();
2003	if (cc->is_negated()) {
2004	if (range_count != `0` && ranges->At(`0`).from() == `0` &&
2005	ranges->At(`0`).to() >= Symbols::kMaxOneCharCodeSymbol) {
2006	// This will be handled in a later filter.
2007	if (cc->flags().IgnoreCase() &&
2008	RangesContainLatin1Equivalents(ranges)) {
2009	continue;
2010	}
2011	return set_replacement(NULL);
2012	}
2013	} else {
2014	if (range_count == `0` \|\|
2015	ranges->At(`0`).from() > Symbols::kMaxOneCharCodeSymbol) {
2016	// This will be handled in a later filter.
2017	if (cc->flags().IgnoreCase() &&
2018	RangesContainLatin1Equivalents(ranges))
2019	continue;
2020	return set_replacement(NULL);
2021	}
2022	}
2023	}
2024	}
2025	return FilterSuccessor(depth - `1`);
2026	}
2027
2028	RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth) {
2029	if (info()->replacement_calculated) return replacement();
2030	if (depth < `0`) return this;
2031	if (info()->visited) return this;
2032	{
2033	VisitMarker marker(info());
2034
2035	RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - `1`);
2036	// If we can't continue after the loop then there is no sense in doing the
2037	// loop.
2038	if (continue_replacement == NULL) return set_replacement(NULL);
2039	}
2040
2041	return ChoiceNode::FilterOneByte(depth - `1`);
2042	}
2043
2044	RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) {
2045	if (info()->replacement_calculated) return replacement();
2046	if (depth < `0`) return this;
2047	if (info()->visited) return this;
2048	VisitMarker marker(info());
2049	intptr_t choice_count = alternatives_->length();
2050
2051	for (intptr_t i = `0`; i < choice_count; i++) {
2052	GuardedAlternative alternative = alternatives_->At(i);
2053	if (alternative.guards() != NULL && alternative.guards()->length() != `0`) {
2054	set_replacement(this);
2055	return this;
2056	}
2057	}
2058
2059	intptr_t surviving = `0`;
2060	RegExpNode* survivor = NULL;
2061	for (intptr_t i = `0`; i < choice_count; i++) {
2062	GuardedAlternative alternative = alternatives_->At(i);
2063	RegExpNode* replacement = alternative.node()->FilterOneByte(depth - `1`);
2064	ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2065	if (replacement != NULL) {
2066	(*alternatives_)[i].set_node(replacement);
2067	surviving++;
2068	survivor = replacement;
2069	}
2070	}
2071	if (surviving < `2`) return set_replacement(survivor);
2072
2073	set_replacement(this);
2074	if (surviving == choice_count) {
2075	return this;
2076	}
2077	// Only some of the nodes survived the filtering. We need to rebuild the
2078	// alternatives list.
2079	ZoneGrowableArray<GuardedAlternative>* new_alternatives =
2080	new (Z) ZoneGrowableArray<GuardedAlternative>(surviving);
2081	for (intptr_t i = `0`; i < choice_count; i++) {
2082	RegExpNode* replacement =
2083	(*alternatives_)[i].node()->FilterOneByte(depth - `1`);
2084	if (replacement != NULL) {
2085	(*alternatives_)[i].set_node(replacement);
2086	new_alternatives->Add((*alternatives_)[i]);
2087	}
2088	}
2089	alternatives_ = new_alternatives;
2090	return this;
2091	}
2092
2093	RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth) {
2094	if (info()->replacement_calculated) return replacement();
2095	if (depth < `0`) return this;
2096	if (info()->visited) return this;
2097	VisitMarker marker(info());
2098	// Alternative 0 is the negative lookahead, alternative 1 is what comes
2099	// afterwards.
2100	RegExpNode* node = (*alternatives_)[`1`].node();
2101	RegExpNode* replacement = node->FilterOneByte(depth - `1`);
2102	if (replacement == NULL) return set_replacement(NULL);
2103	(*alternatives_)[`1`].set_node(replacement);
2104
2105	RegExpNode* neg_node = (*alternatives_)[`0`].node();
2106	RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - `1`);
2107	// If the negative lookahead is always going to fail then
2108	// we don't need to check it.
2109	if (neg_replacement == NULL) return set_replacement(replacement);
2110	(*alternatives_)[`0`].set_node(neg_replacement);
2111	return set_replacement(this);
2112	}
2113
2114	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2115	RegExpCompiler* compiler,
2116	intptr_t characters_filled_in,
2117	bool not_at_start) {
2118	if (body_can_be_zero_length_ \|\| info()->visited) return;
2119	VisitMarker marker(info());
2120	return ChoiceNode::GetQuickCheckDetails(details, compiler,
2121	characters_filled_in, not_at_start);
2122	}
2123
2124	void LoopChoiceNode::FillInBMInfo(intptr_t offset,
2125	intptr_t budget,
2126	BoyerMooreLookahead* bm,
2127	bool not_at_start) {
2128	if (body_can_be_zero_length_ \|\| budget <= `0`) {
2129	bm->SetRest(offset);
2130	SaveBMInfo(bm, not_at_start, offset);
2131	return;
2132	}
2133	ChoiceNode::FillInBMInfo(offset, budget - `1`, bm, not_at_start);
2134	SaveBMInfo(bm, not_at_start, offset);
2135	}
2136
2137	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2138	RegExpCompiler* compiler,
2139	intptr_t characters_filled_in,
2140	bool not_at_start) {
2141	not_at_start = (not_at_start \|\| not_at_start_);
2142	intptr_t choice_count = alternatives_->length();
2143	ASSERT(choice_count > `0`);
2144	(*alternatives_)[`0`].node()->GetQuickCheckDetails(
2145	details, compiler, characters_filled_in, not_at_start);
2146	for (intptr_t i = `1`; i < choice_count; i++) {
2147	QuickCheckDetails new_details(details->characters());
2148	RegExpNode* node = (*alternatives_)[i].node();
2149	node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2150	not_at_start);
2151	// Here we merge the quick match details of the two branches.
2152	details->Merge(&new_details, characters_filled_in);
2153	}
2154	}
2155
2156	// Check for [0-9A-Z_a-z].
2157	static void EmitWordCheck(RegExpMacroAssembler* assembler,
2158	BlockLabel* word,
2159	BlockLabel* non_word,
2160	bool fall_through_on_word) {
2161	if (assembler->CheckSpecialCharacterClass(
2162	fall_through_on_word ? `'w'` : `'W'`,
2163	fall_through_on_word ? non_word : word)) {
2164	// Optimized implementation available.
2165	return;
2166	}
2167	assembler->CheckCharacterGT(`'z'`, non_word);
2168	assembler->CheckCharacterLT(`'0'`, non_word);
2169	assembler->CheckCharacterGT(`'a'` - `1`, word);
2170	assembler->CheckCharacterLT(`'9'` + `1`, word);
2171	assembler->CheckCharacterLT(`'A'`, non_word);
2172	assembler->CheckCharacterLT(`'Z'` + `1`, word);
2173	if (fall_through_on_word) {
2174	assembler->CheckNotCharacter(`'_'`, non_word);
2175	} else {
2176	assembler->CheckCharacter(`'_'`, word);
2177	}
2178	}
2179
2180	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2181	// that matches newline or the start of input).
2182	static void EmitHat(RegExpCompiler* compiler,
2183	RegExpNode* on_success,
2184	Trace* trace) {
2185	RegExpMacroAssembler* assembler = compiler->macro_assembler();
2186	// We will be loading the previous character into the current character
2187	// register.
2188	Trace new_trace(*trace);
2189	new_trace.InvalidateCurrentCharacter();
2190
2191	BlockLabel ok;
2192	if (new_trace.cp_offset() == `0`) {
2193	// The start of input counts as a newline in this context, so skip to
2194	// ok if we are at the start.
2195	assembler->CheckAtStart(&ok);
2196	}
2197	// We already checked that we are not at the start of input so it must be
2198	// OK to load the previous character.
2199	assembler->LoadCurrentCharacter(new_trace.cp_offset() - `1`,
2200	new_trace.backtrack(), false);
2201	if (!assembler->CheckSpecialCharacterClass(`'n'`, new_trace.backtrack())) {
2202	// Newline means \n, \r, 0x2028 or 0x2029.
2203	if (!compiler->one_byte()) {
2204	assembler->CheckCharacterAfterAnd(`0x2028`, `0xfffe`, &ok);
2205	}
2206	assembler->CheckCharacter(`'\n'`, &ok);
2207	assembler->CheckNotCharacter(`'\r'`, new_trace.backtrack());
2208	}
2209	assembler->BindBlock(&ok);
2210	on_success->Emit(compiler, &new_trace);
2211	}
2212
2213	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2214	void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2215	RegExpMacroAssembler* assembler = compiler->macro_assembler();
2216	Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2217	bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2218	BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2219	if (lookahead == NULL) {
2220	intptr_t eats_at_least =
2221	Utils::Minimum(kMaxLookaheadForBoyerMoore,
2222	EatsAtLeast(kMaxLookaheadForBoyerMoore, kRecursionBudget,
2223	not_at_start));
2224	if (eats_at_least >= `1`) {
2225	BoyerMooreLookahead* bm =
2226	new (Z) BoyerMooreLookahead (eats_at_least, compiler, Z);
2227	FillInBMInfo(`0`, kRecursionBudget, bm, not_at_start);
2228	if (bm->at(`0`)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2229	if (bm->at(`0`)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2230	}
2231	} else {
2232	if (lookahead->at(`0`)->is_non_word())
2233	next_is_word_character = Trace::FALSE_VALUE;
2234	if (lookahead->at(`0`)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2235	}
2236	bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2237	if (next_is_word_character == Trace::UNKNOWN) {
2238	BlockLabel before_non_word;
2239	BlockLabel before_word;
2240	if (trace->characters_preloaded() != `1`) {
2241	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2242	}
2243	// Fall through on non-word.
2244	EmitWordCheck(assembler, &before_word, &before_non_word, false);
2245	// Next character is not a word character.
2246	assembler->BindBlock(&before_non_word);
2247	BlockLabel ok;
2248	// Backtrack on \B (non-boundary check) if previous is a word,
2249	// since we know next is not* a word and this would be a boundary.*
2250	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2251
2252	if (!assembler->IsClosed()) {
2253	assembler->GoTo(&ok);
2254	}
2255
2256	assembler->BindBlock(&before_word);
2257	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2258	assembler->BindBlock(&ok);
2259	} else if (next_is_word_character == Trace::TRUE_VALUE) {
2260	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2261	} else {
2262	ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2263	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2264	}
2265	}
2266
2267	void AssertionNode::BacktrackIfPrevious(
2268	RegExpCompiler* compiler,
2269	Trace* trace,
2270	AssertionNode::IfPrevious backtrack_if_previous) {
2271	RegExpMacroAssembler* assembler = compiler->macro_assembler();
2272	Trace new_trace(*trace);
2273	new_trace.InvalidateCurrentCharacter();
2274
2275	BlockLabel fall_through, dummy;
2276
2277	BlockLabel* non_word = backtrack_if_previous == kIsNonWord
2278	? new_trace.backtrack()
2279	: &fall_through;
2280	BlockLabel* word = backtrack_if_previous == kIsNonWord
2281	? &fall_through
2282	: new_trace.backtrack();
2283
2284	if (new_trace.cp_offset() == `0`) {
2285	// The start of input counts as a non-word character, so the question is
2286	// decided if we are at the start.
2287	assembler->CheckAtStart(non_word);
2288	}
2289	// We already checked that we are not at the start of input so it must be
2290	// OK to load the previous character.
2291	assembler->LoadCurrentCharacter(new_trace.cp_offset() - `1`, &dummy, false);
2292	EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2293
2294	assembler->BindBlock(&fall_through);
2295	on_success()->Emit(compiler, &new_trace);
2296	}
2297
2298	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2299	RegExpCompiler* compiler,
2300	intptr_t filled_in,
2301	bool not_at_start) {
2302	if (assertion_type_ == AT_START && not_at_start) {
2303	details->set_cannot_match();
2304	return;
2305	}
2306	return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2307	not_at_start);
2308	}
2309
2310	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2311	RegExpMacroAssembler* assembler = compiler->macro_assembler();
2312	switch (assertion_type_) {
2313	case AT_END: {
2314	BlockLabel ok;
2315	assembler->CheckPosition(trace->cp_offset(), &ok);
2316	assembler->GoTo(trace->backtrack());
2317	assembler->BindBlock(&ok);
2318	break;
2319	}
2320	case AT_START: {
2321	if (trace->at_start() == Trace::FALSE_VALUE) {
2322	assembler->GoTo(trace->backtrack());
2323	return;
2324	}
2325	if (trace->at_start() == Trace::UNKNOWN) {
2326	assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2327	Trace at_start_trace = *trace;
2328	at_start_trace.set_at_start(Trace::TRUE_VALUE);
2329	on_success()->Emit(compiler, &at_start_trace);
2330	return;
2331	}
2332	} break;
2333	case AFTER_NEWLINE:
2334	EmitHat(compiler, on_success(), trace);
2335	return;
2336	case AT_BOUNDARY:
2337	case AT_NON_BOUNDARY: {
2338	EmitBoundaryCheck(compiler, trace);
2339	return;
2340	}
2341	}
2342	on_success()->Emit(compiler, trace);
2343	}
2344
2345	static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2346	if (quick_check == NULL) return false;
2347	if (offset >= quick_check->characters()) return false;
2348	return quick_check->positions(offset)->determines_perfectly;
2349	}
2350
2351	static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2352	if (index > *checked_up_to) {
2353	*checked_up_to = index;
2354	}
2355	}
2356
2357	// We call this repeatedly to generate code for each pass over the text node.
2358	// The passes are in increasing order of difficulty because we hope one
2359	// of the first passes will fail in which case we are saved the work of the
2360	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2361	// we will check the '%' in the first pass, the case independent 'a' in the
2362	// second pass and the character class in the last pass.
2363	//
2364	// The passes are done from right to left, so for example to test for /bar/
2365	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2366	// and then a 'b' with offset 0. This means we can avoid the end-of-input
2367	// bounds check most of the time. In the example we only need to check for
2368	// end-of-input when loading the putative 'r'.
2369	//
2370	// A slight complication involves the fact that the first character may already
2371	// be fetched into a register by the previous node. In this case we want to
2372	// do the test for that character first. We do this in separate passes. The
2373	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2374	// pass has been performed then subsequent passes will have true in
2375	// first_element_checked to indicate that that character does not need to be
2376	// checked again.
2377	//
2378	// In addition to all this we are passed a Trace, which can
2379	// contain an AlternativeGeneration object. In this AlternativeGeneration
2380	// object we can see details of any quick check that was already passed in
2381	// order to get to the code we are now generating. The quick check can involve
2382	// loading characters, which means we do not need to recheck the bounds
2383	// up to the limit the quick check already checked. In addition the quick
2384	// check can have involved a mask and compare operation which may simplify
2385	// or obviate the need for further checks at some character positions.
2386	void TextNode::TextEmitPass(RegExpCompiler* compiler,
2387	TextEmitPassType pass,
2388	bool preloaded,
2389	Trace* trace,
2390	bool first_element_checked,
2391	intptr_t* checked_up_to) {
2392	RegExpMacroAssembler* assembler = compiler->macro_assembler();
2393	bool one_byte = compiler->one_byte();
2394	BlockLabel* backtrack = trace->backtrack();
2395	QuickCheckDetails* quick_check = trace->quick_check_performed();
2396	intptr_t element_count = elms_->length();
2397	intptr_t backward_offset = read_backward() ? -Length() : `0`;
2398	for (intptr_t i = preloaded ? `0` : element_count - `1`; i >= `0`; i--) {
2399	TextElement elm = elms_->At(i);
2400	intptr_t cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2401	if (elm.text_type() == TextElement::ATOM) {
2402	ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2403	for (intptr_t j = preloaded ? `0` : quarks->length() - `1`; j >= `0`; j--) {
2404	if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2405	if (first_element_checked && i == `0` && j == `0`) continue;
2406	if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2407	EmitCharacterFunction* emit_function = NULL;
2408	uint16_t quark = quarks->At(j);
2409	if (elm.atom()->ignore_case()) {
2410	// Everywhere else we assume that a non-Latin-1 character cannot match
2411	// a Latin-1 character. Avoid the cases where this is assumption is
2412	// invalid by using the Latin1 equivalent instead.
2413	quark = Latin1::TryConvertToLatin1(quark);
2414	}
2415	switch (pass) {
2416	case NON_LATIN1_MATCH:
2417	ASSERT(one_byte);
2418	if (quark > Symbols::kMaxOneCharCodeSymbol) {
2419	assembler->GoTo(backtrack);
2420	return;
2421	}
2422	break;
2423	case NON_LETTER_CHARACTER_MATCH:
2424	emit_function = &EmitAtomNonLetter;
2425	break;
2426	case SIMPLE_CHARACTER_MATCH:
2427	emit_function = &EmitSimpleCharacter;
2428	break;
2429	case CASE_CHARACTER_MATCH:
2430	emit_function = &EmitAtomLetter;
2431	break;
2432	default:
2433	break;
2434	}
2435	if (emit_function != NULL) {
2436	const bool bounds_check =
2437	*checked_up_to < (cp_offset + j) \|\| read_backward();
2438	bool bound_checked =
2439	emit_function(Z, compiler, quarks->At(j), backtrack,
2440	cp_offset + j, bounds_check, preloaded);
2441	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2442	}
2443	}
2444	} else {
2445	ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2446	if (pass == CHARACTER_CLASS_MATCH) {
2447	if (first_element_checked && i == `0`) continue;
2448	if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2449	RegExpCharacterClass* cc = elm.char_class();
2450	bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();
2451	EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2452	bounds_check, preloaded, Z);
2453	UpdateBoundsCheck(cp_offset, checked_up_to);
2454	}
2455	}
2456	}
2457	}
2458
2459	intptr_t TextNode::Length() {
2460	TextElement elm = elms_->Last();
2461	ASSERT(elm.cp_offset() >= `0`);
2462	return elm.cp_offset() + elm.length();
2463	}
2464
2465	bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
2466	TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
2467	if (ignore_case) {
2468	return pass == SIMPLE_CHARACTER_MATCH;
2469	} else {
2470	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
2471	}
2472	}
2473
2474	TextNode* TextNode::CreateForCharacterRanges(
2475	ZoneGrowableArray<CharacterRange>* ranges,
2476	bool read_backward,
2477	RegExpNode* on_success,
2478	RegExpFlags flags) {
2479	ASSERT(ranges != nullptr);
2480	ZoneGrowableArray<TextElement>* elms = new ZoneGrowableArray<TextElement>(`1`);
2481	elms->Add(TextElement::CharClass(new RegExpCharacterClass (ranges, flags)));
2482	return new TextNode (elms, read_backward, on_success);
2483	}
2484
2485	TextNode* TextNode::CreateForSurrogatePair(CharacterRange lead,
2486	CharacterRange trail,
2487	bool read_backward,
2488	RegExpNode* on_success,
2489	RegExpFlags flags) {
2490	auto lead_ranges = CharacterRange::List(on_success->zone(), lead);
2491	auto trail_ranges = CharacterRange::List(on_success->zone(), trail);
2492	auto elms = new ZoneGrowableArray<TextElement>(`2`);
2493
2494	elms->Add(
2495	TextElement::CharClass(new RegExpCharacterClass (lead_ranges, flags)));
2496	elms->Add(
2497	TextElement::CharClass(new RegExpCharacterClass (trail_ranges, flags)));
2498
2499	return new TextNode (elms, read_backward, on_success);
2500	}
2501
2502	// This generates the code to match a text node. A text node can contain
2503	// straight character sequences (possibly to be matched in a case-independent
2504	// way) and character classes. For efficiency we do not do this in a single
2505	// pass from left to right. Instead we pass over the text node several times,
2506	// emitting code for some character positions every time. See the comment on
2507	// TextEmitPass for details.
2508	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2509	LimitResult limit_result = LimitVersions(compiler, trace);
2510	if (limit_result == DONE) return;
2511	ASSERT(limit_result == CONTINUE);
2512
2513	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2514	compiler->SetRegExpTooBig();
2515	return;
2516	}
2517
2518	if (compiler->one_byte()) {
2519	intptr_t dummy = `0`;
2520	TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2521	}
2522
2523	bool first_elt_done = false;
2524	intptr_t bound_checked_to = trace->cp_offset() - `1`;
2525	bound_checked_to += trace->bound_checked_up_to();
2526
2527	// If a character is preloaded into the current character register then
2528	// check that now.
2529	if (trace->characters_preloaded() == `1`) {
2530	for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2531	TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2532	false, &bound_checked_to);
2533	}
2534	first_elt_done = true;
2535	}
2536
2537	for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2538	TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2539	first_elt_done, &bound_checked_to);
2540	}
2541
2542	Trace successor_trace(*trace);
2543	// If we advance backward, we may end up at the start.
2544	successor_trace.AdvanceCurrentPositionInTrace(
2545	read_backward() ? -Length() : Length(), compiler);
2546	successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2547	: Trace::FALSE_VALUE);
2548	RecursionCheck rc(compiler);
2549	on_success()->Emit(compiler, &successor_trace);
2550	}
2551
2552	void Trace::InvalidateCurrentCharacter() {
2553	characters_preloaded_ = `0`;
2554	}
2555
2556	void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
2557	RegExpCompiler* compiler) {
2558	// We don't have an instruction for shifting the current character register
2559	// down or for using a shifted value for anything so lets just forget that
2560	// we preloaded any characters into it.
2561	characters_preloaded_ = `0`;
2562	// Adjust the offsets of the quick check performed information. This
2563	// information is used to find out what we already determined about the
2564	// characters by means of mask and compare.
2565	quick_check_performed_.Advance(by, compiler->one_byte());
2566	cp_offset_ += by;
2567	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2568	compiler->SetRegExpTooBig();
2569	cp_offset_ = `0`;
2570	}
2571	bound_checked_up_to_ =
2572	Utils::Maximum(static_cast<intptr_t>(`0`), bound_checked_up_to_ - by);
2573	}
2574
2575	void TextNode::MakeCaseIndependent(bool is_one_byte) {
2576	intptr_t element_count = elms_->length();
2577	for (intptr_t i = `0`; i < element_count; i++) {
2578	TextElement elm = elms_->At(i);
2579	if (elm.text_type() == TextElement::CHAR_CLASS) {
2580	RegExpCharacterClass* cc = elm.char_class();
2581	bool case_equivalents_already_added =
2582	cc->flags().NeedsUnicodeCaseEquivalents();
2583	if (cc->flags().IgnoreCase() && !case_equivalents_already_added) {
2584	// None of the standard character classes is different in the case
2585	// independent case and it slows us down if we don't know that.
2586	if (cc->is_standard()) continue;
2587	CharacterRange::AddCaseEquivalents(cc->ranges(), is_one_byte, Z);
2588	}
2589	}
2590	}
2591	}
2592
2593	intptr_t TextNode::GreedyLoopTextLength() {
2594	TextElement elm = elms_->At(elms_->length() - `1`);
2595	return elm.cp_offset() + elm.length();
2596	}
2597
2598	RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2599	RegExpCompiler* compiler) {
2600	if (read_backward()) return nullptr;
2601	if (elms_->length() != `1`) return NULL;
2602	TextElement elm = elms_->At(`0`);
2603	if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
2604	RegExpCharacterClass* node = elm.char_class();
2605	ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
2606	if (!CharacterRange::IsCanonical(ranges)) {
2607	CharacterRange::Canonicalize(ranges);
2608	}
2609	if (node->is_negated()) {
2610	return ranges->length() == `0` ? on_success() : NULL;
2611	}
2612	if (ranges->length() != `1`) return NULL;
2613	uint32_t max_char;
2614	if (compiler->one_byte()) {
2615	max_char = Symbols::kMaxOneCharCodeSymbol;
2616	} else {
2617	max_char = Utf16::kMaxCodeUnit;
2618	}
2619	return ranges->At(`0`).IsEverything(max_char) ? on_success() : NULL;
2620	}
2621
2622	// Finds the fixed match length of a sequence of nodes that goes from
2623	// this alternative and back to this choice node. If there are variable
2624	// length nodes or other complications in the way then return a sentinel
2625	// value indicating that a greedy loop cannot be constructed.
2626	intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(
2627	const GuardedAlternative* alternative) {
2628	intptr_t length = `0`;
2629	RegExpNode* node = alternative->node();
2630	// Later we will generate code for all these text nodes using recursion
2631	// so we have to limit the max number.
2632	intptr_t recursion_depth = `0`;
2633	while (node != this) {
2634	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2635	return kNodeIsTooComplexForGreedyLoops;
2636	}
2637	intptr_t node_length = node->GreedyLoopTextLength();
2638	if (node_length == kNodeIsTooComplexForGreedyLoops) {
2639	return kNodeIsTooComplexForGreedyLoops;
2640	}
2641	length += node_length;
2642	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2643	node = seq_node->on_success();
2644	}
2645	return read_backward() ? -length : length;
2646	}
2647
2648	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2649	ASSERT(loop_node_ == NULL);
2650	AddAlternative(alt);
2651	loop_node_ = alt.node();
2652	}
2653
2654	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2655	ASSERT(continue_node_ == NULL);
2656	AddAlternative(alt);
2657	continue_node_ = alt.node();
2658	}
2659
2660	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2661	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2662	if (trace->stop_node() == this) {
2663	// Back edge of greedy optimized loop node graph.
2664	intptr_t text_length =
2665	GreedyLoopTextLengthForAlternative(&alternatives_->At(`0`));
2666	ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2667	// Update the counter-based backtracking info on the stack. This is an
2668	// optimization for greedy loops (see below).
2669	ASSERT(trace->cp_offset() == text_length);
2670	macro_assembler->AdvanceCurrentPosition(text_length);
2671	macro_assembler->GoTo(trace->loop_label());
2672	return;
2673	}
2674	ASSERT(trace->stop_node() == NULL);
2675	if (!trace->is_trivial()) {
2676	trace->Flush(compiler, this);
2677	return;
2678	}
2679	ChoiceNode::Emit(compiler, trace);
2680	}
2681
2682	intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2683	intptr_t eats_at_least) {
2684	intptr_t preload_characters =
2685	Utils::Minimum(static_cast<intptr_t>(`4`), eats_at_least);
2686	if (compiler->macro_assembler()->CanReadUnaligned()) {
2687	bool one_byte = compiler->one_byte();
2688	if (one_byte) {
2689	if (preload_characters > `4`) preload_characters = `4`;
2690	// We can't preload 3 characters because there is no machine instruction
2691	// to do that. We can't just load 4 because we could be reading
2692	// beyond the end of the string, which could cause a memory fault.
2693	if (preload_characters == `3`) preload_characters = `2`;
2694	} else {
2695	if (preload_characters > `2`) preload_characters = `2`;
2696	}
2697	} else {
2698	if (preload_characters > `1`) preload_characters = `1`;
2699	}
2700	return preload_characters;
2701	}
2702
2703	// This structure is used when generating the alternatives in a choice node. It
2704	// records the way the alternative is being code generated.
2705	struct AlternativeGeneration {
2706	AlternativeGeneration()
2707	: possible_success (),
2708	expects_preload(false),
2709	after (),
2710	quick_check_details () {}
2711	BlockLabel possible_success;
2712	bool expects_preload;
2713	BlockLabel after;
2714	QuickCheckDetails quick_check_details;
2715	};
2716
2717	// Creates a list of AlternativeGenerations. If the list has a reasonable
2718	// size then it is on the stack, otherwise the excess is on the heap.
2719	class AlternativeGenerationList {
2720	public:
2721	explicit AlternativeGenerationList(intptr_t count) : count_(count) {
2722	ASSERT(count >= `0`);
2723	if (count > kAFew) {
2724	excess_alt_gens_.reset(new AlternativeGeneration[count - kAFew]);
2725	}
2726	}
2727
2728	AlternativeGeneration* at(intptr_t i) {
2729	ASSERT(`0` <= i);
2730	ASSERT(i < count_);
2731	if (i < kAFew) {
2732	return &a_few_alt_gens_[i];
2733	}
2734	return &excess_alt_gens_[i - kAFew];
2735	}
2736
2737	private:
2738	static const intptr_t kAFew = `10`;
2739
2740	intptr_t count_;
2741	AlternativeGeneration a_few_alt_gens_[kAFew];
2742	std::unique_ptr<AlternativeGeneration[]> excess_alt_gens_;
2743
2744	DISALLOW_ALLOCATION();
2745	DISALLOW_COPY_AND_ASSIGN(AlternativeGenerationList);
2746	};
2747
2748	static const int32_t kRangeEndMarker = Utf::kMaxCodePoint + `1`;
2749
2750	// The '2' variant is inclusive from and exclusive to.
2751	// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
2752	// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
2753	// 0x180E has been removed from Unicode's Zs category and thus
2754	// from ECMAScript's WhiteSpace category as of Unicode 6.3.
2755	static const int32_t kSpaceRanges[] = {
2756	`'\t'`, `'\r'` + `1`, `' '`, `' '` + `1`, `0x00A0`, `0x00A1`, `0x1680`,
2757	`0x1681`, `0x2000`, `0x200B`, `0x2028`, `0x202A`, `0x202F`, `0x2030`,
2758	`0x205F`, `0x2060`, `0x3000`, `0x3001`, `0xFEFF`, `0xFF00`, kRangeEndMarker};
2759	static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
2760	static const int32_t kWordRanges[] = {
2761	`'0'`, `'9'` + `1`, `'A'`, `'Z'` + `1`, `'_'`, `'_'` + `1`, `'a'`, `'z'` + `1`, kRangeEndMarker};
2762	static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
2763	static const int32_t kDigitRanges[] = {`'0'`, `'9'` + `1`, kRangeEndMarker};
2764	static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
2765	static const int32_t kSurrogateRanges[] = {`0xd800`, `0xe000`, kRangeEndMarker};
2766	static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
2767	static const int32_t kLineTerminatorRanges[] = {
2768	`0x000A`, `0x000B`, `0x000D`, `0x000E`, `0x2028`, `0x202A`, kRangeEndMarker};
2769	static const intptr_t kLineTerminatorRangeCount =
2770	ARRAY_SIZE(kLineTerminatorRanges);
2771
2772	void BoyerMoorePositionInfo::Set(intptr_t character) {
2773	SetInterval(Interval (character, character));
2774	}
2775
2776	void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2777	s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2778	w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2779	d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2780	surrogate_ =
2781	AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2782	if (interval.to() - interval.from() >= kMapSize - `1`) {
2783	if (map_count_ != kMapSize) {
2784	map_count_ = kMapSize;
2785	for (intptr_t i = `0`; i < kMapSize; i++)
2786	(map_)[i] = true*;
2787	}
2788	return;
2789	}
2790	for (intptr_t i = interval.from(); i <= interval.to(); i++) {
2791	intptr_t mod_character = (i & kMask);
2792	if (!map_->At(mod_character)) {
2793	map_count_++;
2794	(map_)[mod_character] = true*;
2795	}
2796	if (map_count_ == kMapSize) return;
2797	}
2798	}
2799
2800	void BoyerMoorePositionInfo::SetAll() {
2801	s_ = w_ = d_ = kLatticeUnknown;
2802	if (map_count_ != kMapSize) {
2803	map_count_ = kMapSize;
2804	for (intptr_t i = `0`; i < kMapSize; i++)
2805	(map_)[i] = true*;
2806	}
2807	}
2808
2809	BoyerMooreLookahead::BoyerMooreLookahead(intptr_t length,
2810	RegExpCompiler* compiler,
2811	Zone* zone)
2812	: length_(length), compiler_(compiler) {
2813	if (compiler->one_byte()) {
2814	max_char_ = Symbols::kMaxOneCharCodeSymbol;
2815	} else {
2816	max_char_ = Utf16::kMaxCodeUnit;
2817	}
2818	bitmaps_ = new (zone) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);
2819	for (intptr_t i = `0`; i < length; i++) {
2820	bitmaps_->Add(new (zone) BoyerMoorePositionInfo (zone));
2821	}
2822	}
2823
2824	// Find the longest range of lookahead that has the fewest number of different
2825	// characters that can occur at a given position. Since we are optimizing two
2826	// different parameters at once this is a tradeoff.
2827	bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
2828	intptr_t biggest_points = `0`;
2829	// If more than 32 characters out of 128 can occur it is unlikely that we can
2830	// be lucky enough to step forwards much of the time.
2831	const intptr_t kMaxMax = `32`;
2832	for (intptr_t max_number_of_chars = `4`; max_number_of_chars < kMaxMax;
2833	max_number_of_chars *= `2`) {
2834	biggest_points =
2835	FindBestInterval(max_number_of_chars, biggest_points, from, to);
2836	}
2837	if (biggest_points == `0`) return false;
2838	return true;
2839	}
2840
2841	// Find the highest-points range between 0 and length_ where the character
2842	// information is not too vague. 'Too vague' means that there are more than
2843	// max_number_of_chars that can occur at this position. Calculates the number
2844	// of points as the product of width-of-the-range and
2845	// probability-of-finding-one-of-the-characters, where the probability is
2846	// calculated using the frequency distribution of the sample subject string.
2847	intptr_t BoyerMooreLookahead::FindBestInterval(intptr_t max_number_of_chars,
2848	intptr_t old_biggest_points,
2849	intptr_t* from,
2850	intptr_t* to) {
2851	intptr_t biggest_points = old_biggest_points;
2852	static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2853	for (intptr_t i = `0`; i < length_;) {
2854	while (i < length_ && Count(i) > max_number_of_chars)
2855	i++;
2856	if (i == length_) break;
2857	intptr_t remembered_from = i;
2858	bool union_map[kSize];
2859	for (intptr_t j = `0`; j < kSize; j++)
2860	union_map[j] = false;
2861	while (i < length_ && Count(i) <= max_number_of_chars) {
2862	BoyerMoorePositionInfo* map = bitmaps_->At(i);
2863	for (intptr_t j = `0`; j < kSize; j++)
2864	union_map[j] \|= map->at(j);
2865	i++;
2866	}
2867	intptr_t frequency = `0`;
2868	for (intptr_t j = `0`; j < kSize; j++) {
2869	if (union_map[j]) {
2870	// Add 1 to the frequency to give a small per-character boost for
2871	// the cases where our sampling is not good enough and many
2872	// characters have a frequency of zero. This means the frequency
2873	// can theoretically be up to 2kSize though we treat it mostly as*
2874	// a fraction of kSize.
2875	frequency += compiler_->frequency_collator()->Frequency(j) + `1`;
2876	}
2877	}
2878	// We use the probability of skipping times the distance we are skipping to
2879	// judge the effectiveness of this. Actually we have a cut-off: By
2880	// dividing by 2 we switch off the skipping if the probability of skipping
2881	// is less than 50%. This is because the multibyte mask-and-compare
2882	// skipping in quickcheck is more likely to do well on this case.
2883	bool in_quickcheck_range =
2884	((i - remembered_from < `4`) \|\|
2885	(compiler_->one_byte() ? remembered_from <= `4` : remembered_from <= `2`));
2886	// Called 'probability' but it is only a rough estimate and can actually
2887	// be outside the 0-kSize range.
2888	intptr_t probability =
2889	(in_quickcheck_range ? kSize / `2` : kSize) - frequency;
2890	intptr_t points = (i - remembered_from) * probability;
2891	if (points > biggest_points) {
2892	*from = remembered_from;
2893	*to = i - `1`;
2894	biggest_points = points;
2895	}
2896	}
2897	return biggest_points;
2898	}
2899
2900	// Take all the characters that will not prevent a successful match if they
2901	// occur in the subject string in the range between min_lookahead and
2902	// max_lookahead (inclusive) measured from the current position. If the
2903	// character at max_lookahead offset is not one of these characters, then we
2904	// can safely skip forwards by the number of characters in the range.
2905	intptr_t BoyerMooreLookahead::GetSkipTable(
2906	intptr_t min_lookahead,
2907	intptr_t max_lookahead,
2908	const TypedData& boolean_skip_table) {
2909	const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2910
2911	const intptr_t kSkipArrayEntry = `0`;
2912	const intptr_t kDontSkipArrayEntry = `1`;
2913
2914	for (intptr_t i = `0`; i < kSize; i++) {
2915	boolean_skip_table.SetUint8(i, kSkipArrayEntry);
2916	}
2917	intptr_t skip = max_lookahead + `1` - min_lookahead;
2918
2919	for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2920	BoyerMoorePositionInfo* map = bitmaps_->At(i);
2921	for (intptr_t j = `0`; j < kSize; j++) {
2922	if (map->at(j)) {
2923	boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);
2924	}
2925	}
2926	}
2927
2928	return skip;
2929	}
2930
2931	// See comment above on the implementation of GetSkipTable.
2932	void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2933	const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2934
2935	intptr_t min_lookahead = `0`;
2936	intptr_t max_lookahead = `0`;
2937
2938	if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2939
2940	bool found_single_character = false;
2941	intptr_t single_character = `0`;
2942	for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2943	BoyerMoorePositionInfo* map = bitmaps_->At(i);
2944	if (map->map_count() > `1` \|\|
2945	(found_single_character && map->map_count() != `0`)) {
2946	found_single_character = false;
2947	break;
2948	}
2949	for (intptr_t j = `0`; j < kSize; j++) {
2950	if (map->at(j)) {
2951	found_single_character = true;
2952	single_character = j;
2953	break;
2954	}
2955	}
2956	}
2957
2958	intptr_t lookahead_width = max_lookahead + `1` - min_lookahead;
2959
2960	if (found_single_character && lookahead_width == `1` && max_lookahead < `3`) {
2961	// The mask-compare can probably handle this better.
2962	return;
2963	}
2964
2965	if (found_single_character) {
2966	BlockLabel cont, again;
2967	masm->BindBlock(&again);
2968	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2969	if (max_char_ > kSize) {
2970	masm->CheckCharacterAfterAnd(single_character,
2971	RegExpMacroAssembler::kTableMask, &cont);
2972	} else {
2973	masm->CheckCharacter(single_character, &cont);
2974	}
2975	masm->AdvanceCurrentPosition(lookahead_width);
2976	masm->GoTo(&again);
2977	masm->BindBlock(&cont);
2978	return;
2979	}
2980
2981	const TypedData& boolean_skip_table = TypedData::ZoneHandle(
2982	compiler_->zone(),
2983	TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
2984	intptr_t skip_distance =
2985	GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2986	ASSERT(skip_distance != `0`);
2987
2988	BlockLabel cont, again;
2989
2990	masm->BindBlock(&again);
2991	masm->CheckPreemption(/is_backtrack=/false);
2992	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2993	masm->CheckBitInTable(boolean_skip_table, &cont);
2994	masm->AdvanceCurrentPosition(skip_distance);
2995	masm->GoTo(&again);
2996	masm->BindBlock(&cont);
2997
2998	return;
2999	}
3000
3001	/ Code generation for choice nodes.*
3002	*
3003	* We generate quick checks that do a mask and compare to eliminate a
3004	* choice. If the quick check succeeds then it jumps to the continuation to
3005	* do slow checks and check subsequent nodes. If it fails (the common case)
3006	* it falls through to the next choice.
3007	*
3008	* Here is the desired flow graph. Nodes directly below each other imply
3009	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3010	* 3 doesn't have a quick check so we have to call the slow check.
3011	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3012	* regexp continuation is generated directly after the Sn node, up to the
3013	* next GoTo if we decide to reuse some already generated code. Some
3014	* nodes expect preload_characters to be preloaded into the current
3015	* character register. R nodes do this preloading. Vertices are marked
3016	* F for failures and S for success (possible success in the case of quick
3017	* nodes). L, V, < and > are used as arrow heads.
3018	*
3019	* ----------> R
3020	* \|
3021	* V
3022	* Q1 -----> S1
3023	* \| S /
3024	* F\| /
3025	* \| F/
3026	* \| /
3027	* \| R
3028	* \| /
3029	* V L
3030	* Q2 -----> S2
3031	* \| S /
3032	* F\| /
3033	* \| F/
3034	* \| /
3035	* \| R
3036	* \| /
3037	* V L
3038	* S3
3039	* \|
3040	* F\|
3041	* \|
3042	* R
3043	* \|
3044	* backtrack V
3045	* <----------Q4
3046	* \ F \|
3047	* \ \|S
3048	* \ F V
3049	* \-----S4
3050	*
3051	* For greedy loops we push the current position, then generate the code that
3052	* eats the input specially in EmitGreedyLoop. The other choice (the
3053	* continuation) is generated by the normal code in EmitChoices, and steps back
3054	* in the input to the starting position when it fails to match. The loop code
3055	* looks like this (U is the unwind code that steps back in the greedy loop).
3056	*
3057	* _____
3058	* / \
3059	* V \|
3060	* ----------> S1 \|
3061	* /\| \|
3062	* / \|S \|
3063	* F/ \_____/
3064	* /
3065	* \|<-----
3066	* \| \
3067	* V \|S
3068	* Q2 ---> U----->backtrack
3069	* \| F /
3070	* S\| /
3071	* V F /
3072	* S2--/
3073	*/
3074
3075	GreedyLoopState::GreedyLoopState(bool not_at_start) {
3076	counter_backtrack_trace_.set_backtrack(&label_);
3077	if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3078	}
3079
3080	void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3081	#ifdef DEBUG
3082	intptr_t choice_count = alternatives_->length();
3083	for (intptr_t i = `0`; i < choice_count - `1`; i++) {
3084	GuardedAlternative alternative = alternatives_->At(i);
3085	ZoneGrowableArray<Guard> guards = alternative.guards();
3086	intptr_t guard_count = (guards == NULL) ? `0` : guards->length();
3087	for (intptr_t j = `0`; j < guard_count; j++) {
3088	ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
3089	}
3090	}
3091	#endif
3092	}
3093
3094	void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3095	Trace* current_trace,
3096	PreloadState* state) {
3097	if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3098	// Save some time by looking at most one machine word ahead.
3099	state->eats_at_least_ =
3100	EatsAtLeast(compiler->one_byte() ? `4` : `2`, kRecursionBudget,
3101	current_trace->at_start() == Trace::FALSE_VALUE);
3102	}
3103	state->preload_characters_ =
3104	CalculatePreloadCharacters(compiler, state->eats_at_least_);
3105
3106	state->preload_is_current_ =
3107	(current_trace->characters_preloaded() == state->preload_characters_);
3108	state->preload_has_checked_bounds_ = state->preload_is_current_;
3109	}
3110
3111	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3112	intptr_t choice_count = alternatives_->length();
3113
3114	if (choice_count == `1` && alternatives_->At(`0`).guards() == nullptr) {
3115	alternatives_->At(`0`).node()->Emit(compiler, trace);
3116	return;
3117	}
3118
3119	AssertGuardsMentionRegisters(trace);
3120
3121	LimitResult limit_result = LimitVersions(compiler, trace);
3122	if (limit_result == DONE) return;
3123	ASSERT(limit_result == CONTINUE);
3124
3125	// For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3126	// other choice nodes we only flush if we are out of code size budget.
3127	if (trace->flush_budget() == `0` && trace->actions() != NULL) {
3128	trace->Flush(compiler, this);
3129	return;
3130	}
3131
3132	RecursionCheck rc(compiler);
3133
3134	PreloadState preload;
3135	preload.init();
3136	GreedyLoopState greedy_loop_state(not_at_start());
3137
3138	intptr_t text_length =
3139	GreedyLoopTextLengthForAlternative(&alternatives_->At(`0`));
3140	AlternativeGenerationList alt_gens(choice_count);
3141
3142	if (choice_count > `1` && text_length != kNodeIsTooComplexForGreedyLoops) {
3143	trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3144	&greedy_loop_state, text_length);
3145	} else {
3146	// TODO(erikcorry): Delete this. We don't need this label, but it makes us
3147	// match the traces produced pre-cleanup.
3148	BlockLabel second_choice;
3149	compiler->macro_assembler()->BindBlock(&second_choice);
3150
3151	preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3152
3153	EmitChoices(compiler, &alt_gens, `0`, trace, &preload);
3154	}
3155
3156	// At this point we need to generate slow checks for the alternatives where
3157	// the quick check was inlined. We can recognize these because the associated
3158	// label was bound.
3159	intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3160	for (intptr_t i = `0`; i < choice_count; i++) {
3161	AlternativeGeneration* alt_gen = alt_gens.at(i);
3162	Trace new_trace(*trace);
3163	// If there are actions to be flushed we have to limit how many times
3164	// they are flushed. Take the budget of the parent trace and distribute
3165	// it fairly amongst the children.
3166	if (new_trace.actions() != NULL) {
3167	new_trace.set_flush_budget(new_flush_budget);
3168	}
3169	bool next_expects_preload =
3170	i == choice_count - `1` ? false : alt_gens.at(i + `1`)->expects_preload;
3171	EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->At(i),
3172	alt_gen, preload.preload_characters_,
3173	next_expects_preload);
3174	}
3175	}
3176
3177	Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3178	Trace* trace,
3179	AlternativeGenerationList* alt_gens,
3180	PreloadState* preload,
3181	GreedyLoopState* greedy_loop_state,
3182	intptr_t text_length) {
3183	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3184	// Here we have special handling for greedy loops containing only text nodes
3185	// and other simple nodes. These are handled by pushing the current
3186	// position on the stack and then incrementing the current position each
3187	// time around the switch. On backtrack we decrement the current position
3188	// and check it against the pushed value. This avoids pushing backtrack
3189	// information for each iteration of the loop, which could take up a lot of
3190	// space.
3191	ASSERT(trace->stop_node() == NULL);
3192	macro_assembler->PushCurrentPosition();
3193	BlockLabel greedy_match_failed;
3194	Trace greedy_match_trace;
3195	if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3196	greedy_match_trace.set_backtrack(&greedy_match_failed);
3197	BlockLabel loop_label;
3198	macro_assembler->BindBlock(&loop_label);
3199	macro_assembler->CheckPreemption(/is_backtrack=/false);
3200	greedy_match_trace.set_stop_node(this);
3201	greedy_match_trace.set_loop_label(&loop_label);
3202	(*alternatives_)[`0`].node()->Emit(compiler, &greedy_match_trace);
3203	macro_assembler->BindBlock(&greedy_match_failed);
3204
3205	BlockLabel second_choice; // For use in greedy matches.
3206	macro_assembler->BindBlock(&second_choice);
3207
3208	Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3209
3210	EmitChoices(compiler, alt_gens, `1`, new_trace, preload);
3211
3212	macro_assembler->BindBlock(greedy_loop_state->label());
3213	// If we have unwound to the bottom then backtrack.
3214	macro_assembler->CheckGreedyLoop(trace->backtrack());
3215	// Otherwise try the second priority at an earlier position.
3216	macro_assembler->AdvanceCurrentPosition(-text_length);
3217	macro_assembler->GoTo(&second_choice);
3218	return new_trace;
3219	}
3220
3221	intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3222	Trace* trace) {
3223	intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3224	if (alternatives_->length() != `2`) return eats_at_least;
3225
3226	GuardedAlternative alt1 = alternatives_->At(`1`);
3227	if (alt1.guards() != NULL && alt1.guards()->length() != `0`) {
3228	return eats_at_least;
3229	}
3230	RegExpNode* eats_anything_node = alt1.node();
3231	if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3232	return eats_at_least;
3233	}
3234
3235	// Really we should be creating a new trace when we execute this function,
3236	// but there is no need, because the code it generates cannot backtrack, and
3237	// we always arrive here with a trivial trace (since it's the entry to a
3238	// loop. That also implies that there are no preloaded characters, which is
3239	// good, because it means we won't be violating any assumptions by
3240	// overwriting those characters with new load instructions.
3241	ASSERT(trace->is_trivial());
3242
3243	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3244	// At this point we know that we are at a non-greedy loop that will eat
3245	// any character one at a time. Any non-anchored regexp has such a
3246	// loop prepended to it in order to find where it starts. We look for
3247	// a pattern of the form ...abc... where we can look 6 characters ahead
3248	// and step forwards 3 if the character is not one of abc. Abc need
3249	// not be atoms, they can be any reasonably limited character class or
3250	// small alternation.
3251	BoyerMooreLookahead* bm = bm_info(false);
3252	if (bm == NULL) {
3253	eats_at_least = Utils::Minimum(
3254	kMaxLookaheadForBoyerMoore,
3255	EatsAtLeast(kMaxLookaheadForBoyerMoore, kRecursionBudget, false));
3256	if (eats_at_least >= `1`) {
3257	bm = new (Z) BoyerMooreLookahead (eats_at_least, compiler, Z);
3258	GuardedAlternative alt0 = alternatives_->At(`0`);
3259	alt0.node()->FillInBMInfo(`0`, kRecursionBudget, bm, false);
3260	}
3261	}
3262	if (bm != NULL) {
3263	bm->EmitSkipInstructions(macro_assembler);
3264	}
3265	return eats_at_least;
3266	}
3267
3268	void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3269	AlternativeGenerationList* alt_gens,
3270	intptr_t first_choice,
3271	Trace* trace,
3272	PreloadState* preload) {
3273	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3274	SetUpPreLoad(compiler, trace, preload);
3275
3276	// For now we just call all choices one after the other. The idea ultimately
3277	// is to use the Dispatch table to try only the relevant ones.
3278	intptr_t choice_count = alternatives_->length();
3279
3280	intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3281
3282	for (intptr_t i = first_choice; i < choice_count; i++) {
3283	bool is_last = i == choice_count - `1`;
3284	bool fall_through_on_failure = !is_last;
3285	GuardedAlternative alternative = alternatives_->At(i);
3286	AlternativeGeneration* alt_gen = alt_gens->at(i);
3287	alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3288	ZoneGrowableArray<Guard> guards = alternative.guards();
3289	intptr_t guard_count = (guards == NULL) ? `0` : guards->length();
3290	Trace new_trace(*trace);
3291	new_trace.set_characters_preloaded(
3292	preload->preload_is_current_ ? preload->preload_characters_ : `0`);
3293	if (preload->preload_has_checked_bounds_) {
3294	new_trace.set_bound_checked_up_to(preload->preload_characters_);
3295	}
3296	new_trace.quick_check_performed()->Clear();
3297	if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3298	if (!is_last) {
3299	new_trace.set_backtrack(&alt_gen->after);
3300	}
3301	alt_gen->expects_preload = preload->preload_is_current_;
3302	bool generate_full_check_inline = false;
3303	if (kRegexpOptimization &&
3304	try_to_emit_quick_check_for_alternative(i == `0`) &&
3305	alternative.node()->EmitQuickCheck(
3306	compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3307	&alt_gen->possible_success, &alt_gen->quick_check_details,
3308	fall_through_on_failure)) {
3309	// Quick check was generated for this choice.
3310	preload->preload_is_current_ = true;
3311	preload->preload_has_checked_bounds_ = true;
3312	// If we generated the quick check to fall through on possible success,
3313	// we now need to generate the full check inline.
3314	if (!fall_through_on_failure) {
3315	macro_assembler->BindBlock(&alt_gen->possible_success);
3316	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3317	new_trace.set_characters_preloaded(preload->preload_characters_);
3318	new_trace.set_bound_checked_up_to(preload->preload_characters_);
3319	generate_full_check_inline = true;
3320	}
3321	} else if (alt_gen->quick_check_details.cannot_match()) {
3322	if (!fall_through_on_failure) {
3323	macro_assembler->GoTo(trace->backtrack());
3324	}
3325	continue;
3326	} else {
3327	// No quick check was generated. Put the full code here.
3328	// If this is not the first choice then there could be slow checks from
3329	// previous cases that go here when they fail. There's no reason to
3330	// insist that they preload characters since the slow check we are about
3331	// to generate probably can't use it.
3332	if (i != first_choice) {
3333	alt_gen->expects_preload = false;
3334	new_trace.InvalidateCurrentCharacter();
3335	}
3336	generate_full_check_inline = true;
3337	}
3338	if (generate_full_check_inline) {
3339	if (new_trace.actions() != NULL) {
3340	new_trace.set_flush_budget(new_flush_budget);
3341	}
3342	for (intptr_t j = `0`; j < guard_count; j++) {
3343	GenerateGuard(macro_assembler, guards->At(j), &new_trace);
3344	}
3345	alternative.node()->Emit(compiler, &new_trace);
3346	preload->preload_is_current_ = false;
3347	}
3348	macro_assembler->BindBlock(&alt_gen->after);
3349	}
3350	}
3351
3352	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3353	Trace* trace,
3354	GuardedAlternative alternative,
3355	AlternativeGeneration* alt_gen,
3356	intptr_t preload_characters,
3357	bool next_expects_preload) {
3358	if (!alt_gen->possible_success.is_linked()) return;
3359
3360	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3361	macro_assembler->BindBlock(&alt_gen->possible_success);
3362	Trace out_of_line_trace(*trace);
3363	out_of_line_trace.set_characters_preloaded(preload_characters);
3364	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3365	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3366	ZoneGrowableArray<Guard> guards = alternative.guards();
3367	intptr_t guard_count = (guards == NULL) ? `0` : guards->length();
3368	if (next_expects_preload) {
3369	BlockLabel reload_current_char;
3370	out_of_line_trace.set_backtrack(&reload_current_char);
3371	for (intptr_t j = `0`; j < guard_count; j++) {
3372	GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3373	}
3374	alternative.node()->Emit(compiler, &out_of_line_trace);
3375	macro_assembler->BindBlock(&reload_current_char);
3376	// Reload the current character, since the next quick check expects that.
3377	// We don't need to check bounds here because we only get into this
3378	// code through a quick check which already did the checked load.
3379	macro_assembler->LoadCurrentCharacter(trace->cp_offset(), NULL, false,
3380	preload_characters);
3381	macro_assembler->GoTo(&(alt_gen->after));
3382	} else {
3383	out_of_line_trace.set_backtrack(&(alt_gen->after));
3384	for (intptr_t j = `0`; j < guard_count; j++) {
3385	GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3386	}
3387	alternative.node()->Emit(compiler, &out_of_line_trace);
3388	}
3389	}
3390
3391	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3392	RegExpMacroAssembler* assembler = compiler->macro_assembler();
3393	LimitResult limit_result = LimitVersions(compiler, trace);
3394	if (limit_result == DONE) return;
3395	ASSERT(limit_result == CONTINUE);
3396
3397	RecursionCheck rc(compiler);
3398
3399	switch (action_type_) {
3400	case STORE_POSITION: {
3401	Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3402	data_.u_position_register.is_capture,
3403	trace);
3404	Trace new_trace = *trace;
3405	new_trace.add_action(&new_capture);
3406	on_success()->Emit(compiler, &new_trace);
3407	break;
3408	}
3409	case INCREMENT_REGISTER: {
3410	Trace::DeferredIncrementRegister new_increment(
3411	data_.u_increment_register.reg);
3412	Trace new_trace = *trace;
3413	new_trace.add_action(&new_increment);
3414	on_success()->Emit(compiler, &new_trace);
3415	break;
3416	}
3417	case SET_REGISTER: {
3418	Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
3419	data_.u_store_register.value);
3420	Trace new_trace = *trace;
3421	new_trace.add_action(&new_set);
3422	on_success()->Emit(compiler, &new_trace);
3423	break;
3424	}
3425	case CLEAR_CAPTURES: {
3426	Trace::DeferredClearCaptures new_capture(Interval (
3427	data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3428	Trace new_trace = *trace;
3429	new_trace.add_action(&new_capture);
3430	on_success()->Emit(compiler, &new_trace);
3431	break;
3432	}
3433	case BEGIN_SUBMATCH:
3434	if (!trace->is_trivial()) {
3435	trace->Flush(compiler, this);
3436	} else {
3437	assembler->WriteCurrentPositionToRegister(
3438	data_.u_submatch.current_position_register, `0`);
3439	assembler->WriteStackPointerToRegister(
3440	data_.u_submatch.stack_pointer_register);
3441	on_success()->Emit(compiler, trace);
3442	}
3443	break;
3444	case EMPTY_MATCH_CHECK: {
3445	intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
3446	intptr_t stored_pos = `0`;
3447	intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
3448	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3449	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3450	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3451	// If we know we haven't advanced and there is no minimum we
3452	// can just backtrack immediately.
3453	assembler->GoTo(trace->backtrack());
3454	} else if (know_dist && stored_pos < trace->cp_offset()) {
3455	// If we know we've advanced we can generate the continuation
3456	// immediately.
3457	on_success()->Emit(compiler, trace);
3458	} else if (!trace->is_trivial()) {
3459	trace->Flush(compiler, this);
3460	} else {
3461	BlockLabel skip_empty_check;
3462	// If we have a minimum number of repetitions we check the current
3463	// number first and skip the empty check if it's not enough.
3464	if (has_minimum) {
3465	intptr_t limit = data_.u_empty_match_check.repetition_limit;
3466	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3467	}
3468	// If the match is empty we bail out, otherwise we fall through
3469	// to the on-success continuation.
3470	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3471	trace->backtrack());
3472	assembler->BindBlock(&skip_empty_check);
3473	on_success()->Emit(compiler, trace);
3474	}
3475	break;
3476	}
3477	case POSITIVE_SUBMATCH_SUCCESS: {
3478	if (!trace->is_trivial()) {
3479	trace->Flush(compiler, this);
3480	return;
3481	}
3482	assembler->ReadCurrentPositionFromRegister(
3483	data_.u_submatch.current_position_register);
3484	assembler->ReadStackPointerFromRegister(
3485	data_.u_submatch.stack_pointer_register);
3486	intptr_t clear_register_count = data_.u_submatch.clear_register_count;
3487	if (clear_register_count == `0`) {
3488	on_success()->Emit(compiler, trace);
3489	return;
3490	}
3491	intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
3492	BlockLabel clear_registers_backtrack;
3493	Trace new_trace = *trace;
3494	new_trace.set_backtrack(&clear_registers_backtrack);
3495	on_success()->Emit(compiler, &new_trace);
3496
3497	assembler->BindBlock(&clear_registers_backtrack);
3498	intptr_t clear_registers_to =
3499	clear_registers_from + clear_register_count - `1`;
3500	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3501
3502	ASSERT(trace->backtrack() == NULL);
3503	assembler->Backtrack();
3504	return;
3505	}
3506	default:
3507	UNREACHABLE();
3508	}
3509	}
3510
3511	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3512	RegExpMacroAssembler* assembler = compiler->macro_assembler();
3513	if (!trace->is_trivial()) {
3514	trace->Flush(compiler, this);
3515	return;
3516	}
3517
3518	LimitResult limit_result = LimitVersions(compiler, trace);
3519	if (limit_result == DONE) return;
3520	ASSERT(limit_result == CONTINUE);
3521
3522	RecursionCheck rc(compiler);
3523
3524	ASSERT(start_reg_ + `1` == end_reg_);
3525	if (flags_.IgnoreCase()) {
3526	assembler->CheckNotBackReferenceIgnoreCase(
3527	start_reg_, read_backward(), flags_.IsUnicode(), trace->backtrack());
3528	} else {
3529	assembler->CheckNotBackReference(start_reg_, read_backward(),
3530	trace->backtrack());
3531	}
3532	// We are going to advance backward, so we may end up at the start.
3533	if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3534
3535	// Check that the back reference does not end inside a surrogate pair.
3536	if (flags_.IsUnicode() && !compiler->one_byte()) {
3537	assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3538	}
3539
3540	on_success()->Emit(compiler, trace);
3541	}
3542
3543	// -------------------------------------------------------------------
3544	// Dot/dotty output
3545
3546	#ifdef DEBUG
3547
3548	class DotPrinter : public NodeVisitor {
3549	public:
3550	explicit DotPrinter(bool ignore_case) {}
3551	void PrintNode(const char* label, RegExpNode* node);
3552	void Visit(RegExpNode* node);
3553	void PrintAttributes(RegExpNode* from);
3554	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3555	#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
3556	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3557	#undef DECLARE_VISIT
3558	};
3559
3560	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3561	OS::PrintErr("digraph G {\n graph [label=\"");
3562	for (intptr_t i = `0`; label[i] != `'\0'`; i++) {
3563	switch (label[i]) {
3564	case `'\\'`:
3565	OS::PrintErr("\\\\");
3566	break;
3567	case `'"'`:
3568	OS::PrintErr("\"");
3569	break;
3570	default:
3571	OS::PrintErr("%c", label[i]);
3572	break;
3573	}
3574	}
3575	OS::PrintErr("\"];\n");
3576	Visit(node);
3577	OS::PrintErr("}\n");
3578	}
3579
3580	void DotPrinter::Visit(RegExpNode* node) {
3581	if (node->info()->visited) return;
3582	node->info()->visited = true;
3583	node->Accept(this);
3584	}
3585
3586	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3587	OS::PrintErr(" n%p -> n%p [style=dotted];\n", from, on_failure);
3588	Visit(on_failure);
3589	}
3590
3591	class AttributePrinter : public ValueObject {
3592	public:
3593	AttributePrinter() : first_(true) {}
3594	void PrintSeparator() {
3595	if (first_) {
3596	first_ = false;
3597	} else {
3598	OS::PrintErr("\|");
3599	}
3600	}
3601	void PrintBit(const char* name, bool value) {
3602	if (!value) return;
3603	PrintSeparator();
3604	OS::PrintErr("{%s}", name);
3605	}
3606	void PrintPositive(const char* name, intptr_t value) {
3607	if (value < `0`) return;
3608	PrintSeparator();
3609	OS::PrintErr("{%s\|%" Pd "}", name, value);
3610	}
3611
3612	private:
3613	bool first_;
3614	};
3615
3616	void DotPrinter::PrintAttributes(RegExpNode* that) {
3617	OS::PrintErr(
3618	" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3619	"margin=0.1, fontsize=10, label=\"{",
3620	that);
3621	AttributePrinter printer;
3622	NodeInfo* info = that->info();
3623	printer.PrintBit("NI", info->follows_newline_interest);
3624	printer.PrintBit("WI", info->follows_word_interest);
3625	printer.PrintBit("SI", info->follows_start_interest);
3626	BlockLabel* label = that->label();
3627	if (label->is_bound()) printer.PrintPositive("@", label->pos());
3628	OS::PrintErr(
3629	"}\"];\n"
3630	" a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
3631	that, that);
3632	}
3633
3634	void DotPrinter::VisitChoice(ChoiceNode* that) {
3635	OS::PrintErr(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3636	for (intptr_t i = `0`; i < that->alternatives()->length(); i++) {
3637	GuardedAlternative alt = that->alternatives()->At(i);
3638	OS::PrintErr(" n%p -> n%p", that, alt.node());
3639	}
3640	for (intptr_t i = `0`; i < that->alternatives()->length(); i++) {
3641	GuardedAlternative alt = that->alternatives()->At(i);
3642	alt.node()->Accept(this);
3643	}
3644	}
3645
3646	void DotPrinter::VisitText(TextNode* that) {
3647	OS::PrintErr(" n%p [label=\"", that);
3648	for (intptr_t i = `0`; i < that->elements()->length(); i++) {
3649	if (i > `0`) OS::PrintErr(" ");
3650	TextElement elm = that->elements()->At(i);
3651	switch (elm.text_type()) {
3652	case TextElement::ATOM: {
3653	ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
3654	for (intptr_t i = `0`; i < data->length(); i++) {
3655	OS::PrintErr("%c", static_cast<char>(data->At(i)));
3656	}
3657	break;
3658	}
3659	case TextElement::CHAR_CLASS: {
3660	RegExpCharacterClass* node = elm.char_class();
3661	OS::PrintErr("[");
3662	if (node->is_negated()) OS::PrintErr("^");
3663	for (intptr_t j = `0`; j < node->ranges()->length(); j++) {
3664	CharacterRange range = node->ranges()->At(j);
3665	PrintUtf16(range.from());
3666	OS::PrintErr("-");
3667	PrintUtf16(range.to());
3668	}
3669	OS::PrintErr("]");
3670	break;
3671	}
3672	default:
3673	UNREACHABLE();
3674	}
3675	}
3676	OS::PrintErr("\", shape=box, peripheries=2];\n");
3677	PrintAttributes(that);
3678	OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3679	Visit(that->on_success());
3680	}
3681
3682	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3683	OS::PrintErr(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
3684	that, that->start_register(), that->end_register());
3685	PrintAttributes(that);
3686	OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3687	Visit(that->on_success());
3688	}
3689
3690	void DotPrinter::VisitEnd(EndNode* that) {
3691	OS::PrintErr(" n%p [style=bold, shape=point];\n", that);
3692	PrintAttributes(that);
3693	}
3694
3695	void DotPrinter::VisitAssertion(AssertionNode* that) {
3696	OS::PrintErr(" n%p [", that);
3697	switch (that->assertion_type()) {
3698	case AssertionNode::AT_END:
3699	OS::PrintErr("label=\"$\", shape=septagon");
3700	break;
3701	case AssertionNode::AT_START:
3702	OS::PrintErr("label=\"^\", shape=septagon");
3703	break;
3704	case AssertionNode::AT_BOUNDARY:
3705	OS::PrintErr("label=\"\\b\", shape=septagon");
3706	break;
3707	case AssertionNode::AT_NON_BOUNDARY:
3708	OS::PrintErr("label=\"\\B\", shape=septagon");
3709	break;
3710	case AssertionNode::AFTER_NEWLINE:
3711	OS::PrintErr("label=\"(?<=\\n)\", shape=septagon");
3712	break;
3713	}
3714	OS::PrintErr("];\n");
3715	PrintAttributes(that);
3716	RegExpNode* successor = that->on_success();
3717	OS::PrintErr(" n%p -> n%p;\n", that, successor);
3718	Visit(successor);
3719	}
3720
3721	void DotPrinter::VisitAction(ActionNode* that) {
3722	OS::PrintErr(" n%p [", that);
3723	switch (that->action_type_) {
3724	case ActionNode::SET_REGISTER:
3725	OS::PrintErr("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
3726	that->data_.u_store_register.reg,
3727	that->data_.u_store_register.value);
3728	break;
3729	case ActionNode::INCREMENT_REGISTER:
3730	OS::PrintErr("label=\"$%" Pd "++\", shape=octagon",
3731	that->data_.u_increment_register.reg);
3732	break;
3733	case ActionNode::STORE_POSITION:
3734	OS::PrintErr("label=\"$%" Pd ":=$pos\", shape=octagon",
3735	that->data_.u_position_register.reg);
3736	break;
3737	case ActionNode::BEGIN_SUBMATCH:
3738	OS::PrintErr("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
3739	that->data_.u_submatch.current_position_register);
3740	break;
3741	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3742	OS::PrintErr("label=\"escape\", shape=septagon");
3743	break;
3744	case ActionNode::EMPTY_MATCH_CHECK:
3745	OS::PrintErr("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
3746	that->data_.u_empty_match_check.start_register,
3747	that->data_.u_empty_match_check.repetition_register,
3748	that->data_.u_empty_match_check.repetition_limit);
3749	break;
3750	case ActionNode::CLEAR_CAPTURES: {
3751	OS::PrintErr("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
3752	that->data_.u_clear_captures.range_from,
3753	that->data_.u_clear_captures.range_to);
3754	break;
3755	}
3756	}
3757	OS::PrintErr("];\n");
3758	PrintAttributes(that);
3759	RegExpNode* successor = that->on_success();
3760	OS::PrintErr(" n%p -> n%p;\n", that, successor);
3761	Visit(successor);
3762	}
3763
3764	void RegExpEngine::DotPrint(const char* label,
3765	RegExpNode* node,
3766	bool ignore_case) {
3767	DotPrinter printer(ignore_case);
3768	printer.PrintNode(label, node);
3769	}
3770
3771	#endif // DEBUG
3772
3773	// -------------------------------------------------------------------
3774	// Tree to graph conversion
3775
3776	// The zone in which we allocate graph nodes.
3777	#define OZ (on_success->zone())
3778
3779	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3780	RegExpNode* on_success) {
3781	ZoneGrowableArray<TextElement>* elms =
3782	new (OZ) ZoneGrowableArray<TextElement>(`1`);
3783	elms->Add(TextElement::Atom(this));
3784	return new (OZ) TextNode (elms, compiler->read_backward(), on_success);
3785	}
3786
3787	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3788	RegExpNode* on_success) {
3789	ZoneGrowableArray<TextElement>* elms =
3790	new (OZ) ZoneGrowableArray<TextElement>(`1`);
3791	for (intptr_t i = `0`; i < elements()->length(); i++) {
3792	elms->Add(elements()->At(i));
3793	}
3794	return new (OZ) TextNode (elms, compiler->read_backward(), on_success);
3795	}
3796
3797	static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,
3798	const int32_t* special_class,
3799	intptr_t length) {
3800	length--; // Remove final kRangeEndMarker.
3801	ASSERT(special_class[length] == kRangeEndMarker);
3802	ASSERT(ranges->length() != `0`);
3803	ASSERT(length != `0`);
3804	ASSERT(special_class[`0`] != `0`);
3805	if (ranges->length() != (length >> `1`) + `1`) {
3806	return false;
3807	}
3808	CharacterRange range = ranges->At(`0`);
3809	if (range.from() != `0`) {
3810	return false;
3811	}
3812	for (intptr_t i = `0`; i < length; i += `2`) {
3813	if (special_class[i] != (range.to() + `1`)) {
3814	return false;
3815	}
3816	range = ranges->At((i >> `1`) + `1`);
3817	if (special_class[i + `1`] != range.from()) {
3818	return false;
3819	}
3820	}
3821	if (range.to() != Utf::kMaxCodePoint) {
3822	return false;
3823	}
3824	return true;
3825	}
3826
3827	static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges,
3828	const int32_t* special_class,
3829	intptr_t length) {
3830	length--; // Remove final kRangeEndMarker.
3831	ASSERT(special_class[length] == kRangeEndMarker);
3832	if (ranges->length() * `2` != length) {
3833	return false;
3834	}
3835	for (intptr_t i = `0`; i < length; i += `2`) {
3836	CharacterRange range = ranges->At(i >> `1`);
3837	if (range.from() != special_class[i] \|\|
3838	range.to() != special_class[i + `1`] - `1`) {
3839	return false;
3840	}
3841	}
3842	return true;
3843	}
3844
3845	bool RegExpCharacterClass::is_standard() {
3846	// TODO(lrn): Remove need for this function, by not throwing away information
3847	// along the way.
3848	if (is_negated()) {
3849	return false;
3850	}
3851	if (set_.is_standard()) {
3852	return true;
3853	}
3854	if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3855	set_.set_standard_set_type(`'s'`);
3856	return true;
3857	}
3858	if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3859	set_.set_standard_set_type(`'S'`);
3860	return true;
3861	}
3862	if (CompareInverseRanges(set_.ranges(), kLineTerminatorRanges,
3863	kLineTerminatorRangeCount)) {
3864	set_.set_standard_set_type(`'.'`);
3865	return true;
3866	}
3867	if (CompareRanges(set_.ranges(), kLineTerminatorRanges,
3868	kLineTerminatorRangeCount)) {
3869	set_.set_standard_set_type(`'n'`);
3870	return true;
3871	}
3872	if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3873	set_.set_standard_set_type(`'w'`);
3874	return true;
3875	}
3876	if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3877	set_.set_standard_set_type(`'W'`);
3878	return true;
3879	}
3880	return false;
3881	}
3882
3883	UnicodeRangeSplitter::UnicodeRangeSplitter(
3884	Zone* zone,
3885	ZoneGrowableArray<CharacterRange>* base)
3886	: zone_(zone),
3887	table_(zone),
3888	bmp_(nullptr),
3889	lead_surrogates_(nullptr),
3890	trail_surrogates_(nullptr),
3891	non_bmp_(nullptr) {
3892	// The unicode range splitter categorizes given character ranges into:
3893	// - Code points from the BMP representable by one code unit.
3894	// - Code points outside the BMP that need to be split into surrogate pairs.
3895	// - Lone lead surrogates.
3896	// - Lone trail surrogates.
3897	// Lone surrogates are valid code points, even though no actual characters.
3898	// They require special matching to make sure we do not split surrogate pairs.
3899	// We use the dispatch table to accomplish this. The base range is split up
3900	// by the table by the overlay ranges, and the Call callback is used to
3901	// filter and collect ranges for each category.
3902	for (intptr_t i = `0`; i < base->length(); i++) {
3903	table_.AddRange(base->At(i), kBase, zone_);
3904	}
3905	// Add overlay ranges.
3906	table_.AddRange(CharacterRange::Range(`0`, Utf16::kLeadSurrogateStart - `1`),
3907	kBmpCodePoints, zone_);
3908	table_.AddRange(CharacterRange::Range(Utf16::kLeadSurrogateStart,
3909	Utf16::kLeadSurrogateEnd),
3910	kLeadSurrogates, zone_);
3911	table_.AddRange(CharacterRange::Range(Utf16::kTrailSurrogateStart,
3912	Utf16::kTrailSurrogateEnd),
3913	kTrailSurrogates, zone_);
3914	table_.AddRange(
3915	CharacterRange::Range(Utf16::kTrailSurrogateEnd + `1`, Utf16::kMaxCodeUnit),
3916	kBmpCodePoints, zone_);
3917	table_.AddRange(
3918	CharacterRange::Range(Utf16::kMaxCodeUnit + `1`, Utf::kMaxCodePoint),
3919	kNonBmpCodePoints, zone_);
3920	table_.ForEach(this);
3921	}
3922
3923	void UnicodeRangeSplitter::Call(uint32_t from, ChoiceTable::Entry entry) {
3924	OutSet* outset = entry.out_set();
3925	if (!outset->Get(kBase)) return;
3926	ZoneGrowableArray<CharacterRange> target = nullptr**;
3927	if (outset->Get(kBmpCodePoints)) {
3928	target = &bmp_;
3929	} else if (outset->Get(kLeadSurrogates)) {
3930	target = &lead_surrogates_;
3931	} else if (outset->Get(kTrailSurrogates)) {
3932	target = &trail_surrogates_;
3933	} else {
3934	ASSERT(outset->Get(kNonBmpCodePoints));
3935	target = &non_bmp_;
3936	}
3937	if (target == nullptr*) {
3938	target = new* (zone_) ZoneGrowableArray<CharacterRange>(`2`);
3939	}
3940	(*target)->Add(CharacterRange::Range(entry.from(), entry.to()));
3941	}
3942
3943	void AddBmpCharacters(RegExpCompiler* compiler,
3944	ChoiceNode* result,
3945	RegExpNode* on_success,
3946	UnicodeRangeSplitter* splitter) {
3947	ZoneGrowableArray<CharacterRange>* bmp = splitter->bmp();
3948	if (bmp == nullptr) return;
3949	result->AddAlternative(GuardedAlternative (TextNode::CreateForCharacterRanges(
3950	bmp, compiler->read_backward(), on_success, RegExpFlags ())));
3951	}
3952
3953	void AddNonBmpSurrogatePairs(RegExpCompiler* compiler,
3954	ChoiceNode* result,
3955	RegExpNode* on_success,
3956	UnicodeRangeSplitter* splitter) {
3957	ZoneGrowableArray<CharacterRange>* non_bmp = splitter->non_bmp();
3958	if (non_bmp == nullptr) return;
3959	ASSERT(!compiler->one_byte());
3960	CharacterRange::Canonicalize(non_bmp);
3961	for (int i = `0`; i < non_bmp->length(); i++) {
3962	// Match surrogate pair.
3963	// E.g. [\u10005-\u11005] becomes
3964	// \ud800[\udc05-\udfff]\|
3965	// [\ud801-\ud803][\udc00-\udfff]\|
3966	// \ud804[\udc00-\udc05]
3967	uint32_t from = non_bmp->At(i).from();
3968	uint32_t to = non_bmp->At(i).to();
3969	uint16_t from_points[`2`];
3970	Utf16::Encode(from, from_points);
3971	uint16_t to_points[`2`];
3972	Utf16::Encode(to, to_points);
3973	if (from_points[`0`] == to_points[`0`]) {
3974	// The lead surrogate is the same.
3975	result->AddAlternative(
3976	GuardedAlternative (TextNode::CreateForSurrogatePair(
3977	CharacterRange::Singleton(from_points[`0`]),
3978	CharacterRange::Range(from_points[`1`], to_points[`1`]),
3979	compiler->read_backward(), on_success, RegExpFlags ())));
3980	} else {
3981	if (from_points[`1`] != Utf16::kTrailSurrogateStart) {
3982	// Add [from_l][from_t-\udfff]
3983	result->AddAlternative(
3984	GuardedAlternative (TextNode::CreateForSurrogatePair(
3985	CharacterRange::Singleton(from_points[`0`]),
3986	CharacterRange::Range(from_points[`1`],
3987	Utf16::kTrailSurrogateEnd),
3988	compiler->read_backward(), on_success, RegExpFlags ())));
3989	from_points[`0`]++;
3990	}
3991	if (to_points[`1`] != Utf16::kTrailSurrogateEnd) {
3992	// Add [to_l][\udc00-to_t]
3993	result->AddAlternative(
3994	GuardedAlternative (TextNode::CreateForSurrogatePair(
3995	CharacterRange::Singleton(to_points[`0`]),
3996	CharacterRange::Range(Utf16::kTrailSurrogateStart,
3997	to_points[`1`]),
3998	compiler->read_backward(), on_success, RegExpFlags ())));
3999	to_points[`0`]--;
4000	}
4001	if (from_points[`0`] <= to_points[`0`]) {
4002	// Add [from_l-to_l][\udc00-\udfff]
4003	result->AddAlternative(
4004	GuardedAlternative (TextNode::CreateForSurrogatePair(
4005	CharacterRange::Range(from_points[`0`], to_points[`0`]),
4006	CharacterRange::Range(Utf16::kTrailSurrogateStart,
4007	Utf16::kTrailSurrogateEnd),
4008	compiler->read_backward(), on_success, RegExpFlags ())));
4009	}
4010	}
4011	}
4012	}
4013
4014	RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4015	RegExpCompiler* compiler,
4016	ZoneGrowableArray<CharacterRange>* lookbehind,
4017	ZoneGrowableArray<CharacterRange>* match,
4018	RegExpNode* on_success,
4019	bool read_backward,
4020	RegExpFlags flags) {
4021	RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4022	match, read_backward, on_success, flags);
4023	int stack_register = compiler->UnicodeLookaroundStackRegister();
4024	int position_register = compiler->UnicodeLookaroundPositionRegister();
4025	RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4026	position_register);
4027	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4028	lookbehind, !read_backward, lookaround.on_match_success(), flags);
4029	return lookaround.ForMatch(negative_match);
4030	}
4031
4032	RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4033	RegExpCompiler* compiler,
4034	ZoneGrowableArray<CharacterRange>* match,
4035	ZoneGrowableArray<CharacterRange>* lookahead,
4036	RegExpNode* on_success,
4037	bool read_backward,
4038	RegExpFlags flags) {
4039	int stack_register = compiler->UnicodeLookaroundStackRegister();
4040	int position_register = compiler->UnicodeLookaroundPositionRegister();
4041	RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4042	position_register);
4043	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4044	lookahead, read_backward, lookaround.on_match_success(), flags);
4045	return TextNode::CreateForCharacterRanges(
4046	match, read_backward, lookaround.ForMatch(negative_match), flags);
4047	}
4048
4049	void AddLoneLeadSurrogates(RegExpCompiler* compiler,
4050	ChoiceNode* result,
4051	RegExpNode* on_success,
4052	UnicodeRangeSplitter* splitter) {
4053	auto lead_surrogates = splitter->lead_surrogates();
4054	if (lead_surrogates == nullptr) return;
4055	// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4056	auto trail_surrogates = CharacterRange::List(
4057	on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart,
4058	Utf16::kTrailSurrogateEnd));
4059
4060	RegExpNode* match;
4061	if (compiler->read_backward()) {
4062	// Reading backward. Assert that reading forward, there is no trail
4063	// surrogate, and then backward match the lead surrogate.
4064	match = NegativeLookaroundAgainstReadDirectionAndMatch(
4065	compiler, trail_surrogates, lead_surrogates, on_success, true,
4066	RegExpFlags ());
4067	} else {
4068	// Reading forward. Forward match the lead surrogate and assert that
4069	// no trail surrogate follows.
4070	match = MatchAndNegativeLookaroundInReadDirection(
4071	compiler, lead_surrogates, trail_surrogates, on_success, false,
4072	RegExpFlags ());
4073	}
4074	result->AddAlternative(GuardedAlternative (match));
4075	}
4076
4077	void AddLoneTrailSurrogates(RegExpCompiler* compiler,
4078	ChoiceNode* result,
4079	RegExpNode* on_success,
4080	UnicodeRangeSplitter* splitter) {
4081	auto trail_surrogates = splitter->trail_surrogates();
4082	if (trail_surrogates == nullptr) return;
4083	// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4084	auto lead_surrogates = CharacterRange::List(
4085	on_success->zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart,
4086	Utf16::kLeadSurrogateEnd));
4087
4088	RegExpNode* match;
4089	if (compiler->read_backward()) {
4090	// Reading backward. Backward match the trail surrogate and assert that no
4091	// lead surrogate precedes it.
4092	match = MatchAndNegativeLookaroundInReadDirection(
4093	compiler, trail_surrogates, lead_surrogates, on_success, true,
4094	RegExpFlags ());
4095	} else {
4096	// Reading forward. Assert that reading backward, there is no lead
4097	// surrogate, and then forward match the trail surrogate.
4098	match = NegativeLookaroundAgainstReadDirectionAndMatch(
4099	compiler, lead_surrogates, trail_surrogates, on_success, false,
4100	RegExpFlags ());
4101	}
4102	result->AddAlternative(GuardedAlternative (match));
4103	}
4104
4105	RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
4106	RegExpNode* on_success) {
4107	// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
4108	ASSERT(!compiler->read_backward());
4109	// Advance any character. If the character happens to be a lead surrogate and
4110	// we advanced into the middle of a surrogate pair, it will work out, as
4111	// nothing will match from there. We will have to advance again, consuming
4112	// the associated trail surrogate.
4113	auto range = CharacterRange::List(
4114	on_success->zone(), CharacterRange::Range(`0`, Utf16::kMaxCodeUnit));
4115	return TextNode::CreateForCharacterRanges(range, false, on_success,
4116	RegExpFlags ());
4117	}
4118
4119	void AddUnicodeCaseEquivalents(ZoneGrowableArray<CharacterRange>* ranges) {
4120	ASSERT(CharacterRange::IsCanonical(ranges));
4121
4122	// Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
4123	// See also https://crbug.com/v8/6727.
4124	// TODO(sstrickl): This only covers the special case of the {0,0x10FFFF}
4125	// range, which we use frequently internally. But large ranges can also easily
4126	// be created by the user. We might want to have a more general caching
4127	// mechanism for such ranges.
4128	if (ranges->length() == `1` && ranges->At(`0`).IsEverything(Utf::kMaxCodePoint)) {
4129	return;
4130	}
4131
4132	icu::UnicodeSet set;
4133	for (int i = `0`; i < ranges->length(); i++) {
4134	set.add(ranges->At(i).from(), ranges->At(i).to());
4135	}
4136	ranges->Clear();
4137	set.closeOver(USET_CASE_INSENSITIVE);
4138	// Full case mapping map single characters to multiple characters.
4139	// Those are represented as strings in the set. Remove them so that
4140	// we end up with only simple and common case mappings.
4141	set.removeAllStrings();
4142	for (int i = `0`; i < set.getRangeCount(); i++) {
4143	ranges->Add(
4144	CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
4145	}
4146	// No errors and everything we collected have been ranges.
4147	CharacterRange::Canonicalize(ranges);
4148	}
4149
4150	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4151	RegExpNode* on_success) {
4152	set_.Canonicalize();
4153	ZoneGrowableArray<CharacterRange>* ranges = this->ranges();
4154	if (flags_.NeedsUnicodeCaseEquivalents()) {
4155	AddUnicodeCaseEquivalents(ranges);
4156	}
4157	if (flags_.IsUnicode() && !compiler->one_byte() &&
4158	!contains_split_surrogate()) {
4159	if (is_negated()) {
4160	ZoneGrowableArray<CharacterRange>* negated =
4161	new ZoneGrowableArray<CharacterRange>(`2`);
4162	CharacterRange::Negate(ranges, negated);
4163	ranges = negated;
4164	}
4165	if (ranges->length() == `0`) {
4166	RegExpCharacterClass* fail =
4167	new RegExpCharacterClass (ranges, RegExpFlags ());
4168	return new TextNode (fail, compiler->read_backward(), on_success);
4169	}
4170	if (standard_type() == `'*'`) {
4171	return UnanchoredAdvance(compiler, on_success);
4172	} else {
4173	ChoiceNode* result = new (OZ) ChoiceNode (`2`, OZ);
4174	UnicodeRangeSplitter splitter(OZ, ranges);
4175	AddBmpCharacters(compiler, result, on_success, &splitter);
4176	AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
4177	AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
4178	AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
4179	return result;
4180	}
4181	} else {
4182	return new TextNode (this, compiler->read_backward(), on_success);
4183	}
4184	return new (OZ) TextNode (this, compiler->read_backward(), on_success);
4185	}
4186
4187	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4188	RegExpNode* on_success) {
4189	ZoneGrowableArray<RegExpTree> alternatives = this->alternatives();
4190	intptr_t length = alternatives->length();
4191	ChoiceNode* result = new (OZ) ChoiceNode (length, OZ);
4192	for (intptr_t i = `0`; i < length; i++) {
4193	GuardedAlternative alternative(
4194	alternatives->At(i)->ToNode(compiler, on_success));
4195	result->AddAlternative(alternative);
4196	}
4197	return result;
4198	}
4199
4200	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4201	RegExpNode* on_success) {
4202	return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
4203	}
4204
4205	// Scoped object to keep track of how much we unroll quantifier loops in the
4206	// regexp graph generator.
4207	class RegExpExpansionLimiter : public ValueObject {
4208	public:
4209	static const intptr_t kMaxExpansionFactor = `6`;
4210	RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)
4211	: compiler_(compiler),
4212	saved_expansion_factor_(compiler->current_expansion_factor()),
4213	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4214	ASSERT(factor > `0`);
4215	if (ok_to_expand_) {
4216	if (factor > kMaxExpansionFactor) {
4217	// Avoid integer overflow of the current expansion factor.
4218	ok_to_expand_ = false;
4219	compiler->set_current_expansion_factor(kMaxExpansionFactor + `1`);
4220	} else {
4221	intptr_t new_factor = saved_expansion_factor_ * factor;
4222	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4223	compiler->set_current_expansion_factor(new_factor);
4224	}
4225	}
4226	}
4227
4228	~RegExpExpansionLimiter() {
4229	compiler_->set_current_expansion_factor(saved_expansion_factor_);
4230	}
4231
4232	bool ok_to_expand() { return ok_to_expand_; }
4233
4234	private:
4235	RegExpCompiler* compiler_;
4236	intptr_t saved_expansion_factor_;
4237	bool ok_to_expand_;
4238
4239	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4240	};
4241
4242	RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
4243	intptr_t max,
4244	bool is_greedy,
4245	RegExpTree* body,
4246	RegExpCompiler* compiler,
4247	RegExpNode* on_success,
4248	bool not_at_start) {
4249	// x{f, t} becomes this:
4250	//
4251	// (r++)<-.
4252	// \| `
4253	// \| (x)
4254	// v ^
4255	// (r=0)-->(?)---/ [if r < t]
4256	// \|
4257	// [if r >= f] \----> ...
4258	//
4259
4260	// 15.10.2.5 RepeatMatcher algorithm.
4261	// The parser has already eliminated the case where max is 0. In the case
4262	// where max_match is zero the parser has removed the quantifier if min was
4263	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4264
4265	// If we know that we cannot match zero length then things are a little
4266	// simpler since we don't need to make the special zero length match check
4267	// from step 2.1. If the min and max are small we can unroll a little in
4268	// this case.
4269	// Unroll (foo)+ and (foo){3,}
4270	static const intptr_t kMaxUnrolledMinMatches = `3`;
4271	// Unroll (foo)? and (foo){x,3}
4272	static const intptr_t kMaxUnrolledMaxMatches = `3`;
4273	if (max == `0`) return on_success; // This can happen due to recursion.
4274	bool body_can_be_empty = (body->min_match() == `0`);
4275	intptr_t body_start_reg = RegExpCompiler::kNoRegister;
4276	Interval capture_registers = body->CaptureRegisters();
4277	bool needs_capture_clearing = !capture_registers.is_empty();
4278	Zone* zone = compiler->zone();
4279
4280	if (body_can_be_empty) {
4281	body_start_reg = compiler->AllocateRegister();
4282	} else if (kRegexpOptimization && !needs_capture_clearing) {
4283	// Only unroll if there are no captures and the body can't be
4284	// empty.
4285	{
4286	RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? `1` : `0`));
4287	if (min > `0` && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4288	intptr_t new_max = (max == kInfinity) ? max : max - min;
4289	// Recurse once to get the loop or optional matches after the fixed
4290	// ones.
4291	RegExpNode* answer =
4292	ToNode(`0`, new_max, is_greedy, body, compiler, on_success, true);
4293	// Unroll the forced matches from 0 to min. This can cause chains of
4294	// TextNodes (which the parser does not generate). These should be
4295	// combined if it turns out they hinder good code generation.
4296	for (intptr_t i = `0`; i < min; i++) {
4297	answer = body->ToNode(compiler, answer);
4298	}
4299	return answer;
4300	}
4301	}
4302	if (max <= kMaxUnrolledMaxMatches && min == `0`) {
4303	ASSERT(max > `0`); // Due to the 'if' above.
4304	RegExpExpansionLimiter limiter(compiler, max);
4305	if (limiter.ok_to_expand()) {
4306	// Unroll the optional matches up to max.
4307	RegExpNode* answer = on_success;
4308	for (intptr_t i = `0`; i < max; i++) {
4309	ChoiceNode* alternation = new (zone) ChoiceNode (`2`, zone);
4310	if (is_greedy) {
4311	alternation->AddAlternative(
4312	GuardedAlternative (body->ToNode(compiler, answer)));
4313	alternation->AddAlternative(GuardedAlternative (on_success));
4314	} else {
4315	alternation->AddAlternative(GuardedAlternative (on_success));
4316	alternation->AddAlternative(
4317	GuardedAlternative (body->ToNode(compiler, answer)));
4318	}
4319	answer = alternation;
4320	if (not_at_start && !compiler->read_backward()) {
4321	alternation->set_not_at_start();
4322	}
4323	}
4324	return answer;
4325	}
4326	}
4327	}
4328	bool has_min = min > `0`;
4329	bool has_max = max < RegExpTree::kInfinity;
4330	bool needs_counter = has_min \|\| has_max;
4331	intptr_t reg_ctr = needs_counter ? compiler->AllocateRegister()
4332	: RegExpCompiler::kNoRegister;
4333	LoopChoiceNode* center = new (zone)
4334	LoopChoiceNode (body->min_match() == `0`, compiler->read_backward(), zone);
4335	if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
4336	RegExpNode* loop_return =
4337	needs_counter ? static_cast<RegExpNode*>(
4338	ActionNode::IncrementRegister(reg_ctr, center))
4339	: static_cast<RegExpNode*>(center);
4340	if (body_can_be_empty) {
4341	// If the body can be empty we need to check if it was and then
4342	// backtrack.
4343	loop_return =
4344	ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
4345	}
4346	RegExpNode* body_node = body->ToNode(compiler, loop_return);
4347	if (body_can_be_empty) {
4348	// If the body can be empty we need to store the start position
4349	// so we can bail out if it was empty.
4350	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4351	}
4352	if (needs_capture_clearing) {
4353	// Before entering the body of this loop we need to clear captures.
4354	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4355	}
4356	GuardedAlternative body_alt(body_node);
4357	if (has_max) {
4358	Guard* body_guard = new (zone) Guard (reg_ctr, Guard::LT, max);
4359	body_alt.AddGuard(body_guard, zone);
4360	}
4361	GuardedAlternative rest_alt(on_success);
4362	if (has_min) {
4363	Guard* rest_guard = new (zone) Guard (reg_ctr, Guard::GEQ, min);
4364	rest_alt.AddGuard(rest_guard, zone);
4365	}
4366	if (is_greedy) {
4367	center->AddLoopAlternative(body_alt);
4368	center->AddContinueAlternative(rest_alt);
4369	} else {
4370	center->AddContinueAlternative(rest_alt);
4371	center->AddLoopAlternative(body_alt);
4372	}
4373	if (needs_counter) {
4374	return ActionNode::SetRegister(reg_ctr, `0`, center);
4375	} else {
4376	return center;
4377	}
4378	}
4379
4380	namespace {
4381	// Desugar \b to (?<=\w)(?=\W)\|(?<=\W)(?=\w) and
4382	// \B to (?<=\w)(?=\w)\|(?<=\W)(?=\W)
4383	RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
4384	RegExpNode* on_success,
4385	RegExpAssertion::AssertionType type,
4386	RegExpFlags flags) {
4387	ASSERT(flags.NeedsUnicodeCaseEquivalents());
4388	ZoneGrowableArray<CharacterRange>* word_range =
4389	new ZoneGrowableArray<CharacterRange>(`2`);
4390	CharacterRange::AddClassEscape(`'w'`, word_range, true);
4391	int stack_register = compiler->UnicodeLookaroundStackRegister();
4392	int position_register = compiler->UnicodeLookaroundPositionRegister();
4393	ChoiceNode* result = new (OZ) ChoiceNode (`2`, OZ);
4394	// Add two choices. The (non-)boundary could start with a word or
4395	// a non-word-character.
4396	for (int i = `0`; i < `2`; i++) {
4397	bool lookbehind_for_word = i == `0`;
4398	bool lookahead_for_word =
4399	(type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
4400	// Look to the left.
4401	RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
4402	stack_register, position_register);
4403	RegExpNode* backward = TextNode::CreateForCharacterRanges(
4404	word_range, true, lookbehind.on_match_success(), flags);
4405	// Look to the right.
4406	RegExpLookaround::Builder lookahead(lookahead_for_word,
4407	lookbehind.ForMatch(backward),
4408	stack_register, position_register);
4409	RegExpNode* forward = TextNode::CreateForCharacterRanges(
4410	word_range, false, lookahead.on_match_success(), flags);
4411	result->AddAlternative(GuardedAlternative (lookahead.ForMatch(forward)));
4412	}
4413	return result;
4414	}
4415	} // anonymous namespace
4416
4417	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
4418	RegExpNode* on_success) {
4419	switch (assertion_type()) {
4420	case START_OF_LINE:
4421	return AssertionNode::AfterNewline(on_success);
4422	case START_OF_INPUT:
4423	return AssertionNode::AtStart(on_success);
4424	case BOUNDARY:
4425	return flags_.NeedsUnicodeCaseEquivalents()
4426	? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
4427	flags_)
4428	: AssertionNode::AtBoundary(on_success);
4429	case NON_BOUNDARY:
4430	return flags_.NeedsUnicodeCaseEquivalents()
4431	? BoundaryAssertionAsLookaround(compiler, on_success,
4432	NON_BOUNDARY, flags_)
4433	: AssertionNode::AtNonBoundary(on_success);
4434	case END_OF_INPUT:
4435	return AssertionNode::AtEnd(on_success);
4436	case END_OF_LINE: {
4437	// Compile $ in multiline regexps as an alternation with a positive
4438	// lookahead in one side and an end-of-input on the other side.
4439	// We need two registers for the lookahead.
4440	intptr_t stack_pointer_register = compiler->AllocateRegister();
4441	intptr_t position_register = compiler->AllocateRegister();
4442	// The ChoiceNode to distinguish between a newline and end-of-input.
4443	ChoiceNode* result = new ChoiceNode (`2`, on_success->zone());
4444	// Create a newline atom.
4445	ZoneGrowableArray<CharacterRange>* newline_ranges =
4446	new ZoneGrowableArray<CharacterRange>(`3`);
4447	CharacterRange::AddClassEscape(`'n'`, newline_ranges);
4448	RegExpCharacterClass* newline_atom =
4449	new RegExpCharacterClass (`'n'`, RegExpFlags ());
4450	TextNode* newline_matcher =
4451	new TextNode (newline_atom, /read_backwards=/false,
4452	ActionNode::PositiveSubmatchSuccess(
4453	stack_pointer_register, position_register,
4454	`0`, // No captures inside.
4455	-`1`, // Ignored if no captures.
4456	on_success));
4457	// Create an end-of-input matcher.
4458	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4459	stack_pointer_register, position_register, newline_matcher);
4460	// Add the two alternatives to the ChoiceNode.
4461	GuardedAlternative eol_alternative(end_of_line);
4462	result->AddAlternative(eol_alternative);
4463	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4464	result->AddAlternative(end_alternative);
4465	return result;
4466	}
4467	default:
4468	UNREACHABLE();
4469	}
4470	return on_success;
4471	}
4472
4473	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
4474	RegExpNode* on_success) {
4475	return new (OZ) BackReferenceNode (RegExpCapture::StartRegister(index()),
4476	RegExpCapture::EndRegister(index()), flags_,
4477	compiler->read_backward(), on_success);
4478	}
4479
4480	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
4481	RegExpNode* on_success) {
4482	return on_success;
4483	}
4484
4485	RegExpLookaround::Builder::Builder(bool is_positive,
4486	RegExpNode* on_success,
4487	intptr_t stack_pointer_register,
4488	intptr_t position_register,
4489	intptr_t capture_register_count,
4490	intptr_t capture_register_start)
4491	: is_positive_(is_positive),
4492	on_success_(on_success),
4493	stack_pointer_register_(stack_pointer_register),
4494	position_register_(position_register) {
4495	if (is_positive_) {
4496	on_match_success_ = ActionNode::PositiveSubmatchSuccess(
4497	stack_pointer_register, position_register, capture_register_count,
4498	capture_register_start, on_success);
4499	} else {
4500	on_match_success_ = new (OZ) NegativeSubmatchSuccess (
4501	stack_pointer_register, position_register, capture_register_count,
4502	capture_register_start, OZ);
4503	}
4504	}
4505
4506	RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
4507	if (is_positive_) {
4508	return ActionNode::BeginSubmatch(stack_pointer_register_,
4509	position_register_, match);
4510	} else {
4511	Zone* zone = on_success_->zone();
4512	// We use a ChoiceNode to represent the negative lookaround. The first
4513	// alternative is the negative match. On success, the end node backtracks.
4514	// On failure, the second alternative is tried and leads to success.
4515	// NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
4516	// first exit when calculating quick checks.
4517	ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode (
4518	GuardedAlternative (match), GuardedAlternative (on_success_), zone);
4519	return ActionNode::BeginSubmatch(stack_pointer_register_,
4520	position_register_, choice_node);
4521	}
4522	}
4523
4524	RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
4525	RegExpNode* on_success) {
4526	intptr_t stack_pointer_register = compiler->AllocateRegister();
4527	intptr_t position_register = compiler->AllocateRegister();
4528
4529	const intptr_t registers_per_capture = `2`;
4530	const intptr_t register_of_first_capture = `2`;
4531	intptr_t register_count = capture_count_ * registers_per_capture;
4532	intptr_t register_start =
4533	register_of_first_capture + capture_from_ * registers_per_capture;
4534
4535	RegExpNode* result;
4536	bool was_reading_backward = compiler->read_backward();
4537	compiler->set_read_backward(type() == LOOKBEHIND);
4538	Builder builder(is_positive(), on_success, stack_pointer_register,
4539	position_register, register_count, register_start);
4540	RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
4541	result = builder.ForMatch(match);
4542	compiler->set_read_backward(was_reading_backward);
4543	return result;
4544	}
4545
4546	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
4547	RegExpNode* on_success) {
4548	return ToNode(body(), index(), compiler, on_success);
4549	}
4550
4551	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4552	intptr_t index,
4553	RegExpCompiler* compiler,
4554	RegExpNode* on_success) {
4555	ASSERT(body != nullptr);
4556	intptr_t start_reg = RegExpCapture::StartRegister(index);
4557	intptr_t end_reg = RegExpCapture::EndRegister(index);
4558	if (compiler->read_backward()) {
4559	intptr_t tmp = end_reg;
4560	end_reg = start_reg;
4561	start_reg = tmp;
4562	}
4563	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
4564	RegExpNode* body_node = body->ToNode(compiler, store_end);
4565	return ActionNode::StorePosition(start_reg, true, body_node);
4566	}
4567
4568	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
4569	RegExpNode* on_success) {
4570	ZoneGrowableArray<RegExpTree> children = nodes();
4571	RegExpNode* current = on_success;
4572	if (compiler->read_backward()) {
4573	for (intptr_t i = `0`; i < children->length(); i++) {
4574	current = children->At(i)->ToNode(compiler, current);
4575	}
4576	} else {
4577	for (intptr_t i = children->length() - `1`; i >= `0`; i--) {
4578	current = children->At(i)->ToNode(compiler, current);
4579	}
4580	}
4581	return current;
4582	}
4583
4584	static void AddClass(const int32_t* elmv,
4585	intptr_t elmc,
4586	ZoneGrowableArray<CharacterRange>* ranges) {
4587	elmc--;
4588	ASSERT(elmv[elmc] == kRangeEndMarker);
4589	for (intptr_t i = `0`; i < elmc; i += `2`) {
4590	ASSERT(elmv[i] < elmv[i + `1`]);
4591	ranges->Add(CharacterRange (elmv[i], elmv[i + `1`] - `1`));
4592	}
4593	}
4594
4595	static void AddClassNegated(const int32_t* elmv,
4596	intptr_t elmc,
4597	ZoneGrowableArray<CharacterRange>* ranges) {
4598	elmc--;
4599	ASSERT(elmv[elmc] == kRangeEndMarker);
4600	ASSERT(elmv[`0`] != `0x0000`);
4601	ASSERT(elmv[elmc - `1`] != Utf::kMaxCodePoint);
4602	uint16_t last = `0x0000`;
4603	for (intptr_t i = `0`; i < elmc; i += `2`) {
4604	ASSERT(last <= elmv[i] - `1`);
4605	ASSERT(elmv[i] < elmv[i + `1`]);
4606	ranges->Add(CharacterRange (last, elmv[i] - `1`));
4607	last = elmv[i + `1`];
4608	}
4609	ranges->Add(CharacterRange (last, Utf::kMaxCodePoint));
4610	}
4611
4612	void CharacterRange::AddClassEscape(uint16_t type,
4613	ZoneGrowableArray<CharacterRange>* ranges,
4614	bool add_unicode_case_equivalents) {
4615	if (add_unicode_case_equivalents && (type == `'w'` \|\| type == `'W'`)) {
4616	// See #sec-runtime-semantics-wordcharacters-abstract-operation
4617	// In case of unicode and ignore_case, we need to create the closure over
4618	// case equivalent characters before negating.
4619	ZoneGrowableArray<CharacterRange>* new_ranges =
4620	new ZoneGrowableArray<CharacterRange>(`2`);
4621	AddClass(kWordRanges, kWordRangeCount, new_ranges);
4622	AddUnicodeCaseEquivalents(new_ranges);
4623	if (type == `'W'`) {
4624	ZoneGrowableArray<CharacterRange>* negated =
4625	new ZoneGrowableArray<CharacterRange>(`2`);
4626	CharacterRange::Negate(new_ranges, negated);
4627	new_ranges = negated;
4628	}
4629	ranges->AddArray(*new_ranges);
4630	return;
4631	}
4632	AddClassEscape(type, ranges);
4633	}
4634
4635	void CharacterRange::AddClassEscape(uint16_t type,
4636	ZoneGrowableArray<CharacterRange>* ranges) {
4637	switch (type) {
4638	case `'s'`:
4639	AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4640	break;
4641	case `'S'`:
4642	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4643	break;
4644	case `'w'`:
4645	AddClass(kWordRanges, kWordRangeCount, ranges);
4646	break;
4647	case `'W'`:
4648	AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4649	break;
4650	case `'d'`:
4651	AddClass(kDigitRanges, kDigitRangeCount, ranges);
4652	break;
4653	case `'D'`:
4654	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4655	break;
4656	case `'.'`:
4657	AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
4658	break;
4659	// This is not a character range as defined by the spec but a
4660	// convenient shorthand for a character class that matches any
4661	// character.
4662	case `'*'`:
4663	ranges->Add(CharacterRange::Everything());
4664	break;
4665	// This is the set of characters matched by the $ and ^ symbols
4666	// in multiline mode.
4667	case `'n'`:
4668	AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
4669	break;
4670	default:
4671	UNREACHABLE();
4672	}
4673	}
4674
4675	void CharacterRange::AddCaseEquivalents(
4676	ZoneGrowableArray<CharacterRange>* ranges,
4677	bool is_one_byte,
4678	Zone* zone) {
4679	CharacterRange::Canonicalize(ranges);
4680	int range_count = ranges->length();
4681	for (intptr_t i = `0`; i < range_count; i++) {
4682	CharacterRange range = ranges->At(i);
4683	int32_t bottom = range.from();
4684	if (bottom > Utf16::kMaxCodeUnit) continue;
4685	int32_t top = Utils::Minimum(range.to(), Utf16::kMaxCodeUnit);
4686	// Nothing to be done for surrogates
4687	if (bottom >= Utf16::kLeadSurrogateStart &&
4688	top <= Utf16::kTrailSurrogateEnd) {
4689	continue;
4690	}
4691	if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
4692	if (bottom > Symbols::kMaxOneCharCodeSymbol) continue;
4693	if (top > Symbols::kMaxOneCharCodeSymbol) {
4694	top = Symbols::kMaxOneCharCodeSymbol;
4695	}
4696	}
4697
4698	unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
4699	unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;
4700	int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4701	if (top == bottom) {
4702	// If this is a singleton we just expand the one character.
4703	intptr_t length = jsregexp_uncanonicalize.get(bottom, `'\0'`, chars);
4704	for (intptr_t i = `0`; i < length; i++) {
4705	int32_t chr = chars[i];
4706	if (chr != bottom) {
4707	ranges->Add(CharacterRange::Singleton(chars[i]));
4708	}
4709	}
4710	} else {
4711	// If this is a range we expand the characters block by block,
4712	// expanding contiguous subranges (blocks) one at a time.
4713	// The approach is as follows. For a given start character we
4714	// look up the remainder of the block that contains it (represented
4715	// by the end point), for instance we find 'z' if the character
4716	// is 'c'. A block is characterized by the property
4717	// that all characters uncanonicalize in the same way, except that
4718	// each entry in the result is incremented by the distance from the first
4719	// element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A']
4720	// and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4721	// Once we've found the end point we look up its uncanonicalization
4722	// and produce a range for each element. For instance for [c-f]
4723	// we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4724	// add a range if it is not already contained in the input, so [c-f]
4725	// will be skipped but [C-F] will be added. If this range is not
4726	// completely contained in a block we do this for all the blocks
4727	// covered by the range (handling characters that is not in a block
4728	// as a "singleton block").
4729	int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4730	intptr_t pos = bottom;
4731	while (pos <= top) {
4732	intptr_t length = jsregexp_canonrange.get(pos, `'\0'`, range);
4733	int32_t block_end;
4734	if (length == `0`) {
4735	block_end = pos;
4736	} else {
4737	ASSERT(length == `1`);
4738	block_end = range[`0`];
4739	}
4740	intptr_t end = (block_end > top) ? top : block_end;
4741	length = jsregexp_uncanonicalize.get(block_end, `'\0'`, range);
4742	for (intptr_t i = `0`; i < length; i++) {
4743	int32_t c = range[i];
4744	int32_t range_from = c - (block_end - pos);
4745	int32_t range_to = c - (block_end - end);
4746	if (!(bottom <= range_from && range_to <= top)) {
4747	ranges->Add(CharacterRange (range_from, range_to));
4748	}
4749	}
4750	pos = end + `1`;
4751	}
4752	}
4753	}
4754	}
4755
4756	bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) {
4757	ASSERT(ranges != NULL);
4758	intptr_t n = ranges->length();
4759	if (n <= `1`) return true;
4760	intptr_t max = ranges->At(`0`).to();
4761	for (intptr_t i = `1`; i < n; i++) {
4762	CharacterRange next_range = ranges->At(i);
4763	if (next_range.from() <= max + `1`) return false;
4764	max = next_range.to();
4765	}
4766	return true;
4767	}
4768
4769	ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() {
4770	if (ranges_ == NULL) {
4771	ranges_ = new ZoneGrowableArray<CharacterRange>(`2`);
4772	CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4773	}
4774	return ranges_;
4775	}
4776
4777	// Move a number of elements in a zone array to another position
4778	// in the same array. Handles overlapping source and target areas.
4779	static void MoveRanges(ZoneGrowableArray<CharacterRange>* list,
4780	intptr_t from,
4781	intptr_t to,
4782	intptr_t count) {
4783	// Ranges are potentially overlapping.
4784	if (from < to) {
4785	for (intptr_t i = count - `1`; i >= `0`; i--) {
4786	(*list)[to + i] = list->At(from + i);
4787	}
4788	} else {
4789	for (intptr_t i = `0`; i < count; i++) {
4790	(*list)[to + i] = list->At(from + i);
4791	}
4792	}
4793	}
4794
4795	static intptr_t InsertRangeInCanonicalList(
4796	ZoneGrowableArray<CharacterRange>* list,
4797	intptr_t count,
4798	CharacterRange insert) {
4799	// Inserts a range into list[0..count[, which must be sorted
4800	// by from value and non-overlapping and non-adjacent, using at most
4801	// list[0..count] for the result. Returns the number of resulting
4802	// canonicalized ranges. Inserting a range may collapse existing ranges into
4803	// fewer ranges, so the return value can be anything in the range 1..count+1.
4804	int32_t from = insert.from();
4805	int32_t to = insert.to();
4806	intptr_t start_pos = `0`;
4807	intptr_t end_pos = count;
4808	for (intptr_t i = count - `1`; i >= `0`; i--) {
4809	CharacterRange current = list->At(i);
4810	if (current.from() > to + `1`) {
4811	end_pos = i;
4812	} else if (current.to() + `1` < from) {
4813	start_pos = i + `1`;
4814	break;
4815	}
4816	}
4817
4818	// Inserted range overlaps, or is adjacent to, ranges at positions
4819	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4820	// not affected by the insertion.
4821	// If start_pos == end_pos, the range must be inserted before start_pos.
4822	// if start_pos < end_pos, the entire range from start_pos to end_pos
4823	// must be merged with the insert range.
4824
4825	if (start_pos == end_pos) {
4826	// Insert between existing ranges at position start_pos.
4827	if (start_pos < count) {
4828	MoveRanges(list, start_pos, start_pos + `1`, count - start_pos);
4829	}
4830	(*list)[start_pos] = insert;
4831	return count + `1`;
4832	}
4833	if (start_pos + `1` == end_pos) {
4834	// Replace single existing range at position start_pos.
4835	CharacterRange to_replace = list->At(start_pos);
4836	intptr_t new_from = Utils::Minimum(to_replace.from(), from);
4837	intptr_t new_to = Utils::Maximum(to_replace.to(), to);
4838	(*list)[start_pos] = CharacterRange (new_from, new_to);
4839	return count;
4840	}
4841	// Replace a number of existing ranges from start_pos to end_pos - 1.
4842	// Move the remaining ranges down.
4843
4844	intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);
4845	intptr_t new_to = Utils::Maximum(list->At(end_pos - `1`).to(), to);
4846	if (end_pos < count) {
4847	MoveRanges(list, end_pos, start_pos + `1`, count - end_pos);
4848	}
4849	(*list)[start_pos] = CharacterRange (new_from, new_to);
4850	return count - (end_pos - start_pos) + `1`;
4851	}
4852
4853	void CharacterSet::Canonicalize() {
4854	// Special/default classes are always considered canonical. The result
4855	// of calling ranges() will be sorted.
4856	if (ranges_ == NULL) return;
4857	CharacterRange::Canonicalize(ranges_);
4858	}
4859
4860	void CharacterRange::Canonicalize(
4861	ZoneGrowableArray<CharacterRange>* character_ranges) {
4862	if (character_ranges->length() <= `1`) return;
4863	// Check whether ranges are already canonical (increasing, non-overlapping,
4864	// non-adjacent).
4865	intptr_t n = character_ranges->length();
4866	intptr_t max = character_ranges->At(`0`).to();
4867	intptr_t i = `1`;
4868	while (i < n) {
4869	CharacterRange current = character_ranges->At(i);
4870	if (current.from() <= max + `1`) {
4871	break;
4872	}
4873	max = current.to();
4874	i++;
4875	}
4876	// Canonical until the i'th range. If that's all of them, we are done.
4877	if (i == n) return;
4878
4879	// The ranges at index i and forward are not canonicalized. Make them so by
4880	// doing the equivalent of insertion sort (inserting each into the previous
4881	// list, in order).
4882	// Notice that inserting a range can reduce the number of ranges in the
4883	// result due to combining of adjacent and overlapping ranges.
4884	intptr_t read = i; // Range to insert.
4885	intptr_t num_canonical = i; // Length of canonicalized part of list.
4886	do {
4887	num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
4888	character_ranges->At(read));
4889	read++;
4890	} while (read < n);
4891	character_ranges->TruncateTo(num_canonical);
4892
4893	ASSERT(CharacterRange::IsCanonical(character_ranges));
4894	}
4895
4896	void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges,
4897	ZoneGrowableArray<CharacterRange>* negated_ranges) {
4898	ASSERT(CharacterRange::IsCanonical(ranges));
4899	ASSERT(negated_ranges->length() == `0`);
4900	intptr_t range_count = ranges->length();
4901	uint32_t from = `0`;
4902	intptr_t i = `0`;
4903	if (range_count > `0` && ranges->At(`0`).from() == `0`) {
4904	from = ranges->At(`0`).to();
4905	i = `1`;
4906	}
4907	while (i < range_count) {
4908	CharacterRange range = ranges->At(i);
4909	negated_ranges->Add(CharacterRange (from + `1`, range.from() - `1`));
4910	from = range.to();
4911	i++;
4912	}
4913	if (from < Utf::kMaxCodePoint) {
4914	negated_ranges->Add(CharacterRange (from + `1`, Utf::kMaxCodePoint));
4915	}
4916	}
4917
4918	// -------------------------------------------------------------------
4919	// Splay tree
4920
4921	// Workaround for the fact that ZoneGrowableArray does not have contains().
4922	static bool ArrayContains(ZoneGrowableArray<unsigned>* array, unsigned value) {
4923	for (intptr_t i = `0`; i < array->length(); i++) {
4924	if (array->At(i) == value) {
4925	return true;
4926	}
4927	}
4928	return false;
4929	}
4930
4931	OutSet* OutSet::Extend(unsigned value, Zone* zone) {
4932	if (Get(value)) return this;
4933	if (successors() != nullptr) {
4934	for (int i = `0`; i < successors()->length(); i++) {
4935	OutSet* successor = successors()->At(i);
4936	if (successor->Get(value)) return successor;
4937	}
4938	} else {
4939	successors_ = new (zone) ZoneGrowableArray<OutSet*>(`2`);
4940	}
4941	OutSet* result = new (zone) OutSet (first_, remaining_);
4942	result->Set(value, zone);
4943	successors()->Add(result);
4944	return result;
4945	}
4946
4947	void OutSet::Set(unsigned value, Zone* zone) {
4948	if (value < kFirstLimit) {
4949	first_ \|= (`1` << value);
4950	} else {
4951	if (remaining_ == NULL)
4952	remaining_ = new (zone) ZoneGrowableArray<unsigned>(`1`);
4953
4954	bool remaining_contains_value = ArrayContains(remaining_, value);
4955	if (remaining_->is_empty() \|\| !remaining_contains_value) {
4956	remaining_->Add(value);
4957	}
4958	}
4959	}
4960
4961	bool OutSet::Get(unsigned value) const {
4962	if (value < kFirstLimit) {
4963	return (first_ & (`1` << value)) != `0`;
4964	} else if (remaining_ == NULL) {
4965	return false;
4966	} else {
4967	return ArrayContains(remaining_, value);
4968	}
4969	}
4970
4971	const int32_t ChoiceTable::Config::kNoKey = Utf::kInvalidChar;
4972
4973	void ChoiceTable::AddRange(CharacterRange full_range,
4974	int32_t value,
4975	Zone* zone) {
4976	CharacterRange current = full_range;
4977	if (tree()->is_empty()) {
4978	// If this is the first range we just insert into the table.
4979	ZoneSplayTree<Config>::Locator loc;
4980	bool inserted = tree()->Insert(current.from(), &loc);
4981	ASSERT(inserted);
4982	USE(inserted);
4983	loc.set_value(
4984	Entry (current.from(), current.to(), empty()->Extend(value, zone)));
4985	return;
4986	}
4987	// First see if there is a range to the left of this one that
4988	// overlaps.
4989	ZoneSplayTree<Config>::Locator loc;
4990	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4991	Entry* entry = &loc.value();
4992	// If we've found a range that overlaps with this one, and it
4993	// starts strictly to the left of this one, we have to fix it
4994	// because the following code only handles ranges that start on
4995	// or after the start point of the range we're adding.
4996	if (entry->from() < current.from() && entry->to() >= current.from()) {
4997	// Snap the overlapping range in half around the start point of
4998	// the range we're adding.
4999	CharacterRange left =
5000	CharacterRange::Range(entry->from(), current.from() - `1`);
5001	CharacterRange right = CharacterRange::Range(current.from(), entry->to());
5002	// The left part of the overlapping range doesn't overlap.
5003	// Truncate the whole entry to be just the left part.
5004	entry->set_to(left.to());
5005	// The right part is the one that overlaps. We add this part
5006	// to the map and let the next step deal with merging it with
5007	// the range we're adding.
5008	ZoneSplayTree<Config>::Locator loc;
5009	bool inserted = tree()->Insert(right.from(), &loc);
5010	ASSERT(inserted);
5011	USE(inserted);
5012	loc.set_value(Entry (right.from(), right.to(), entry->out_set()));
5013	}
5014	}
5015	while (current.is_valid()) {
5016	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5017	(loc.value().from() <= current.to()) &&
5018	(loc.value().to() >= current.from())) {
5019	Entry* entry = &loc.value();
5020	// We have overlap. If there is space between the start point of
5021	// the range we're adding and where the overlapping range starts
5022	// then we have to add a range covering just that space.
5023	if (current.from() < entry->from()) {
5024	ZoneSplayTree<Config>::Locator ins;
5025	bool inserted = tree()->Insert(current.from(), &ins);
5026	ASSERT(inserted);
5027	USE(inserted);
5028	ins.set_value(Entry (current.from(), entry->from() - `1`,
5029	empty()->Extend(value, zone)));
5030	current.set_from(entry->from());
5031	}
5032	ASSERT(current.from() == entry->from());
5033	// If the overlapping range extends beyond the one we want to add
5034	// we have to snap the right part off and add it separately.
5035	if (entry->to() > current.to()) {
5036	ZoneSplayTree<Config>::Locator ins;
5037	bool inserted = tree()->Insert(current.to() + `1`, &ins);
5038	ASSERT(inserted);
5039	USE(inserted);
5040	ins.set_value(Entry (current.to() + `1`, entry->to(), entry->out_set()));
5041	entry->set_to(current.to());
5042	}
5043	ASSERT(entry->to() <= current.to());
5044	// The overlapping range is now completely contained by the range
5045	// we're adding so we can just update it and move the start point
5046	// of the range we're adding just past it.
5047	entry->AddValue(value, zone);
5048	ASSERT(entry->to() + `1` > current.from());
5049	current.set_from(entry->to() + `1`);
5050	} else {
5051	// There is no overlap so we can just add the range
5052	ZoneSplayTree<Config>::Locator ins;
5053	bool inserted = tree()->Insert(current.from(), &ins);
5054	ASSERT(inserted);
5055	USE(inserted);
5056	ins.set_value(
5057	Entry (current.from(), current.to(), empty()->Extend(value, zone)));
5058	break;
5059	}
5060	}
5061	}
5062
5063	OutSet* ChoiceTable::Get(int32_t value) {
5064	ZoneSplayTree<Config>::Locator loc;
5065	if (!tree()->FindGreatestLessThan(value, &loc)) return empty();
5066	Entry* entry = &loc.value();
5067	if (value <= entry->to())
5068	return entry->out_set();
5069	else
5070	return empty();
5071	}
5072
5073	// -------------------------------------------------------------------
5074	// Analysis
5075
5076	void Analysis::EnsureAnalyzed(RegExpNode* that) {
5077	if (that->info()->been_analyzed \|\| that->info()->being_analyzed) return;
5078	that->info()->being_analyzed = true;
5079	that->Accept(this);
5080	that->info()->being_analyzed = false;
5081	that->info()->been_analyzed = true;
5082	}
5083
5084	void Analysis::VisitEnd(EndNode* that) {
5085	// nothing to do
5086	}
5087
5088	void TextNode::CalculateOffsets() {
5089	intptr_t element_count = elements()->length();
5090	// Set up the offsets of the elements relative to the start. This is a fixed
5091	// quantity since a TextNode can only contain fixed-width things.
5092	intptr_t cp_offset = `0`;
5093	for (intptr_t i = `0`; i < element_count; i++) {
5094	TextElement& elm = (*elements())[i];
5095	elm.set_cp_offset(cp_offset);
5096	cp_offset += elm.length();
5097	}
5098	}
5099
5100	void Analysis::VisitText(TextNode* that) {
5101	that->MakeCaseIndependent(is_one_byte_);
5102	EnsureAnalyzed(that->on_success());
5103	if (!has_failed()) {
5104	that->CalculateOffsets();
5105	}
5106	}
5107
5108	void Analysis::VisitAction(ActionNode* that) {
5109	RegExpNode* target = that->on_success();
5110	EnsureAnalyzed(target);
5111	if (!has_failed()) {
5112	// If the next node is interested in what it follows then this node
5113	// has to be interested too so it can pass the information on.
5114	that->info()->AddFromFollowing(target->info());
5115	}
5116	}
5117
5118	void Analysis::VisitChoice(ChoiceNode* that) {
5119	NodeInfo* info = that->info();
5120	for (intptr_t i = `0`; i < that->alternatives()->length(); i++) {
5121	RegExpNode* node = (*that->alternatives())[i].node();
5122	EnsureAnalyzed(node);
5123	if (has_failed()) return;
5124	// Anything the following nodes need to know has to be known by
5125	// this node also, so it can pass it on.
5126	info->AddFromFollowing(node->info());
5127	}
5128	}
5129
5130	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5131	NodeInfo* info = that->info();
5132	for (intptr_t i = `0`; i < that->alternatives()->length(); i++) {
5133	RegExpNode* node = (*that->alternatives())[i].node();
5134	if (node != that->loop_node()) {
5135	EnsureAnalyzed(node);
5136	if (has_failed()) return;
5137	info->AddFromFollowing(node->info());
5138	}
5139	}
5140	// Check the loop last since it may need the value of this node
5141	// to get a correct result.
5142	EnsureAnalyzed(that->loop_node());
5143	if (!has_failed()) {
5144	info->AddFromFollowing(that->loop_node()->info());
5145	}
5146	}
5147
5148	void Analysis::VisitBackReference(BackReferenceNode* that) {
5149	EnsureAnalyzed(that->on_success());
5150	}
5151
5152	void Analysis::VisitAssertion(AssertionNode* that) {
5153	EnsureAnalyzed(that->on_success());
5154	}
5155
5156	void BackReferenceNode::FillInBMInfo(intptr_t offset,
5157	intptr_t budget,
5158	BoyerMooreLookahead* bm,
5159	bool not_at_start) {
5160	// Working out the set of characters that a backreference can match is too
5161	// hard, so we just say that any character can match.
5162	bm->SetRest(offset);
5163	SaveBMInfo(bm, not_at_start, offset);
5164	}
5165
5166	COMPILE_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5167	RegExpMacroAssembler::kTableSize);
5168
5169	void ChoiceNode::FillInBMInfo(intptr_t offset,
5170	intptr_t budget,
5171	BoyerMooreLookahead* bm,
5172	bool not_at_start) {
5173	ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
5174	budget = (budget - `1`) / alts->length();
5175	for (intptr_t i = `0`; i < alts->length(); i++) {
5176	GuardedAlternative& alt = (*alts)[i];
5177	if (alt.guards() != NULL && alt.guards()->length() != `0`) {
5178	bm->SetRest(offset); // Give up trying to fill in info.
5179	SaveBMInfo(bm, not_at_start, offset);
5180	return;
5181	}
5182	alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5183	}
5184	SaveBMInfo(bm, not_at_start, offset);
5185	}
5186
5187	void TextNode::FillInBMInfo(intptr_t initial_offset,
5188	intptr_t budget,
5189	BoyerMooreLookahead* bm,
5190	bool not_at_start) {
5191	if (initial_offset >= bm->length()) return;
5192	intptr_t offset = initial_offset;
5193	intptr_t max_char = bm->max_char();
5194	for (intptr_t i = `0`; i < elements()->length(); i++) {
5195	if (offset >= bm->length()) {
5196	if (initial_offset == `0`) set_bm_info(not_at_start, bm);
5197	return;
5198	}
5199	TextElement text = elements()->At(i);
5200	if (text.text_type() == TextElement::ATOM) {
5201	RegExpAtom* atom = text.atom();
5202	for (intptr_t j = `0`; j < atom->length(); j++, offset++) {
5203	if (offset >= bm->length()) {
5204	if (initial_offset == `0`) set_bm_info(not_at_start, bm);
5205	return;
5206	}
5207	uint16_t character = atom->data()->At(j);
5208	if (atom->flags().IgnoreCase()) {
5209	int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5210	intptr_t length = GetCaseIndependentLetters(
5211	character, bm->max_char() == Symbols::kMaxOneCharCodeSymbol,
5212	chars);
5213	for (intptr_t j = `0`; j < length; j++) {
5214	bm->Set(offset, chars[j]);
5215	}
5216	} else {
5217	if (character <= max_char) bm->Set(offset, character);
5218	}
5219	}
5220	} else {
5221	ASSERT(text.text_type() == TextElement::CHAR_CLASS);
5222	RegExpCharacterClass* char_class = text.char_class();
5223	ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
5224	if (char_class->is_negated()) {
5225	bm->SetAll(offset);
5226	} else {
5227	for (intptr_t k = `0`; k < ranges->length(); k++) {
5228	const CharacterRange& range = ranges->At(k);
5229	if (range.from() > max_char) continue;
5230	intptr_t to =
5231	Utils::Minimum(max_char, static_cast<intptr_t>(range.to()));
5232	bm->SetInterval(offset, Interval (range.from(), to));
5233	}
5234	}
5235	offset++;
5236	}
5237	}
5238	if (offset >= bm->length()) {
5239	if (initial_offset == `0`) set_bm_info(not_at_start, bm);
5240	return;
5241	}
5242	on_success()->FillInBMInfo(offset, budget - `1`, bm,
5243	true); // Not at start after a text node.
5244	if (initial_offset == `0`) set_bm_info(not_at_start, bm);
5245	}
5246
5247	RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
5248	RegExpNode* on_success,
5249	RegExpFlags flags) {
5250	// If the regexp matching starts within a surrogate pair, step back
5251	// to the lead surrogate and start matching from there.
5252	ASSERT(!compiler->read_backward());
5253	Zone* zone = compiler->zone();
5254
5255	auto lead_surrogates = CharacterRange::List(
5256	on_success->zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart,
5257	Utf16::kLeadSurrogateEnd));
5258	auto trail_surrogates = CharacterRange::List(
5259	on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart,
5260	Utf16::kTrailSurrogateEnd));
5261
5262	ChoiceNode* optional_step_back = new (zone) ChoiceNode (`2`, zone);
5263
5264	int stack_register = compiler->UnicodeLookaroundStackRegister();
5265	int position_register = compiler->UnicodeLookaroundPositionRegister();
5266	RegExpNode* step_back = TextNode::CreateForCharacterRanges(
5267	lead_surrogates, /read_backward=/true, on_success, flags);
5268	RegExpLookaround::Builder builder(/is_positive=/true, step_back,
5269	stack_register, position_register);
5270	RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
5271	trail_surrogates, /read_backward=/false, builder.on_match_success(),
5272	flags);
5273
5274	optional_step_back->AddAlternative(
5275	GuardedAlternative (builder.ForMatch(match_trail)));
5276	optional_step_back->AddAlternative(GuardedAlternative (on_success));
5277
5278	return optional_step_back;
5279	}
5280
5281	#if !defined(DART_PRECOMPILED_RUNTIME)
5282	RegExpEngine::CompilationResult RegExpEngine::CompileIR(
5283	RegExpCompileData* data,
5284	const ParsedFunction* parsed_function,
5285	const ZoneGrowableArray<const ICData*>& ic_data_array,
5286	intptr_t osr_id) {
5287	ASSERT(!FLAG_interpret_irregexp);
5288	Zone* zone = Thread::Current()->zone();
5289
5290	const Function& function = parsed_function->function();
5291	const intptr_t specialization_cid = function.string_specialization_cid();
5292	const bool is_sticky = function.is_sticky_specialization();
5293	const bool is_one_byte = (specialization_cid == kOneByteStringCid \|\|
5294	specialization_cid == kExternalOneByteStringCid);
5295	RegExp& regexp = RegExp::Handle(zone, function.regexp());
5296	const String& pattern = String::Handle(zone, regexp.pattern());
5297
5298	ASSERT(!regexp.IsNull());
5299	ASSERT(!pattern.IsNull());
5300
5301	const bool is_global = regexp.flags().IsGlobal();
5302	const bool is_unicode = regexp.flags().IsUnicode();
5303
5304	RegExpCompiler compiler(data->capture_count, is_one_byte);
5305
5306	// TODO(zerny): Frequency sampling is currently disabled because of several
5307	// issues. We do not want to store subject strings in the regexp object since
5308	// they might be long and we should not prevent their garbage collection.
5309	// Passing them to this function explicitly does not help, since we must
5310	// generate exactly the same IR for both the unoptimizing and optimizing
5311	// pipelines (otherwise it gets confused when i.e. deopt id's differ).
5312	// An option would be to store sampling results in the regexp object, but
5313	// I'm not sure the performance gains are relevant enough.
5314
5315	// Wrap the body of the regexp in capture #0.
5316	RegExpNode* captured_body =
5317	RegExpCapture::ToNode(data->tree, `0`, &compiler, compiler.accept());
5318
5319	RegExpNode* node = captured_body;
5320	const bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5321	const bool is_start_anchored = data->tree->IsAnchoredAtStart();
5322	intptr_t max_length = data->tree->max_match();
5323	if (!is_start_anchored && !is_sticky) {
5324	// Add a .? at the beginning, outside the body capture, unless*
5325	// this expression is anchored at the beginning or is sticky.
5326	RegExpNode* loop_node = RegExpQuantifier::ToNode(
5327	`0`, RegExpTree::kInfinity, false,
5328	new (zone) RegExpCharacterClass (`'*'`, RegExpFlags ()), &compiler,
5329	captured_body, data->contains_anchor);
5330
5331	if (data->contains_anchor) {
5332	// Unroll loop once, to take care of the case that might start
5333	// at the start of input.
5334	ChoiceNode* first_step_node = new (zone) ChoiceNode (`2`, zone);
5335	first_step_node->AddAlternative(GuardedAlternative (captured_body));
5336	first_step_node->AddAlternative(GuardedAlternative (new (zone) TextNode (
5337	new (zone) RegExpCharacterClass (`'*'`, RegExpFlags ()),
5338	/read_backwards=/false, loop_node)));
5339	node = first_step_node;
5340	} else {
5341	node = loop_node;
5342	}
5343	}
5344	if (is_one_byte) {
5345	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5346	// Do it again to propagate the new nodes to places where they were not
5347	// put because they had not been calculated yet.
5348	if (node != NULL) {
5349	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5350	}
5351	} else if (is_unicode && (is_global \|\| is_sticky)) {
5352	node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5353	}
5354
5355	if (node == NULL) node = new (zone) EndNode (EndNode::BACKTRACK, zone);
5356	data->node = node;
5357	Analysis analysis(is_one_byte);
5358	analysis.EnsureAnalyzed(node);
5359	if (analysis.has_failed()) {
5360	const char* error_message = analysis.error_message();
5361	return CompilationResult (error_message);
5362	}
5363
5364	// Native regexp implementation.
5365
5366	IRRegExpMacroAssembler* macro_assembler = new (zone)
5367	IRRegExpMacroAssembler (specialization_cid, data->capture_count,
5368	parsed_function, ic_data_array, osr_id, zone);
5369
5370	// Inserted here, instead of in Assembler, because it depends on information
5371	// in the AST that isn't replicated in the Node structure.
5372	static const intptr_t kMaxBacksearchLimit = `1024`;
5373	if (is_end_anchored && !is_start_anchored && !is_sticky &&
5374	max_length < kMaxBacksearchLimit) {
5375	macro_assembler->SetCurrentPositionFromEnd(max_length);
5376	}
5377
5378	if (is_global) {
5379	RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
5380	if (data->tree->min_match() > `0`) {
5381	mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
5382	} else if (is_unicode) {
5383	mode = RegExpMacroAssembler::GLOBAL_UNICODE;
5384	}
5385	macro_assembler->set_global_mode(mode);
5386	}
5387
5388	RegExpEngine::CompilationResult result =
5389	compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5390
5391	if (FLAG_trace_irregexp) {
5392	macro_assembler->PrintBlocks();
5393	}
5394
5395	return result;
5396	}
5397	#endif // !defined(DART_PRECOMPILED_RUNTIME)
5398
5399	RegExpEngine::CompilationResult RegExpEngine::CompileBytecode(
5400	RegExpCompileData* data,
5401	const RegExp& regexp,
5402	bool is_one_byte,
5403	bool is_sticky,
5404	Zone* zone) {
5405	ASSERT(FLAG_interpret_irregexp);
5406	const String& pattern = String::Handle(zone, regexp.pattern());
5407
5408	ASSERT(!regexp.IsNull());
5409	ASSERT(!pattern.IsNull());
5410
5411	const bool is_global = regexp.flags().IsGlobal();
5412	const bool is_unicode = regexp.flags().IsUnicode();
5413
5414	RegExpCompiler compiler(data->capture_count, is_one_byte);
5415
5416	// TODO(zerny): Frequency sampling is currently disabled because of several
5417	// issues. We do not want to store subject strings in the regexp object since
5418	// they might be long and we should not prevent their garbage collection.
5419	// Passing them to this function explicitly does not help, since we must
5420	// generate exactly the same IR for both the unoptimizing and optimizing
5421	// pipelines (otherwise it gets confused when i.e. deopt id's differ).
5422	// An option would be to store sampling results in the regexp object, but
5423	// I'm not sure the performance gains are relevant enough.
5424
5425	// Wrap the body of the regexp in capture #0.
5426	RegExpNode* captured_body =
5427	RegExpCapture::ToNode(data->tree, `0`, &compiler, compiler.accept());
5428
5429	RegExpNode* node = captured_body;
5430	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5431	bool is_start_anchored = data->tree->IsAnchoredAtStart();
5432	intptr_t max_length = data->tree->max_match();
5433	if (!is_start_anchored && !is_sticky) {
5434	// Add a .? at the beginning, outside the body capture, unless*
5435	// this expression is anchored at the beginning.
5436	RegExpNode* loop_node = RegExpQuantifier::ToNode(
5437	`0`, RegExpTree::kInfinity, false,
5438	new (zone) RegExpCharacterClass (`'*'`, RegExpFlags ()), &compiler,
5439	captured_body, data->contains_anchor);
5440
5441	if (data->contains_anchor) {
5442	// Unroll loop once, to take care of the case that might start
5443	// at the start of input.
5444	ChoiceNode* first_step_node = new (zone) ChoiceNode (`2`, zone);
5445	first_step_node->AddAlternative(GuardedAlternative (captured_body));
5446	first_step_node->AddAlternative(GuardedAlternative (new (zone) TextNode (
5447	new (zone) RegExpCharacterClass (`'*'`, RegExpFlags ()),
5448	/read_backwards=/false, loop_node)));
5449	node = first_step_node;
5450	} else {
5451	node = loop_node;
5452	}
5453	}
5454	if (is_one_byte) {
5455	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5456	// Do it again to propagate the new nodes to places where they were not
5457	// put because they had not been calculated yet.
5458	if (node != NULL) {
5459	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5460	}
5461	} else if (is_unicode && (is_global \|\| is_sticky)) {
5462	node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5463	}
5464
5465	if (node == NULL) node = new (zone) EndNode (EndNode::BACKTRACK, zone);
5466	data->node = node;
5467	Analysis analysis(is_one_byte);
5468	analysis.EnsureAnalyzed(node);
5469	if (analysis.has_failed()) {
5470	const char* error_message = analysis.error_message();
5471	return CompilationResult (error_message);
5472	}
5473
5474	// Bytecode regexp implementation.
5475
5476	ZoneGrowableArray<uint8_t> buffer(zone, `1024`);
5477	BytecodeRegExpMacroAssembler* macro_assembler =
5478	new (zone) BytecodeRegExpMacroAssembler (&buffer, zone);
5479
5480	// Inserted here, instead of in Assembler, because it depends on information
5481	// in the AST that isn't replicated in the Node structure.
5482	static const intptr_t kMaxBacksearchLimit = `1024`;
5483	if (is_end_anchored && !is_start_anchored && !is_sticky &&
5484	max_length < kMaxBacksearchLimit) {
5485	macro_assembler->SetCurrentPositionFromEnd(max_length);
5486	}
5487
5488	if (is_global) {
5489	RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
5490	if (data->tree->min_match() > `0`) {
5491	mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
5492	} else if (is_unicode) {
5493	mode = RegExpMacroAssembler::GLOBAL_UNICODE;
5494	}
5495	macro_assembler->set_global_mode(mode);
5496	}
5497
5498	RegExpEngine::CompilationResult result =
5499	compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5500
5501	if (FLAG_trace_irregexp) {
5502	macro_assembler->PrintBlocks();
5503	}
5504
5505	return result;
5506	}
5507
5508	static void CreateSpecializedFunction(Thread* thread,
5509	Zone* zone,
5510	const RegExp& regexp,
5511	intptr_t specialization_cid,
5512	bool sticky,
5513	const Object& owner) {
5514	const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
5515
5516	Function& fn =
5517	Function::Handle(zone, Function::New(Symbols::ColonMatcher(),
5518	FunctionLayout::kIrregexpFunction,
5519	true, // Static.
5520	false, // Not const.
5521	false, // Not abstract.
5522	false, // Not external.
5523	false, // Not native.
5524	owner, TokenPosition::kMinSource));
5525
5526	// TODO(zerny): Share these arrays between all irregexp functions.
5527	fn.set_num_fixed_parameters(kParamCount);
5528	fn.set_parameter_types(
5529	Array::Handle(zone, Array::New(kParamCount, Heap::kOld)));
5530	fn.set_parameter_names(
5531	Array::Handle(zone, Array::New(kParamCount, Heap::kOld)));
5532	fn.SetParameterTypeAt(RegExpMacroAssembler::kParamRegExpIndex,
5533	Object::dynamic_type());
5534	fn.SetParameterNameAt(RegExpMacroAssembler::kParamRegExpIndex,
5535	Symbols::This());
5536	fn.SetParameterTypeAt(RegExpMacroAssembler::kParamStringIndex,
5537	Object::dynamic_type());
5538	fn.SetParameterNameAt(RegExpMacroAssembler::kParamStringIndex,
5539	Symbols::string_param());
5540	fn.SetParameterTypeAt(RegExpMacroAssembler::kParamStartOffsetIndex,
5541	Object::dynamic_type());
5542	fn.SetParameterNameAt(RegExpMacroAssembler::kParamStartOffsetIndex,
5543	Symbols::start_index_param());
5544	fn.set_result_type(Type::Handle(zone, Type::ArrayType()));
5545
5546	// Cache the result.
5547	regexp.set_function(specialization_cid, sticky, fn);
5548
5549	fn.SetRegExpData(regexp, specialization_cid, sticky);
5550	fn.set_is_debuggable(false);
5551
5552	// The function is compiled lazily during the first call.
5553	}
5554
5555	RegExpPtr RegExpEngine::CreateRegExp(Thread* thread,
5556	const String& pattern,
5557	RegExpFlags flags) {
5558	Zone* zone = thread->zone();
5559	const RegExp& regexp = RegExp::Handle(RegExp::New());
5560
5561	regexp.set_pattern(pattern);
5562	regexp.set_flags(flags);
5563
5564	// TODO(zerny): We might want to use normal string searching algorithms
5565	// for simple patterns.
5566	regexp.set_is_complex();
5567	regexp.set_is_global(); // All dart regexps are global.
5568
5569	if (!FLAG_interpret_irregexp) {
5570	const Library& lib = Library::Handle(zone, Library::CoreLibrary());
5571	const Class& owner =
5572	Class::Handle(zone, lib.LookupClass(Symbols::RegExp()));
5573
5574	for (intptr_t cid = kOneByteStringCid; cid <= kExternalTwoByteStringCid;
5575	cid++) {
5576	CreateSpecializedFunction(thread, zone, regexp, cid, /sticky=/false,
5577	owner);
5578	CreateSpecializedFunction(thread, zone, regexp, cid, /sticky=/true,
5579	owner);
5580	}
5581	}
5582
5583	return regexp.raw();
5584	}
5585
5586	} // namespace dart
5587

Browse the source code of engine/third_party/dart/runtime/vm/regexp.cc