1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp.h"
6
7#include <memory>
8
9#include "platform/splay-tree-inl.h"
10#include "platform/unicode.h"
11
12#include "unicode/uniset.h"
13
14#include "vm/dart_entry.h"
15#include "vm/regexp_assembler.h"
16#include "vm/regexp_assembler_bytecode.h"
17#include "vm/regexp_ast.h"
18#include "vm/symbols.h"
19#include "vm/thread.h"
20#include "vm/unibrow-inl.h"
21
22#if !defined(DART_PRECOMPILED_RUNTIME)
23#include "vm/regexp_assembler_ir.h"
24#endif // !defined(DART_PRECOMPILED_RUNTIME)
25
26#define Z (zone())
27
28namespace dart {
29
30// Default to generating optimized regexp code.
31static const bool kRegexpOptimization = true;
32
33// More makes code generation slower, less makes V8 benchmark score lower.
34static const intptr_t kMaxLookaheadForBoyerMoore = 8;
35
36ContainedInLattice AddRange(ContainedInLattice containment,
37 const int32_t* ranges,
38 intptr_t ranges_length,
39 Interval new_range) {
40 ASSERT((ranges_length & 1) == 1);
41 ASSERT(ranges[ranges_length - 1] == Utf::kMaxCodePoint + 1);
42 if (containment == kLatticeUnknown) return containment;
43 bool inside = false;
44 int32_t last = 0;
45 for (intptr_t i = 0; i < ranges_length;
46 inside = !inside, last = ranges[i], i++) {
47 // Consider the range from last to ranges[i].
48 // We haven't got to the new range yet.
49 if (ranges[i] <= new_range.from()) continue;
50 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
51 // inclusive, but the values in ranges are not.
52 if (last <= new_range.from() && new_range.to() < ranges[i]) {
53 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
54 }
55 return kLatticeUnknown;
56 }
57 return containment;
58}
59
60// -------------------------------------------------------------------
61// Implementation of the Irregexp regular expression engine.
62//
63// The Irregexp regular expression engine is intended to be a complete
64// implementation of ECMAScript regular expressions. It generates
65// IR code that is subsequently compiled to native code.
66
67// The Irregexp regexp engine is structured in three steps.
68// 1) The parser generates an abstract syntax tree. See regexp_ast.cc.
69// 2) From the AST a node network is created. The nodes are all
70// subclasses of RegExpNode. The nodes represent states when
71// executing a regular expression. Several optimizations are
72// performed on the node network.
73// 3) From the nodes we generate IR instructions that can actually
74// execute the regular expression (perform the search). The
75// code generation step is described in more detail below.
76
77// Code generation.
78//
79// The nodes are divided into four main categories.
80// * Choice nodes
81// These represent places where the regular expression can
82// match in more than one way. For example on entry to an
83// alternation (foo|bar) or a repetition (*, +, ? or {}).
84// * Action nodes
85// These represent places where some action should be
86// performed. Examples include recording the current position
87// in the input string to a register (in order to implement
88// captures) or other actions on register for example in order
89// to implement the counters needed for {} repetitions.
90// * Matching nodes
91// These attempt to match some element part of the input string.
92// Examples of elements include character classes, plain strings
93// or back references.
94// * End nodes
95// These are used to implement the actions required on finding
96// a successful match or failing to find a match.
97//
98// The code generated maintains some state as it runs. This consists of the
99// following elements:
100//
101// * The capture registers. Used for string captures.
102// * Other registers. Used for counters etc.
103// * The current position.
104// * The stack of backtracking information. Used when a matching node
105// fails to find a match and needs to try an alternative.
106//
107// Conceptual regular expression execution model:
108//
109// There is a simple conceptual model of regular expression execution
110// which will be presented first. The actual code generated is a more
111// efficient simulation of the simple conceptual model:
112//
113// * Choice nodes are implemented as follows:
114// For each choice except the last {
115// push current position
116// push backtrack code location
117// <generate code to test for choice>
118// backtrack code location:
119// pop current position
120// }
121// <generate code to test for last choice>
122//
123// * Actions nodes are generated as follows
124// <push affected registers on backtrack stack>
125// <generate code to perform action>
126// push backtrack code location
127// <generate code to test for following nodes>
128// backtrack code location:
129// <pop affected registers to restore their state>
130// <pop backtrack location from stack and go to it>
131//
132// * Matching nodes are generated as follows:
133// if input string matches at current position
134// update current position
135// <generate code to test for following nodes>
136// else
137// <pop backtrack location from stack and go to it>
138//
139// Thus it can be seen that the current position is saved and restored
140// by the choice nodes, whereas the registers are saved and restored by
141// by the action nodes that manipulate them.
142//
143// The other interesting aspect of this model is that nodes are generated
144// at the point where they are needed by a recursive call to Emit(). If
145// the node has already been code generated then the Emit() call will
146// generate a jump to the previously generated code instead. In order to
147// limit recursion it is possible for the Emit() function to put the node
148// on a work list for later generation and instead generate a jump. The
149// destination of the jump is resolved later when the code is generated.
150//
151// Actual regular expression code generation.
152//
153// Code generation is actually more complicated than the above. In order
154// to improve the efficiency of the generated code some optimizations are
155// performed
156//
157// * Choice nodes have 1-character lookahead.
158// A choice node looks at the following character and eliminates some of
159// the choices immediately based on that character. This is not yet
160// implemented.
161// * Simple greedy loops store reduced backtracking information.
162// A quantifier like /.*foo/m will greedily match the whole input. It will
163// then need to backtrack to a point where it can match "foo". The naive
164// implementation of this would push each character position onto the
165// backtracking stack, then pop them off one by one. This would use space
166// proportional to the length of the input string. However since the "."
167// can only match in one way and always has a constant length (in this case
168// of 1) it suffices to store the current position on the top of the stack
169// once. Matching now becomes merely incrementing the current position and
170// backtracking becomes decrementing the current position and checking the
171// result against the stored current position. This is faster and saves
172// space.
173// * The current state is virtualized.
174// This is used to defer expensive operations until it is clear that they
175// are needed and to generate code for a node more than once, allowing
176// specialized an efficient versions of the code to be created. This is
177// explained in the section below.
178//
179// Execution state virtualization.
180//
181// Instead of emitting code, nodes that manipulate the state can record their
182// manipulation in an object called the Trace. The Trace object can record a
183// current position offset, an optional backtrack code location on the top of
184// the virtualized backtrack stack and some register changes. When a node is
185// to be emitted it can flush the Trace or update it. Flushing the Trace
186// will emit code to bring the actual state into line with the virtual state.
187// Avoiding flushing the state can postpone some work (e.g. updates of capture
188// registers). Postponing work can save time when executing the regular
189// expression since it may be found that the work never has to be done as a
190// failure to match can occur. In addition it is much faster to jump to a
191// known backtrack code location than it is to pop an unknown backtrack
192// location from the stack and jump there.
193//
194// The virtual state found in the Trace affects code generation. For example
195// the virtual state contains the difference between the actual current
196// position and the virtual current position, and matching code needs to use
197// this offset to attempt a match in the correct location of the input
198// string. Therefore code generated for a non-trivial trace is specialized
199// to that trace. The code generator therefore has the ability to generate
200// code for each node several times. In order to limit the size of the
201// generated code there is an arbitrary limit on how many specialized sets of
202// code may be generated for a given node. If the limit is reached, the
203// trace is flushed and a generic version of the code for a node is emitted.
204// This is subsequently used for that node. The code emitted for non-generic
205// trace is not recorded in the node and so it cannot currently be reused in
206// the event that code generation is requested for an identical trace.
207
208void RegExpTree::AppendToText(RegExpText* text) {
209 UNREACHABLE();
210}
211
212void RegExpAtom::AppendToText(RegExpText* text) {
213 text->AddElement(TextElement::Atom(this));
214}
215
216void RegExpCharacterClass::AppendToText(RegExpText* text) {
217 text->AddElement(TextElement::CharClass(this));
218}
219
220void RegExpText::AppendToText(RegExpText* text) {
221 for (intptr_t i = 0; i < elements()->length(); i++)
222 text->AddElement((*elements())[i]);
223}
224
225TextElement TextElement::Atom(RegExpAtom* atom) {
226 return TextElement(ATOM, atom);
227}
228
229TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
230 return TextElement(CHAR_CLASS, char_class);
231}
232
233intptr_t TextElement::length() const {
234 switch (text_type()) {
235 case ATOM:
236 return atom()->length();
237
238 case CHAR_CLASS:
239 return 1;
240 }
241 UNREACHABLE();
242 return 0;
243}
244
245class FrequencyCollator : public ValueObject {
246 public:
247 FrequencyCollator() : total_samples_(0) {
248 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
249 frequencies_[i] = CharacterFrequency(i);
250 }
251 }
252
253 void CountCharacter(intptr_t character) {
254 intptr_t index = (character & RegExpMacroAssembler::kTableMask);
255 frequencies_[index].Increment();
256 total_samples_++;
257 }
258
259 // Does not measure in percent, but rather per-128 (the table size from the
260 // regexp macro assembler).
261 intptr_t Frequency(intptr_t in_character) {
262 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
263 if (total_samples_ < 1) return 1; // Division by zero.
264 intptr_t freq_in_per128 =
265 (frequencies_[in_character].counter() * 128) / total_samples_;
266 return freq_in_per128;
267 }
268
269 private:
270 class CharacterFrequency {
271 public:
272 CharacterFrequency() : counter_(0), character_(-1) {}
273 explicit CharacterFrequency(intptr_t character)
274 : counter_(0), character_(character) {}
275
276 void Increment() { counter_++; }
277 intptr_t counter() { return counter_; }
278 intptr_t character() { return character_; }
279
280 private:
281 intptr_t counter_;
282 intptr_t character_;
283
284 DISALLOW_ALLOCATION();
285 };
286
287 private:
288 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
289 intptr_t total_samples_;
290};
291
292class RegExpCompiler : public ValueObject {
293 public:
294 RegExpCompiler(intptr_t capture_count, bool is_one_byte);
295
296 intptr_t AllocateRegister() { return next_register_++; }
297
298 // Lookarounds to match lone surrogates for unicode character class matches
299 // are never nested. We can therefore reuse registers.
300 intptr_t UnicodeLookaroundStackRegister() {
301 if (unicode_lookaround_stack_register_ == kNoRegister) {
302 unicode_lookaround_stack_register_ = AllocateRegister();
303 }
304 return unicode_lookaround_stack_register_;
305 }
306
307 intptr_t UnicodeLookaroundPositionRegister() {
308 if (unicode_lookaround_position_register_ == kNoRegister) {
309 unicode_lookaround_position_register_ = AllocateRegister();
310 }
311 return unicode_lookaround_position_register_;
312 }
313
314#if !defined(DART_PRECOMPILED_RUNTIME)
315 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,
316 RegExpNode* start,
317 intptr_t capture_count,
318 const String& pattern);
319#endif
320
321 RegExpEngine::CompilationResult Assemble(
322 BytecodeRegExpMacroAssembler* assembler,
323 RegExpNode* start,
324 intptr_t capture_count,
325 const String& pattern);
326
327 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
328
329 static const intptr_t kImplementationOffset = 0;
330 static const intptr_t kNumberOfRegistersOffset = 0;
331 static const intptr_t kCodeOffset = 1;
332
333 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
334 EndNode* accept() { return accept_; }
335
336 static const intptr_t kMaxRecursion = 100;
337 inline intptr_t recursion_depth() { return recursion_depth_; }
338 inline void IncrementRecursionDepth() { recursion_depth_++; }
339 inline void DecrementRecursionDepth() { recursion_depth_--; }
340
341 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
342
343 inline bool one_byte() const { return is_one_byte_; }
344 bool read_backward() { return read_backward_; }
345 void set_read_backward(bool value) { read_backward_ = value; }
346 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
347
348 intptr_t current_expansion_factor() { return current_expansion_factor_; }
349 void set_current_expansion_factor(intptr_t value) {
350 current_expansion_factor_ = value;
351 }
352
353 Zone* zone() const { return zone_; }
354
355 static const intptr_t kNoRegister = -1;
356
357 private:
358 EndNode* accept_;
359 intptr_t next_register_;
360 intptr_t unicode_lookaround_stack_register_;
361 intptr_t unicode_lookaround_position_register_;
362 ZoneGrowableArray<RegExpNode*>* work_list_;
363 intptr_t recursion_depth_;
364 RegExpMacroAssembler* macro_assembler_;
365 bool is_one_byte_;
366 bool reg_exp_too_big_;
367 bool read_backward_;
368 intptr_t current_expansion_factor_;
369 FrequencyCollator frequency_collator_;
370 Zone* zone_;
371};
372
373class RecursionCheck : public ValueObject {
374 public:
375 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
376 compiler->IncrementRecursionDepth();
377 }
378 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
379
380 private:
381 RegExpCompiler* compiler_;
382};
383
384static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
385 return RegExpEngine::CompilationResult("RegExp too big");
386}
387
388// Attempts to compile the regexp using an Irregexp code generator. Returns
389// a fixed array or a null handle depending on whether it succeeded.
390RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte)
391 : next_register_(2 * (capture_count + 1)),
392 unicode_lookaround_stack_register_(kNoRegister),
393 unicode_lookaround_position_register_(kNoRegister),
394 work_list_(NULL),
395 recursion_depth_(0),
396 is_one_byte_(is_one_byte),
397 reg_exp_too_big_(false),
398 read_backward_(false),
399 current_expansion_factor_(1),
400 zone_(Thread::Current()->zone()) {
401 accept_ = new (Z) EndNode(EndNode::ACCEPT, Z);
402}
403
404#if !defined(DART_PRECOMPILED_RUNTIME)
405RegExpEngine::CompilationResult RegExpCompiler::Assemble(
406 IRRegExpMacroAssembler* macro_assembler,
407 RegExpNode* start,
408 intptr_t capture_count,
409 const String& pattern) {
410 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
411 macro_assembler_ = macro_assembler;
412
413 ZoneGrowableArray<RegExpNode*> work_list(0);
414 work_list_ = &work_list;
415 BlockLabel fail;
416 macro_assembler_->PushBacktrack(&fail);
417 Trace new_trace;
418 start->Emit(this, &new_trace);
419 macro_assembler_->BindBlock(&fail);
420 macro_assembler_->Fail();
421 while (!work_list.is_empty()) {
422 work_list.RemoveLast()->Emit(this, &new_trace);
423 }
424 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
425
426 macro_assembler->GenerateBacktrackBlock();
427 macro_assembler->FinalizeRegistersArray();
428
429 return RegExpEngine::CompilationResult(
430 macro_assembler->backtrack_goto(), macro_assembler->graph_entry(),
431 macro_assembler->num_blocks(), macro_assembler->num_stack_locals(),
432 next_register_);
433}
434#endif
435
436RegExpEngine::CompilationResult RegExpCompiler::Assemble(
437 BytecodeRegExpMacroAssembler* macro_assembler,
438 RegExpNode* start,
439 intptr_t capture_count,
440 const String& pattern) {
441 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
442 macro_assembler_ = macro_assembler;
443
444 ZoneGrowableArray<RegExpNode*> work_list(0);
445 work_list_ = &work_list;
446 BlockLabel fail;
447 macro_assembler_->PushBacktrack(&fail);
448 Trace new_trace;
449 start->Emit(this, &new_trace);
450 macro_assembler_->BindBlock(&fail);
451 macro_assembler_->Fail();
452 while (!work_list.is_empty()) {
453 work_list.RemoveLast()->Emit(this, &new_trace);
454 }
455 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
456
457 TypedData& bytecode = TypedData::ZoneHandle(macro_assembler->GetBytecode());
458 return RegExpEngine::CompilationResult(&bytecode, next_register_);
459}
460
461bool Trace::DeferredAction::Mentions(intptr_t that) {
462 if (action_type() == ActionNode::CLEAR_CAPTURES) {
463 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
464 return range.Contains(that);
465 } else {
466 return reg() == that;
467 }
468}
469
470bool Trace::mentions_reg(intptr_t reg) {
471 for (DeferredAction* action = actions_; action != NULL;
472 action = action->next()) {
473 if (action->Mentions(reg)) return true;
474 }
475 return false;
476}
477
478bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
479 ASSERT(*cp_offset == 0);
480 for (DeferredAction* action = actions_; action != NULL;
481 action = action->next()) {
482 if (action->Mentions(reg)) {
483 if (action->action_type() == ActionNode::STORE_POSITION) {
484 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
485 return true;
486 } else {
487 return false;
488 }
489 }
490 }
491 return false;
492}
493
494// This is called as we come into a loop choice node and some other tricky
495// nodes. It normalizes the state of the code generator to ensure we can
496// generate generic code.
497intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers, Zone* zone) {
498 intptr_t max_register = RegExpCompiler::kNoRegister;
499 for (DeferredAction* action = actions_; action != NULL;
500 action = action->next()) {
501 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
502 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
503 for (intptr_t i = range.from(); i <= range.to(); i++)
504 affected_registers->Set(i, zone);
505 if (range.to() > max_register) max_register = range.to();
506 } else {
507 affected_registers->Set(action->reg(), zone);
508 if (action->reg() > max_register) max_register = action->reg();
509 }
510 }
511 return max_register;
512}
513
514void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
515 intptr_t max_register,
516 const OutSet& registers_to_pop,
517 const OutSet& registers_to_clear) {
518 for (intptr_t reg = max_register; reg >= 0; reg--) {
519 if (registers_to_pop.Get(reg)) {
520 assembler->PopRegister(reg);
521 } else if (registers_to_clear.Get(reg)) {
522 intptr_t clear_to = reg;
523 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
524 reg--;
525 }
526 assembler->ClearRegisters(reg, clear_to);
527 }
528 }
529}
530
531void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
532 intptr_t max_register,
533 const OutSet& affected_registers,
534 OutSet* registers_to_pop,
535 OutSet* registers_to_clear,
536 Zone* zone) {
537 for (intptr_t reg = 0; reg <= max_register; reg++) {
538 if (!affected_registers.Get(reg)) {
539 continue;
540 }
541
542 // The chronologically first deferred action in the trace
543 // is used to infer the action needed to restore a register
544 // to its previous state (or not, if it's safe to ignore it).
545 enum DeferredActionUndoType { ACTION_IGNORE, ACTION_RESTORE, ACTION_CLEAR };
546 DeferredActionUndoType undo_action = ACTION_IGNORE;
547
548 intptr_t value = 0;
549 bool absolute = false;
550 bool clear = false;
551 static const intptr_t kNoStore = kMinInt32;
552 intptr_t store_position = kNoStore;
553 // This is a little tricky because we are scanning the actions in reverse
554 // historical order (newest first).
555 for (DeferredAction* action = actions_; action != NULL;
556 action = action->next()) {
557 if (action->Mentions(reg)) {
558 switch (action->action_type()) {
559 case ActionNode::SET_REGISTER: {
560 Trace::DeferredSetRegister* psr =
561 static_cast<Trace::DeferredSetRegister*>(action);
562 if (!absolute) {
563 value += psr->value();
564 absolute = true;
565 }
566 // SET_REGISTER is currently only used for newly introduced loop
567 // counters. They can have a significant previous value if they
568 // occour in a loop. TODO(lrn): Propagate this information, so we
569 // can set undo_action to ACTION_IGNORE if we know there is no
570 // value to restore.
571 undo_action = ACTION_RESTORE;
572 ASSERT(store_position == kNoStore);
573 ASSERT(!clear);
574 break;
575 }
576 case ActionNode::INCREMENT_REGISTER:
577 if (!absolute) {
578 value++;
579 }
580 ASSERT(store_position == kNoStore);
581 ASSERT(!clear);
582 undo_action = ACTION_RESTORE;
583 break;
584 case ActionNode::STORE_POSITION: {
585 Trace::DeferredCapture* pc =
586 static_cast<Trace::DeferredCapture*>(action);
587 if (!clear && store_position == kNoStore) {
588 store_position = pc->cp_offset();
589 }
590
591 // For captures we know that stores and clears alternate.
592 // Other register, are never cleared, and if the occur
593 // inside a loop, they might be assigned more than once.
594 if (reg <= 1) {
595 // Registers zero and one, aka "capture zero", is
596 // always set correctly if we succeed. There is no
597 // need to undo a setting on backtrack, because we
598 // will set it again or fail.
599 undo_action = ACTION_IGNORE;
600 } else {
601 undo_action = pc->is_capture() ? ACTION_CLEAR : ACTION_RESTORE;
602 }
603 ASSERT(!absolute);
604 ASSERT(value == 0);
605 break;
606 }
607 case ActionNode::CLEAR_CAPTURES: {
608 // Since we're scanning in reverse order, if we've already
609 // set the position we have to ignore historically earlier
610 // clearing operations.
611 if (store_position == kNoStore) {
612 clear = true;
613 }
614 undo_action = ACTION_RESTORE;
615 ASSERT(!absolute);
616 ASSERT(value == 0);
617 break;
618 }
619 default:
620 UNREACHABLE();
621 break;
622 }
623 }
624 }
625 // Prepare for the undo-action (e.g., push if it's going to be popped).
626 if (undo_action == ACTION_RESTORE) {
627 assembler->PushRegister(reg);
628 registers_to_pop->Set(reg, zone);
629 } else if (undo_action == ACTION_CLEAR) {
630 registers_to_clear->Set(reg, zone);
631 }
632 // Perform the chronologically last action (or accumulated increment)
633 // for the register.
634 if (store_position != kNoStore) {
635 assembler->WriteCurrentPositionToRegister(reg, store_position);
636 } else if (clear) {
637 assembler->ClearRegisters(reg, reg);
638 } else if (absolute) {
639 assembler->SetRegister(reg, value);
640 } else if (value != 0) {
641 assembler->AdvanceRegister(reg, value);
642 }
643 }
644}
645
646// This is called as we come into a loop choice node and some other tricky
647// nodes. It normalizes the state of the code generator to ensure we can
648// generate generic code.
649void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
650 RegExpMacroAssembler* assembler = compiler->macro_assembler();
651
652 ASSERT(!is_trivial());
653
654 if (actions_ == NULL && backtrack() == NULL) {
655 // Here we just have some deferred cp advances to fix and we are back to
656 // a normal situation. We may also have to forget some information gained
657 // through a quick check that was already performed.
658 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
659 // Create a new trivial state and generate the node with that.
660 Trace new_state;
661 successor->Emit(compiler, &new_state);
662 return;
663 }
664
665 // Generate deferred actions here along with code to undo them again.
666 OutSet affected_registers;
667
668 if (backtrack() != NULL) {
669 // Here we have a concrete backtrack location. These are set up by choice
670 // nodes and so they indicate that we have a deferred save of the current
671 // position which we may need to emit here.
672 assembler->PushCurrentPosition();
673 }
674 Zone* zone = successor->zone();
675 intptr_t max_register = FindAffectedRegisters(&affected_registers, zone);
676 OutSet registers_to_pop;
677 OutSet registers_to_clear;
678 PerformDeferredActions(assembler, max_register, affected_registers,
679 &registers_to_pop, &registers_to_clear, zone);
680 if (cp_offset_ != 0) {
681 assembler->AdvanceCurrentPosition(cp_offset_);
682 }
683
684 // Create a new trivial state and generate the node with that.
685 BlockLabel undo;
686 assembler->PushBacktrack(&undo);
687 Trace new_state;
688 successor->Emit(compiler, &new_state);
689
690 // On backtrack we need to restore state.
691 assembler->BindBlock(&undo);
692 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
693 registers_to_clear);
694 if (backtrack() == NULL) {
695 assembler->Backtrack();
696 } else {
697 assembler->PopCurrentPosition();
698 assembler->GoTo(backtrack());
699 }
700}
701
702void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
703 RegExpMacroAssembler* assembler = compiler->macro_assembler();
704
705 // Omit flushing the trace. We discard the entire stack frame anyway.
706
707 if (!label()->is_bound()) {
708 // We are completely independent of the trace, since we ignore it,
709 // so this code can be used as the generic version.
710 assembler->BindBlock(label());
711 }
712
713 // Throw away everything on the backtrack stack since the start
714 // of the negative submatch and restore the character position.
715 assembler->ReadCurrentPositionFromRegister(current_position_register_);
716 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
717 if (clear_capture_count_ > 0) {
718 // Clear any captures that might have been performed during the success
719 // of the body of the negative look-ahead.
720 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
721 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
722 }
723 // Now that we have unwound the stack we find at the top of the stack the
724 // backtrack that the BeginSubmatch node got.
725 assembler->Backtrack();
726}
727
728void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
729 if (!trace->is_trivial()) {
730 trace->Flush(compiler, this);
731 return;
732 }
733 RegExpMacroAssembler* assembler = compiler->macro_assembler();
734 if (!label()->is_bound()) {
735 assembler->BindBlock(label());
736 }
737 switch (action_) {
738 case ACCEPT:
739 assembler->Succeed();
740 return;
741 case BACKTRACK:
742 assembler->GoTo(trace->backtrack());
743 return;
744 case NEGATIVE_SUBMATCH_SUCCESS:
745 // This case is handled in a different virtual method.
746 UNREACHABLE();
747 }
748 UNIMPLEMENTED();
749}
750
751void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
752 if (guards_ == NULL) guards_ = new (zone) ZoneGrowableArray<Guard*>(1);
753 guards_->Add(guard);
754}
755
756ActionNode* ActionNode::SetRegister(intptr_t reg,
757 intptr_t val,
758 RegExpNode* on_success) {
759 ActionNode* result =
760 new (on_success->zone()) ActionNode(SET_REGISTER, on_success);
761 result->data_.u_store_register.reg = reg;
762 result->data_.u_store_register.value = val;
763 return result;
764}
765
766ActionNode* ActionNode::IncrementRegister(intptr_t reg,
767 RegExpNode* on_success) {
768 ActionNode* result =
769 new (on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
770 result->data_.u_increment_register.reg = reg;
771 return result;
772}
773
774ActionNode* ActionNode::StorePosition(intptr_t reg,
775 bool is_capture,
776 RegExpNode* on_success) {
777 ActionNode* result =
778 new (on_success->zone()) ActionNode(STORE_POSITION, on_success);
779 result->data_.u_position_register.reg = reg;
780 result->data_.u_position_register.is_capture = is_capture;
781 return result;
782}
783
784ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
785 ActionNode* result =
786 new (on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
787 result->data_.u_clear_captures.range_from = range.from();
788 result->data_.u_clear_captures.range_to = range.to();
789 return result;
790}
791
792ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg,
793 intptr_t position_reg,
794 RegExpNode* on_success) {
795 ActionNode* result =
796 new (on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
797 result->data_.u_submatch.stack_pointer_register = stack_reg;
798 result->data_.u_submatch.current_position_register = position_reg;
799 return result;
800}
801
802ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg,
803 intptr_t position_reg,
804 intptr_t clear_register_count,
805 intptr_t clear_register_from,
806 RegExpNode* on_success) {
807 ActionNode* result = new (on_success->zone())
808 ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
809 result->data_.u_submatch.stack_pointer_register = stack_reg;
810 result->data_.u_submatch.current_position_register = position_reg;
811 result->data_.u_submatch.clear_register_count = clear_register_count;
812 result->data_.u_submatch.clear_register_from = clear_register_from;
813 return result;
814}
815
816ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register,
817 intptr_t repetition_register,
818 intptr_t repetition_limit,
819 RegExpNode* on_success) {
820 ActionNode* result =
821 new (on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
822 result->data_.u_empty_match_check.start_register = start_register;
823 result->data_.u_empty_match_check.repetition_register = repetition_register;
824 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
825 return result;
826}
827
828#define DEFINE_ACCEPT(Type) \
829 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
830FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
831#undef DEFINE_ACCEPT
832
833void LoopChoiceNode::Accept(NodeVisitor* visitor) {
834 visitor->VisitLoopChoice(this);
835}
836
837// -------------------------------------------------------------------
838// Emit code.
839
840void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
841 Guard* guard,
842 Trace* trace) {
843 switch (guard->op()) {
844 case Guard::LT:
845 ASSERT(!trace->mentions_reg(guard->reg()));
846 macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
847 trace->backtrack());
848 break;
849 case Guard::GEQ:
850 ASSERT(!trace->mentions_reg(guard->reg()));
851 macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
852 trace->backtrack());
853 break;
854 }
855}
856
857// Returns the number of characters in the equivalence class, omitting those
858// that cannot occur in the source string because it is ASCII.
859static intptr_t GetCaseIndependentLetters(uint16_t character,
860 bool one_byte_subject,
861 int32_t* letters) {
862 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
863 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);
864 // Unibrow returns 0 or 1 for characters where case independence is
865 // trivial.
866 if (length == 0) {
867 letters[0] = character;
868 length = 1;
869 }
870 if (!one_byte_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
871 return length;
872 }
873
874 // The standard requires that non-ASCII characters cannot have ASCII
875 // character codes in their equivalence class.
876 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
877 // is it? For example, \u00C5 is equivalent to \u212B.
878 return 0;
879}
880
881static inline bool EmitSimpleCharacter(Zone* zone,
882 RegExpCompiler* compiler,
883 uint16_t c,
884 BlockLabel* on_failure,
885 intptr_t cp_offset,
886 bool check,
887 bool preloaded) {
888 RegExpMacroAssembler* assembler = compiler->macro_assembler();
889 bool bound_checked = false;
890 if (!preloaded) {
891 assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
892 bound_checked = true;
893 }
894 assembler->CheckNotCharacter(c, on_failure);
895 return bound_checked;
896}
897
898// Only emits non-letters (things that don't have case). Only used for case
899// independent matches.
900static inline bool EmitAtomNonLetter(Zone* zone,
901 RegExpCompiler* compiler,
902 uint16_t c,
903 BlockLabel* on_failure,
904 intptr_t cp_offset,
905 bool check,
906 bool preloaded) {
907 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
908 bool one_byte = compiler->one_byte();
909 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
910 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
911 if (length < 1) {
912 // This can't match. Must be an one-byte subject and a non-one-byte
913 // character. We do not need to do anything since the one-byte pass
914 // already handled this.
915 return false; // Bounds not checked.
916 }
917 bool checked = false;
918 // We handle the length > 1 case in a later pass.
919 if (length == 1) {
920 if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {
921 // Can't match - see above.
922 return false; // Bounds not checked.
923 }
924 if (!preloaded) {
925 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
926 checked = check;
927 }
928 macro_assembler->CheckNotCharacter(c, on_failure);
929 }
930 return checked;
931}
932
933static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
934 bool one_byte,
935 uint16_t c1,
936 uint16_t c2,
937 BlockLabel* on_failure) {
938 uint16_t char_mask;
939 if (one_byte) {
940 char_mask = Symbols::kMaxOneCharCodeSymbol;
941 } else {
942 char_mask = Utf16::kMaxCodeUnit;
943 }
944 uint16_t exor = c1 ^ c2;
945 // Check whether exor has only one bit set.
946 if (((exor - 1) & exor) == 0) {
947 // If c1 and c2 differ only by one bit.
948 // Ecma262UnCanonicalize always gives the highest number last.
949 ASSERT(c2 > c1);
950 uint16_t mask = char_mask ^ exor;
951 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
952 return true;
953 }
954 ASSERT(c2 > c1);
955 uint16_t diff = c2 - c1;
956 if (((diff - 1) & diff) == 0 && c1 >= diff) {
957 // If the characters differ by 2^n but don't differ by one bit then
958 // subtract the difference from the found character, then do the or
959 // trick. We avoid the theoretical case where negative numbers are
960 // involved in order to simplify code generation.
961 uint16_t mask = char_mask ^ diff;
962 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
963 on_failure);
964 return true;
965 }
966 return false;
967}
968
969typedef bool EmitCharacterFunction(Zone* zone,
970 RegExpCompiler* compiler,
971 uint16_t c,
972 BlockLabel* on_failure,
973 intptr_t cp_offset,
974 bool check,
975 bool preloaded);
976
977// Only emits letters (things that have case). Only used for case independent
978// matches.
979static inline bool EmitAtomLetter(Zone* zone,
980 RegExpCompiler* compiler,
981 uint16_t c,
982 BlockLabel* on_failure,
983 intptr_t cp_offset,
984 bool check,
985 bool preloaded) {
986 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
987 bool one_byte = compiler->one_byte();
988 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
989 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
990 if (length <= 1) return false;
991 // We may not need to check against the end of the input string
992 // if this character lies before a character that matched.
993 if (!preloaded) {
994 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
995 }
996 BlockLabel ok;
997 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
998 switch (length) {
999 case 2: {
1000 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1001 chars[1], on_failure)) {
1002 } else {
1003 macro_assembler->CheckCharacter(chars[0], &ok);
1004 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1005 macro_assembler->BindBlock(&ok);
1006 }
1007 break;
1008 }
1009 case 4:
1010 macro_assembler->CheckCharacter(chars[3], &ok);
1011 FALL_THROUGH;
1012 case 3:
1013 macro_assembler->CheckCharacter(chars[0], &ok);
1014 macro_assembler->CheckCharacter(chars[1], &ok);
1015 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1016 macro_assembler->BindBlock(&ok);
1017 break;
1018 default:
1019 UNREACHABLE();
1020 break;
1021 }
1022 return true;
1023}
1024
1025static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1026 uint16_t border,
1027 BlockLabel* fall_through,
1028 BlockLabel* above_or_equal,
1029 BlockLabel* below) {
1030 if (below != fall_through) {
1031 masm->CheckCharacterLT(border, below);
1032 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1033 } else {
1034 masm->CheckCharacterGT(border - 1, above_or_equal);
1035 }
1036}
1037
1038static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1039 uint16_t first,
1040 uint16_t last,
1041 BlockLabel* fall_through,
1042 BlockLabel* in_range,
1043 BlockLabel* out_of_range) {
1044 if (in_range == fall_through) {
1045 if (first == last) {
1046 masm->CheckNotCharacter(first, out_of_range);
1047 } else {
1048 masm->CheckCharacterNotInRange(first, last, out_of_range);
1049 }
1050 } else {
1051 if (first == last) {
1052 masm->CheckCharacter(first, in_range);
1053 } else {
1054 masm->CheckCharacterInRange(first, last, in_range);
1055 }
1056 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1057 }
1058}
1059
1060// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1061// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1062static void EmitUseLookupTable(RegExpMacroAssembler* masm,
1063 ZoneGrowableArray<uint16_t>* ranges,
1064 intptr_t start_index,
1065 intptr_t end_index,
1066 uint16_t min_char,
1067 BlockLabel* fall_through,
1068 BlockLabel* even_label,
1069 BlockLabel* odd_label) {
1070 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1071 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1072
1073 intptr_t base = (min_char & ~kMask);
1074
1075 // Assert that everything is on one kTableSize page.
1076 for (intptr_t i = start_index; i <= end_index; i++) {
1077 ASSERT((ranges->At(i) & ~kMask) == base);
1078 }
1079 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
1080
1081 char templ[kSize];
1082 BlockLabel* on_bit_set;
1083 BlockLabel* on_bit_clear;
1084 intptr_t bit;
1085 if (even_label == fall_through) {
1086 on_bit_set = odd_label;
1087 on_bit_clear = even_label;
1088 bit = 1;
1089 } else {
1090 on_bit_set = even_label;
1091 on_bit_clear = odd_label;
1092 bit = 0;
1093 }
1094 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;
1095 i++) {
1096 templ[i] = bit;
1097 }
1098 intptr_t j = 0;
1099 bit ^= 1;
1100 for (intptr_t i = start_index; i < end_index; i++) {
1101 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {
1102 templ[j] = bit;
1103 }
1104 bit ^= 1;
1105 }
1106 for (intptr_t i = j; i < kSize; i++) {
1107 templ[i] = bit;
1108 }
1109 // TODO(erikcorry): Cache these.
1110 const TypedData& ba = TypedData::ZoneHandle(
1111 masm->zone(), TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
1112 for (intptr_t i = 0; i < kSize; i++) {
1113 ba.SetUint8(i, templ[i]);
1114 }
1115 masm->CheckBitInTable(ba, on_bit_set);
1116 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1117}
1118
1119static void CutOutRange(RegExpMacroAssembler* masm,
1120 ZoneGrowableArray<uint16_t>* ranges,
1121 intptr_t start_index,
1122 intptr_t end_index,
1123 intptr_t cut_index,
1124 BlockLabel* even_label,
1125 BlockLabel* odd_label) {
1126 bool odd = (((cut_index - start_index) & 1) == 1);
1127 BlockLabel* in_range_label = odd ? odd_label : even_label;
1128 BlockLabel dummy;
1129 EmitDoubleBoundaryTest(masm, ranges->At(cut_index),
1130 ranges->At(cut_index + 1) - 1, &dummy, in_range_label,
1131 &dummy);
1132 ASSERT(!dummy.is_linked());
1133 // Cut out the single range by rewriting the array. This creates a new
1134 // range that is a merger of the two ranges on either side of the one we
1135 // are cutting out. The oddity of the labels is preserved.
1136 for (intptr_t j = cut_index; j > start_index; j--) {
1137 (*ranges)[j] = ranges->At(j - 1);
1138 }
1139 for (intptr_t j = cut_index + 1; j < end_index; j++) {
1140 (*ranges)[j] = ranges->At(j + 1);
1141 }
1142}
1143
1144// Unicode case. Split the search space into kSize spaces that are handled
1145// with recursion.
1146static void SplitSearchSpace(ZoneGrowableArray<uint16_t>* ranges,
1147 intptr_t start_index,
1148 intptr_t end_index,
1149 intptr_t* new_start_index,
1150 intptr_t* new_end_index,
1151 uint16_t* border) {
1152 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1153 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1154
1155 uint16_t first = ranges->At(start_index);
1156 uint16_t last = ranges->At(end_index) - 1;
1157
1158 *new_start_index = start_index;
1159 *border = (ranges->At(start_index) & ~kMask) + kSize;
1160 while (*new_start_index < end_index) {
1161 if (ranges->At(*new_start_index) > *border) break;
1162 (*new_start_index)++;
1163 }
1164 // new_start_index is the index of the first edge that is beyond the
1165 // current kSize space.
1166
1167 // For very large search spaces we do a binary chop search of the non-Latin1
1168 // space instead of just going to the end of the current kSize space. The
1169 // heuristics are complicated a little by the fact that any 128-character
1170 // encoding space can be quickly tested with a table lookup, so we don't
1171 // wish to do binary chop search at a smaller granularity than that. A
1172 // 128-character space can take up a lot of space in the ranges array if,
1173 // for example, we only want to match every second character (eg. the lower
1174 // case characters on some Unicode pages).
1175 intptr_t binary_chop_index = (end_index + start_index) / 2;
1176 // The first test ensures that we get to the code that handles the Latin1
1177 // range with a single not-taken branch, speeding up this important
1178 // character range (even non-Latin1 charset-based text has spaces and
1179 // punctuation).
1180 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.
1181 end_index - start_index > (*new_start_index - start_index) * 2 &&
1182 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1183 ranges->At(binary_chop_index) >= first + 2 * kSize) {
1184 intptr_t scan_forward_for_section_border = binary_chop_index;
1185 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1;
1186
1187 while (scan_forward_for_section_border < end_index) {
1188 if (ranges->At(scan_forward_for_section_border) > new_border) {
1189 *new_start_index = scan_forward_for_section_border;
1190 *border = new_border;
1191 break;
1192 }
1193 scan_forward_for_section_border++;
1194 }
1195 }
1196
1197 ASSERT(*new_start_index > start_index);
1198 *new_end_index = *new_start_index - 1;
1199 if (ranges->At(*new_end_index) == *border) {
1200 (*new_end_index)--;
1201 }
1202 if (*border >= ranges->At(end_index)) {
1203 *border = ranges->At(end_index);
1204 *new_start_index = end_index; // Won't be used.
1205 *new_end_index = end_index - 1;
1206 }
1207}
1208
1209// Gets a series of segment boundaries representing a character class. If the
1210// character is in the range between an even and an odd boundary (counting from
1211// start_index) then go to even_label, otherwise go to odd_label. We already
1212// know that the character is in the range of min_char to max_char inclusive.
1213// Either label can be NULL indicating backtracking. Either label can also be
1214// equal to the fall_through label.
1215static void GenerateBranches(RegExpMacroAssembler* masm,
1216 ZoneGrowableArray<uint16_t>* ranges,
1217 intptr_t start_index,
1218 intptr_t end_index,
1219 uint16_t min_char,
1220 uint16_t max_char,
1221 BlockLabel* fall_through,
1222 BlockLabel* even_label,
1223 BlockLabel* odd_label) {
1224 uint16_t first = ranges->At(start_index);
1225 uint16_t last = ranges->At(end_index) - 1;
1226
1227 ASSERT(min_char < first);
1228
1229 // Just need to test if the character is before or on-or-after
1230 // a particular character.
1231 if (start_index == end_index) {
1232 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1233 return;
1234 }
1235
1236 // Another almost trivial case: There is one interval in the middle that is
1237 // different from the end intervals.
1238 if (start_index + 1 == end_index) {
1239 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1240 odd_label);
1241 return;
1242 }
1243
1244 // It's not worth using table lookup if there are very few intervals in the
1245 // character class.
1246 if (end_index - start_index <= 6) {
1247 // It is faster to test for individual characters, so we look for those
1248 // first, then try arbitrary ranges in the second round.
1249 static intptr_t kNoCutIndex = -1;
1250 intptr_t cut = kNoCutIndex;
1251 for (intptr_t i = start_index; i < end_index; i++) {
1252 if (ranges->At(i) == ranges->At(i + 1) - 1) {
1253 cut = i;
1254 break;
1255 }
1256 }
1257 if (cut == kNoCutIndex) cut = start_index;
1258 CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1259 odd_label);
1260 ASSERT(end_index - start_index >= 2);
1261 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1262 max_char, fall_through, even_label, odd_label);
1263 return;
1264 }
1265
1266 // If there are a lot of intervals in the regexp, then we will use tables to
1267 // determine whether the character is inside or outside the character class.
1268 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;
1269
1270 if ((max_char >> kBits) == (min_char >> kBits)) {
1271 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1272 fall_through, even_label, odd_label);
1273 return;
1274 }
1275
1276 if ((min_char >> kBits) != (first >> kBits)) {
1277 masm->CheckCharacterLT(first, odd_label);
1278 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1279 fall_through, odd_label, even_label);
1280 return;
1281 }
1282
1283 intptr_t new_start_index = 0;
1284 intptr_t new_end_index = 0;
1285 uint16_t border = 0;
1286
1287 SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1288 &new_end_index, &border);
1289
1290 BlockLabel handle_rest;
1291 BlockLabel* above = &handle_rest;
1292 if (border == last + 1) {
1293 // We didn't find any section that started after the limit, so everything
1294 // above the border is one of the terminal labels.
1295 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1296 ASSERT(new_end_index == end_index - 1);
1297 }
1298
1299 ASSERT(start_index <= new_end_index);
1300 ASSERT(new_start_index <= end_index);
1301 ASSERT(start_index < new_start_index);
1302 ASSERT(new_end_index < end_index);
1303 ASSERT(new_end_index + 1 == new_start_index ||
1304 (new_end_index + 2 == new_start_index &&
1305 border == ranges->At(new_end_index + 1)));
1306 ASSERT(min_char < border - 1);
1307 ASSERT(border < max_char);
1308 ASSERT(ranges->At(new_end_index) < border);
1309 ASSERT(border < ranges->At(new_start_index) ||
1310 (border == ranges->At(new_start_index) &&
1311 new_start_index == end_index && new_end_index == end_index - 1 &&
1312 border == last + 1));
1313 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
1314
1315 masm->CheckCharacterGT(border - 1, above);
1316 BlockLabel dummy;
1317 GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1318 border - 1, &dummy, even_label, odd_label);
1319
1320 if (handle_rest.is_linked()) {
1321 masm->BindBlock(&handle_rest);
1322 bool flip = (new_start_index & 1) != (start_index & 1);
1323 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1324 &dummy, flip ? odd_label : even_label,
1325 flip ? even_label : odd_label);
1326 }
1327}
1328
1329static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1330 RegExpCharacterClass* cc,
1331 bool one_byte,
1332 BlockLabel* on_failure,
1333 intptr_t cp_offset,
1334 bool check_offset,
1335 bool preloaded,
1336 Zone* zone) {
1337 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1338 if (!CharacterRange::IsCanonical(ranges)) {
1339 CharacterRange::Canonicalize(ranges);
1340 }
1341
1342 uint16_t max_char;
1343 if (one_byte) {
1344 max_char = Symbols::kMaxOneCharCodeSymbol;
1345 } else {
1346 max_char = Utf16::kMaxCodeUnit;
1347 }
1348
1349 intptr_t range_count = ranges->length();
1350
1351 intptr_t last_valid_range = range_count - 1;
1352 while (last_valid_range >= 0) {
1353 const CharacterRange& range = ranges->At(last_valid_range);
1354 if (range.from() <= max_char) {
1355 break;
1356 }
1357 last_valid_range--;
1358 }
1359
1360 if (last_valid_range < 0) {
1361 if (!cc->is_negated()) {
1362 macro_assembler->GoTo(on_failure);
1363 }
1364 if (check_offset) {
1365 macro_assembler->CheckPosition(cp_offset, on_failure);
1366 }
1367 return;
1368 }
1369
1370 if (last_valid_range == 0 && ranges->At(0).IsEverything(max_char)) {
1371 if (cc->is_negated()) {
1372 macro_assembler->GoTo(on_failure);
1373 } else {
1374 // This is a common case hit by non-anchored expressions.
1375 if (check_offset) {
1376 macro_assembler->CheckPosition(cp_offset, on_failure);
1377 }
1378 }
1379 return;
1380 }
1381
1382 if (!preloaded) {
1383 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1384 }
1385
1386 if (cc->is_standard() && macro_assembler->CheckSpecialCharacterClass(
1387 cc->standard_type(), on_failure)) {
1388 return;
1389 }
1390
1391 // A new list with ascending entries. Each entry is a code unit
1392 // where there is a boundary between code units that are part of
1393 // the class and code units that are not. Normally we insert an
1394 // entry at zero which goes to the failure label, but if there
1395 // was already one there we fall through for success on that entry.
1396 // Subsequent entries have alternating meaning (success/failure).
1397 ZoneGrowableArray<uint16_t>* range_boundaries =
1398 new (zone) ZoneGrowableArray<uint16_t>(last_valid_range);
1399
1400 bool zeroth_entry_is_failure = !cc->is_negated();
1401
1402 for (intptr_t i = 0; i <= last_valid_range; i++) {
1403 const CharacterRange& range = ranges->At(i);
1404 if (range.from() == 0) {
1405 ASSERT(i == 0);
1406 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1407 } else {
1408 range_boundaries->Add(range.from());
1409 }
1410 if (range.to() + 1 <= max_char) {
1411 range_boundaries->Add(range.to() + 1);
1412 }
1413 }
1414 intptr_t end_index = range_boundaries->length() - 1;
1415
1416 BlockLabel fall_through;
1417 GenerateBranches(macro_assembler, range_boundaries,
1418 0, // start_index.
1419 end_index,
1420 0, // min_char.
1421 max_char, &fall_through,
1422 zeroth_entry_is_failure ? &fall_through : on_failure,
1423 zeroth_entry_is_failure ? on_failure : &fall_through);
1424 macro_assembler->BindBlock(&fall_through);
1425}
1426
1427RegExpNode::~RegExpNode() {}
1428
1429RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1430 Trace* trace) {
1431 // If we are generating a greedy loop then don't stop and don't reuse code.
1432 if (trace->stop_node() != NULL) {
1433 return CONTINUE;
1434 }
1435
1436 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1437 if (trace->is_trivial()) {
1438 if (label_.is_bound()) {
1439 // We are being asked to generate a generic version, but that's already
1440 // been done so just go to it.
1441 macro_assembler->GoTo(&label_);
1442 return DONE;
1443 }
1444 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1445 // To avoid too deep recursion we push the node to the work queue and just
1446 // generate a goto here.
1447 compiler->AddWork(this);
1448 macro_assembler->GoTo(&label_);
1449 return DONE;
1450 }
1451 // Generate generic version of the node and bind the label for later use.
1452 macro_assembler->BindBlock(&label_);
1453 return CONTINUE;
1454 }
1455
1456 // We are being asked to make a non-generic version. Keep track of how many
1457 // non-generic versions we generate so as not to overdo it.
1458 trace_count_++;
1459 if (kRegexpOptimization && trace_count_ < kMaxCopiesCodeGenerated &&
1460 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1461 return CONTINUE;
1462 }
1463
1464 // If we get here code has been generated for this node too many times or
1465 // recursion is too deep. Time to switch to a generic version. The code for
1466 // generic versions above can handle deep recursion properly.
1467 trace->Flush(compiler, this);
1468 return DONE;
1469}
1470
1471intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
1472 intptr_t budget,
1473 bool not_at_start) {
1474 if (budget <= 0) return 0;
1475 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1476 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1477}
1478
1479void ActionNode::FillInBMInfo(intptr_t offset,
1480 intptr_t budget,
1481 BoyerMooreLookahead* bm,
1482 bool not_at_start) {
1483 if (action_type_ == BEGIN_SUBMATCH) {
1484 bm->SetRest(offset);
1485 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1486 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1487 }
1488 SaveBMInfo(bm, not_at_start, offset);
1489}
1490
1491intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
1492 intptr_t budget,
1493 bool not_at_start) {
1494 if (budget <= 0) return 0;
1495 // If we know we are not at the start and we are asked "how many characters
1496 // will you match if you succeed?" then we can answer anything since false
1497 // implies false. So lets just return the max answer (still_to_find) since
1498 // that won't prevent us from preloading a lot of characters for the other
1499 // branches in the node graph.
1500 if (assertion_type() == AT_START && not_at_start) return still_to_find;
1501 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1502}
1503
1504void AssertionNode::FillInBMInfo(intptr_t offset,
1505 intptr_t budget,
1506 BoyerMooreLookahead* bm,
1507 bool not_at_start) {
1508 // Match the behaviour of EatsAtLeast on this node.
1509 if (assertion_type() == AT_START && not_at_start) return;
1510 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1511 SaveBMInfo(bm, not_at_start, offset);
1512}
1513
1514intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
1515 intptr_t budget,
1516 bool not_at_start) {
1517 if (read_backward()) return 0;
1518 if (budget <= 0) return 0;
1519 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1520}
1521
1522intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
1523 intptr_t budget,
1524 bool not_at_start) {
1525 if (read_backward()) return 0;
1526 intptr_t answer = Length();
1527 if (answer >= still_to_find) return answer;
1528 if (budget <= 0) return answer;
1529 // We are not at start after this node so we set the last argument to 'true'.
1530 return answer +
1531 on_success()->EatsAtLeast(still_to_find - answer, budget - 1, true);
1532}
1533
1534intptr_t NegativeLookaroundChoiceNode::EatsAtLeast(intptr_t still_to_find,
1535 intptr_t budget,
1536 bool not_at_start) {
1537 if (budget <= 0) return 0;
1538 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1539 // afterwards.
1540 RegExpNode* node = (*alternatives_)[1].node();
1541 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1542}
1543
1544void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1545 QuickCheckDetails* details,
1546 RegExpCompiler* compiler,
1547 intptr_t filled_in,
1548 bool not_at_start) {
1549 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1550 // afterwards.
1551 RegExpNode* node = (*alternatives_)[1].node();
1552 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1553}
1554
1555intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
1556 intptr_t budget,
1557 RegExpNode* ignore_this_node,
1558 bool not_at_start) {
1559 if (budget <= 0) return 0;
1560 intptr_t min = 100;
1561 intptr_t choice_count = alternatives_->length();
1562 budget = (budget - 1) / choice_count;
1563 for (intptr_t i = 0; i < choice_count; i++) {
1564 RegExpNode* node = (*alternatives_)[i].node();
1565 if (node == ignore_this_node) continue;
1566 intptr_t node_eats_at_least =
1567 node->EatsAtLeast(still_to_find, budget, not_at_start);
1568 if (node_eats_at_least < min) min = node_eats_at_least;
1569 if (min == 0) return 0;
1570 }
1571 return min;
1572}
1573
1574intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1575 intptr_t budget,
1576 bool not_at_start) {
1577 return EatsAtLeastHelper(still_to_find, budget - 1, loop_node_, not_at_start);
1578}
1579
1580intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
1581 intptr_t budget,
1582 bool not_at_start) {
1583 return EatsAtLeastHelper(still_to_find, budget, NULL, not_at_start);
1584}
1585
1586// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1587static inline uint32_t SmearBitsRight(uint32_t v) {
1588 v |= v >> 1;
1589 v |= v >> 2;
1590 v |= v >> 4;
1591 v |= v >> 8;
1592 v |= v >> 16;
1593 return v;
1594}
1595
1596bool QuickCheckDetails::Rationalize(bool asc) {
1597 bool found_useful_op = false;
1598 uint32_t char_mask;
1599 if (asc) {
1600 char_mask = Symbols::kMaxOneCharCodeSymbol;
1601 } else {
1602 char_mask = Utf16::kMaxCodeUnit;
1603 }
1604 mask_ = 0;
1605 value_ = 0;
1606 intptr_t char_shift = 0;
1607 for (intptr_t i = 0; i < characters_; i++) {
1608 Position* pos = &positions_[i];
1609 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1610 found_useful_op = true;
1611 }
1612 mask_ |= (pos->mask & char_mask) << char_shift;
1613 value_ |= (pos->value & char_mask) << char_shift;
1614 char_shift += asc ? 8 : 16;
1615 }
1616 return found_useful_op;
1617}
1618
1619bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1620 Trace* bounds_check_trace,
1621 Trace* trace,
1622 bool preload_has_checked_bounds,
1623 BlockLabel* on_possible_success,
1624 QuickCheckDetails* details,
1625 bool fall_through_on_failure) {
1626 if (details->characters() == 0) return false;
1627 GetQuickCheckDetails(details, compiler, 0,
1628 trace->at_start() == Trace::FALSE_VALUE);
1629 if (details->cannot_match()) return false;
1630 if (!details->Rationalize(compiler->one_byte())) return false;
1631 ASSERT(details->characters() == 1 ||
1632 compiler->macro_assembler()->CanReadUnaligned());
1633 uint32_t mask = details->mask();
1634 uint32_t value = details->value();
1635
1636 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1637
1638 if (trace->characters_preloaded() != details->characters()) {
1639 ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());
1640 // We are attempting to preload the minimum number of characters
1641 // any choice would eat, so if the bounds check fails, then none of the
1642 // choices can succeed, so we can just immediately backtrack, rather
1643 // than go to the next choice.
1644 assembler->LoadCurrentCharacter(
1645 trace->cp_offset(), bounds_check_trace->backtrack(),
1646 !preload_has_checked_bounds, details->characters());
1647 }
1648
1649 bool need_mask = true;
1650
1651 if (details->characters() == 1) {
1652 // If number of characters preloaded is 1 then we used a byte or 16 bit
1653 // load so the value is already masked down.
1654 uint32_t char_mask;
1655 if (compiler->one_byte()) {
1656 char_mask = Symbols::kMaxOneCharCodeSymbol;
1657 } else {
1658 char_mask = Utf16::kMaxCodeUnit;
1659 }
1660 if ((mask & char_mask) == char_mask) need_mask = false;
1661 mask &= char_mask;
1662 } else {
1663 // For 2-character preloads in one-byte mode or 1-character preloads in
1664 // two-byte mode we also use a 16 bit load with zero extend.
1665 if (details->characters() == 2 && compiler->one_byte()) {
1666 if ((mask & 0xffff) == 0xffff) need_mask = false;
1667 } else if (details->characters() == 1 && !compiler->one_byte()) {
1668 if ((mask & 0xffff) == 0xffff) need_mask = false;
1669 } else {
1670 if (mask == 0xffffffff) need_mask = false;
1671 }
1672 }
1673
1674 if (fall_through_on_failure) {
1675 if (need_mask) {
1676 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1677 } else {
1678 assembler->CheckCharacter(value, on_possible_success);
1679 }
1680 } else {
1681 if (need_mask) {
1682 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1683 } else {
1684 assembler->CheckNotCharacter(value, trace->backtrack());
1685 }
1686 }
1687 return true;
1688}
1689
1690// Here is the meat of GetQuickCheckDetails (see also the comment on the
1691// super-class in the .h file).
1692//
1693// We iterate along the text object, building up for each character a
1694// mask and value that can be used to test for a quick failure to match.
1695// The masks and values for the positions will be combined into a single
1696// machine word for the current character width in order to be used in
1697// generating a quick check.
1698void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1699 RegExpCompiler* compiler,
1700 intptr_t characters_filled_in,
1701 bool not_at_start) {
1702#if defined(__GNUC__) && defined(__BYTE_ORDER__)
1703 // TODO(zerny): Make the combination code byte-order independent.
1704 ASSERT(details->characters() == 1 ||
1705 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__));
1706#endif
1707 // Do not collect any quick check details if the text node reads backward,
1708 // since it reads in the opposite direction than we use for quick checks.
1709 if (read_backward()) return;
1710 ASSERT(characters_filled_in < details->characters());
1711 intptr_t characters = details->characters();
1712 int32_t char_mask;
1713 if (compiler->one_byte()) {
1714 char_mask = Symbols::kMaxOneCharCodeSymbol;
1715 } else {
1716 char_mask = Utf16::kMaxCodeUnit;
1717 }
1718 for (intptr_t k = 0; k < elms_->length(); k++) {
1719 TextElement elm = elms_->At(k);
1720 if (elm.text_type() == TextElement::ATOM) {
1721 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1722 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
1723 QuickCheckDetails::Position* pos =
1724 details->positions(characters_filled_in);
1725 uint16_t c = quarks->At(i);
1726 if (c > char_mask) {
1727 // If we expect a non-Latin1 character from an one-byte string,
1728 // there is no way we can match. Not even case independent
1729 // matching can turn an Latin1 character into non-Latin1 or
1730 // vice versa.
1731 // TODO(dcarney): issue 3550. Verify that this works as expected.
1732 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
1733 details->set_cannot_match();
1734 pos->determines_perfectly = false;
1735 return;
1736 }
1737 if (elm.atom()->ignore_case()) {
1738 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1739 intptr_t length =
1740 GetCaseIndependentLetters(c, compiler->one_byte(), chars);
1741 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1742 if (length == 1) {
1743 // This letter has no case equivalents, so it's nice and simple
1744 // and the mask-compare will determine definitely whether we have
1745 // a match at this character position.
1746 pos->mask = char_mask;
1747 pos->value = c;
1748 pos->determines_perfectly = true;
1749 } else {
1750 uint32_t common_bits = char_mask;
1751 uint32_t bits = chars[0];
1752 for (intptr_t j = 1; j < length; j++) {
1753 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1754 common_bits ^= differing_bits;
1755 bits &= common_bits;
1756 }
1757 // If length is 2 and common bits has only one zero in it then
1758 // our mask and compare instruction will determine definitely
1759 // whether we have a match at this character position. Otherwise
1760 // it can only be an approximate check.
1761 uint32_t one_zero = (common_bits | ~char_mask);
1762 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1763 pos->determines_perfectly = true;
1764 }
1765 pos->mask = common_bits;
1766 pos->value = bits;
1767 }
1768 } else {
1769 // Don't ignore case. Nice simple case where the mask-compare will
1770 // determine definitely whether we have a match at this character
1771 // position.
1772 pos->mask = char_mask;
1773 pos->value = c;
1774 pos->determines_perfectly = true;
1775 }
1776 characters_filled_in++;
1777 ASSERT(characters_filled_in <= details->characters());
1778 if (characters_filled_in == details->characters()) {
1779 return;
1780 }
1781 }
1782 } else {
1783 QuickCheckDetails::Position* pos =
1784 details->positions(characters_filled_in);
1785 RegExpCharacterClass* tree = elm.char_class();
1786 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
1787 ASSERT(!ranges->is_empty());
1788 if (tree->is_negated()) {
1789 // A quick check uses multi-character mask and compare. There is no
1790 // useful way to incorporate a negative char class into this scheme
1791 // so we just conservatively create a mask and value that will always
1792 // succeed.
1793 pos->mask = 0;
1794 pos->value = 0;
1795 } else {
1796 intptr_t first_range = 0;
1797 while (ranges->At(first_range).from() > char_mask) {
1798 first_range++;
1799 if (first_range == ranges->length()) {
1800 details->set_cannot_match();
1801 pos->determines_perfectly = false;
1802 return;
1803 }
1804 }
1805 CharacterRange range = ranges->At(first_range);
1806 uint16_t from = range.from();
1807 uint16_t to = range.to();
1808 if (to > char_mask) {
1809 to = char_mask;
1810 }
1811 uint32_t differing_bits = (from ^ to);
1812 // A mask and compare is only perfect if the differing bits form a
1813 // number like 00011111 with one single block of trailing 1s.
1814 if ((differing_bits & (differing_bits + 1)) == 0 &&
1815 from + differing_bits == to) {
1816 pos->determines_perfectly = true;
1817 }
1818 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1819 uint32_t bits = (from & common_bits);
1820 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
1821 CharacterRange range = ranges->At(i);
1822 uint16_t from = range.from();
1823 uint16_t to = range.to();
1824 if (from > char_mask) continue;
1825 if (to > char_mask) to = char_mask;
1826 // Here we are combining more ranges into the mask and compare
1827 // value. With each new range the mask becomes more sparse and
1828 // so the chances of a false positive rise. A character class
1829 // with multiple ranges is assumed never to be equivalent to a
1830 // mask and compare operation.
1831 pos->determines_perfectly = false;
1832 uint32_t new_common_bits = (from ^ to);
1833 new_common_bits = ~SmearBitsRight(new_common_bits);
1834 common_bits &= new_common_bits;
1835 bits &= new_common_bits;
1836 uint32_t differing_bits = (from & common_bits) ^ bits;
1837 common_bits ^= differing_bits;
1838 bits &= common_bits;
1839 }
1840 pos->mask = common_bits;
1841 pos->value = bits;
1842 }
1843 characters_filled_in++;
1844 ASSERT(characters_filled_in <= details->characters());
1845 if (characters_filled_in == details->characters()) {
1846 return;
1847 }
1848 }
1849 }
1850 ASSERT(characters_filled_in != details->characters());
1851 if (!details->cannot_match()) {
1852 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1853 true);
1854 }
1855}
1856
1857void QuickCheckDetails::Clear() {
1858 for (int i = 0; i < characters_; i++) {
1859 positions_[i].mask = 0;
1860 positions_[i].value = 0;
1861 positions_[i].determines_perfectly = false;
1862 }
1863 characters_ = 0;
1864}
1865
1866void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
1867 if (by >= characters_ || by < 0) {
1868 // check that by < 0 => characters_ == 0
1869 ASSERT(by >= 0 || characters_ == 0);
1870 Clear();
1871 return;
1872 }
1873 for (intptr_t i = 0; i < characters_ - by; i++) {
1874 positions_[i] = positions_[by + i];
1875 }
1876 for (intptr_t i = characters_ - by; i < characters_; i++) {
1877 positions_[i].mask = 0;
1878 positions_[i].value = 0;
1879 positions_[i].determines_perfectly = false;
1880 }
1881 characters_ -= by;
1882 // We could change mask_ and value_ here but we would never advance unless
1883 // they had already been used in a check and they won't be used again because
1884 // it would gain us nothing. So there's no point.
1885}
1886
1887void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
1888 ASSERT(characters_ == other->characters_);
1889 if (other->cannot_match_) {
1890 return;
1891 }
1892 if (cannot_match_) {
1893 *this = *other;
1894 return;
1895 }
1896 for (intptr_t i = from_index; i < characters_; i++) {
1897 QuickCheckDetails::Position* pos = positions(i);
1898 QuickCheckDetails::Position* other_pos = other->positions(i);
1899 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1900 !other_pos->determines_perfectly) {
1901 // Our mask-compare operation will be approximate unless we have the
1902 // exact same operation on both sides of the alternation.
1903 pos->determines_perfectly = false;
1904 }
1905 pos->mask &= other_pos->mask;
1906 pos->value &= pos->mask;
1907 other_pos->value &= pos->mask;
1908 uint16_t differing_bits = (pos->value ^ other_pos->value);
1909 pos->mask &= ~differing_bits;
1910 pos->value &= pos->mask;
1911 }
1912}
1913
1914class VisitMarker : public ValueObject {
1915 public:
1916 explicit VisitMarker(NodeInfo* info) : info_(info) {
1917 ASSERT(!info->visited);
1918 info->visited = true;
1919 }
1920 ~VisitMarker() { info_->visited = false; }
1921
1922 private:
1923 NodeInfo* info_;
1924};
1925
1926RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth) {
1927 if (info()->replacement_calculated) return replacement();
1928 if (depth < 0) return this;
1929 ASSERT(!info()->visited);
1930 VisitMarker marker(info());
1931 return FilterSuccessor(depth - 1);
1932}
1933
1934RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth) {
1935 RegExpNode* next = on_success_->FilterOneByte(depth - 1);
1936 if (next == NULL) return set_replacement(NULL);
1937 on_success_ = next;
1938 return set_replacement(this);
1939}
1940
1941// We need to check for the following characters: 0x39c 0x3bc 0x178.
1942static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
1943 // TODO(dcarney): this could be a lot more efficient.
1944 return range.Contains(0x39c) || range.Contains(0x3bc) ||
1945 range.Contains(0x178);
1946}
1947
1948static bool RangesContainLatin1Equivalents(
1949 ZoneGrowableArray<CharacterRange>* ranges) {
1950 for (intptr_t i = 0; i < ranges->length(); i++) {
1951 // TODO(dcarney): this could be a lot more efficient.
1952 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;
1953 }
1954 return false;
1955}
1956
1957static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
1958 ASSERT(c > Symbols::kMaxOneCharCodeSymbol);
1959 switch (c) {
1960 // This are equivalent characters in unicode.
1961 case 0x39c:
1962 case 0x3bc:
1963 return 0xb5;
1964 // This is an uppercase of a Latin-1 character
1965 // outside of Latin-1.
1966 case 0x178:
1967 return 0xff;
1968 }
1969 return 0;
1970}
1971
1972RegExpNode* TextNode::FilterOneByte(intptr_t depth) {
1973 if (info()->replacement_calculated) return replacement();
1974 if (depth < 0) return this;
1975 ASSERT(!info()->visited);
1976 VisitMarker marker(info());
1977 intptr_t element_count = elms_->length();
1978 for (intptr_t i = 0; i < element_count; i++) {
1979 TextElement elm = elms_->At(i);
1980 if (elm.text_type() == TextElement::ATOM) {
1981 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1982 for (intptr_t j = 0; j < quarks->length(); j++) {
1983 uint16_t c = quarks->At(j);
1984 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
1985 if (!elm.atom()->ignore_case()) return set_replacement(NULL);
1986 // Here, we need to check for characters whose upper and lower cases
1987 // are outside the Latin-1 range.
1988 uint16_t converted = ConvertNonLatin1ToLatin1(c);
1989 // Character is outside Latin-1 completely
1990 if (converted == 0) return set_replacement(NULL);
1991 // Convert quark to Latin-1 in place.
1992 (*quarks)[0] = converted;
1993 }
1994 } else {
1995 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
1996 RegExpCharacterClass* cc = elm.char_class();
1997 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1998 if (!CharacterRange::IsCanonical(ranges)) {
1999 CharacterRange::Canonicalize(ranges);
2000 }
2001 // Now they are in order so we only need to look at the first.
2002 intptr_t range_count = ranges->length();
2003 if (cc->is_negated()) {
2004 if (range_count != 0 && ranges->At(0).from() == 0 &&
2005 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) {
2006 // This will be handled in a later filter.
2007 if (cc->flags().IgnoreCase() &&
2008 RangesContainLatin1Equivalents(ranges)) {
2009 continue;
2010 }
2011 return set_replacement(NULL);
2012 }
2013 } else {
2014 if (range_count == 0 ||
2015 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) {
2016 // This will be handled in a later filter.
2017 if (cc->flags().IgnoreCase() &&
2018 RangesContainLatin1Equivalents(ranges))
2019 continue;
2020 return set_replacement(NULL);
2021 }
2022 }
2023 }
2024 }
2025 return FilterSuccessor(depth - 1);
2026}
2027
2028RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth) {
2029 if (info()->replacement_calculated) return replacement();
2030 if (depth < 0) return this;
2031 if (info()->visited) return this;
2032 {
2033 VisitMarker marker(info());
2034
2035 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2036 // If we can't continue after the loop then there is no sense in doing the
2037 // loop.
2038 if (continue_replacement == NULL) return set_replacement(NULL);
2039 }
2040
2041 return ChoiceNode::FilterOneByte(depth - 1);
2042}
2043
2044RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) {
2045 if (info()->replacement_calculated) return replacement();
2046 if (depth < 0) return this;
2047 if (info()->visited) return this;
2048 VisitMarker marker(info());
2049 intptr_t choice_count = alternatives_->length();
2050
2051 for (intptr_t i = 0; i < choice_count; i++) {
2052 GuardedAlternative alternative = alternatives_->At(i);
2053 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2054 set_replacement(this);
2055 return this;
2056 }
2057 }
2058
2059 intptr_t surviving = 0;
2060 RegExpNode* survivor = NULL;
2061 for (intptr_t i = 0; i < choice_count; i++) {
2062 GuardedAlternative alternative = alternatives_->At(i);
2063 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2064 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2065 if (replacement != NULL) {
2066 (*alternatives_)[i].set_node(replacement);
2067 surviving++;
2068 survivor = replacement;
2069 }
2070 }
2071 if (surviving < 2) return set_replacement(survivor);
2072
2073 set_replacement(this);
2074 if (surviving == choice_count) {
2075 return this;
2076 }
2077 // Only some of the nodes survived the filtering. We need to rebuild the
2078 // alternatives list.
2079 ZoneGrowableArray<GuardedAlternative>* new_alternatives =
2080 new (Z) ZoneGrowableArray<GuardedAlternative>(surviving);
2081 for (intptr_t i = 0; i < choice_count; i++) {
2082 RegExpNode* replacement =
2083 (*alternatives_)[i].node()->FilterOneByte(depth - 1);
2084 if (replacement != NULL) {
2085 (*alternatives_)[i].set_node(replacement);
2086 new_alternatives->Add((*alternatives_)[i]);
2087 }
2088 }
2089 alternatives_ = new_alternatives;
2090 return this;
2091}
2092
2093RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth) {
2094 if (info()->replacement_calculated) return replacement();
2095 if (depth < 0) return this;
2096 if (info()->visited) return this;
2097 VisitMarker marker(info());
2098 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2099 // afterwards.
2100 RegExpNode* node = (*alternatives_)[1].node();
2101 RegExpNode* replacement = node->FilterOneByte(depth - 1);
2102 if (replacement == NULL) return set_replacement(NULL);
2103 (*alternatives_)[1].set_node(replacement);
2104
2105 RegExpNode* neg_node = (*alternatives_)[0].node();
2106 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2107 // If the negative lookahead is always going to fail then
2108 // we don't need to check it.
2109 if (neg_replacement == NULL) return set_replacement(replacement);
2110 (*alternatives_)[0].set_node(neg_replacement);
2111 return set_replacement(this);
2112}
2113
2114void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2115 RegExpCompiler* compiler,
2116 intptr_t characters_filled_in,
2117 bool not_at_start) {
2118 if (body_can_be_zero_length_ || info()->visited) return;
2119 VisitMarker marker(info());
2120 return ChoiceNode::GetQuickCheckDetails(details, compiler,
2121 characters_filled_in, not_at_start);
2122}
2123
2124void LoopChoiceNode::FillInBMInfo(intptr_t offset,
2125 intptr_t budget,
2126 BoyerMooreLookahead* bm,
2127 bool not_at_start) {
2128 if (body_can_be_zero_length_ || budget <= 0) {
2129 bm->SetRest(offset);
2130 SaveBMInfo(bm, not_at_start, offset);
2131 return;
2132 }
2133 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2134 SaveBMInfo(bm, not_at_start, offset);
2135}
2136
2137void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2138 RegExpCompiler* compiler,
2139 intptr_t characters_filled_in,
2140 bool not_at_start) {
2141 not_at_start = (not_at_start || not_at_start_);
2142 intptr_t choice_count = alternatives_->length();
2143 ASSERT(choice_count > 0);
2144 (*alternatives_)[0].node()->GetQuickCheckDetails(
2145 details, compiler, characters_filled_in, not_at_start);
2146 for (intptr_t i = 1; i < choice_count; i++) {
2147 QuickCheckDetails new_details(details->characters());
2148 RegExpNode* node = (*alternatives_)[i].node();
2149 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2150 not_at_start);
2151 // Here we merge the quick match details of the two branches.
2152 details->Merge(&new_details, characters_filled_in);
2153 }
2154}
2155
2156// Check for [0-9A-Z_a-z].
2157static void EmitWordCheck(RegExpMacroAssembler* assembler,
2158 BlockLabel* word,
2159 BlockLabel* non_word,
2160 bool fall_through_on_word) {
2161 if (assembler->CheckSpecialCharacterClass(
2162 fall_through_on_word ? 'w' : 'W',
2163 fall_through_on_word ? non_word : word)) {
2164 // Optimized implementation available.
2165 return;
2166 }
2167 assembler->CheckCharacterGT('z', non_word);
2168 assembler->CheckCharacterLT('0', non_word);
2169 assembler->CheckCharacterGT('a' - 1, word);
2170 assembler->CheckCharacterLT('9' + 1, word);
2171 assembler->CheckCharacterLT('A', non_word);
2172 assembler->CheckCharacterLT('Z' + 1, word);
2173 if (fall_through_on_word) {
2174 assembler->CheckNotCharacter('_', non_word);
2175 } else {
2176 assembler->CheckCharacter('_', word);
2177 }
2178}
2179
2180// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2181// that matches newline or the start of input).
2182static void EmitHat(RegExpCompiler* compiler,
2183 RegExpNode* on_success,
2184 Trace* trace) {
2185 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2186 // We will be loading the previous character into the current character
2187 // register.
2188 Trace new_trace(*trace);
2189 new_trace.InvalidateCurrentCharacter();
2190
2191 BlockLabel ok;
2192 if (new_trace.cp_offset() == 0) {
2193 // The start of input counts as a newline in this context, so skip to
2194 // ok if we are at the start.
2195 assembler->CheckAtStart(&ok);
2196 }
2197 // We already checked that we are not at the start of input so it must be
2198 // OK to load the previous character.
2199 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2200 new_trace.backtrack(), false);
2201 if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2202 // Newline means \n, \r, 0x2028 or 0x2029.
2203 if (!compiler->one_byte()) {
2204 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2205 }
2206 assembler->CheckCharacter('\n', &ok);
2207 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2208 }
2209 assembler->BindBlock(&ok);
2210 on_success->Emit(compiler, &new_trace);
2211}
2212
2213// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2214void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2215 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2216 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2217 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2218 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2219 if (lookahead == NULL) {
2220 intptr_t eats_at_least =
2221 Utils::Minimum(kMaxLookaheadForBoyerMoore,
2222 EatsAtLeast(kMaxLookaheadForBoyerMoore, kRecursionBudget,
2223 not_at_start));
2224 if (eats_at_least >= 1) {
2225 BoyerMooreLookahead* bm =
2226 new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
2227 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
2228 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2229 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2230 }
2231 } else {
2232 if (lookahead->at(0)->is_non_word())
2233 next_is_word_character = Trace::FALSE_VALUE;
2234 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2235 }
2236 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2237 if (next_is_word_character == Trace::UNKNOWN) {
2238 BlockLabel before_non_word;
2239 BlockLabel before_word;
2240 if (trace->characters_preloaded() != 1) {
2241 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2242 }
2243 // Fall through on non-word.
2244 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2245 // Next character is not a word character.
2246 assembler->BindBlock(&before_non_word);
2247 BlockLabel ok;
2248 // Backtrack on \B (non-boundary check) if previous is a word,
2249 // since we know next *is not* a word and this would be a boundary.
2250 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2251
2252 if (!assembler->IsClosed()) {
2253 assembler->GoTo(&ok);
2254 }
2255
2256 assembler->BindBlock(&before_word);
2257 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2258 assembler->BindBlock(&ok);
2259 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2260 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2261 } else {
2262 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2263 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2264 }
2265}
2266
2267void AssertionNode::BacktrackIfPrevious(
2268 RegExpCompiler* compiler,
2269 Trace* trace,
2270 AssertionNode::IfPrevious backtrack_if_previous) {
2271 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2272 Trace new_trace(*trace);
2273 new_trace.InvalidateCurrentCharacter();
2274
2275 BlockLabel fall_through, dummy;
2276
2277 BlockLabel* non_word = backtrack_if_previous == kIsNonWord
2278 ? new_trace.backtrack()
2279 : &fall_through;
2280 BlockLabel* word = backtrack_if_previous == kIsNonWord
2281 ? &fall_through
2282 : new_trace.backtrack();
2283
2284 if (new_trace.cp_offset() == 0) {
2285 // The start of input counts as a non-word character, so the question is
2286 // decided if we are at the start.
2287 assembler->CheckAtStart(non_word);
2288 }
2289 // We already checked that we are not at the start of input so it must be
2290 // OK to load the previous character.
2291 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2292 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2293
2294 assembler->BindBlock(&fall_through);
2295 on_success()->Emit(compiler, &new_trace);
2296}
2297
2298void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2299 RegExpCompiler* compiler,
2300 intptr_t filled_in,
2301 bool not_at_start) {
2302 if (assertion_type_ == AT_START && not_at_start) {
2303 details->set_cannot_match();
2304 return;
2305 }
2306 return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2307 not_at_start);
2308}
2309
2310void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2311 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2312 switch (assertion_type_) {
2313 case AT_END: {
2314 BlockLabel ok;
2315 assembler->CheckPosition(trace->cp_offset(), &ok);
2316 assembler->GoTo(trace->backtrack());
2317 assembler->BindBlock(&ok);
2318 break;
2319 }
2320 case AT_START: {
2321 if (trace->at_start() == Trace::FALSE_VALUE) {
2322 assembler->GoTo(trace->backtrack());
2323 return;
2324 }
2325 if (trace->at_start() == Trace::UNKNOWN) {
2326 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2327 Trace at_start_trace = *trace;
2328 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2329 on_success()->Emit(compiler, &at_start_trace);
2330 return;
2331 }
2332 } break;
2333 case AFTER_NEWLINE:
2334 EmitHat(compiler, on_success(), trace);
2335 return;
2336 case AT_BOUNDARY:
2337 case AT_NON_BOUNDARY: {
2338 EmitBoundaryCheck(compiler, trace);
2339 return;
2340 }
2341 }
2342 on_success()->Emit(compiler, trace);
2343}
2344
2345static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2346 if (quick_check == NULL) return false;
2347 if (offset >= quick_check->characters()) return false;
2348 return quick_check->positions(offset)->determines_perfectly;
2349}
2350
2351static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2352 if (index > *checked_up_to) {
2353 *checked_up_to = index;
2354 }
2355}
2356
2357// We call this repeatedly to generate code for each pass over the text node.
2358// The passes are in increasing order of difficulty because we hope one
2359// of the first passes will fail in which case we are saved the work of the
2360// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2361// we will check the '%' in the first pass, the case independent 'a' in the
2362// second pass and the character class in the last pass.
2363//
2364// The passes are done from right to left, so for example to test for /bar/
2365// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2366// and then a 'b' with offset 0. This means we can avoid the end-of-input
2367// bounds check most of the time. In the example we only need to check for
2368// end-of-input when loading the putative 'r'.
2369//
2370// A slight complication involves the fact that the first character may already
2371// be fetched into a register by the previous node. In this case we want to
2372// do the test for that character first. We do this in separate passes. The
2373// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2374// pass has been performed then subsequent passes will have true in
2375// first_element_checked to indicate that that character does not need to be
2376// checked again.
2377//
2378// In addition to all this we are passed a Trace, which can
2379// contain an AlternativeGeneration object. In this AlternativeGeneration
2380// object we can see details of any quick check that was already passed in
2381// order to get to the code we are now generating. The quick check can involve
2382// loading characters, which means we do not need to recheck the bounds
2383// up to the limit the quick check already checked. In addition the quick
2384// check can have involved a mask and compare operation which may simplify
2385// or obviate the need for further checks at some character positions.
2386void TextNode::TextEmitPass(RegExpCompiler* compiler,
2387 TextEmitPassType pass,
2388 bool preloaded,
2389 Trace* trace,
2390 bool first_element_checked,
2391 intptr_t* checked_up_to) {
2392 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2393 bool one_byte = compiler->one_byte();
2394 BlockLabel* backtrack = trace->backtrack();
2395 QuickCheckDetails* quick_check = trace->quick_check_performed();
2396 intptr_t element_count = elms_->length();
2397 intptr_t backward_offset = read_backward() ? -Length() : 0;
2398 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2399 TextElement elm = elms_->At(i);
2400 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2401 if (elm.text_type() == TextElement::ATOM) {
2402 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2403 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2404 if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2405 if (first_element_checked && i == 0 && j == 0) continue;
2406 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2407 EmitCharacterFunction* emit_function = NULL;
2408 uint16_t quark = quarks->At(j);
2409 if (elm.atom()->ignore_case()) {
2410 // Everywhere else we assume that a non-Latin-1 character cannot match
2411 // a Latin-1 character. Avoid the cases where this is assumption is
2412 // invalid by using the Latin1 equivalent instead.
2413 quark = Latin1::TryConvertToLatin1(quark);
2414 }
2415 switch (pass) {
2416 case NON_LATIN1_MATCH:
2417 ASSERT(one_byte);
2418 if (quark > Symbols::kMaxOneCharCodeSymbol) {
2419 assembler->GoTo(backtrack);
2420 return;
2421 }
2422 break;
2423 case NON_LETTER_CHARACTER_MATCH:
2424 emit_function = &EmitAtomNonLetter;
2425 break;
2426 case SIMPLE_CHARACTER_MATCH:
2427 emit_function = &EmitSimpleCharacter;
2428 break;
2429 case CASE_CHARACTER_MATCH:
2430 emit_function = &EmitAtomLetter;
2431 break;
2432 default:
2433 break;
2434 }
2435 if (emit_function != NULL) {
2436 const bool bounds_check =
2437 *checked_up_to < (cp_offset + j) || read_backward();
2438 bool bound_checked =
2439 emit_function(Z, compiler, quarks->At(j), backtrack,
2440 cp_offset + j, bounds_check, preloaded);
2441 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2442 }
2443 }
2444 } else {
2445 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2446 if (pass == CHARACTER_CLASS_MATCH) {
2447 if (first_element_checked && i == 0) continue;
2448 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2449 RegExpCharacterClass* cc = elm.char_class();
2450 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2451 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2452 bounds_check, preloaded, Z);
2453 UpdateBoundsCheck(cp_offset, checked_up_to);
2454 }
2455 }
2456 }
2457}
2458
2459intptr_t TextNode::Length() {
2460 TextElement elm = elms_->Last();
2461 ASSERT(elm.cp_offset() >= 0);
2462 return elm.cp_offset() + elm.length();
2463}
2464
2465bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
2466 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
2467 if (ignore_case) {
2468 return pass == SIMPLE_CHARACTER_MATCH;
2469 } else {
2470 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2471 }
2472}
2473
2474TextNode* TextNode::CreateForCharacterRanges(
2475 ZoneGrowableArray<CharacterRange>* ranges,
2476 bool read_backward,
2477 RegExpNode* on_success,
2478 RegExpFlags flags) {
2479 ASSERT(ranges != nullptr);
2480 ZoneGrowableArray<TextElement>* elms = new ZoneGrowableArray<TextElement>(1);
2481 elms->Add(TextElement::CharClass(new RegExpCharacterClass(ranges, flags)));
2482 return new TextNode(elms, read_backward, on_success);
2483}
2484
2485TextNode* TextNode::CreateForSurrogatePair(CharacterRange lead,
2486 CharacterRange trail,
2487 bool read_backward,
2488 RegExpNode* on_success,
2489 RegExpFlags flags) {
2490 auto lead_ranges = CharacterRange::List(on_success->zone(), lead);
2491 auto trail_ranges = CharacterRange::List(on_success->zone(), trail);
2492 auto elms = new ZoneGrowableArray<TextElement>(2);
2493
2494 elms->Add(
2495 TextElement::CharClass(new RegExpCharacterClass(lead_ranges, flags)));
2496 elms->Add(
2497 TextElement::CharClass(new RegExpCharacterClass(trail_ranges, flags)));
2498
2499 return new TextNode(elms, read_backward, on_success);
2500}
2501
2502// This generates the code to match a text node. A text node can contain
2503// straight character sequences (possibly to be matched in a case-independent
2504// way) and character classes. For efficiency we do not do this in a single
2505// pass from left to right. Instead we pass over the text node several times,
2506// emitting code for some character positions every time. See the comment on
2507// TextEmitPass for details.
2508void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2509 LimitResult limit_result = LimitVersions(compiler, trace);
2510 if (limit_result == DONE) return;
2511 ASSERT(limit_result == CONTINUE);
2512
2513 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2514 compiler->SetRegExpTooBig();
2515 return;
2516 }
2517
2518 if (compiler->one_byte()) {
2519 intptr_t dummy = 0;
2520 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2521 }
2522
2523 bool first_elt_done = false;
2524 intptr_t bound_checked_to = trace->cp_offset() - 1;
2525 bound_checked_to += trace->bound_checked_up_to();
2526
2527 // If a character is preloaded into the current character register then
2528 // check that now.
2529 if (trace->characters_preloaded() == 1) {
2530 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2531 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2532 false, &bound_checked_to);
2533 }
2534 first_elt_done = true;
2535 }
2536
2537 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2538 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2539 first_elt_done, &bound_checked_to);
2540 }
2541
2542 Trace successor_trace(*trace);
2543 // If we advance backward, we may end up at the start.
2544 successor_trace.AdvanceCurrentPositionInTrace(
2545 read_backward() ? -Length() : Length(), compiler);
2546 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2547 : Trace::FALSE_VALUE);
2548 RecursionCheck rc(compiler);
2549 on_success()->Emit(compiler, &successor_trace);
2550}
2551
2552void Trace::InvalidateCurrentCharacter() {
2553 characters_preloaded_ = 0;
2554}
2555
2556void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
2557 RegExpCompiler* compiler) {
2558 // We don't have an instruction for shifting the current character register
2559 // down or for using a shifted value for anything so lets just forget that
2560 // we preloaded any characters into it.
2561 characters_preloaded_ = 0;
2562 // Adjust the offsets of the quick check performed information. This
2563 // information is used to find out what we already determined about the
2564 // characters by means of mask and compare.
2565 quick_check_performed_.Advance(by, compiler->one_byte());
2566 cp_offset_ += by;
2567 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2568 compiler->SetRegExpTooBig();
2569 cp_offset_ = 0;
2570 }
2571 bound_checked_up_to_ =
2572 Utils::Maximum(static_cast<intptr_t>(0), bound_checked_up_to_ - by);
2573}
2574
2575void TextNode::MakeCaseIndependent(bool is_one_byte) {
2576 intptr_t element_count = elms_->length();
2577 for (intptr_t i = 0; i < element_count; i++) {
2578 TextElement elm = elms_->At(i);
2579 if (elm.text_type() == TextElement::CHAR_CLASS) {
2580 RegExpCharacterClass* cc = elm.char_class();
2581 bool case_equivalents_already_added =
2582 cc->flags().NeedsUnicodeCaseEquivalents();
2583 if (cc->flags().IgnoreCase() && !case_equivalents_already_added) {
2584 // None of the standard character classes is different in the case
2585 // independent case and it slows us down if we don't know that.
2586 if (cc->is_standard()) continue;
2587 CharacterRange::AddCaseEquivalents(cc->ranges(), is_one_byte, Z);
2588 }
2589 }
2590 }
2591}
2592
2593intptr_t TextNode::GreedyLoopTextLength() {
2594 TextElement elm = elms_->At(elms_->length() - 1);
2595 return elm.cp_offset() + elm.length();
2596}
2597
2598RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2599 RegExpCompiler* compiler) {
2600 if (read_backward()) return nullptr;
2601 if (elms_->length() != 1) return NULL;
2602 TextElement elm = elms_->At(0);
2603 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
2604 RegExpCharacterClass* node = elm.char_class();
2605 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
2606 if (!CharacterRange::IsCanonical(ranges)) {
2607 CharacterRange::Canonicalize(ranges);
2608 }
2609 if (node->is_negated()) {
2610 return ranges->length() == 0 ? on_success() : NULL;
2611 }
2612 if (ranges->length() != 1) return NULL;
2613 uint32_t max_char;
2614 if (compiler->one_byte()) {
2615 max_char = Symbols::kMaxOneCharCodeSymbol;
2616 } else {
2617 max_char = Utf16::kMaxCodeUnit;
2618 }
2619 return ranges->At(0).IsEverything(max_char) ? on_success() : NULL;
2620}
2621
2622// Finds the fixed match length of a sequence of nodes that goes from
2623// this alternative and back to this choice node. If there are variable
2624// length nodes or other complications in the way then return a sentinel
2625// value indicating that a greedy loop cannot be constructed.
2626intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(
2627 const GuardedAlternative* alternative) {
2628 intptr_t length = 0;
2629 RegExpNode* node = alternative->node();
2630 // Later we will generate code for all these text nodes using recursion
2631 // so we have to limit the max number.
2632 intptr_t recursion_depth = 0;
2633 while (node != this) {
2634 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2635 return kNodeIsTooComplexForGreedyLoops;
2636 }
2637 intptr_t node_length = node->GreedyLoopTextLength();
2638 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2639 return kNodeIsTooComplexForGreedyLoops;
2640 }
2641 length += node_length;
2642 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2643 node = seq_node->on_success();
2644 }
2645 return read_backward() ? -length : length;
2646}
2647
2648void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2649 ASSERT(loop_node_ == NULL);
2650 AddAlternative(alt);
2651 loop_node_ = alt.node();
2652}
2653
2654void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2655 ASSERT(continue_node_ == NULL);
2656 AddAlternative(alt);
2657 continue_node_ = alt.node();
2658}
2659
2660void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2661 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2662 if (trace->stop_node() == this) {
2663 // Back edge of greedy optimized loop node graph.
2664 intptr_t text_length =
2665 GreedyLoopTextLengthForAlternative(&alternatives_->At(0));
2666 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2667 // Update the counter-based backtracking info on the stack. This is an
2668 // optimization for greedy loops (see below).
2669 ASSERT(trace->cp_offset() == text_length);
2670 macro_assembler->AdvanceCurrentPosition(text_length);
2671 macro_assembler->GoTo(trace->loop_label());
2672 return;
2673 }
2674 ASSERT(trace->stop_node() == NULL);
2675 if (!trace->is_trivial()) {
2676 trace->Flush(compiler, this);
2677 return;
2678 }
2679 ChoiceNode::Emit(compiler, trace);
2680}
2681
2682intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2683 intptr_t eats_at_least) {
2684 intptr_t preload_characters =
2685 Utils::Minimum(static_cast<intptr_t>(4), eats_at_least);
2686 if (compiler->macro_assembler()->CanReadUnaligned()) {
2687 bool one_byte = compiler->one_byte();
2688 if (one_byte) {
2689 if (preload_characters > 4) preload_characters = 4;
2690 // We can't preload 3 characters because there is no machine instruction
2691 // to do that. We can't just load 4 because we could be reading
2692 // beyond the end of the string, which could cause a memory fault.
2693 if (preload_characters == 3) preload_characters = 2;
2694 } else {
2695 if (preload_characters > 2) preload_characters = 2;
2696 }
2697 } else {
2698 if (preload_characters > 1) preload_characters = 1;
2699 }
2700 return preload_characters;
2701}
2702
2703// This structure is used when generating the alternatives in a choice node. It
2704// records the way the alternative is being code generated.
2705struct AlternativeGeneration {
2706 AlternativeGeneration()
2707 : possible_success(),
2708 expects_preload(false),
2709 after(),
2710 quick_check_details() {}
2711 BlockLabel possible_success;
2712 bool expects_preload;
2713 BlockLabel after;
2714 QuickCheckDetails quick_check_details;
2715};
2716
2717// Creates a list of AlternativeGenerations. If the list has a reasonable
2718// size then it is on the stack, otherwise the excess is on the heap.
2719class AlternativeGenerationList {
2720 public:
2721 explicit AlternativeGenerationList(intptr_t count) : count_(count) {
2722 ASSERT(count >= 0);
2723 if (count > kAFew) {
2724 excess_alt_gens_.reset(new AlternativeGeneration[count - kAFew]);
2725 }
2726 }
2727
2728 AlternativeGeneration* at(intptr_t i) {
2729 ASSERT(0 <= i);
2730 ASSERT(i < count_);
2731 if (i < kAFew) {
2732 return &a_few_alt_gens_[i];
2733 }
2734 return &excess_alt_gens_[i - kAFew];
2735 }
2736
2737 private:
2738 static const intptr_t kAFew = 10;
2739
2740 intptr_t count_;
2741 AlternativeGeneration a_few_alt_gens_[kAFew];
2742 std::unique_ptr<AlternativeGeneration[]> excess_alt_gens_;
2743
2744 DISALLOW_ALLOCATION();
2745 DISALLOW_COPY_AND_ASSIGN(AlternativeGenerationList);
2746};
2747
2748static const int32_t kRangeEndMarker = Utf::kMaxCodePoint + 1;
2749
2750// The '2' variant is inclusive from and exclusive to.
2751// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
2752// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
2753// 0x180E has been removed from Unicode's Zs category and thus
2754// from ECMAScript's WhiteSpace category as of Unicode 6.3.
2755static const int32_t kSpaceRanges[] = {
2756 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
2757 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
2758 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
2759static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
2760static const int32_t kWordRanges[] = {
2761 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
2762static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
2763static const int32_t kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
2764static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
2765static const int32_t kSurrogateRanges[] = {0xd800, 0xe000, kRangeEndMarker};
2766static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
2767static const int32_t kLineTerminatorRanges[] = {
2768 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
2769static const intptr_t kLineTerminatorRangeCount =
2770 ARRAY_SIZE(kLineTerminatorRanges);
2771
2772void BoyerMoorePositionInfo::Set(intptr_t character) {
2773 SetInterval(Interval(character, character));
2774}
2775
2776void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2777 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2778 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2779 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2780 surrogate_ =
2781 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2782 if (interval.to() - interval.from() >= kMapSize - 1) {
2783 if (map_count_ != kMapSize) {
2784 map_count_ = kMapSize;
2785 for (intptr_t i = 0; i < kMapSize; i++)
2786 (*map_)[i] = true;
2787 }
2788 return;
2789 }
2790 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
2791 intptr_t mod_character = (i & kMask);
2792 if (!map_->At(mod_character)) {
2793 map_count_++;
2794 (*map_)[mod_character] = true;
2795 }
2796 if (map_count_ == kMapSize) return;
2797 }
2798}
2799
2800void BoyerMoorePositionInfo::SetAll() {
2801 s_ = w_ = d_ = kLatticeUnknown;
2802 if (map_count_ != kMapSize) {
2803 map_count_ = kMapSize;
2804 for (intptr_t i = 0; i < kMapSize; i++)
2805 (*map_)[i] = true;
2806 }
2807}
2808
2809BoyerMooreLookahead::BoyerMooreLookahead(intptr_t length,
2810 RegExpCompiler* compiler,
2811 Zone* zone)
2812 : length_(length), compiler_(compiler) {
2813 if (compiler->one_byte()) {
2814 max_char_ = Symbols::kMaxOneCharCodeSymbol;
2815 } else {
2816 max_char_ = Utf16::kMaxCodeUnit;
2817 }
2818 bitmaps_ = new (zone) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);
2819 for (intptr_t i = 0; i < length; i++) {
2820 bitmaps_->Add(new (zone) BoyerMoorePositionInfo(zone));
2821 }
2822}
2823
2824// Find the longest range of lookahead that has the fewest number of different
2825// characters that can occur at a given position. Since we are optimizing two
2826// different parameters at once this is a tradeoff.
2827bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
2828 intptr_t biggest_points = 0;
2829 // If more than 32 characters out of 128 can occur it is unlikely that we can
2830 // be lucky enough to step forwards much of the time.
2831 const intptr_t kMaxMax = 32;
2832 for (intptr_t max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2833 max_number_of_chars *= 2) {
2834 biggest_points =
2835 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2836 }
2837 if (biggest_points == 0) return false;
2838 return true;
2839}
2840
2841// Find the highest-points range between 0 and length_ where the character
2842// information is not too vague. 'Too vague' means that there are more than
2843// max_number_of_chars that can occur at this position. Calculates the number
2844// of points as the product of width-of-the-range and
2845// probability-of-finding-one-of-the-characters, where the probability is
2846// calculated using the frequency distribution of the sample subject string.
2847intptr_t BoyerMooreLookahead::FindBestInterval(intptr_t max_number_of_chars,
2848 intptr_t old_biggest_points,
2849 intptr_t* from,
2850 intptr_t* to) {
2851 intptr_t biggest_points = old_biggest_points;
2852 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2853 for (intptr_t i = 0; i < length_;) {
2854 while (i < length_ && Count(i) > max_number_of_chars)
2855 i++;
2856 if (i == length_) break;
2857 intptr_t remembered_from = i;
2858 bool union_map[kSize];
2859 for (intptr_t j = 0; j < kSize; j++)
2860 union_map[j] = false;
2861 while (i < length_ && Count(i) <= max_number_of_chars) {
2862 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2863 for (intptr_t j = 0; j < kSize; j++)
2864 union_map[j] |= map->at(j);
2865 i++;
2866 }
2867 intptr_t frequency = 0;
2868 for (intptr_t j = 0; j < kSize; j++) {
2869 if (union_map[j]) {
2870 // Add 1 to the frequency to give a small per-character boost for
2871 // the cases where our sampling is not good enough and many
2872 // characters have a frequency of zero. This means the frequency
2873 // can theoretically be up to 2*kSize though we treat it mostly as
2874 // a fraction of kSize.
2875 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2876 }
2877 }
2878 // We use the probability of skipping times the distance we are skipping to
2879 // judge the effectiveness of this. Actually we have a cut-off: By
2880 // dividing by 2 we switch off the skipping if the probability of skipping
2881 // is less than 50%. This is because the multibyte mask-and-compare
2882 // skipping in quickcheck is more likely to do well on this case.
2883 bool in_quickcheck_range =
2884 ((i - remembered_from < 4) ||
2885 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2886 // Called 'probability' but it is only a rough estimate and can actually
2887 // be outside the 0-kSize range.
2888 intptr_t probability =
2889 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2890 intptr_t points = (i - remembered_from) * probability;
2891 if (points > biggest_points) {
2892 *from = remembered_from;
2893 *to = i - 1;
2894 biggest_points = points;
2895 }
2896 }
2897 return biggest_points;
2898}
2899
2900// Take all the characters that will not prevent a successful match if they
2901// occur in the subject string in the range between min_lookahead and
2902// max_lookahead (inclusive) measured from the current position. If the
2903// character at max_lookahead offset is not one of these characters, then we
2904// can safely skip forwards by the number of characters in the range.
2905intptr_t BoyerMooreLookahead::GetSkipTable(
2906 intptr_t min_lookahead,
2907 intptr_t max_lookahead,
2908 const TypedData& boolean_skip_table) {
2909 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2910
2911 const intptr_t kSkipArrayEntry = 0;
2912 const intptr_t kDontSkipArrayEntry = 1;
2913
2914 for (intptr_t i = 0; i < kSize; i++) {
2915 boolean_skip_table.SetUint8(i, kSkipArrayEntry);
2916 }
2917 intptr_t skip = max_lookahead + 1 - min_lookahead;
2918
2919 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2920 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2921 for (intptr_t j = 0; j < kSize; j++) {
2922 if (map->at(j)) {
2923 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);
2924 }
2925 }
2926 }
2927
2928 return skip;
2929}
2930
2931// See comment above on the implementation of GetSkipTable.
2932void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2933 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2934
2935 intptr_t min_lookahead = 0;
2936 intptr_t max_lookahead = 0;
2937
2938 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2939
2940 bool found_single_character = false;
2941 intptr_t single_character = 0;
2942 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2943 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2944 if (map->map_count() > 1 ||
2945 (found_single_character && map->map_count() != 0)) {
2946 found_single_character = false;
2947 break;
2948 }
2949 for (intptr_t j = 0; j < kSize; j++) {
2950 if (map->at(j)) {
2951 found_single_character = true;
2952 single_character = j;
2953 break;
2954 }
2955 }
2956 }
2957
2958 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
2959
2960 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2961 // The mask-compare can probably handle this better.
2962 return;
2963 }
2964
2965 if (found_single_character) {
2966 BlockLabel cont, again;
2967 masm->BindBlock(&again);
2968 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2969 if (max_char_ > kSize) {
2970 masm->CheckCharacterAfterAnd(single_character,
2971 RegExpMacroAssembler::kTableMask, &cont);
2972 } else {
2973 masm->CheckCharacter(single_character, &cont);
2974 }
2975 masm->AdvanceCurrentPosition(lookahead_width);
2976 masm->GoTo(&again);
2977 masm->BindBlock(&cont);
2978 return;
2979 }
2980
2981 const TypedData& boolean_skip_table = TypedData::ZoneHandle(
2982 compiler_->zone(),
2983 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
2984 intptr_t skip_distance =
2985 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2986 ASSERT(skip_distance != 0);
2987
2988 BlockLabel cont, again;
2989
2990 masm->BindBlock(&again);
2991 masm->CheckPreemption(/*is_backtrack=*/false);
2992 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2993 masm->CheckBitInTable(boolean_skip_table, &cont);
2994 masm->AdvanceCurrentPosition(skip_distance);
2995 masm->GoTo(&again);
2996 masm->BindBlock(&cont);
2997
2998 return;
2999}
3000
3001/* Code generation for choice nodes.
3002 *
3003 * We generate quick checks that do a mask and compare to eliminate a
3004 * choice. If the quick check succeeds then it jumps to the continuation to
3005 * do slow checks and check subsequent nodes. If it fails (the common case)
3006 * it falls through to the next choice.
3007 *
3008 * Here is the desired flow graph. Nodes directly below each other imply
3009 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3010 * 3 doesn't have a quick check so we have to call the slow check.
3011 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3012 * regexp continuation is generated directly after the Sn node, up to the
3013 * next GoTo if we decide to reuse some already generated code. Some
3014 * nodes expect preload_characters to be preloaded into the current
3015 * character register. R nodes do this preloading. Vertices are marked
3016 * F for failures and S for success (possible success in the case of quick
3017 * nodes). L, V, < and > are used as arrow heads.
3018 *
3019 * ----------> R
3020 * |
3021 * V
3022 * Q1 -----> S1
3023 * | S /
3024 * F| /
3025 * | F/
3026 * | /
3027 * | R
3028 * | /
3029 * V L
3030 * Q2 -----> S2
3031 * | S /
3032 * F| /
3033 * | F/
3034 * | /
3035 * | R
3036 * | /
3037 * V L
3038 * S3
3039 * |
3040 * F|
3041 * |
3042 * R
3043 * |
3044 * backtrack V
3045 * <----------Q4
3046 * \ F |
3047 * \ |S
3048 * \ F V
3049 * \-----S4
3050 *
3051 * For greedy loops we push the current position, then generate the code that
3052 * eats the input specially in EmitGreedyLoop. The other choice (the
3053 * continuation) is generated by the normal code in EmitChoices, and steps back
3054 * in the input to the starting position when it fails to match. The loop code
3055 * looks like this (U is the unwind code that steps back in the greedy loop).
3056 *
3057 * _____
3058 * / \
3059 * V |
3060 * ----------> S1 |
3061 * /| |
3062 * / |S |
3063 * F/ \_____/
3064 * /
3065 * |<-----
3066 * | \
3067 * V |S
3068 * Q2 ---> U----->backtrack
3069 * | F /
3070 * S| /
3071 * V F /
3072 * S2--/
3073 */
3074
3075GreedyLoopState::GreedyLoopState(bool not_at_start) {
3076 counter_backtrack_trace_.set_backtrack(&label_);
3077 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3078}
3079
3080void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3081#ifdef DEBUG
3082 intptr_t choice_count = alternatives_->length();
3083 for (intptr_t i = 0; i < choice_count - 1; i++) {
3084 GuardedAlternative alternative = alternatives_->At(i);
3085 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3086 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
3087 for (intptr_t j = 0; j < guard_count; j++) {
3088 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
3089 }
3090 }
3091#endif
3092}
3093
3094void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3095 Trace* current_trace,
3096 PreloadState* state) {
3097 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3098 // Save some time by looking at most one machine word ahead.
3099 state->eats_at_least_ =
3100 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3101 current_trace->at_start() == Trace::FALSE_VALUE);
3102 }
3103 state->preload_characters_ =
3104 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3105
3106 state->preload_is_current_ =
3107 (current_trace->characters_preloaded() == state->preload_characters_);
3108 state->preload_has_checked_bounds_ = state->preload_is_current_;
3109}
3110
3111void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3112 intptr_t choice_count = alternatives_->length();
3113
3114 if (choice_count == 1 && alternatives_->At(0).guards() == nullptr) {
3115 alternatives_->At(0).node()->Emit(compiler, trace);
3116 return;
3117 }
3118
3119 AssertGuardsMentionRegisters(trace);
3120
3121 LimitResult limit_result = LimitVersions(compiler, trace);
3122 if (limit_result == DONE) return;
3123 ASSERT(limit_result == CONTINUE);
3124
3125 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3126 // other choice nodes we only flush if we are out of code size budget.
3127 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3128 trace->Flush(compiler, this);
3129 return;
3130 }
3131
3132 RecursionCheck rc(compiler);
3133
3134 PreloadState preload;
3135 preload.init();
3136 GreedyLoopState greedy_loop_state(not_at_start());
3137
3138 intptr_t text_length =
3139 GreedyLoopTextLengthForAlternative(&alternatives_->At(0));
3140 AlternativeGenerationList alt_gens(choice_count);
3141
3142 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3143 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3144 &greedy_loop_state, text_length);
3145 } else {
3146 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3147 // match the traces produced pre-cleanup.
3148 BlockLabel second_choice;
3149 compiler->macro_assembler()->BindBlock(&second_choice);
3150
3151 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3152
3153 EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3154 }
3155
3156 // At this point we need to generate slow checks for the alternatives where
3157 // the quick check was inlined. We can recognize these because the associated
3158 // label was bound.
3159 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3160 for (intptr_t i = 0; i < choice_count; i++) {
3161 AlternativeGeneration* alt_gen = alt_gens.at(i);
3162 Trace new_trace(*trace);
3163 // If there are actions to be flushed we have to limit how many times
3164 // they are flushed. Take the budget of the parent trace and distribute
3165 // it fairly amongst the children.
3166 if (new_trace.actions() != NULL) {
3167 new_trace.set_flush_budget(new_flush_budget);
3168 }
3169 bool next_expects_preload =
3170 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3171 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->At(i),
3172 alt_gen, preload.preload_characters_,
3173 next_expects_preload);
3174 }
3175}
3176
3177Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3178 Trace* trace,
3179 AlternativeGenerationList* alt_gens,
3180 PreloadState* preload,
3181 GreedyLoopState* greedy_loop_state,
3182 intptr_t text_length) {
3183 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3184 // Here we have special handling for greedy loops containing only text nodes
3185 // and other simple nodes. These are handled by pushing the current
3186 // position on the stack and then incrementing the current position each
3187 // time around the switch. On backtrack we decrement the current position
3188 // and check it against the pushed value. This avoids pushing backtrack
3189 // information for each iteration of the loop, which could take up a lot of
3190 // space.
3191 ASSERT(trace->stop_node() == NULL);
3192 macro_assembler->PushCurrentPosition();
3193 BlockLabel greedy_match_failed;
3194 Trace greedy_match_trace;
3195 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3196 greedy_match_trace.set_backtrack(&greedy_match_failed);
3197 BlockLabel loop_label;
3198 macro_assembler->BindBlock(&loop_label);
3199 macro_assembler->CheckPreemption(/*is_backtrack=*/false);
3200 greedy_match_trace.set_stop_node(this);
3201 greedy_match_trace.set_loop_label(&loop_label);
3202 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);
3203 macro_assembler->BindBlock(&greedy_match_failed);
3204
3205 BlockLabel second_choice; // For use in greedy matches.
3206 macro_assembler->BindBlock(&second_choice);
3207
3208 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3209
3210 EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3211
3212 macro_assembler->BindBlock(greedy_loop_state->label());
3213 // If we have unwound to the bottom then backtrack.
3214 macro_assembler->CheckGreedyLoop(trace->backtrack());
3215 // Otherwise try the second priority at an earlier position.
3216 macro_assembler->AdvanceCurrentPosition(-text_length);
3217 macro_assembler->GoTo(&second_choice);
3218 return new_trace;
3219}
3220
3221intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3222 Trace* trace) {
3223 intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3224 if (alternatives_->length() != 2) return eats_at_least;
3225
3226 GuardedAlternative alt1 = alternatives_->At(1);
3227 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
3228 return eats_at_least;
3229 }
3230 RegExpNode* eats_anything_node = alt1.node();
3231 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3232 return eats_at_least;
3233 }
3234
3235 // Really we should be creating a new trace when we execute this function,
3236 // but there is no need, because the code it generates cannot backtrack, and
3237 // we always arrive here with a trivial trace (since it's the entry to a
3238 // loop. That also implies that there are no preloaded characters, which is
3239 // good, because it means we won't be violating any assumptions by
3240 // overwriting those characters with new load instructions.
3241 ASSERT(trace->is_trivial());
3242
3243 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3244 // At this point we know that we are at a non-greedy loop that will eat
3245 // any character one at a time. Any non-anchored regexp has such a
3246 // loop prepended to it in order to find where it starts. We look for
3247 // a pattern of the form ...abc... where we can look 6 characters ahead
3248 // and step forwards 3 if the character is not one of abc. Abc need
3249 // not be atoms, they can be any reasonably limited character class or
3250 // small alternation.
3251 BoyerMooreLookahead* bm = bm_info(false);
3252 if (bm == NULL) {
3253 eats_at_least = Utils::Minimum(
3254 kMaxLookaheadForBoyerMoore,
3255 EatsAtLeast(kMaxLookaheadForBoyerMoore, kRecursionBudget, false));
3256 if (eats_at_least >= 1) {
3257 bm = new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
3258 GuardedAlternative alt0 = alternatives_->At(0);
3259 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
3260 }
3261 }
3262 if (bm != NULL) {
3263 bm->EmitSkipInstructions(macro_assembler);
3264 }
3265 return eats_at_least;
3266}
3267
3268void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3269 AlternativeGenerationList* alt_gens,
3270 intptr_t first_choice,
3271 Trace* trace,
3272 PreloadState* preload) {
3273 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3274 SetUpPreLoad(compiler, trace, preload);
3275
3276 // For now we just call all choices one after the other. The idea ultimately
3277 // is to use the Dispatch table to try only the relevant ones.
3278 intptr_t choice_count = alternatives_->length();
3279
3280 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3281
3282 for (intptr_t i = first_choice; i < choice_count; i++) {
3283 bool is_last = i == choice_count - 1;
3284 bool fall_through_on_failure = !is_last;
3285 GuardedAlternative alternative = alternatives_->At(i);
3286 AlternativeGeneration* alt_gen = alt_gens->at(i);
3287 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3288 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3289 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
3290 Trace new_trace(*trace);
3291 new_trace.set_characters_preloaded(
3292 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3293 if (preload->preload_has_checked_bounds_) {
3294 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3295 }
3296 new_trace.quick_check_performed()->Clear();
3297 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3298 if (!is_last) {
3299 new_trace.set_backtrack(&alt_gen->after);
3300 }
3301 alt_gen->expects_preload = preload->preload_is_current_;
3302 bool generate_full_check_inline = false;
3303 if (kRegexpOptimization &&
3304 try_to_emit_quick_check_for_alternative(i == 0) &&
3305 alternative.node()->EmitQuickCheck(
3306 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3307 &alt_gen->possible_success, &alt_gen->quick_check_details,
3308 fall_through_on_failure)) {
3309 // Quick check was generated for this choice.
3310 preload->preload_is_current_ = true;
3311 preload->preload_has_checked_bounds_ = true;
3312 // If we generated the quick check to fall through on possible success,
3313 // we now need to generate the full check inline.
3314 if (!fall_through_on_failure) {
3315 macro_assembler->BindBlock(&alt_gen->possible_success);
3316 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3317 new_trace.set_characters_preloaded(preload->preload_characters_);
3318 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3319 generate_full_check_inline = true;
3320 }
3321 } else if (alt_gen->quick_check_details.cannot_match()) {
3322 if (!fall_through_on_failure) {
3323 macro_assembler->GoTo(trace->backtrack());
3324 }
3325 continue;
3326 } else {
3327 // No quick check was generated. Put the full code here.
3328 // If this is not the first choice then there could be slow checks from
3329 // previous cases that go here when they fail. There's no reason to
3330 // insist that they preload characters since the slow check we are about
3331 // to generate probably can't use it.
3332 if (i != first_choice) {
3333 alt_gen->expects_preload = false;
3334 new_trace.InvalidateCurrentCharacter();
3335 }
3336 generate_full_check_inline = true;
3337 }
3338 if (generate_full_check_inline) {
3339 if (new_trace.actions() != NULL) {
3340 new_trace.set_flush_budget(new_flush_budget);
3341 }
3342 for (intptr_t j = 0; j < guard_count; j++) {
3343 GenerateGuard(macro_assembler, guards->At(j), &new_trace);
3344 }
3345 alternative.node()->Emit(compiler, &new_trace);
3346 preload->preload_is_current_ = false;
3347 }
3348 macro_assembler->BindBlock(&alt_gen->after);
3349 }
3350}
3351
3352void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3353 Trace* trace,
3354 GuardedAlternative alternative,
3355 AlternativeGeneration* alt_gen,
3356 intptr_t preload_characters,
3357 bool next_expects_preload) {
3358 if (!alt_gen->possible_success.is_linked()) return;
3359
3360 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3361 macro_assembler->BindBlock(&alt_gen->possible_success);
3362 Trace out_of_line_trace(*trace);
3363 out_of_line_trace.set_characters_preloaded(preload_characters);
3364 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3365 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3366 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3367 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
3368 if (next_expects_preload) {
3369 BlockLabel reload_current_char;
3370 out_of_line_trace.set_backtrack(&reload_current_char);
3371 for (intptr_t j = 0; j < guard_count; j++) {
3372 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3373 }
3374 alternative.node()->Emit(compiler, &out_of_line_trace);
3375 macro_assembler->BindBlock(&reload_current_char);
3376 // Reload the current character, since the next quick check expects that.
3377 // We don't need to check bounds here because we only get into this
3378 // code through a quick check which already did the checked load.
3379 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), NULL, false,
3380 preload_characters);
3381 macro_assembler->GoTo(&(alt_gen->after));
3382 } else {
3383 out_of_line_trace.set_backtrack(&(alt_gen->after));
3384 for (intptr_t j = 0; j < guard_count; j++) {
3385 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3386 }
3387 alternative.node()->Emit(compiler, &out_of_line_trace);
3388 }
3389}
3390
3391void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3392 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3393 LimitResult limit_result = LimitVersions(compiler, trace);
3394 if (limit_result == DONE) return;
3395 ASSERT(limit_result == CONTINUE);
3396
3397 RecursionCheck rc(compiler);
3398
3399 switch (action_type_) {
3400 case STORE_POSITION: {
3401 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3402 data_.u_position_register.is_capture,
3403 trace);
3404 Trace new_trace = *trace;
3405 new_trace.add_action(&new_capture);
3406 on_success()->Emit(compiler, &new_trace);
3407 break;
3408 }
3409 case INCREMENT_REGISTER: {
3410 Trace::DeferredIncrementRegister new_increment(
3411 data_.u_increment_register.reg);
3412 Trace new_trace = *trace;
3413 new_trace.add_action(&new_increment);
3414 on_success()->Emit(compiler, &new_trace);
3415 break;
3416 }
3417 case SET_REGISTER: {
3418 Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
3419 data_.u_store_register.value);
3420 Trace new_trace = *trace;
3421 new_trace.add_action(&new_set);
3422 on_success()->Emit(compiler, &new_trace);
3423 break;
3424 }
3425 case CLEAR_CAPTURES: {
3426 Trace::DeferredClearCaptures new_capture(Interval(
3427 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3428 Trace new_trace = *trace;
3429 new_trace.add_action(&new_capture);
3430 on_success()->Emit(compiler, &new_trace);
3431 break;
3432 }
3433 case BEGIN_SUBMATCH:
3434 if (!trace->is_trivial()) {
3435 trace->Flush(compiler, this);
3436 } else {
3437 assembler->WriteCurrentPositionToRegister(
3438 data_.u_submatch.current_position_register, 0);
3439 assembler->WriteStackPointerToRegister(
3440 data_.u_submatch.stack_pointer_register);
3441 on_success()->Emit(compiler, trace);
3442 }
3443 break;
3444 case EMPTY_MATCH_CHECK: {
3445 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
3446 intptr_t stored_pos = 0;
3447 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
3448 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3449 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3450 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3451 // If we know we haven't advanced and there is no minimum we
3452 // can just backtrack immediately.
3453 assembler->GoTo(trace->backtrack());
3454 } else if (know_dist && stored_pos < trace->cp_offset()) {
3455 // If we know we've advanced we can generate the continuation
3456 // immediately.
3457 on_success()->Emit(compiler, trace);
3458 } else if (!trace->is_trivial()) {
3459 trace->Flush(compiler, this);
3460 } else {
3461 BlockLabel skip_empty_check;
3462 // If we have a minimum number of repetitions we check the current
3463 // number first and skip the empty check if it's not enough.
3464 if (has_minimum) {
3465 intptr_t limit = data_.u_empty_match_check.repetition_limit;
3466 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3467 }
3468 // If the match is empty we bail out, otherwise we fall through
3469 // to the on-success continuation.
3470 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3471 trace->backtrack());
3472 assembler->BindBlock(&skip_empty_check);
3473 on_success()->Emit(compiler, trace);
3474 }
3475 break;
3476 }
3477 case POSITIVE_SUBMATCH_SUCCESS: {
3478 if (!trace->is_trivial()) {
3479 trace->Flush(compiler, this);
3480 return;
3481 }
3482 assembler->ReadCurrentPositionFromRegister(
3483 data_.u_submatch.current_position_register);
3484 assembler->ReadStackPointerFromRegister(
3485 data_.u_submatch.stack_pointer_register);
3486 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
3487 if (clear_register_count == 0) {
3488 on_success()->Emit(compiler, trace);
3489 return;
3490 }
3491 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
3492 BlockLabel clear_registers_backtrack;
3493 Trace new_trace = *trace;
3494 new_trace.set_backtrack(&clear_registers_backtrack);
3495 on_success()->Emit(compiler, &new_trace);
3496
3497 assembler->BindBlock(&clear_registers_backtrack);
3498 intptr_t clear_registers_to =
3499 clear_registers_from + clear_register_count - 1;
3500 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3501
3502 ASSERT(trace->backtrack() == NULL);
3503 assembler->Backtrack();
3504 return;
3505 }
3506 default:
3507 UNREACHABLE();
3508 }
3509}
3510
3511void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3512 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3513 if (!trace->is_trivial()) {
3514 trace->Flush(compiler, this);
3515 return;
3516 }
3517
3518 LimitResult limit_result = LimitVersions(compiler, trace);
3519 if (limit_result == DONE) return;
3520 ASSERT(limit_result == CONTINUE);
3521
3522 RecursionCheck rc(compiler);
3523
3524 ASSERT(start_reg_ + 1 == end_reg_);
3525 if (flags_.IgnoreCase()) {
3526 assembler->CheckNotBackReferenceIgnoreCase(
3527 start_reg_, read_backward(), flags_.IsUnicode(), trace->backtrack());
3528 } else {
3529 assembler->CheckNotBackReference(start_reg_, read_backward(),
3530 trace->backtrack());
3531 }
3532 // We are going to advance backward, so we may end up at the start.
3533 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3534
3535 // Check that the back reference does not end inside a surrogate pair.
3536 if (flags_.IsUnicode() && !compiler->one_byte()) {
3537 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3538 }
3539
3540 on_success()->Emit(compiler, trace);
3541}
3542
3543// -------------------------------------------------------------------
3544// Dot/dotty output
3545
3546#ifdef DEBUG
3547
3548class DotPrinter : public NodeVisitor {
3549 public:
3550 explicit DotPrinter(bool ignore_case) {}
3551 void PrintNode(const char* label, RegExpNode* node);
3552 void Visit(RegExpNode* node);
3553 void PrintAttributes(RegExpNode* from);
3554 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3555#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
3556 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3557#undef DECLARE_VISIT
3558};
3559
3560void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3561 OS::PrintErr("digraph G {\n graph [label=\"");
3562 for (intptr_t i = 0; label[i] != '\0'; i++) {
3563 switch (label[i]) {
3564 case '\\':
3565 OS::PrintErr("\\\\");
3566 break;
3567 case '"':
3568 OS::PrintErr("\"");
3569 break;
3570 default:
3571 OS::PrintErr("%c", label[i]);
3572 break;
3573 }
3574 }
3575 OS::PrintErr("\"];\n");
3576 Visit(node);
3577 OS::PrintErr("}\n");
3578}
3579
3580void DotPrinter::Visit(RegExpNode* node) {
3581 if (node->info()->visited) return;
3582 node->info()->visited = true;
3583 node->Accept(this);
3584}
3585
3586void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3587 OS::PrintErr(" n%p -> n%p [style=dotted];\n", from, on_failure);
3588 Visit(on_failure);
3589}
3590
3591class AttributePrinter : public ValueObject {
3592 public:
3593 AttributePrinter() : first_(true) {}
3594 void PrintSeparator() {
3595 if (first_) {
3596 first_ = false;
3597 } else {
3598 OS::PrintErr("|");
3599 }
3600 }
3601 void PrintBit(const char* name, bool value) {
3602 if (!value) return;
3603 PrintSeparator();
3604 OS::PrintErr("{%s}", name);
3605 }
3606 void PrintPositive(const char* name, intptr_t value) {
3607 if (value < 0) return;
3608 PrintSeparator();
3609 OS::PrintErr("{%s|%" Pd "}", name, value);
3610 }
3611
3612 private:
3613 bool first_;
3614};
3615
3616void DotPrinter::PrintAttributes(RegExpNode* that) {
3617 OS::PrintErr(
3618 " a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3619 "margin=0.1, fontsize=10, label=\"{",
3620 that);
3621 AttributePrinter printer;
3622 NodeInfo* info = that->info();
3623 printer.PrintBit("NI", info->follows_newline_interest);
3624 printer.PrintBit("WI", info->follows_word_interest);
3625 printer.PrintBit("SI", info->follows_start_interest);
3626 BlockLabel* label = that->label();
3627 if (label->is_bound()) printer.PrintPositive("@", label->pos());
3628 OS::PrintErr(
3629 "}\"];\n"
3630 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
3631 that, that);
3632}
3633
3634void DotPrinter::VisitChoice(ChoiceNode* that) {
3635 OS::PrintErr(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3636 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3637 GuardedAlternative alt = that->alternatives()->At(i);
3638 OS::PrintErr(" n%p -> n%p", that, alt.node());
3639 }
3640 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3641 GuardedAlternative alt = that->alternatives()->At(i);
3642 alt.node()->Accept(this);
3643 }
3644}
3645
3646void DotPrinter::VisitText(TextNode* that) {
3647 OS::PrintErr(" n%p [label=\"", that);
3648 for (intptr_t i = 0; i < that->elements()->length(); i++) {
3649 if (i > 0) OS::PrintErr(" ");
3650 TextElement elm = that->elements()->At(i);
3651 switch (elm.text_type()) {
3652 case TextElement::ATOM: {
3653 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
3654 for (intptr_t i = 0; i < data->length(); i++) {
3655 OS::PrintErr("%c", static_cast<char>(data->At(i)));
3656 }
3657 break;
3658 }
3659 case TextElement::CHAR_CLASS: {
3660 RegExpCharacterClass* node = elm.char_class();
3661 OS::PrintErr("[");
3662 if (node->is_negated()) OS::PrintErr("^");
3663 for (intptr_t j = 0; j < node->ranges()->length(); j++) {
3664 CharacterRange range = node->ranges()->At(j);
3665 PrintUtf16(range.from());
3666 OS::PrintErr("-");
3667 PrintUtf16(range.to());
3668 }
3669 OS::PrintErr("]");
3670 break;
3671 }
3672 default:
3673 UNREACHABLE();
3674 }
3675 }
3676 OS::PrintErr("\", shape=box, peripheries=2];\n");
3677 PrintAttributes(that);
3678 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3679 Visit(that->on_success());
3680}
3681
3682void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3683 OS::PrintErr(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
3684 that, that->start_register(), that->end_register());
3685 PrintAttributes(that);
3686 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3687 Visit(that->on_success());
3688}
3689
3690void DotPrinter::VisitEnd(EndNode* that) {
3691 OS::PrintErr(" n%p [style=bold, shape=point];\n", that);
3692 PrintAttributes(that);
3693}
3694
3695void DotPrinter::VisitAssertion(AssertionNode* that) {
3696 OS::PrintErr(" n%p [", that);
3697 switch (that->assertion_type()) {
3698 case AssertionNode::AT_END:
3699 OS::PrintErr("label=\"$\", shape=septagon");
3700 break;
3701 case AssertionNode::AT_START:
3702 OS::PrintErr("label=\"^\", shape=septagon");
3703 break;
3704 case AssertionNode::AT_BOUNDARY:
3705 OS::PrintErr("label=\"\\b\", shape=septagon");
3706 break;
3707 case AssertionNode::AT_NON_BOUNDARY:
3708 OS::PrintErr("label=\"\\B\", shape=septagon");
3709 break;
3710 case AssertionNode::AFTER_NEWLINE:
3711 OS::PrintErr("label=\"(?<=\\n)\", shape=septagon");
3712 break;
3713 }
3714 OS::PrintErr("];\n");
3715 PrintAttributes(that);
3716 RegExpNode* successor = that->on_success();
3717 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3718 Visit(successor);
3719}
3720
3721void DotPrinter::VisitAction(ActionNode* that) {
3722 OS::PrintErr(" n%p [", that);
3723 switch (that->action_type_) {
3724 case ActionNode::SET_REGISTER:
3725 OS::PrintErr("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
3726 that->data_.u_store_register.reg,
3727 that->data_.u_store_register.value);
3728 break;
3729 case ActionNode::INCREMENT_REGISTER:
3730 OS::PrintErr("label=\"$%" Pd "++\", shape=octagon",
3731 that->data_.u_increment_register.reg);
3732 break;
3733 case ActionNode::STORE_POSITION:
3734 OS::PrintErr("label=\"$%" Pd ":=$pos\", shape=octagon",
3735 that->data_.u_position_register.reg);
3736 break;
3737 case ActionNode::BEGIN_SUBMATCH:
3738 OS::PrintErr("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
3739 that->data_.u_submatch.current_position_register);
3740 break;
3741 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3742 OS::PrintErr("label=\"escape\", shape=septagon");
3743 break;
3744 case ActionNode::EMPTY_MATCH_CHECK:
3745 OS::PrintErr("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
3746 that->data_.u_empty_match_check.start_register,
3747 that->data_.u_empty_match_check.repetition_register,
3748 that->data_.u_empty_match_check.repetition_limit);
3749 break;
3750 case ActionNode::CLEAR_CAPTURES: {
3751 OS::PrintErr("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
3752 that->data_.u_clear_captures.range_from,
3753 that->data_.u_clear_captures.range_to);
3754 break;
3755 }
3756 }
3757 OS::PrintErr("];\n");
3758 PrintAttributes(that);
3759 RegExpNode* successor = that->on_success();
3760 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3761 Visit(successor);
3762}
3763
3764void RegExpEngine::DotPrint(const char* label,
3765 RegExpNode* node,
3766 bool ignore_case) {
3767 DotPrinter printer(ignore_case);
3768 printer.PrintNode(label, node);
3769}
3770
3771#endif // DEBUG
3772
3773// -------------------------------------------------------------------
3774// Tree to graph conversion
3775
3776// The zone in which we allocate graph nodes.
3777#define OZ (on_success->zone())
3778
3779RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3780 RegExpNode* on_success) {
3781 ZoneGrowableArray<TextElement>* elms =
3782 new (OZ) ZoneGrowableArray<TextElement>(1);
3783 elms->Add(TextElement::Atom(this));
3784 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3785}
3786
3787RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3788 RegExpNode* on_success) {
3789 ZoneGrowableArray<TextElement>* elms =
3790 new (OZ) ZoneGrowableArray<TextElement>(1);
3791 for (intptr_t i = 0; i < elements()->length(); i++) {
3792 elms->Add(elements()->At(i));
3793 }
3794 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3795}
3796
3797static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,
3798 const int32_t* special_class,
3799 intptr_t length) {
3800 length--; // Remove final kRangeEndMarker.
3801 ASSERT(special_class[length] == kRangeEndMarker);
3802 ASSERT(ranges->length() != 0);
3803 ASSERT(length != 0);
3804 ASSERT(special_class[0] != 0);
3805 if (ranges->length() != (length >> 1) + 1) {
3806 return false;
3807 }
3808 CharacterRange range = ranges->At(0);
3809 if (range.from() != 0) {
3810 return false;
3811 }
3812 for (intptr_t i = 0; i < length; i += 2) {
3813 if (special_class[i] != (range.to() + 1)) {
3814 return false;
3815 }
3816 range = ranges->At((i >> 1) + 1);
3817 if (special_class[i + 1] != range.from()) {
3818 return false;
3819 }
3820 }
3821 if (range.to() != Utf::kMaxCodePoint) {
3822 return false;
3823 }
3824 return true;
3825}
3826
3827static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges,
3828 const int32_t* special_class,
3829 intptr_t length) {
3830 length--; // Remove final kRangeEndMarker.
3831 ASSERT(special_class[length] == kRangeEndMarker);
3832 if (ranges->length() * 2 != length) {
3833 return false;
3834 }
3835 for (intptr_t i = 0; i < length; i += 2) {
3836 CharacterRange range = ranges->At(i >> 1);
3837 if (range.from() != special_class[i] ||
3838 range.to() != special_class[i + 1] - 1) {
3839 return false;
3840 }
3841 }
3842 return true;
3843}
3844
3845bool RegExpCharacterClass::is_standard() {
3846 // TODO(lrn): Remove need for this function, by not throwing away information
3847 // along the way.
3848 if (is_negated()) {
3849 return false;
3850 }
3851 if (set_.is_standard()) {
3852 return true;
3853 }
3854 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3855 set_.set_standard_set_type('s');
3856 return true;
3857 }
3858 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3859 set_.set_standard_set_type('S');
3860 return true;
3861 }
3862 if (CompareInverseRanges(set_.ranges(), kLineTerminatorRanges,
3863 kLineTerminatorRangeCount)) {
3864 set_.set_standard_set_type('.');
3865 return true;
3866 }
3867 if (CompareRanges(set_.ranges(), kLineTerminatorRanges,
3868 kLineTerminatorRangeCount)) {
3869 set_.set_standard_set_type('n');
3870 return true;
3871 }
3872 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3873 set_.set_standard_set_type('w');
3874 return true;
3875 }
3876 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3877 set_.set_standard_set_type('W');
3878 return true;
3879 }
3880 return false;
3881}
3882
3883UnicodeRangeSplitter::UnicodeRangeSplitter(
3884 Zone* zone,
3885 ZoneGrowableArray<CharacterRange>* base)
3886 : zone_(zone),
3887 table_(zone),
3888 bmp_(nullptr),
3889 lead_surrogates_(nullptr),
3890 trail_surrogates_(nullptr),
3891 non_bmp_(nullptr) {
3892 // The unicode range splitter categorizes given character ranges into:
3893 // - Code points from the BMP representable by one code unit.
3894 // - Code points outside the BMP that need to be split into surrogate pairs.
3895 // - Lone lead surrogates.
3896 // - Lone trail surrogates.
3897 // Lone surrogates are valid code points, even though no actual characters.
3898 // They require special matching to make sure we do not split surrogate pairs.
3899 // We use the dispatch table to accomplish this. The base range is split up
3900 // by the table by the overlay ranges, and the Call callback is used to
3901 // filter and collect ranges for each category.
3902 for (intptr_t i = 0; i < base->length(); i++) {
3903 table_.AddRange(base->At(i), kBase, zone_);
3904 }
3905 // Add overlay ranges.
3906 table_.AddRange(CharacterRange::Range(0, Utf16::kLeadSurrogateStart - 1),
3907 kBmpCodePoints, zone_);
3908 table_.AddRange(CharacterRange::Range(Utf16::kLeadSurrogateStart,
3909 Utf16::kLeadSurrogateEnd),
3910 kLeadSurrogates, zone_);
3911 table_.AddRange(CharacterRange::Range(Utf16::kTrailSurrogateStart,
3912 Utf16::kTrailSurrogateEnd),
3913 kTrailSurrogates, zone_);
3914 table_.AddRange(
3915 CharacterRange::Range(Utf16::kTrailSurrogateEnd + 1, Utf16::kMaxCodeUnit),
3916 kBmpCodePoints, zone_);
3917 table_.AddRange(
3918 CharacterRange::Range(Utf16::kMaxCodeUnit + 1, Utf::kMaxCodePoint),
3919 kNonBmpCodePoints, zone_);
3920 table_.ForEach(this);
3921}
3922
3923void UnicodeRangeSplitter::Call(uint32_t from, ChoiceTable::Entry entry) {
3924 OutSet* outset = entry.out_set();
3925 if (!outset->Get(kBase)) return;
3926 ZoneGrowableArray<CharacterRange>** target = nullptr;
3927 if (outset->Get(kBmpCodePoints)) {
3928 target = &bmp_;
3929 } else if (outset->Get(kLeadSurrogates)) {
3930 target = &lead_surrogates_;
3931 } else if (outset->Get(kTrailSurrogates)) {
3932 target = &trail_surrogates_;
3933 } else {
3934 ASSERT(outset->Get(kNonBmpCodePoints));
3935 target = &non_bmp_;
3936 }
3937 if (*target == nullptr) {
3938 *target = new (zone_) ZoneGrowableArray<CharacterRange>(2);
3939 }
3940 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()));
3941}
3942
3943void AddBmpCharacters(RegExpCompiler* compiler,
3944 ChoiceNode* result,
3945 RegExpNode* on_success,
3946 UnicodeRangeSplitter* splitter) {
3947 ZoneGrowableArray<CharacterRange>* bmp = splitter->bmp();
3948 if (bmp == nullptr) return;
3949 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
3950 bmp, compiler->read_backward(), on_success, RegExpFlags())));
3951}
3952
3953void AddNonBmpSurrogatePairs(RegExpCompiler* compiler,
3954 ChoiceNode* result,
3955 RegExpNode* on_success,
3956 UnicodeRangeSplitter* splitter) {
3957 ZoneGrowableArray<CharacterRange>* non_bmp = splitter->non_bmp();
3958 if (non_bmp == nullptr) return;
3959 ASSERT(!compiler->one_byte());
3960 CharacterRange::Canonicalize(non_bmp);
3961 for (int i = 0; i < non_bmp->length(); i++) {
3962 // Match surrogate pair.
3963 // E.g. [\u10005-\u11005] becomes
3964 // \ud800[\udc05-\udfff]|
3965 // [\ud801-\ud803][\udc00-\udfff]|
3966 // \ud804[\udc00-\udc05]
3967 uint32_t from = non_bmp->At(i).from();
3968 uint32_t to = non_bmp->At(i).to();
3969 uint16_t from_points[2];
3970 Utf16::Encode(from, from_points);
3971 uint16_t to_points[2];
3972 Utf16::Encode(to, to_points);
3973 if (from_points[0] == to_points[0]) {
3974 // The lead surrogate is the same.
3975 result->AddAlternative(
3976 GuardedAlternative(TextNode::CreateForSurrogatePair(
3977 CharacterRange::Singleton(from_points[0]),
3978 CharacterRange::Range(from_points[1], to_points[1]),
3979 compiler->read_backward(), on_success, RegExpFlags())));
3980 } else {
3981 if (from_points[1] != Utf16::kTrailSurrogateStart) {
3982 // Add [from_l][from_t-\udfff]
3983 result->AddAlternative(
3984 GuardedAlternative(TextNode::CreateForSurrogatePair(
3985 CharacterRange::Singleton(from_points[0]),
3986 CharacterRange::Range(from_points[1],
3987 Utf16::kTrailSurrogateEnd),
3988 compiler->read_backward(), on_success, RegExpFlags())));
3989 from_points[0]++;
3990 }
3991 if (to_points[1] != Utf16::kTrailSurrogateEnd) {
3992 // Add [to_l][\udc00-to_t]
3993 result->AddAlternative(
3994 GuardedAlternative(TextNode::CreateForSurrogatePair(
3995 CharacterRange::Singleton(to_points[0]),
3996 CharacterRange::Range(Utf16::kTrailSurrogateStart,
3997 to_points[1]),
3998 compiler->read_backward(), on_success, RegExpFlags())));
3999 to_points[0]--;
4000 }
4001 if (from_points[0] <= to_points[0]) {
4002 // Add [from_l-to_l][\udc00-\udfff]
4003 result->AddAlternative(
4004 GuardedAlternative(TextNode::CreateForSurrogatePair(
4005 CharacterRange::Range(from_points[0], to_points[0]),
4006 CharacterRange::Range(Utf16::kTrailSurrogateStart,
4007 Utf16::kTrailSurrogateEnd),
4008 compiler->read_backward(), on_success, RegExpFlags())));
4009 }
4010 }
4011 }
4012}
4013
4014RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4015 RegExpCompiler* compiler,
4016 ZoneGrowableArray<CharacterRange>* lookbehind,
4017 ZoneGrowableArray<CharacterRange>* match,
4018 RegExpNode* on_success,
4019 bool read_backward,
4020 RegExpFlags flags) {
4021 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4022 match, read_backward, on_success, flags);
4023 int stack_register = compiler->UnicodeLookaroundStackRegister();
4024 int position_register = compiler->UnicodeLookaroundPositionRegister();
4025 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4026 position_register);
4027 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4028 lookbehind, !read_backward, lookaround.on_match_success(), flags);
4029 return lookaround.ForMatch(negative_match);
4030}
4031
4032RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4033 RegExpCompiler* compiler,
4034 ZoneGrowableArray<CharacterRange>* match,
4035 ZoneGrowableArray<CharacterRange>* lookahead,
4036 RegExpNode* on_success,
4037 bool read_backward,
4038 RegExpFlags flags) {
4039 int stack_register = compiler->UnicodeLookaroundStackRegister();
4040 int position_register = compiler->UnicodeLookaroundPositionRegister();
4041 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4042 position_register);
4043 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4044 lookahead, read_backward, lookaround.on_match_success(), flags);
4045 return TextNode::CreateForCharacterRanges(
4046 match, read_backward, lookaround.ForMatch(negative_match), flags);
4047}
4048
4049void AddLoneLeadSurrogates(RegExpCompiler* compiler,
4050 ChoiceNode* result,
4051 RegExpNode* on_success,
4052 UnicodeRangeSplitter* splitter) {
4053 auto lead_surrogates = splitter->lead_surrogates();
4054 if (lead_surrogates == nullptr) return;
4055 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4056 auto trail_surrogates = CharacterRange::List(
4057 on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart,
4058 Utf16::kTrailSurrogateEnd));
4059
4060 RegExpNode* match;
4061 if (compiler->read_backward()) {
4062 // Reading backward. Assert that reading forward, there is no trail
4063 // surrogate, and then backward match the lead surrogate.
4064 match = NegativeLookaroundAgainstReadDirectionAndMatch(
4065 compiler, trail_surrogates, lead_surrogates, on_success, true,
4066 RegExpFlags());
4067 } else {
4068 // Reading forward. Forward match the lead surrogate and assert that
4069 // no trail surrogate follows.
4070 match = MatchAndNegativeLookaroundInReadDirection(
4071 compiler, lead_surrogates, trail_surrogates, on_success, false,
4072 RegExpFlags());
4073 }
4074 result->AddAlternative(GuardedAlternative(match));
4075}
4076
4077void AddLoneTrailSurrogates(RegExpCompiler* compiler,
4078 ChoiceNode* result,
4079 RegExpNode* on_success,
4080 UnicodeRangeSplitter* splitter) {
4081 auto trail_surrogates = splitter->trail_surrogates();
4082 if (trail_surrogates == nullptr) return;
4083 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4084 auto lead_surrogates = CharacterRange::List(
4085 on_success->zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart,
4086 Utf16::kLeadSurrogateEnd));
4087
4088 RegExpNode* match;
4089 if (compiler->read_backward()) {
4090 // Reading backward. Backward match the trail surrogate and assert that no
4091 // lead surrogate precedes it.
4092 match = MatchAndNegativeLookaroundInReadDirection(
4093 compiler, trail_surrogates, lead_surrogates, on_success, true,
4094 RegExpFlags());
4095 } else {
4096 // Reading forward. Assert that reading backward, there is no lead
4097 // surrogate, and then forward match the trail surrogate.
4098 match = NegativeLookaroundAgainstReadDirectionAndMatch(
4099 compiler, lead_surrogates, trail_surrogates, on_success, false,
4100 RegExpFlags());
4101 }
4102 result->AddAlternative(GuardedAlternative(match));
4103}
4104
4105RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
4106 RegExpNode* on_success) {
4107 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
4108 ASSERT(!compiler->read_backward());
4109 // Advance any character. If the character happens to be a lead surrogate and
4110 // we advanced into the middle of a surrogate pair, it will work out, as
4111 // nothing will match from there. We will have to advance again, consuming
4112 // the associated trail surrogate.
4113 auto range = CharacterRange::List(
4114 on_success->zone(), CharacterRange::Range(0, Utf16::kMaxCodeUnit));
4115 return TextNode::CreateForCharacterRanges(range, false, on_success,
4116 RegExpFlags());
4117}
4118
4119void AddUnicodeCaseEquivalents(ZoneGrowableArray<CharacterRange>* ranges) {
4120 ASSERT(CharacterRange::IsCanonical(ranges));
4121
4122 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
4123 // See also https://crbug.com/v8/6727.
4124 // TODO(sstrickl): This only covers the special case of the {0,0x10FFFF}
4125 // range, which we use frequently internally. But large ranges can also easily
4126 // be created by the user. We might want to have a more general caching
4127 // mechanism for such ranges.
4128 if (ranges->length() == 1 && ranges->At(0).IsEverything(Utf::kMaxCodePoint)) {
4129 return;
4130 }
4131
4132 icu::UnicodeSet set;
4133 for (int i = 0; i < ranges->length(); i++) {
4134 set.add(ranges->At(i).from(), ranges->At(i).to());
4135 }
4136 ranges->Clear();
4137 set.closeOver(USET_CASE_INSENSITIVE);
4138 // Full case mapping map single characters to multiple characters.
4139 // Those are represented as strings in the set. Remove them so that
4140 // we end up with only simple and common case mappings.
4141 set.removeAllStrings();
4142 for (int i = 0; i < set.getRangeCount(); i++) {
4143 ranges->Add(
4144 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
4145 }
4146 // No errors and everything we collected have been ranges.
4147 CharacterRange::Canonicalize(ranges);
4148}
4149
4150RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4151 RegExpNode* on_success) {
4152 set_.Canonicalize();
4153 ZoneGrowableArray<CharacterRange>* ranges = this->ranges();
4154 if (flags_.NeedsUnicodeCaseEquivalents()) {
4155 AddUnicodeCaseEquivalents(ranges);
4156 }
4157 if (flags_.IsUnicode() && !compiler->one_byte() &&
4158 !contains_split_surrogate()) {
4159 if (is_negated()) {
4160 ZoneGrowableArray<CharacterRange>* negated =
4161 new ZoneGrowableArray<CharacterRange>(2);
4162 CharacterRange::Negate(ranges, negated);
4163 ranges = negated;
4164 }
4165 if (ranges->length() == 0) {
4166 RegExpCharacterClass* fail =
4167 new RegExpCharacterClass(ranges, RegExpFlags());
4168 return new TextNode(fail, compiler->read_backward(), on_success);
4169 }
4170 if (standard_type() == '*') {
4171 return UnanchoredAdvance(compiler, on_success);
4172 } else {
4173 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4174 UnicodeRangeSplitter splitter(OZ, ranges);
4175 AddBmpCharacters(compiler, result, on_success, &splitter);
4176 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
4177 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
4178 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
4179 return result;
4180 }
4181 } else {
4182 return new TextNode(this, compiler->read_backward(), on_success);
4183 }
4184 return new (OZ) TextNode(this, compiler->read_backward(), on_success);
4185}
4186
4187RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4188 RegExpNode* on_success) {
4189 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
4190 intptr_t length = alternatives->length();
4191 ChoiceNode* result = new (OZ) ChoiceNode(length, OZ);
4192 for (intptr_t i = 0; i < length; i++) {
4193 GuardedAlternative alternative(
4194 alternatives->At(i)->ToNode(compiler, on_success));
4195 result->AddAlternative(alternative);
4196 }
4197 return result;
4198}
4199
4200RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4201 RegExpNode* on_success) {
4202 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
4203}
4204
4205// Scoped object to keep track of how much we unroll quantifier loops in the
4206// regexp graph generator.
4207class RegExpExpansionLimiter : public ValueObject {
4208 public:
4209 static const intptr_t kMaxExpansionFactor = 6;
4210 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)
4211 : compiler_(compiler),
4212 saved_expansion_factor_(compiler->current_expansion_factor()),
4213 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4214 ASSERT(factor > 0);
4215 if (ok_to_expand_) {
4216 if (factor > kMaxExpansionFactor) {
4217 // Avoid integer overflow of the current expansion factor.
4218 ok_to_expand_ = false;
4219 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4220 } else {
4221 intptr_t new_factor = saved_expansion_factor_ * factor;
4222 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4223 compiler->set_current_expansion_factor(new_factor);
4224 }
4225 }
4226 }
4227
4228 ~RegExpExpansionLimiter() {
4229 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4230 }
4231
4232 bool ok_to_expand() { return ok_to_expand_; }
4233
4234 private:
4235 RegExpCompiler* compiler_;
4236 intptr_t saved_expansion_factor_;
4237 bool ok_to_expand_;
4238
4239 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4240};
4241
4242RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
4243 intptr_t max,
4244 bool is_greedy,
4245 RegExpTree* body,
4246 RegExpCompiler* compiler,
4247 RegExpNode* on_success,
4248 bool not_at_start) {
4249 // x{f, t} becomes this:
4250 //
4251 // (r++)<-.
4252 // | `
4253 // | (x)
4254 // v ^
4255 // (r=0)-->(?)---/ [if r < t]
4256 // |
4257 // [if r >= f] \----> ...
4258 //
4259
4260 // 15.10.2.5 RepeatMatcher algorithm.
4261 // The parser has already eliminated the case where max is 0. In the case
4262 // where max_match is zero the parser has removed the quantifier if min was
4263 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4264
4265 // If we know that we cannot match zero length then things are a little
4266 // simpler since we don't need to make the special zero length match check
4267 // from step 2.1. If the min and max are small we can unroll a little in
4268 // this case.
4269 // Unroll (foo)+ and (foo){3,}
4270 static const intptr_t kMaxUnrolledMinMatches = 3;
4271 // Unroll (foo)? and (foo){x,3}
4272 static const intptr_t kMaxUnrolledMaxMatches = 3;
4273 if (max == 0) return on_success; // This can happen due to recursion.
4274 bool body_can_be_empty = (body->min_match() == 0);
4275 intptr_t body_start_reg = RegExpCompiler::kNoRegister;
4276 Interval capture_registers = body->CaptureRegisters();
4277 bool needs_capture_clearing = !capture_registers.is_empty();
4278 Zone* zone = compiler->zone();
4279
4280 if (body_can_be_empty) {
4281 body_start_reg = compiler->AllocateRegister();
4282 } else if (kRegexpOptimization && !needs_capture_clearing) {
4283 // Only unroll if there are no captures and the body can't be
4284 // empty.
4285 {
4286 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
4287 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4288 intptr_t new_max = (max == kInfinity) ? max : max - min;
4289 // Recurse once to get the loop or optional matches after the fixed
4290 // ones.
4291 RegExpNode* answer =
4292 ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
4293 // Unroll the forced matches from 0 to min. This can cause chains of
4294 // TextNodes (which the parser does not generate). These should be
4295 // combined if it turns out they hinder good code generation.
4296 for (intptr_t i = 0; i < min; i++) {
4297 answer = body->ToNode(compiler, answer);
4298 }
4299 return answer;
4300 }
4301 }
4302 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4303 ASSERT(max > 0); // Due to the 'if' above.
4304 RegExpExpansionLimiter limiter(compiler, max);
4305 if (limiter.ok_to_expand()) {
4306 // Unroll the optional matches up to max.
4307 RegExpNode* answer = on_success;
4308 for (intptr_t i = 0; i < max; i++) {
4309 ChoiceNode* alternation = new (zone) ChoiceNode(2, zone);
4310 if (is_greedy) {
4311 alternation->AddAlternative(
4312 GuardedAlternative(body->ToNode(compiler, answer)));
4313 alternation->AddAlternative(GuardedAlternative(on_success));
4314 } else {
4315 alternation->AddAlternative(GuardedAlternative(on_success));
4316 alternation->AddAlternative(
4317 GuardedAlternative(body->ToNode(compiler, answer)));
4318 }
4319 answer = alternation;
4320 if (not_at_start && !compiler->read_backward()) {
4321 alternation->set_not_at_start();
4322 }
4323 }
4324 return answer;
4325 }
4326 }
4327 }
4328 bool has_min = min > 0;
4329 bool has_max = max < RegExpTree::kInfinity;
4330 bool needs_counter = has_min || has_max;
4331 intptr_t reg_ctr = needs_counter ? compiler->AllocateRegister()
4332 : RegExpCompiler::kNoRegister;
4333 LoopChoiceNode* center = new (zone)
4334 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
4335 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
4336 RegExpNode* loop_return =
4337 needs_counter ? static_cast<RegExpNode*>(
4338 ActionNode::IncrementRegister(reg_ctr, center))
4339 : static_cast<RegExpNode*>(center);
4340 if (body_can_be_empty) {
4341 // If the body can be empty we need to check if it was and then
4342 // backtrack.
4343 loop_return =
4344 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
4345 }
4346 RegExpNode* body_node = body->ToNode(compiler, loop_return);
4347 if (body_can_be_empty) {
4348 // If the body can be empty we need to store the start position
4349 // so we can bail out if it was empty.
4350 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4351 }
4352 if (needs_capture_clearing) {
4353 // Before entering the body of this loop we need to clear captures.
4354 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4355 }
4356 GuardedAlternative body_alt(body_node);
4357 if (has_max) {
4358 Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max);
4359 body_alt.AddGuard(body_guard, zone);
4360 }
4361 GuardedAlternative rest_alt(on_success);
4362 if (has_min) {
4363 Guard* rest_guard = new (zone) Guard(reg_ctr, Guard::GEQ, min);
4364 rest_alt.AddGuard(rest_guard, zone);
4365 }
4366 if (is_greedy) {
4367 center->AddLoopAlternative(body_alt);
4368 center->AddContinueAlternative(rest_alt);
4369 } else {
4370 center->AddContinueAlternative(rest_alt);
4371 center->AddLoopAlternative(body_alt);
4372 }
4373 if (needs_counter) {
4374 return ActionNode::SetRegister(reg_ctr, 0, center);
4375 } else {
4376 return center;
4377 }
4378}
4379
4380namespace {
4381// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
4382// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
4383RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
4384 RegExpNode* on_success,
4385 RegExpAssertion::AssertionType type,
4386 RegExpFlags flags) {
4387 ASSERT(flags.NeedsUnicodeCaseEquivalents());
4388 ZoneGrowableArray<CharacterRange>* word_range =
4389 new ZoneGrowableArray<CharacterRange>(2);
4390 CharacterRange::AddClassEscape('w', word_range, true);
4391 int stack_register = compiler->UnicodeLookaroundStackRegister();
4392 int position_register = compiler->UnicodeLookaroundPositionRegister();
4393 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4394 // Add two choices. The (non-)boundary could start with a word or
4395 // a non-word-character.
4396 for (int i = 0; i < 2; i++) {
4397 bool lookbehind_for_word = i == 0;
4398 bool lookahead_for_word =
4399 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
4400 // Look to the left.
4401 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
4402 stack_register, position_register);
4403 RegExpNode* backward = TextNode::CreateForCharacterRanges(
4404 word_range, true, lookbehind.on_match_success(), flags);
4405 // Look to the right.
4406 RegExpLookaround::Builder lookahead(lookahead_for_word,
4407 lookbehind.ForMatch(backward),
4408 stack_register, position_register);
4409 RegExpNode* forward = TextNode::CreateForCharacterRanges(
4410 word_range, false, lookahead.on_match_success(), flags);
4411 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
4412 }
4413 return result;
4414}
4415} // anonymous namespace
4416
4417RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
4418 RegExpNode* on_success) {
4419 switch (assertion_type()) {
4420 case START_OF_LINE:
4421 return AssertionNode::AfterNewline(on_success);
4422 case START_OF_INPUT:
4423 return AssertionNode::AtStart(on_success);
4424 case BOUNDARY:
4425 return flags_.NeedsUnicodeCaseEquivalents()
4426 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
4427 flags_)
4428 : AssertionNode::AtBoundary(on_success);
4429 case NON_BOUNDARY:
4430 return flags_.NeedsUnicodeCaseEquivalents()
4431 ? BoundaryAssertionAsLookaround(compiler, on_success,
4432 NON_BOUNDARY, flags_)
4433 : AssertionNode::AtNonBoundary(on_success);
4434 case END_OF_INPUT:
4435 return AssertionNode::AtEnd(on_success);
4436 case END_OF_LINE: {
4437 // Compile $ in multiline regexps as an alternation with a positive
4438 // lookahead in one side and an end-of-input on the other side.
4439 // We need two registers for the lookahead.
4440 intptr_t stack_pointer_register = compiler->AllocateRegister();
4441 intptr_t position_register = compiler->AllocateRegister();
4442 // The ChoiceNode to distinguish between a newline and end-of-input.
4443 ChoiceNode* result = new ChoiceNode(2, on_success->zone());
4444 // Create a newline atom.
4445 ZoneGrowableArray<CharacterRange>* newline_ranges =
4446 new ZoneGrowableArray<CharacterRange>(3);
4447 CharacterRange::AddClassEscape('n', newline_ranges);
4448 RegExpCharacterClass* newline_atom =
4449 new RegExpCharacterClass('n', RegExpFlags());
4450 TextNode* newline_matcher =
4451 new TextNode(newline_atom, /*read_backwards=*/false,
4452 ActionNode::PositiveSubmatchSuccess(
4453 stack_pointer_register, position_register,
4454 0, // No captures inside.
4455 -1, // Ignored if no captures.
4456 on_success));
4457 // Create an end-of-input matcher.
4458 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4459 stack_pointer_register, position_register, newline_matcher);
4460 // Add the two alternatives to the ChoiceNode.
4461 GuardedAlternative eol_alternative(end_of_line);
4462 result->AddAlternative(eol_alternative);
4463 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4464 result->AddAlternative(end_alternative);
4465 return result;
4466 }
4467 default:
4468 UNREACHABLE();
4469 }
4470 return on_success;
4471}
4472
4473RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
4474 RegExpNode* on_success) {
4475 return new (OZ) BackReferenceNode(RegExpCapture::StartRegister(index()),
4476 RegExpCapture::EndRegister(index()), flags_,
4477 compiler->read_backward(), on_success);
4478}
4479
4480RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
4481 RegExpNode* on_success) {
4482 return on_success;
4483}
4484
4485RegExpLookaround::Builder::Builder(bool is_positive,
4486 RegExpNode* on_success,
4487 intptr_t stack_pointer_register,
4488 intptr_t position_register,
4489 intptr_t capture_register_count,
4490 intptr_t capture_register_start)
4491 : is_positive_(is_positive),
4492 on_success_(on_success),
4493 stack_pointer_register_(stack_pointer_register),
4494 position_register_(position_register) {
4495 if (is_positive_) {
4496 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
4497 stack_pointer_register, position_register, capture_register_count,
4498 capture_register_start, on_success);
4499 } else {
4500 on_match_success_ = new (OZ) NegativeSubmatchSuccess(
4501 stack_pointer_register, position_register, capture_register_count,
4502 capture_register_start, OZ);
4503 }
4504}
4505
4506RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
4507 if (is_positive_) {
4508 return ActionNode::BeginSubmatch(stack_pointer_register_,
4509 position_register_, match);
4510 } else {
4511 Zone* zone = on_success_->zone();
4512 // We use a ChoiceNode to represent the negative lookaround. The first
4513 // alternative is the negative match. On success, the end node backtracks.
4514 // On failure, the second alternative is tried and leads to success.
4515 // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
4516 // first exit when calculating quick checks.
4517 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
4518 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
4519 return ActionNode::BeginSubmatch(stack_pointer_register_,
4520 position_register_, choice_node);
4521 }
4522}
4523
4524RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
4525 RegExpNode* on_success) {
4526 intptr_t stack_pointer_register = compiler->AllocateRegister();
4527 intptr_t position_register = compiler->AllocateRegister();
4528
4529 const intptr_t registers_per_capture = 2;
4530 const intptr_t register_of_first_capture = 2;
4531 intptr_t register_count = capture_count_ * registers_per_capture;
4532 intptr_t register_start =
4533 register_of_first_capture + capture_from_ * registers_per_capture;
4534
4535 RegExpNode* result;
4536 bool was_reading_backward = compiler->read_backward();
4537 compiler->set_read_backward(type() == LOOKBEHIND);
4538 Builder builder(is_positive(), on_success, stack_pointer_register,
4539 position_register, register_count, register_start);
4540 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
4541 result = builder.ForMatch(match);
4542 compiler->set_read_backward(was_reading_backward);
4543 return result;
4544}
4545
4546RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
4547 RegExpNode* on_success) {
4548 return ToNode(body(), index(), compiler, on_success);
4549}
4550
4551RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4552 intptr_t index,
4553 RegExpCompiler* compiler,
4554 RegExpNode* on_success) {
4555 ASSERT(body != nullptr);
4556 intptr_t start_reg = RegExpCapture::StartRegister(index);
4557 intptr_t end_reg = RegExpCapture::EndRegister(index);
4558 if (compiler->read_backward()) {
4559 intptr_t tmp = end_reg;
4560 end_reg = start_reg;
4561 start_reg = tmp;
4562 }
4563 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
4564 RegExpNode* body_node = body->ToNode(compiler, store_end);
4565 return ActionNode::StorePosition(start_reg, true, body_node);
4566}
4567
4568RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
4569 RegExpNode* on_success) {
4570 ZoneGrowableArray<RegExpTree*>* children = nodes();
4571 RegExpNode* current = on_success;
4572 if (compiler->read_backward()) {
4573 for (intptr_t i = 0; i < children->length(); i++) {
4574 current = children->At(i)->ToNode(compiler, current);
4575 }
4576 } else {
4577 for (intptr_t i = children->length() - 1; i >= 0; i--) {
4578 current = children->At(i)->ToNode(compiler, current);
4579 }
4580 }
4581 return current;
4582}
4583
4584static void AddClass(const int32_t* elmv,
4585 intptr_t elmc,
4586 ZoneGrowableArray<CharacterRange>* ranges) {
4587 elmc--;
4588 ASSERT(elmv[elmc] == kRangeEndMarker);
4589 for (intptr_t i = 0; i < elmc; i += 2) {
4590 ASSERT(elmv[i] < elmv[i + 1]);
4591 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
4592 }
4593}
4594
4595static void AddClassNegated(const int32_t* elmv,
4596 intptr_t elmc,
4597 ZoneGrowableArray<CharacterRange>* ranges) {
4598 elmc--;
4599 ASSERT(elmv[elmc] == kRangeEndMarker);
4600 ASSERT(elmv[0] != 0x0000);
4601 ASSERT(elmv[elmc - 1] != Utf::kMaxCodePoint);
4602 uint16_t last = 0x0000;
4603 for (intptr_t i = 0; i < elmc; i += 2) {
4604 ASSERT(last <= elmv[i] - 1);
4605 ASSERT(elmv[i] < elmv[i + 1]);
4606 ranges->Add(CharacterRange(last, elmv[i] - 1));
4607 last = elmv[i + 1];
4608 }
4609 ranges->Add(CharacterRange(last, Utf::kMaxCodePoint));
4610}
4611
4612void CharacterRange::AddClassEscape(uint16_t type,
4613 ZoneGrowableArray<CharacterRange>* ranges,
4614 bool add_unicode_case_equivalents) {
4615 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
4616 // See #sec-runtime-semantics-wordcharacters-abstract-operation
4617 // In case of unicode and ignore_case, we need to create the closure over
4618 // case equivalent characters before negating.
4619 ZoneGrowableArray<CharacterRange>* new_ranges =
4620 new ZoneGrowableArray<CharacterRange>(2);
4621 AddClass(kWordRanges, kWordRangeCount, new_ranges);
4622 AddUnicodeCaseEquivalents(new_ranges);
4623 if (type == 'W') {
4624 ZoneGrowableArray<CharacterRange>* negated =
4625 new ZoneGrowableArray<CharacterRange>(2);
4626 CharacterRange::Negate(new_ranges, negated);
4627 new_ranges = negated;
4628 }
4629 ranges->AddArray(*new_ranges);
4630 return;
4631 }
4632 AddClassEscape(type, ranges);
4633}
4634
4635void CharacterRange::AddClassEscape(uint16_t type,
4636 ZoneGrowableArray<CharacterRange>* ranges) {
4637 switch (type) {
4638 case 's':
4639 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4640 break;
4641 case 'S':
4642 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4643 break;
4644 case 'w':
4645 AddClass(kWordRanges, kWordRangeCount, ranges);
4646 break;
4647 case 'W':
4648 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4649 break;
4650 case 'd':
4651 AddClass(kDigitRanges, kDigitRangeCount, ranges);
4652 break;
4653 case 'D':
4654 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4655 break;
4656 case '.':
4657 AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
4658 break;
4659 // This is not a character range as defined by the spec but a
4660 // convenient shorthand for a character class that matches any
4661 // character.
4662 case '*':
4663 ranges->Add(CharacterRange::Everything());
4664 break;
4665 // This is the set of characters matched by the $ and ^ symbols
4666 // in multiline mode.
4667 case 'n':
4668 AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
4669 break;
4670 default:
4671 UNREACHABLE();
4672 }
4673}
4674
4675void CharacterRange::AddCaseEquivalents(
4676 ZoneGrowableArray<CharacterRange>* ranges,
4677 bool is_one_byte,
4678 Zone* zone) {
4679 CharacterRange::Canonicalize(ranges);
4680 int range_count = ranges->length();
4681 for (intptr_t i = 0; i < range_count; i++) {
4682 CharacterRange range = ranges->At(i);
4683 int32_t bottom = range.from();
4684 if (bottom > Utf16::kMaxCodeUnit) continue;
4685 int32_t top = Utils::Minimum(range.to(), Utf16::kMaxCodeUnit);
4686 // Nothing to be done for surrogates
4687 if (bottom >= Utf16::kLeadSurrogateStart &&
4688 top <= Utf16::kTrailSurrogateEnd) {
4689 continue;
4690 }
4691 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
4692 if (bottom > Symbols::kMaxOneCharCodeSymbol) continue;
4693 if (top > Symbols::kMaxOneCharCodeSymbol) {
4694 top = Symbols::kMaxOneCharCodeSymbol;
4695 }
4696 }
4697
4698 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
4699 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;
4700 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4701 if (top == bottom) {
4702 // If this is a singleton we just expand the one character.
4703 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars);
4704 for (intptr_t i = 0; i < length; i++) {
4705 int32_t chr = chars[i];
4706 if (chr != bottom) {
4707 ranges->Add(CharacterRange::Singleton(chars[i]));
4708 }
4709 }
4710 } else {
4711 // If this is a range we expand the characters block by block,
4712 // expanding contiguous subranges (blocks) one at a time.
4713 // The approach is as follows. For a given start character we
4714 // look up the remainder of the block that contains it (represented
4715 // by the end point), for instance we find 'z' if the character
4716 // is 'c'. A block is characterized by the property
4717 // that all characters uncanonicalize in the same way, except that
4718 // each entry in the result is incremented by the distance from the first
4719 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A']
4720 // and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4721 // Once we've found the end point we look up its uncanonicalization
4722 // and produce a range for each element. For instance for [c-f]
4723 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4724 // add a range if it is not already contained in the input, so [c-f]
4725 // will be skipped but [C-F] will be added. If this range is not
4726 // completely contained in a block we do this for all the blocks
4727 // covered by the range (handling characters that is not in a block
4728 // as a "singleton block").
4729 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4730 intptr_t pos = bottom;
4731 while (pos <= top) {
4732 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);
4733 int32_t block_end;
4734 if (length == 0) {
4735 block_end = pos;
4736 } else {
4737 ASSERT(length == 1);
4738 block_end = range[0];
4739 }
4740 intptr_t end = (block_end > top) ? top : block_end;
4741 length = jsregexp_uncanonicalize.get(block_end, '\0', range);
4742 for (intptr_t i = 0; i < length; i++) {
4743 int32_t c = range[i];
4744 int32_t range_from = c - (block_end - pos);
4745 int32_t range_to = c - (block_end - end);
4746 if (!(bottom <= range_from && range_to <= top)) {
4747 ranges->Add(CharacterRange(range_from, range_to));
4748 }
4749 }
4750 pos = end + 1;
4751 }
4752 }
4753 }
4754}
4755
4756bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) {
4757 ASSERT(ranges != NULL);
4758 intptr_t n = ranges->length();
4759 if (n <= 1) return true;
4760 intptr_t max = ranges->At(0).to();
4761 for (intptr_t i = 1; i < n; i++) {
4762 CharacterRange next_range = ranges->At(i);
4763 if (next_range.from() <= max + 1) return false;
4764 max = next_range.to();
4765 }
4766 return true;
4767}
4768
4769ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() {
4770 if (ranges_ == NULL) {
4771 ranges_ = new ZoneGrowableArray<CharacterRange>(2);
4772 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4773 }
4774 return ranges_;
4775}
4776
4777// Move a number of elements in a zone array to another position
4778// in the same array. Handles overlapping source and target areas.
4779static void MoveRanges(ZoneGrowableArray<CharacterRange>* list,
4780 intptr_t from,
4781 intptr_t to,
4782 intptr_t count) {
4783 // Ranges are potentially overlapping.
4784 if (from < to) {
4785 for (intptr_t i = count - 1; i >= 0; i--) {
4786 (*list)[to + i] = list->At(from + i);
4787 }
4788 } else {
4789 for (intptr_t i = 0; i < count; i++) {
4790 (*list)[to + i] = list->At(from + i);
4791 }
4792 }
4793}
4794
4795static intptr_t InsertRangeInCanonicalList(
4796 ZoneGrowableArray<CharacterRange>* list,
4797 intptr_t count,
4798 CharacterRange insert) {
4799 // Inserts a range into list[0..count[, which must be sorted
4800 // by from value and non-overlapping and non-adjacent, using at most
4801 // list[0..count] for the result. Returns the number of resulting
4802 // canonicalized ranges. Inserting a range may collapse existing ranges into
4803 // fewer ranges, so the return value can be anything in the range 1..count+1.
4804 int32_t from = insert.from();
4805 int32_t to = insert.to();
4806 intptr_t start_pos = 0;
4807 intptr_t end_pos = count;
4808 for (intptr_t i = count - 1; i >= 0; i--) {
4809 CharacterRange current = list->At(i);
4810 if (current.from() > to + 1) {
4811 end_pos = i;
4812 } else if (current.to() + 1 < from) {
4813 start_pos = i + 1;
4814 break;
4815 }
4816 }
4817
4818 // Inserted range overlaps, or is adjacent to, ranges at positions
4819 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4820 // not affected by the insertion.
4821 // If start_pos == end_pos, the range must be inserted before start_pos.
4822 // if start_pos < end_pos, the entire range from start_pos to end_pos
4823 // must be merged with the insert range.
4824
4825 if (start_pos == end_pos) {
4826 // Insert between existing ranges at position start_pos.
4827 if (start_pos < count) {
4828 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4829 }
4830 (*list)[start_pos] = insert;
4831 return count + 1;
4832 }
4833 if (start_pos + 1 == end_pos) {
4834 // Replace single existing range at position start_pos.
4835 CharacterRange to_replace = list->At(start_pos);
4836 intptr_t new_from = Utils::Minimum(to_replace.from(), from);
4837 intptr_t new_to = Utils::Maximum(to_replace.to(), to);
4838 (*list)[start_pos] = CharacterRange(new_from, new_to);
4839 return count;
4840 }
4841 // Replace a number of existing ranges from start_pos to end_pos - 1.
4842 // Move the remaining ranges down.
4843
4844 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);
4845 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to);
4846 if (end_pos < count) {
4847 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4848 }
4849 (*list)[start_pos] = CharacterRange(new_from, new_to);
4850 return count - (end_pos - start_pos) + 1;
4851}
4852
4853void CharacterSet::Canonicalize() {
4854 // Special/default classes are always considered canonical. The result
4855 // of calling ranges() will be sorted.
4856 if (ranges_ == NULL) return;
4857 CharacterRange::Canonicalize(ranges_);
4858}
4859
4860void CharacterRange::Canonicalize(
4861 ZoneGrowableArray<CharacterRange>* character_ranges) {
4862 if (character_ranges->length() <= 1) return;
4863 // Check whether ranges are already canonical (increasing, non-overlapping,
4864 // non-adjacent).
4865 intptr_t n = character_ranges->length();
4866 intptr_t max = character_ranges->At(0).to();
4867 intptr_t i = 1;
4868 while (i < n) {
4869 CharacterRange current = character_ranges->At(i);
4870 if (current.from() <= max + 1) {
4871 break;
4872 }
4873 max = current.to();
4874 i++;
4875 }
4876 // Canonical until the i'th range. If that's all of them, we are done.
4877 if (i == n) return;
4878
4879 // The ranges at index i and forward are not canonicalized. Make them so by
4880 // doing the equivalent of insertion sort (inserting each into the previous
4881 // list, in order).
4882 // Notice that inserting a range can reduce the number of ranges in the
4883 // result due to combining of adjacent and overlapping ranges.
4884 intptr_t read = i; // Range to insert.
4885 intptr_t num_canonical = i; // Length of canonicalized part of list.
4886 do {
4887 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
4888 character_ranges->At(read));
4889 read++;
4890 } while (read < n);
4891 character_ranges->TruncateTo(num_canonical);
4892
4893 ASSERT(CharacterRange::IsCanonical(character_ranges));
4894}
4895
4896void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges,
4897 ZoneGrowableArray<CharacterRange>* negated_ranges) {
4898 ASSERT(CharacterRange::IsCanonical(ranges));
4899 ASSERT(negated_ranges->length() == 0);
4900 intptr_t range_count = ranges->length();
4901 uint32_t from = 0;
4902 intptr_t i = 0;
4903 if (range_count > 0 && ranges->At(0).from() == 0) {
4904 from = ranges->At(0).to();
4905 i = 1;
4906 }
4907 while (i < range_count) {
4908 CharacterRange range = ranges->At(i);
4909 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4910 from = range.to();
4911 i++;
4912 }
4913 if (from < Utf::kMaxCodePoint) {
4914 negated_ranges->Add(CharacterRange(from + 1, Utf::kMaxCodePoint));
4915 }
4916}
4917
4918// -------------------------------------------------------------------
4919// Splay tree
4920
4921// Workaround for the fact that ZoneGrowableArray does not have contains().
4922static bool ArrayContains(ZoneGrowableArray<unsigned>* array, unsigned value) {
4923 for (intptr_t i = 0; i < array->length(); i++) {
4924 if (array->At(i) == value) {
4925 return true;
4926 }
4927 }
4928 return false;
4929}
4930
4931OutSet* OutSet::Extend(unsigned value, Zone* zone) {
4932 if (Get(value)) return this;
4933 if (successors() != nullptr) {
4934 for (int i = 0; i < successors()->length(); i++) {
4935 OutSet* successor = successors()->At(i);
4936 if (successor->Get(value)) return successor;
4937 }
4938 } else {
4939 successors_ = new (zone) ZoneGrowableArray<OutSet*>(2);
4940 }
4941 OutSet* result = new (zone) OutSet(first_, remaining_);
4942 result->Set(value, zone);
4943 successors()->Add(result);
4944 return result;
4945}
4946
4947void OutSet::Set(unsigned value, Zone* zone) {
4948 if (value < kFirstLimit) {
4949 first_ |= (1 << value);
4950 } else {
4951 if (remaining_ == NULL)
4952 remaining_ = new (zone) ZoneGrowableArray<unsigned>(1);
4953
4954 bool remaining_contains_value = ArrayContains(remaining_, value);
4955 if (remaining_->is_empty() || !remaining_contains_value) {
4956 remaining_->Add(value);
4957 }
4958 }
4959}
4960
4961bool OutSet::Get(unsigned value) const {
4962 if (value < kFirstLimit) {
4963 return (first_ & (1 << value)) != 0;
4964 } else if (remaining_ == NULL) {
4965 return false;
4966 } else {
4967 return ArrayContains(remaining_, value);
4968 }
4969}
4970
4971const int32_t ChoiceTable::Config::kNoKey = Utf::kInvalidChar;
4972
4973void ChoiceTable::AddRange(CharacterRange full_range,
4974 int32_t value,
4975 Zone* zone) {
4976 CharacterRange current = full_range;
4977 if (tree()->is_empty()) {
4978 // If this is the first range we just insert into the table.
4979 ZoneSplayTree<Config>::Locator loc;
4980 bool inserted = tree()->Insert(current.from(), &loc);
4981 ASSERT(inserted);
4982 USE(inserted);
4983 loc.set_value(
4984 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
4985 return;
4986 }
4987 // First see if there is a range to the left of this one that
4988 // overlaps.
4989 ZoneSplayTree<Config>::Locator loc;
4990 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4991 Entry* entry = &loc.value();
4992 // If we've found a range that overlaps with this one, and it
4993 // starts strictly to the left of this one, we have to fix it
4994 // because the following code only handles ranges that start on
4995 // or after the start point of the range we're adding.
4996 if (entry->from() < current.from() && entry->to() >= current.from()) {
4997 // Snap the overlapping range in half around the start point of
4998 // the range we're adding.
4999 CharacterRange left =
5000 CharacterRange::Range(entry->from(), current.from() - 1);
5001 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
5002 // The left part of the overlapping range doesn't overlap.
5003 // Truncate the whole entry to be just the left part.
5004 entry->set_to(left.to());
5005 // The right part is the one that overlaps. We add this part
5006 // to the map and let the next step deal with merging it with
5007 // the range we're adding.
5008 ZoneSplayTree<Config>::Locator loc;
5009 bool inserted = tree()->Insert(right.from(), &loc);
5010 ASSERT(inserted);
5011 USE(inserted);
5012 loc.set_value(Entry(right.from(), right.to(), entry->out_set()));
5013 }
5014 }
5015 while (current.is_valid()) {
5016 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5017 (loc.value().from() <= current.to()) &&
5018 (loc.value().to() >= current.from())) {
5019 Entry* entry = &loc.value();
5020 // We have overlap. If there is space between the start point of
5021 // the range we're adding and where the overlapping range starts
5022 // then we have to add a range covering just that space.
5023 if (current.from() < entry->from()) {
5024 ZoneSplayTree<Config>::Locator ins;
5025 bool inserted = tree()->Insert(current.from(), &ins);
5026 ASSERT(inserted);
5027 USE(inserted);
5028 ins.set_value(Entry(current.from(), entry->from() - 1,
5029 empty()->Extend(value, zone)));
5030 current.set_from(entry->from());
5031 }
5032 ASSERT(current.from() == entry->from());
5033 // If the overlapping range extends beyond the one we want to add
5034 // we have to snap the right part off and add it separately.
5035 if (entry->to() > current.to()) {
5036 ZoneSplayTree<Config>::Locator ins;
5037 bool inserted = tree()->Insert(current.to() + 1, &ins);
5038 ASSERT(inserted);
5039 USE(inserted);
5040 ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set()));
5041 entry->set_to(current.to());
5042 }
5043 ASSERT(entry->to() <= current.to());
5044 // The overlapping range is now completely contained by the range
5045 // we're adding so we can just update it and move the start point
5046 // of the range we're adding just past it.
5047 entry->AddValue(value, zone);
5048 ASSERT(entry->to() + 1 > current.from());
5049 current.set_from(entry->to() + 1);
5050 } else {
5051 // There is no overlap so we can just add the range
5052 ZoneSplayTree<Config>::Locator ins;
5053 bool inserted = tree()->Insert(current.from(), &ins);
5054 ASSERT(inserted);
5055 USE(inserted);
5056 ins.set_value(
5057 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
5058 break;
5059 }
5060 }
5061}
5062
5063OutSet* ChoiceTable::Get(int32_t value) {
5064 ZoneSplayTree<Config>::Locator loc;
5065 if (!tree()->FindGreatestLessThan(value, &loc)) return empty();
5066 Entry* entry = &loc.value();
5067 if (value <= entry->to())
5068 return entry->out_set();
5069 else
5070 return empty();
5071}
5072
5073// -------------------------------------------------------------------
5074// Analysis
5075
5076void Analysis::EnsureAnalyzed(RegExpNode* that) {
5077 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
5078 that->info()->being_analyzed = true;
5079 that->Accept(this);
5080 that->info()->being_analyzed = false;
5081 that->info()->been_analyzed = true;
5082}
5083
5084void Analysis::VisitEnd(EndNode* that) {
5085 // nothing to do
5086}
5087
5088void TextNode::CalculateOffsets() {
5089 intptr_t element_count = elements()->length();
5090 // Set up the offsets of the elements relative to the start. This is a fixed
5091 // quantity since a TextNode can only contain fixed-width things.
5092 intptr_t cp_offset = 0;
5093 for (intptr_t i = 0; i < element_count; i++) {
5094 TextElement& elm = (*elements())[i];
5095 elm.set_cp_offset(cp_offset);
5096 cp_offset += elm.length();
5097 }
5098}
5099
5100void Analysis::VisitText(TextNode* that) {
5101 that->MakeCaseIndependent(is_one_byte_);
5102 EnsureAnalyzed(that->on_success());
5103 if (!has_failed()) {
5104 that->CalculateOffsets();
5105 }
5106}
5107
5108void Analysis::VisitAction(ActionNode* that) {
5109 RegExpNode* target = that->on_success();
5110 EnsureAnalyzed(target);
5111 if (!has_failed()) {
5112 // If the next node is interested in what it follows then this node
5113 // has to be interested too so it can pass the information on.
5114 that->info()->AddFromFollowing(target->info());
5115 }
5116}
5117
5118void Analysis::VisitChoice(ChoiceNode* that) {
5119 NodeInfo* info = that->info();
5120 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5121 RegExpNode* node = (*that->alternatives())[i].node();
5122 EnsureAnalyzed(node);
5123 if (has_failed()) return;
5124 // Anything the following nodes need to know has to be known by
5125 // this node also, so it can pass it on.
5126 info->AddFromFollowing(node->info());
5127 }
5128}
5129
5130void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5131 NodeInfo* info = that->info();
5132 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5133 RegExpNode* node = (*that->alternatives())[i].node();
5134 if (node != that->loop_node()) {
5135 EnsureAnalyzed(node);
5136 if (has_failed()) return;
5137 info->AddFromFollowing(node->info());
5138 }
5139 }
5140 // Check the loop last since it may need the value of this node
5141 // to get a correct result.
5142 EnsureAnalyzed(that->loop_node());
5143 if (!has_failed()) {
5144 info->AddFromFollowing(that->loop_node()->info());
5145 }
5146}
5147
5148void Analysis::VisitBackReference(BackReferenceNode* that) {
5149 EnsureAnalyzed(that->on_success());
5150}
5151
5152void Analysis::VisitAssertion(AssertionNode* that) {
5153 EnsureAnalyzed(that->on_success());
5154}
5155
5156void BackReferenceNode::FillInBMInfo(intptr_t offset,
5157 intptr_t budget,
5158 BoyerMooreLookahead* bm,
5159 bool not_at_start) {
5160 // Working out the set of characters that a backreference can match is too
5161 // hard, so we just say that any character can match.
5162 bm->SetRest(offset);
5163 SaveBMInfo(bm, not_at_start, offset);
5164}
5165
5166COMPILE_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5167 RegExpMacroAssembler::kTableSize);
5168
5169void ChoiceNode::FillInBMInfo(intptr_t offset,
5170 intptr_t budget,
5171 BoyerMooreLookahead* bm,
5172 bool not_at_start) {
5173 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
5174 budget = (budget - 1) / alts->length();
5175 for (intptr_t i = 0; i < alts->length(); i++) {
5176 GuardedAlternative& alt = (*alts)[i];
5177 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5178 bm->SetRest(offset); // Give up trying to fill in info.
5179 SaveBMInfo(bm, not_at_start, offset);
5180 return;
5181 }
5182 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5183 }
5184 SaveBMInfo(bm, not_at_start, offset);
5185}
5186
5187void TextNode::FillInBMInfo(intptr_t initial_offset,
5188 intptr_t budget,
5189 BoyerMooreLookahead* bm,
5190 bool not_at_start) {
5191 if (initial_offset >= bm->length()) return;
5192 intptr_t offset = initial_offset;
5193 intptr_t max_char = bm->max_char();
5194 for (intptr_t i = 0; i < elements()->length(); i++) {
5195 if (offset >= bm->length()) {
5196 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5197 return;
5198 }
5199 TextElement text = elements()->At(i);
5200 if (text.text_type() == TextElement::ATOM) {
5201 RegExpAtom* atom = text.atom();
5202 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
5203 if (offset >= bm->length()) {
5204 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5205 return;
5206 }
5207 uint16_t character = atom->data()->At(j);
5208 if (atom->flags().IgnoreCase()) {
5209 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5210 intptr_t length = GetCaseIndependentLetters(
5211 character, bm->max_char() == Symbols::kMaxOneCharCodeSymbol,
5212 chars);
5213 for (intptr_t j = 0; j < length; j++) {
5214 bm->Set(offset, chars[j]);
5215 }
5216 } else {
5217 if (character <= max_char) bm->Set(offset, character);
5218 }
5219 }
5220 } else {
5221 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
5222 RegExpCharacterClass* char_class = text.char_class();
5223 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
5224 if (char_class->is_negated()) {
5225 bm->SetAll(offset);
5226 } else {
5227 for (intptr_t k = 0; k < ranges->length(); k++) {
5228 const CharacterRange& range = ranges->At(k);
5229 if (range.from() > max_char) continue;
5230 intptr_t to =
5231 Utils::Minimum(max_char, static_cast<intptr_t>(range.to()));
5232 bm->SetInterval(offset, Interval(range.from(), to));
5233 }
5234 }
5235 offset++;
5236 }
5237 }
5238 if (offset >= bm->length()) {
5239 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5240 return;
5241 }
5242 on_success()->FillInBMInfo(offset, budget - 1, bm,
5243 true); // Not at start after a text node.
5244 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5245}
5246
5247RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
5248 RegExpNode* on_success,
5249 RegExpFlags flags) {
5250 // If the regexp matching starts within a surrogate pair, step back
5251 // to the lead surrogate and start matching from there.
5252 ASSERT(!compiler->read_backward());
5253 Zone* zone = compiler->zone();
5254
5255 auto lead_surrogates = CharacterRange::List(
5256 on_success->zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart,
5257 Utf16::kLeadSurrogateEnd));
5258 auto trail_surrogates = CharacterRange::List(
5259 on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart,
5260 Utf16::kTrailSurrogateEnd));
5261
5262 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
5263
5264 int stack_register = compiler->UnicodeLookaroundStackRegister();
5265 int position_register = compiler->UnicodeLookaroundPositionRegister();
5266 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
5267 lead_surrogates, /*read_backward=*/true, on_success, flags);
5268 RegExpLookaround::Builder builder(/*is_positive=*/true, step_back,
5269 stack_register, position_register);
5270 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
5271 trail_surrogates, /*read_backward=*/false, builder.on_match_success(),
5272 flags);
5273
5274 optional_step_back->AddAlternative(
5275 GuardedAlternative(builder.ForMatch(match_trail)));
5276 optional_step_back->AddAlternative(GuardedAlternative(on_success));
5277
5278 return optional_step_back;
5279}
5280
5281#if !defined(DART_PRECOMPILED_RUNTIME)
5282RegExpEngine::CompilationResult RegExpEngine::CompileIR(
5283 RegExpCompileData* data,
5284 const ParsedFunction* parsed_function,
5285 const ZoneGrowableArray<const ICData*>& ic_data_array,
5286 intptr_t osr_id) {
5287 ASSERT(!FLAG_interpret_irregexp);
5288 Zone* zone = Thread::Current()->zone();
5289
5290 const Function& function = parsed_function->function();
5291 const intptr_t specialization_cid = function.string_specialization_cid();
5292 const bool is_sticky = function.is_sticky_specialization();
5293 const bool is_one_byte = (specialization_cid == kOneByteStringCid ||
5294 specialization_cid == kExternalOneByteStringCid);
5295 RegExp& regexp = RegExp::Handle(zone, function.regexp());
5296 const String& pattern = String::Handle(zone, regexp.pattern());
5297
5298 ASSERT(!regexp.IsNull());
5299 ASSERT(!pattern.IsNull());
5300
5301 const bool is_global = regexp.flags().IsGlobal();
5302 const bool is_unicode = regexp.flags().IsUnicode();
5303
5304 RegExpCompiler compiler(data->capture_count, is_one_byte);
5305
5306 // TODO(zerny): Frequency sampling is currently disabled because of several
5307 // issues. We do not want to store subject strings in the regexp object since
5308 // they might be long and we should not prevent their garbage collection.
5309 // Passing them to this function explicitly does not help, since we must
5310 // generate exactly the same IR for both the unoptimizing and optimizing
5311 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5312 // An option would be to store sampling results in the regexp object, but
5313 // I'm not sure the performance gains are relevant enough.
5314
5315 // Wrap the body of the regexp in capture #0.
5316 RegExpNode* captured_body =
5317 RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
5318
5319 RegExpNode* node = captured_body;
5320 const bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5321 const bool is_start_anchored = data->tree->IsAnchoredAtStart();
5322 intptr_t max_length = data->tree->max_match();
5323 if (!is_start_anchored && !is_sticky) {
5324 // Add a .*? at the beginning, outside the body capture, unless
5325 // this expression is anchored at the beginning or is sticky.
5326 RegExpNode* loop_node = RegExpQuantifier::ToNode(
5327 0, RegExpTree::kInfinity, false,
5328 new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler,
5329 captured_body, data->contains_anchor);
5330
5331 if (data->contains_anchor) {
5332 // Unroll loop once, to take care of the case that might start
5333 // at the start of input.
5334 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5335 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5336 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
5337 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5338 /*read_backwards=*/false, loop_node)));
5339 node = first_step_node;
5340 } else {
5341 node = loop_node;
5342 }
5343 }
5344 if (is_one_byte) {
5345 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5346 // Do it again to propagate the new nodes to places where they were not
5347 // put because they had not been calculated yet.
5348 if (node != NULL) {
5349 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5350 }
5351 } else if (is_unicode && (is_global || is_sticky)) {
5352 node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5353 }
5354
5355 if (node == NULL) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5356 data->node = node;
5357 Analysis analysis(is_one_byte);
5358 analysis.EnsureAnalyzed(node);
5359 if (analysis.has_failed()) {
5360 const char* error_message = analysis.error_message();
5361 return CompilationResult(error_message);
5362 }
5363
5364 // Native regexp implementation.
5365
5366 IRRegExpMacroAssembler* macro_assembler = new (zone)
5367 IRRegExpMacroAssembler(specialization_cid, data->capture_count,
5368 parsed_function, ic_data_array, osr_id, zone);
5369
5370 // Inserted here, instead of in Assembler, because it depends on information
5371 // in the AST that isn't replicated in the Node structure.
5372 static const intptr_t kMaxBacksearchLimit = 1024;
5373 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5374 max_length < kMaxBacksearchLimit) {
5375 macro_assembler->SetCurrentPositionFromEnd(max_length);
5376 }
5377
5378 if (is_global) {
5379 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
5380 if (data->tree->min_match() > 0) {
5381 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
5382 } else if (is_unicode) {
5383 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
5384 }
5385 macro_assembler->set_global_mode(mode);
5386 }
5387
5388 RegExpEngine::CompilationResult result =
5389 compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5390
5391 if (FLAG_trace_irregexp) {
5392 macro_assembler->PrintBlocks();
5393 }
5394
5395 return result;
5396}
5397#endif // !defined(DART_PRECOMPILED_RUNTIME)
5398
5399RegExpEngine::CompilationResult RegExpEngine::CompileBytecode(
5400 RegExpCompileData* data,
5401 const RegExp& regexp,
5402 bool is_one_byte,
5403 bool is_sticky,
5404 Zone* zone) {
5405 ASSERT(FLAG_interpret_irregexp);
5406 const String& pattern = String::Handle(zone, regexp.pattern());
5407
5408 ASSERT(!regexp.IsNull());
5409 ASSERT(!pattern.IsNull());
5410
5411 const bool is_global = regexp.flags().IsGlobal();
5412 const bool is_unicode = regexp.flags().IsUnicode();
5413
5414 RegExpCompiler compiler(data->capture_count, is_one_byte);
5415
5416 // TODO(zerny): Frequency sampling is currently disabled because of several
5417 // issues. We do not want to store subject strings in the regexp object since
5418 // they might be long and we should not prevent their garbage collection.
5419 // Passing them to this function explicitly does not help, since we must
5420 // generate exactly the same IR for both the unoptimizing and optimizing
5421 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5422 // An option would be to store sampling results in the regexp object, but
5423 // I'm not sure the performance gains are relevant enough.
5424
5425 // Wrap the body of the regexp in capture #0.
5426 RegExpNode* captured_body =
5427 RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
5428
5429 RegExpNode* node = captured_body;
5430 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5431 bool is_start_anchored = data->tree->IsAnchoredAtStart();
5432 intptr_t max_length = data->tree->max_match();
5433 if (!is_start_anchored && !is_sticky) {
5434 // Add a .*? at the beginning, outside the body capture, unless
5435 // this expression is anchored at the beginning.
5436 RegExpNode* loop_node = RegExpQuantifier::ToNode(
5437 0, RegExpTree::kInfinity, false,
5438 new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler,
5439 captured_body, data->contains_anchor);
5440
5441 if (data->contains_anchor) {
5442 // Unroll loop once, to take care of the case that might start
5443 // at the start of input.
5444 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5445 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5446 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
5447 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5448 /*read_backwards=*/false, loop_node)));
5449 node = first_step_node;
5450 } else {
5451 node = loop_node;
5452 }
5453 }
5454 if (is_one_byte) {
5455 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5456 // Do it again to propagate the new nodes to places where they were not
5457 // put because they had not been calculated yet.
5458 if (node != NULL) {
5459 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
5460 }
5461 } else if (is_unicode && (is_global || is_sticky)) {
5462 node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5463 }
5464
5465 if (node == NULL) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5466 data->node = node;
5467 Analysis analysis(is_one_byte);
5468 analysis.EnsureAnalyzed(node);
5469 if (analysis.has_failed()) {
5470 const char* error_message = analysis.error_message();
5471 return CompilationResult(error_message);
5472 }
5473
5474 // Bytecode regexp implementation.
5475
5476 ZoneGrowableArray<uint8_t> buffer(zone, 1024);
5477 BytecodeRegExpMacroAssembler* macro_assembler =
5478 new (zone) BytecodeRegExpMacroAssembler(&buffer, zone);
5479
5480 // Inserted here, instead of in Assembler, because it depends on information
5481 // in the AST that isn't replicated in the Node structure.
5482 static const intptr_t kMaxBacksearchLimit = 1024;
5483 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5484 max_length < kMaxBacksearchLimit) {
5485 macro_assembler->SetCurrentPositionFromEnd(max_length);
5486 }
5487
5488 if (is_global) {
5489 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
5490 if (data->tree->min_match() > 0) {
5491 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
5492 } else if (is_unicode) {
5493 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
5494 }
5495 macro_assembler->set_global_mode(mode);
5496 }
5497
5498 RegExpEngine::CompilationResult result =
5499 compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5500
5501 if (FLAG_trace_irregexp) {
5502 macro_assembler->PrintBlocks();
5503 }
5504
5505 return result;
5506}
5507
5508static void CreateSpecializedFunction(Thread* thread,
5509 Zone* zone,
5510 const RegExp& regexp,
5511 intptr_t specialization_cid,
5512 bool sticky,
5513 const Object& owner) {
5514 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
5515
5516 Function& fn =
5517 Function::Handle(zone, Function::New(Symbols::ColonMatcher(),
5518 FunctionLayout::kIrregexpFunction,
5519 true, // Static.
5520 false, // Not const.
5521 false, // Not abstract.
5522 false, // Not external.
5523 false, // Not native.
5524 owner, TokenPosition::kMinSource));
5525
5526 // TODO(zerny): Share these arrays between all irregexp functions.
5527 fn.set_num_fixed_parameters(kParamCount);
5528 fn.set_parameter_types(
5529 Array::Handle(zone, Array::New(kParamCount, Heap::kOld)));
5530 fn.set_parameter_names(
5531 Array::Handle(zone, Array::New(kParamCount, Heap::kOld)));
5532 fn.SetParameterTypeAt(RegExpMacroAssembler::kParamRegExpIndex,
5533 Object::dynamic_type());
5534 fn.SetParameterNameAt(RegExpMacroAssembler::kParamRegExpIndex,
5535 Symbols::This());
5536 fn.SetParameterTypeAt(RegExpMacroAssembler::kParamStringIndex,
5537 Object::dynamic_type());
5538 fn.SetParameterNameAt(RegExpMacroAssembler::kParamStringIndex,
5539 Symbols::string_param());
5540 fn.SetParameterTypeAt(RegExpMacroAssembler::kParamStartOffsetIndex,
5541 Object::dynamic_type());
5542 fn.SetParameterNameAt(RegExpMacroAssembler::kParamStartOffsetIndex,
5543 Symbols::start_index_param());
5544 fn.set_result_type(Type::Handle(zone, Type::ArrayType()));
5545
5546 // Cache the result.
5547 regexp.set_function(specialization_cid, sticky, fn);
5548
5549 fn.SetRegExpData(regexp, specialization_cid, sticky);
5550 fn.set_is_debuggable(false);
5551
5552 // The function is compiled lazily during the first call.
5553}
5554
5555RegExpPtr RegExpEngine::CreateRegExp(Thread* thread,
5556 const String& pattern,
5557 RegExpFlags flags) {
5558 Zone* zone = thread->zone();
5559 const RegExp& regexp = RegExp::Handle(RegExp::New());
5560
5561 regexp.set_pattern(pattern);
5562 regexp.set_flags(flags);
5563
5564 // TODO(zerny): We might want to use normal string searching algorithms
5565 // for simple patterns.
5566 regexp.set_is_complex();
5567 regexp.set_is_global(); // All dart regexps are global.
5568
5569 if (!FLAG_interpret_irregexp) {
5570 const Library& lib = Library::Handle(zone, Library::CoreLibrary());
5571 const Class& owner =
5572 Class::Handle(zone, lib.LookupClass(Symbols::RegExp()));
5573
5574 for (intptr_t cid = kOneByteStringCid; cid <= kExternalTwoByteStringCid;
5575 cid++) {
5576 CreateSpecializedFunction(thread, zone, regexp, cid, /*sticky=*/false,
5577 owner);
5578 CreateSpecializedFunction(thread, zone, regexp, cid, /*sticky=*/true,
5579 owner);
5580 }
5581 }
5582
5583 return regexp.raw();
5584}
5585
5586} // namespace dart
5587