1// Copyright (c) 2015, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp_assembler_bytecode.h"
6
7#include "vm/exceptions.h"
8#include "vm/object_store.h"
9#include "vm/regexp.h"
10#include "vm/regexp_assembler.h"
11#include "vm/regexp_assembler_bytecode_inl.h"
12#include "vm/regexp_bytecodes.h"
13#include "vm/regexp_interpreter.h"
14#include "vm/regexp_parser.h"
15#include "vm/timeline.h"
16
17namespace dart {
18
19BytecodeRegExpMacroAssembler::BytecodeRegExpMacroAssembler(
20 ZoneGrowableArray<uint8_t>* buffer,
21 Zone* zone)
22 : RegExpMacroAssembler(zone),
23 buffer_(buffer),
24 pc_(0),
25 advance_current_end_(kInvalidPC) {}
26
27BytecodeRegExpMacroAssembler::~BytecodeRegExpMacroAssembler() {
28 if (backtrack_.is_linked()) backtrack_.Unuse();
29}
30
31BytecodeRegExpMacroAssembler::IrregexpImplementation
32BytecodeRegExpMacroAssembler::Implementation() {
33 return kBytecodeImplementation;
34}
35
36void BytecodeRegExpMacroAssembler::BindBlock(BlockLabel* l) {
37 advance_current_end_ = kInvalidPC;
38 ASSERT(!l->is_bound());
39 if (l->is_linked()) {
40 intptr_t pos = l->pos();
41 while (pos != 0) {
42 intptr_t fixup = pos;
43 pos = *reinterpret_cast<int32_t*>(buffer_->data() + fixup);
44 *reinterpret_cast<uint32_t*>(buffer_->data() + fixup) = pc_;
45 }
46 }
47 l->BindTo(pc_);
48}
49
50void BytecodeRegExpMacroAssembler::EmitOrLink(BlockLabel* l) {
51 if (l == NULL) l = &backtrack_;
52 if (l->is_bound()) {
53 Emit32(l->pos());
54 } else {
55 int pos = 0;
56 if (l->is_linked()) {
57 pos = l->pos();
58 }
59 l->LinkTo(pc_);
60 Emit32(pos);
61 }
62}
63
64void BytecodeRegExpMacroAssembler::PopRegister(intptr_t register_index) {
65 ASSERT(register_index >= 0);
66 ASSERT(register_index <= kMaxRegister);
67 Emit(BC_POP_REGISTER, register_index);
68}
69
70void BytecodeRegExpMacroAssembler::PushRegister(intptr_t register_index) {
71 ASSERT(register_index >= 0);
72 ASSERT(register_index <= kMaxRegister);
73 Emit(BC_PUSH_REGISTER, register_index);
74}
75
76void BytecodeRegExpMacroAssembler::WriteCurrentPositionToRegister(
77 intptr_t register_index,
78 intptr_t cp_offset) {
79 ASSERT(register_index >= 0);
80 ASSERT(register_index <= kMaxRegister);
81 Emit(BC_SET_REGISTER_TO_CP, register_index);
82 Emit32(cp_offset); // Current position offset.
83}
84
85void BytecodeRegExpMacroAssembler::ClearRegisters(intptr_t reg_from,
86 intptr_t reg_to) {
87 ASSERT(reg_from <= reg_to);
88 for (int reg = reg_from; reg <= reg_to; reg++) {
89 SetRegister(reg, -1);
90 }
91}
92
93void BytecodeRegExpMacroAssembler::ReadCurrentPositionFromRegister(
94 intptr_t register_index) {
95 ASSERT(register_index >= 0);
96 ASSERT(register_index <= kMaxRegister);
97 Emit(BC_SET_CP_TO_REGISTER, register_index);
98}
99
100void BytecodeRegExpMacroAssembler::WriteStackPointerToRegister(
101 intptr_t register_index) {
102 ASSERT(register_index >= 0);
103 ASSERT(register_index <= kMaxRegister);
104 Emit(BC_SET_REGISTER_TO_SP, register_index);
105}
106
107void BytecodeRegExpMacroAssembler::ReadStackPointerFromRegister(
108 intptr_t register_index) {
109 ASSERT(register_index >= 0);
110 ASSERT(register_index <= kMaxRegister);
111 Emit(BC_SET_SP_TO_REGISTER, register_index);
112}
113
114void BytecodeRegExpMacroAssembler::SetCurrentPositionFromEnd(intptr_t by) {
115 ASSERT(Utils::IsUint(24, by));
116 Emit(BC_SET_CURRENT_POSITION_FROM_END, by);
117}
118
119void BytecodeRegExpMacroAssembler::SetRegister(intptr_t register_index,
120 intptr_t to) {
121 ASSERT(register_index >= 0);
122 ASSERT(register_index <= kMaxRegister);
123 Emit(BC_SET_REGISTER, register_index);
124 Emit32(to);
125}
126
127void BytecodeRegExpMacroAssembler::AdvanceRegister(intptr_t register_index,
128 intptr_t by) {
129 ASSERT(register_index >= 0);
130 ASSERT(register_index <= kMaxRegister);
131 Emit(BC_ADVANCE_REGISTER, register_index);
132 Emit32(by);
133}
134
135void BytecodeRegExpMacroAssembler::PopCurrentPosition() {
136 Emit(BC_POP_CP, 0);
137}
138
139void BytecodeRegExpMacroAssembler::PushCurrentPosition() {
140 Emit(BC_PUSH_CP, 0);
141}
142
143void BytecodeRegExpMacroAssembler::Backtrack() {
144 Emit(BC_POP_BT, 0);
145}
146
147void BytecodeRegExpMacroAssembler::GoTo(BlockLabel* l) {
148 if (advance_current_end_ == pc_) {
149 // Combine advance current and goto.
150 pc_ = advance_current_start_;
151 Emit(BC_ADVANCE_CP_AND_GOTO, advance_current_offset_);
152 EmitOrLink(l);
153 advance_current_end_ = kInvalidPC;
154 } else {
155 // Regular goto.
156 Emit(BC_GOTO, 0);
157 EmitOrLink(l);
158 }
159}
160
161void BytecodeRegExpMacroAssembler::PushBacktrack(BlockLabel* l) {
162 Emit(BC_PUSH_BT, 0);
163 EmitOrLink(l);
164}
165
166bool BytecodeRegExpMacroAssembler::Succeed() {
167 Emit(BC_SUCCEED, 0);
168 return false; // Restart matching for global regexp not supported.
169}
170
171void BytecodeRegExpMacroAssembler::Fail() {
172 Emit(BC_FAIL, 0);
173}
174
175void BytecodeRegExpMacroAssembler::AdvanceCurrentPosition(intptr_t by) {
176 ASSERT(by >= kMinCPOffset);
177 ASSERT(by <= kMaxCPOffset);
178 advance_current_start_ = pc_;
179 advance_current_offset_ = by;
180 Emit(BC_ADVANCE_CP, by);
181 advance_current_end_ = pc_;
182}
183
184void BytecodeRegExpMacroAssembler::CheckGreedyLoop(
185 BlockLabel* on_tos_equals_current_position) {
186 Emit(BC_CHECK_GREEDY, 0);
187 EmitOrLink(on_tos_equals_current_position);
188}
189
190void BytecodeRegExpMacroAssembler::LoadCurrentCharacter(intptr_t cp_offset,
191 BlockLabel* on_failure,
192 bool check_bounds,
193 intptr_t characters) {
194 ASSERT(cp_offset >= kMinCPOffset);
195 ASSERT(cp_offset <= kMaxCPOffset);
196 int bytecode;
197 if (check_bounds) {
198 if (characters == 4) {
199 bytecode = BC_LOAD_4_CURRENT_CHARS;
200 } else if (characters == 2) {
201 bytecode = BC_LOAD_2_CURRENT_CHARS;
202 } else {
203 ASSERT(characters == 1);
204 bytecode = BC_LOAD_CURRENT_CHAR;
205 }
206 } else {
207 if (characters == 4) {
208 bytecode = BC_LOAD_4_CURRENT_CHARS_UNCHECKED;
209 } else if (characters == 2) {
210 bytecode = BC_LOAD_2_CURRENT_CHARS_UNCHECKED;
211 } else {
212 ASSERT(characters == 1);
213 bytecode = BC_LOAD_CURRENT_CHAR_UNCHECKED;
214 }
215 }
216 Emit(bytecode, cp_offset);
217 if (check_bounds) EmitOrLink(on_failure);
218}
219
220void BytecodeRegExpMacroAssembler::CheckCharacterLT(uint16_t limit,
221 BlockLabel* on_less) {
222 Emit(BC_CHECK_LT, limit);
223 EmitOrLink(on_less);
224}
225
226void BytecodeRegExpMacroAssembler::CheckCharacterGT(uint16_t limit,
227 BlockLabel* on_greater) {
228 Emit(BC_CHECK_GT, limit);
229 EmitOrLink(on_greater);
230}
231
232void BytecodeRegExpMacroAssembler::CheckCharacter(uint32_t c,
233 BlockLabel* on_equal) {
234 if (c > MAX_FIRST_ARG) {
235 Emit(BC_CHECK_4_CHARS, 0);
236 Emit32(c);
237 } else {
238 Emit(BC_CHECK_CHAR, c);
239 }
240 EmitOrLink(on_equal);
241}
242
243void BytecodeRegExpMacroAssembler::CheckAtStart(BlockLabel* on_at_start) {
244 Emit(BC_CHECK_AT_START, 0);
245 EmitOrLink(on_at_start);
246}
247
248void BytecodeRegExpMacroAssembler::CheckNotAtStart(
249 intptr_t cp_offset,
250 BlockLabel* on_not_at_start) {
251 Emit(BC_CHECK_NOT_AT_START, cp_offset);
252 EmitOrLink(on_not_at_start);
253}
254
255void BytecodeRegExpMacroAssembler::CheckNotCharacter(uint32_t c,
256 BlockLabel* on_not_equal) {
257 if (c > MAX_FIRST_ARG) {
258 Emit(BC_CHECK_NOT_4_CHARS, 0);
259 Emit32(c);
260 } else {
261 Emit(BC_CHECK_NOT_CHAR, c);
262 }
263 EmitOrLink(on_not_equal);
264}
265
266void BytecodeRegExpMacroAssembler::CheckCharacterAfterAnd(
267 uint32_t c,
268 uint32_t mask,
269 BlockLabel* on_equal) {
270 if (c > MAX_FIRST_ARG) {
271 Emit(BC_AND_CHECK_4_CHARS, 0);
272 Emit32(c);
273 } else {
274 Emit(BC_AND_CHECK_CHAR, c);
275 }
276 Emit32(mask);
277 EmitOrLink(on_equal);
278}
279
280void BytecodeRegExpMacroAssembler::CheckNotCharacterAfterAnd(
281 uint32_t c,
282 uint32_t mask,
283 BlockLabel* on_not_equal) {
284 if (c > MAX_FIRST_ARG) {
285 Emit(BC_AND_CHECK_NOT_4_CHARS, 0);
286 Emit32(c);
287 } else {
288 Emit(BC_AND_CHECK_NOT_CHAR, c);
289 }
290 Emit32(mask);
291 EmitOrLink(on_not_equal);
292}
293
294void BytecodeRegExpMacroAssembler::CheckNotCharacterAfterMinusAnd(
295 uint16_t c,
296 uint16_t minus,
297 uint16_t mask,
298 BlockLabel* on_not_equal) {
299 Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c);
300 Emit16(minus);
301 Emit16(mask);
302 EmitOrLink(on_not_equal);
303}
304
305void BytecodeRegExpMacroAssembler::CheckCharacterInRange(
306 uint16_t from,
307 uint16_t to,
308 BlockLabel* on_in_range) {
309 Emit(BC_CHECK_CHAR_IN_RANGE, 0);
310 Emit16(from);
311 Emit16(to);
312 EmitOrLink(on_in_range);
313}
314
315void BytecodeRegExpMacroAssembler::CheckCharacterNotInRange(
316 uint16_t from,
317 uint16_t to,
318 BlockLabel* on_not_in_range) {
319 Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0);
320 Emit16(from);
321 Emit16(to);
322 EmitOrLink(on_not_in_range);
323}
324
325void BytecodeRegExpMacroAssembler::CheckBitInTable(const TypedData& table,
326 BlockLabel* on_bit_set) {
327 Emit(BC_CHECK_BIT_IN_TABLE, 0);
328 EmitOrLink(on_bit_set);
329 for (int i = 0; i < kTableSize; i += kBitsPerByte) {
330 int byte = 0;
331 for (int j = 0; j < kBitsPerByte; j++) {
332 if (table.GetUint8(i + j) != 0) byte |= 1 << j;
333 }
334 Emit8(byte);
335 }
336}
337
338void BytecodeRegExpMacroAssembler::CheckNotBackReference(
339 intptr_t start_reg,
340 bool read_backward,
341 BlockLabel* on_not_equal) {
342 ASSERT(start_reg >= 0);
343 ASSERT(start_reg <= kMaxRegister);
344 Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF,
345 start_reg);
346 EmitOrLink(on_not_equal);
347}
348
349void BytecodeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(
350 intptr_t start_reg,
351 bool read_backward,
352 bool unicode,
353 BlockLabel* on_not_equal) {
354 ASSERT(start_reg >= 0);
355 ASSERT(start_reg <= kMaxRegister);
356 Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
357 : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
358 : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
359 : BC_CHECK_NOT_BACK_REF_NO_CASE),
360 start_reg);
361 EmitOrLink(on_not_equal);
362}
363
364void BytecodeRegExpMacroAssembler::IfRegisterLT(intptr_t register_index,
365 intptr_t comparand,
366 BlockLabel* on_less_than) {
367 ASSERT(register_index >= 0);
368 ASSERT(register_index <= kMaxRegister);
369 Emit(BC_CHECK_REGISTER_LT, register_index);
370 Emit32(comparand);
371 EmitOrLink(on_less_than);
372}
373
374void BytecodeRegExpMacroAssembler::IfRegisterGE(
375 intptr_t register_index,
376 intptr_t comparand,
377 BlockLabel* on_greater_or_equal) {
378 ASSERT(register_index >= 0);
379 ASSERT(register_index <= kMaxRegister);
380 Emit(BC_CHECK_REGISTER_GE, register_index);
381 Emit32(comparand);
382 EmitOrLink(on_greater_or_equal);
383}
384
385void BytecodeRegExpMacroAssembler::IfRegisterEqPos(intptr_t register_index,
386 BlockLabel* on_eq) {
387 ASSERT(register_index >= 0);
388 ASSERT(register_index <= kMaxRegister);
389 Emit(BC_CHECK_REGISTER_EQ_POS, register_index);
390 EmitOrLink(on_eq);
391}
392
393TypedDataPtr BytecodeRegExpMacroAssembler::GetBytecode() {
394 BindBlock(&backtrack_);
395 Emit(BC_POP_BT, 0);
396
397 intptr_t len = length();
398 const TypedData& bytecode =
399 TypedData::Handle(TypedData::New(kTypedDataUint8ArrayCid, len));
400
401 NoSafepointScope no_safepoint;
402 memmove(bytecode.DataAddr(0), buffer_->data(), len);
403
404 return bytecode.raw();
405}
406
407intptr_t BytecodeRegExpMacroAssembler::length() {
408 return pc_;
409}
410
411void BytecodeRegExpMacroAssembler::Expand() {
412 // BOGUS
413 buffer_->Add(0);
414 buffer_->Add(0);
415 buffer_->Add(0);
416 buffer_->Add(0);
417 intptr_t x = buffer_->length();
418 for (intptr_t i = 0; i < x; i++)
419 buffer_->Add(0);
420}
421
422static intptr_t Prepare(const RegExp& regexp,
423 const String& subject,
424 bool sticky,
425 Zone* zone) {
426 bool is_one_byte =
427 subject.IsOneByteString() || subject.IsExternalOneByteString();
428
429 if (regexp.bytecode(is_one_byte, sticky) == TypedData::null()) {
430 const String& pattern = String::Handle(zone, regexp.pattern());
431#if defined(SUPPORT_TIMELINE)
432 TimelineBeginEndScope tbes(Thread::Current(), Timeline::GetCompilerStream(),
433 "CompileIrregexpBytecode");
434 if (tbes.enabled()) {
435 tbes.SetNumArguments(1);
436 tbes.CopyArgument(0, "pattern", pattern.ToCString());
437 }
438#endif // !defined(PRODUCT)
439
440 RegExpCompileData* compile_data = new (zone) RegExpCompileData();
441
442 // Parsing failures are handled in the RegExp factory constructor.
443 RegExpParser::ParseRegExp(pattern, regexp.flags(), compile_data);
444
445 regexp.set_num_bracket_expressions(compile_data->capture_count);
446 regexp.set_capture_name_map(compile_data->capture_name_map);
447 if (compile_data->simple) {
448 regexp.set_is_simple();
449 } else {
450 regexp.set_is_complex();
451 }
452
453 RegExpEngine::CompilationResult result = RegExpEngine::CompileBytecode(
454 compile_data, regexp, is_one_byte, sticky, zone);
455 ASSERT(result.bytecode != NULL);
456 ASSERT(regexp.num_registers(is_one_byte) == -1 ||
457 regexp.num_registers(is_one_byte) == result.num_registers);
458 regexp.set_num_registers(is_one_byte, result.num_registers);
459 regexp.set_bytecode(is_one_byte, sticky, *(result.bytecode));
460 }
461
462 ASSERT(regexp.num_registers(is_one_byte) != -1);
463
464 return regexp.num_registers(is_one_byte) +
465 (Smi::Value(regexp.num_bracket_expressions()) + 1) * 2;
466}
467
468static IrregexpInterpreter::IrregexpResult ExecRaw(const RegExp& regexp,
469 const String& subject,
470 intptr_t index,
471 bool sticky,
472 int32_t* output,
473 intptr_t output_size,
474 Zone* zone) {
475 bool is_one_byte =
476 subject.IsOneByteString() || subject.IsExternalOneByteString();
477
478 ASSERT(regexp.num_bracket_expressions() != Smi::null());
479
480 // We must have done EnsureCompiledIrregexp, so we can get the number of
481 // registers.
482 int number_of_capture_registers =
483 (Smi::Value(regexp.num_bracket_expressions()) + 1) * 2;
484 int32_t* raw_output = &output[number_of_capture_registers];
485
486 // We do not touch the actual capture result registers until we know there
487 // has been a match so that we can use those capture results to set the
488 // last match info.
489 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
490 raw_output[i] = -1;
491 }
492
493 const TypedData& bytecode =
494 TypedData::Handle(zone, regexp.bytecode(is_one_byte, sticky));
495 ASSERT(!bytecode.IsNull());
496 IrregexpInterpreter::IrregexpResult result =
497 IrregexpInterpreter::Match(bytecode, subject, raw_output, index, zone);
498
499 if (result == IrregexpInterpreter::RE_SUCCESS) {
500 // Copy capture results to the start of the registers array.
501 memmove(output, raw_output, number_of_capture_registers * sizeof(int32_t));
502 }
503 if (result == IrregexpInterpreter::RE_EXCEPTION) {
504 Thread* thread = Thread::Current();
505 Isolate* isolate = thread->isolate();
506 const Instance& exception =
507 Instance::Handle(isolate->object_store()->stack_overflow());
508 Exceptions::Throw(thread, exception);
509 UNREACHABLE();
510 }
511 return result;
512}
513
514InstancePtr BytecodeRegExpMacroAssembler::Interpret(const RegExp& regexp,
515 const String& subject,
516 const Smi& start_index,
517 bool sticky,
518 Zone* zone) {
519 intptr_t required_registers = Prepare(regexp, subject, sticky, zone);
520 if (required_registers < 0) {
521 // Compiling failed with an exception.
522 UNREACHABLE();
523 }
524
525 // V8 uses a shared copy on the isolate when smaller than some threshold.
526 int32_t* output_registers = zone->Alloc<int32_t>(required_registers);
527
528 IrregexpInterpreter::IrregexpResult result =
529 ExecRaw(regexp, subject, start_index.Value(), sticky, output_registers,
530 required_registers, zone);
531
532 if (result == IrregexpInterpreter::RE_SUCCESS) {
533 intptr_t capture_count = Smi::Value(regexp.num_bracket_expressions());
534 intptr_t capture_register_count = (capture_count + 1) * 2;
535 ASSERT(required_registers >= capture_register_count);
536
537 const TypedData& result = TypedData::Handle(
538 TypedData::New(kTypedDataInt32ArrayCid, capture_register_count));
539 {
540#ifdef DEBUG
541 // These indices will be used with substring operations that don't check
542 // bounds, so sanity check them here.
543 for (intptr_t i = 0; i < capture_register_count; i++) {
544 int32_t val = output_registers[i];
545 ASSERT(val == -1 || (val >= 0 && val <= subject.Length()));
546 }
547#endif
548
549 NoSafepointScope no_safepoint;
550 memmove(result.DataAddr(0), output_registers,
551 capture_register_count * sizeof(int32_t));
552 }
553
554 return result.raw();
555 }
556 if (result == IrregexpInterpreter::RE_EXCEPTION) {
557 UNREACHABLE();
558 }
559 ASSERT(result == IrregexpInterpreter::RE_FAILURE);
560 return Instance::null();
561}
562
563} // namespace dart
564