Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 :
4 : // Copyright 2012 the V8 project authors. All rights reserved.
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following
13 : // disclaimer in the documentation and/or other materials provided
14 : // with the distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived
17 : // from this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : #include "irregexp/RegExpEngine.h"
32 :
33 : #include "irregexp/NativeRegExpMacroAssembler.h"
34 : #include "irregexp/RegExpCharacters.h"
35 : #include "irregexp/RegExpMacroAssembler.h"
36 : #include "jit/ExecutableAllocator.h"
37 : #include "jit/JitCommon.h"
38 :
39 : #include "irregexp/RegExpCharacters-inl.h"
40 :
41 : using namespace js;
42 : using namespace js::irregexp;
43 :
44 : using mozilla::ArrayLength;
45 : using mozilla::DebugOnly;
46 : using mozilla::Maybe;
47 :
48 : #define DEFINE_ACCEPT(Type) \
49 : void Type##Node::Accept(NodeVisitor* visitor) { \
50 : visitor->Visit##Type(this); \
51 : }
52 501 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
53 : #undef DEFINE_ACCEPT
54 :
55 56 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
56 56 : visitor->VisitLoopChoice(this);
57 56 : }
58 :
59 : static const int kMaxLookaheadForBoyerMoore = 8;
60 :
61 566 : RegExpNode::RegExpNode(LifoAlloc* alloc)
62 566 : : replacement_(nullptr), trace_count_(0), alloc_(alloc)
63 : {
64 566 : bm_info_[0] = bm_info_[1] = nullptr;
65 566 : }
66 :
67 : static const int kMaxOneByteCharCode = 0xff;
68 : static const int kMaxUtf16CodeUnit = 0xffff;
69 :
70 : static char16_t
71 599 : MaximumCharacter(bool latin1)
72 : {
73 599 : return latin1 ? kMaxOneByteCharCode : kMaxUtf16CodeUnit;
74 : }
75 :
76 : static void
77 225 : AddClass(const int* elmv, int elmc,
78 : CharacterRangeVector* ranges)
79 : {
80 225 : elmc--;
81 225 : MOZ_ASSERT(elmv[elmc] == 0x10000);
82 1629 : for (int i = 0; i < elmc; i += 2) {
83 1404 : MOZ_ASSERT(elmv[i] < elmv[i + 1]);
84 1404 : ranges->append(CharacterRange(elmv[i], elmv[i + 1] - 1));
85 : }
86 225 : }
87 :
88 : void
89 159 : js::irregexp::AddClassNegated(const int* elmv,
90 : int elmc,
91 : CharacterRangeVector* ranges)
92 : {
93 159 : elmc--;
94 159 : MOZ_ASSERT(elmv[elmc] == 0x10000);
95 159 : MOZ_ASSERT(elmv[0] != 0x0000);
96 159 : MOZ_ASSERT(elmv[elmc-1] != kMaxUtf16CodeUnit);
97 159 : char16_t last = 0x0000;
98 816 : for (int i = 0; i < elmc; i += 2) {
99 657 : MOZ_ASSERT(last <= elmv[i] - 1);
100 657 : MOZ_ASSERT(elmv[i] < elmv[i + 1]);
101 657 : ranges->append(CharacterRange(last, elmv[i] - 1));
102 657 : last = elmv[i + 1];
103 : }
104 159 : ranges->append(CharacterRange(last, kMaxUtf16CodeUnit));
105 159 : }
106 :
107 : void
108 402 : CharacterRange::AddClassEscape(LifoAlloc* alloc, char16_t type,
109 : CharacterRangeVector* ranges)
110 : {
111 402 : switch (type) {
112 : case 's':
113 124 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
114 124 : break;
115 : case 'S':
116 24 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
117 24 : break;
118 : case 'w':
119 21 : AddClass(kWordRanges, kWordRangeCount, ranges);
120 21 : break;
121 : case 'W':
122 12 : AddClassNegated(kWordRanges, kWordRangeCount, ranges);
123 12 : break;
124 : case 'd':
125 80 : AddClass(kDigitRanges, kDigitRangeCount, ranges);
126 80 : break;
127 : case 'D':
128 0 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
129 0 : break;
130 : case '.':
131 123 : AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
132 123 : break;
133 : // This is not a character range as defined by the spec but a
134 : // convenient shorthand for a character class that matches any
135 : // character.
136 : case '*':
137 18 : ranges->append(CharacterRange::Everything());
138 18 : break;
139 : // This is the set of characters matched by the $ and ^ symbols
140 : // in multiline mode.
141 : case 'n':
142 0 : AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
143 0 : break;
144 : default:
145 0 : MOZ_CRASH("Bad character class escape");
146 : }
147 402 : }
148 :
149 : // Add class escape, excluding surrogate pair range.
150 : void
151 0 : CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
152 : CharacterRangeVector* ranges, bool ignore_case)
153 : {
154 0 : switch (type) {
155 : case 's':
156 : case 'd':
157 0 : return AddClassEscape(alloc, type, ranges);
158 : break;
159 : case 'S':
160 0 : AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges);
161 0 : break;
162 : case 'w':
163 0 : if (ignore_case)
164 0 : AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges);
165 : else
166 0 : AddClassEscape(alloc, type, ranges);
167 0 : break;
168 : case 'W':
169 0 : if (ignore_case) {
170 : AddClass(kNegatedIgnoreCaseWordAndSurrogateRanges,
171 0 : kNegatedIgnoreCaseWordAndSurrogateRangeCount, ranges);
172 : } else {
173 0 : AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges);
174 : }
175 0 : break;
176 : case 'D':
177 0 : AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges);
178 0 : break;
179 : default:
180 0 : MOZ_CRASH("Bad type!");
181 : }
182 : }
183 :
184 : static bool
185 0 : RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode)
186 : {
187 0 : for (size_t i = 0; i < ranges.length(); i++) {
188 : // TODO(dcarney): this could be a lot more efficient.
189 0 : if (RangeContainsLatin1Equivalents(ranges[i], unicode))
190 0 : return true;
191 : }
192 0 : return false;
193 : }
194 :
195 : static const size_t kEcma262UnCanonicalizeMaxWidth = 4;
196 :
197 : // Returns the number of characters in the equivalence class, omitting those
198 : // that cannot occur in the source string if it is a one byte string.
199 : static MOZ_ALWAYS_INLINE int
200 562 : GetCaseIndependentLetters(char16_t character,
201 : bool latin1_subject,
202 : bool unicode,
203 : const char16_t* choices,
204 : size_t choices_length,
205 : char16_t* letters)
206 : {
207 562 : size_t count = 0;
208 3372 : for (size_t i = 0; i < choices_length; i++) {
209 2810 : char16_t c = choices[i];
210 :
211 : // Skip characters that can't appear in one byte strings.
212 2810 : if (!unicode && latin1_subject && c > kMaxOneByteCharCode)
213 0 : continue;
214 :
215 : // Watch for duplicates.
216 2810 : bool found = false;
217 3212 : for (size_t j = 0; j < count; j++) {
218 2248 : if (letters[j] == c) {
219 1846 : found = true;
220 1846 : break;
221 : }
222 : }
223 2810 : if (found)
224 1846 : continue;
225 :
226 964 : letters[count++] = c;
227 : }
228 :
229 562 : return count;
230 : }
231 :
232 : static int
233 562 : GetCaseIndependentLetters(char16_t character,
234 : bool latin1_subject,
235 : bool unicode,
236 : char16_t* letters)
237 : {
238 562 : if (unicode) {
239 : const char16_t choices[] = {
240 : character,
241 0 : unicode::FoldCase(character),
242 0 : unicode::ReverseFoldCase1(character),
243 0 : unicode::ReverseFoldCase2(character),
244 0 : unicode::ReverseFoldCase3(character),
245 0 : };
246 0 : return GetCaseIndependentLetters(character, latin1_subject, unicode,
247 0 : choices, ArrayLength(choices), letters);
248 : }
249 :
250 562 : char16_t upper = unicode::ToUpperCase(character);
251 562 : unicode::CodepointsWithSameUpperCase others(character);
252 562 : char16_t other1 = others.other1();
253 562 : char16_t other2 = others.other2();
254 562 : char16_t other3 = others.other3();
255 :
256 : // ES 2017 draft 996af87b7072b3c3dd2b1def856c66f456102215 21.2.4.2
257 : // step 3.g.
258 : // The standard requires that non-ASCII characters cannot have ASCII
259 : // character codes in their equivalence class, even though this
260 : // situation occurs multiple times in the Unicode tables.
261 : static const unsigned kMaxAsciiCharCode = 127;
262 562 : if (upper <= kMaxAsciiCharCode) {
263 562 : if (character > kMaxAsciiCharCode) {
264 : // If Canonicalize(character) == character, all other characters
265 : // should be ignored.
266 0 : return GetCaseIndependentLetters(character, latin1_subject, unicode,
267 0 : &character, 1, letters);
268 : }
269 :
270 562 : if (other1 > kMaxAsciiCharCode)
271 0 : other1 = character;
272 562 : if (other2 > kMaxAsciiCharCode)
273 59 : other2 = character;
274 562 : if (other3 > kMaxAsciiCharCode)
275 0 : other3 = character;
276 : }
277 :
278 : const char16_t choices[] = {
279 : character,
280 : upper,
281 : other1,
282 : other2,
283 : other3
284 562 : };
285 562 : return GetCaseIndependentLetters(character, latin1_subject, unicode,
286 562 : choices, ArrayLength(choices), letters);
287 : }
288 :
289 : void
290 25 : CharacterRange::AddCaseEquivalents(bool is_latin1, bool unicode, CharacterRangeVector* ranges)
291 : {
292 25 : char16_t bottom = from();
293 25 : char16_t top = to();
294 :
295 25 : if (is_latin1 && !RangeContainsLatin1Equivalents(*this, unicode)) {
296 15 : if (bottom > kMaxOneByteCharCode)
297 0 : return;
298 15 : if (top > kMaxOneByteCharCode)
299 0 : top = kMaxOneByteCharCode;
300 : } else {
301 : // Nothing to do for surrogates.
302 10 : if (bottom >= unicode::LeadSurrogateMin && top <= unicode::TrailSurrogateMax)
303 0 : return;
304 : }
305 :
306 190 : for (char16_t c = bottom;; c++) {
307 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
308 190 : size_t length = GetCaseIndependentLetters(c, is_latin1, unicode, chars);
309 :
310 488 : for (size_t i = 0; i < length; i++) {
311 298 : char16_t other = chars[i];
312 298 : if (other == c)
313 190 : continue;
314 :
315 : // Try to combine with an existing range.
316 108 : bool found = false;
317 454 : for (size_t i = 0; i < ranges->length(); i++) {
318 448 : CharacterRange& range = (*ranges)[i];
319 448 : if (range.Contains(other)) {
320 52 : found = true;
321 52 : break;
322 396 : } else if (other == range.from() - 1) {
323 0 : range.set_from(other);
324 0 : found = true;
325 0 : break;
326 396 : } else if (other == range.to() + 1) {
327 50 : range.set_to(other);
328 50 : found = true;
329 50 : break;
330 : }
331 : }
332 :
333 108 : if (!found)
334 6 : ranges->append(CharacterRange::Singleton(other));
335 : }
336 :
337 190 : if (c == top)
338 25 : break;
339 165 : }
340 : }
341 :
342 : static bool
343 208 : CompareInverseRanges(const CharacterRangeVector& ranges, const int* special_class, size_t length)
344 : {
345 208 : length--; // Remove final 0x10000.
346 208 : MOZ_ASSERT(special_class[length] == 0x10000);
347 208 : MOZ_ASSERT(ranges.length() != 0);
348 208 : MOZ_ASSERT(length != 0);
349 208 : MOZ_ASSERT(special_class[0] != 0);
350 208 : if (ranges.length() != (length >> 1) + 1)
351 192 : return false;
352 16 : CharacterRange range = ranges[0];
353 16 : if (range.from() != 0)
354 7 : return false;
355 40 : for (size_t i = 0; i < length; i += 2) {
356 32 : if (special_class[i] != (range.to() + 1))
357 1 : return false;
358 31 : range = ranges[(i >> 1) + 1];
359 31 : if (special_class[i+1] != range.from())
360 0 : return false;
361 : }
362 8 : if (range.to() != 0xffff)
363 0 : return false;
364 8 : return true;
365 : }
366 :
367 : static bool
368 193 : CompareRanges(const CharacterRangeVector& ranges, const int* special_class, size_t length)
369 : {
370 193 : length--; // Remove final 0x10000.
371 193 : MOZ_ASSERT(special_class[length] == 0x10000);
372 193 : if (ranges.length() * 2 != length)
373 181 : return false;
374 32 : for (size_t i = 0; i < length; i += 2) {
375 30 : CharacterRange range = ranges[i >> 1];
376 30 : if (range.from() != special_class[i] || range.to() != special_class[i + 1] - 1)
377 10 : return false;
378 : }
379 2 : return true;
380 : }
381 :
382 : bool
383 96 : RegExpCharacterClass::is_standard(LifoAlloc* alloc)
384 : {
385 : // TODO(lrn): Remove need for this function, by not throwing away information
386 : // along the way.
387 96 : if (is_negated_)
388 15 : return false;
389 81 : if (set_.is_standard())
390 10 : return true;
391 71 : if (CompareRanges(set_.ranges(alloc), kSpaceRanges, kSpaceRangeCount)) {
392 2 : set_.set_standard_set_type('s');
393 2 : return true;
394 : }
395 69 : if (CompareInverseRanges(set_.ranges(alloc), kSpaceRanges, kSpaceRangeCount)) {
396 1 : set_.set_standard_set_type('S');
397 1 : return true;
398 : }
399 68 : if (CompareInverseRanges(set_.ranges(alloc),
400 : kLineTerminatorRanges,
401 : kLineTerminatorRangeCount)) {
402 7 : set_.set_standard_set_type('.');
403 7 : return true;
404 : }
405 61 : if (CompareRanges(set_.ranges(alloc),
406 : kLineTerminatorRanges,
407 : kLineTerminatorRangeCount)) {
408 0 : set_.set_standard_set_type('n');
409 0 : return true;
410 : }
411 61 : if (CompareRanges(set_.ranges(alloc), kWordRanges, kWordRangeCount)) {
412 0 : set_.set_standard_set_type('w');
413 0 : return true;
414 : }
415 61 : if (CompareInverseRanges(set_.ranges(alloc), kWordRanges, kWordRangeCount)) {
416 0 : set_.set_standard_set_type('W');
417 0 : return true;
418 : }
419 61 : return false;
420 : }
421 :
422 : bool
423 207 : CharacterRange::IsCanonical(const CharacterRangeVector& ranges)
424 : {
425 207 : int n = ranges.length();
426 207 : if (n <= 1)
427 100 : return true;
428 :
429 107 : int max = ranges[0].to();
430 416 : for (int i = 1; i < n; i++) {
431 329 : CharacterRange next_range = ranges[i];
432 329 : if (next_range.from() <= max + 1)
433 20 : return false;
434 309 : max = next_range.to();
435 : }
436 87 : return true;
437 : }
438 :
439 : // Move a number of elements in a zonelist to another position
440 : // in the same list. Handles overlapping source and target areas.
441 : static
442 24 : void MoveRanges(CharacterRangeVector& list, int from, int to, int count)
443 : {
444 : // Ranges are potentially overlapping.
445 24 : if (from < to) {
446 68 : for (int i = count - 1; i >= 0; i--)
447 45 : list[to + i] = list[from + i];
448 : } else {
449 4 : for (int i = 0; i < count; i++)
450 3 : list[to + i] = list[from + i];
451 : }
452 24 : }
453 :
454 : static int
455 53 : InsertRangeInCanonicalList(CharacterRangeVector& list,
456 : int count,
457 : CharacterRange insert)
458 : {
459 : // Inserts a range into list[0..count[, which must be sorted
460 : // by from value and non-overlapping and non-adjacent, using at most
461 : // list[0..count] for the result. Returns the number of resulting
462 : // canonicalized ranges. Inserting a range may collapse existing ranges into
463 : // fewer ranges, so the return value can be anything in the range 1..count+1.
464 53 : char16_t from = insert.from();
465 53 : char16_t to = insert.to();
466 53 : int start_pos = 0;
467 53 : int end_pos = count;
468 140 : for (int i = count - 1; i >= 0; i--) {
469 120 : CharacterRange current = list[i];
470 120 : if (current.from() > to + 1) {
471 65 : end_pos = i;
472 55 : } else if (current.to() + 1 < from) {
473 33 : start_pos = i + 1;
474 33 : break;
475 : }
476 : }
477 :
478 : // Inserted range overlaps, or is adjacent to, ranges at positions
479 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
480 : // not affected by the insertion.
481 : // If start_pos == end_pos, the range must be inserted before start_pos.
482 : // if start_pos < end_pos, the entire range from start_pos to end_pos
483 : // must be merged with the insert range.
484 :
485 53 : if (start_pos == end_pos) {
486 : // Insert between existing ranges at position start_pos.
487 32 : if (start_pos < count) {
488 23 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
489 : }
490 32 : list[start_pos] = insert;
491 32 : return count + 1;
492 : }
493 21 : if (start_pos + 1 == end_pos) {
494 : // Replace single existing range at position start_pos.
495 20 : CharacterRange to_replace = list[start_pos];
496 20 : int new_from = Min(to_replace.from(), from);
497 20 : int new_to = Max(to_replace.to(), to);
498 20 : list[start_pos] = CharacterRange(new_from, new_to);
499 20 : return count;
500 : }
501 : // Replace a number of existing ranges from start_pos to end_pos - 1.
502 : // Move the remaining ranges down.
503 :
504 1 : int new_from = Min(list[start_pos].from(), from);
505 1 : int new_to = Max(list[end_pos - 1].to(), to);
506 1 : if (end_pos < count) {
507 1 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
508 : }
509 1 : list[start_pos] = CharacterRange(new_from, new_to);
510 1 : return count - (end_pos - start_pos) + 1;
511 : }
512 :
513 : void
514 20 : CharacterRange::Canonicalize(CharacterRangeVector& character_ranges)
515 : {
516 20 : if (character_ranges.length() <= 1) return;
517 : // Check whether ranges are already canonical (increasing, non-overlapping,
518 : // non-adjacent).
519 20 : int n = character_ranges.length();
520 20 : int max = character_ranges[0].to();
521 20 : int i = 1;
522 56 : while (i < n) {
523 38 : CharacterRange current = character_ranges[i];
524 38 : if (current.from() <= max + 1) {
525 20 : break;
526 : }
527 18 : max = current.to();
528 18 : i++;
529 : }
530 : // Canonical until the i'th range. If that's all of them, we are done.
531 20 : if (i == n) return;
532 :
533 : // The ranges at index i and forward are not canonicalized. Make them so by
534 : // doing the equivalent of insertion sort (inserting each into the previous
535 : // list, in order).
536 : // Notice that inserting a range can reduce the number of ranges in the
537 : // result due to combining of adjacent and overlapping ranges.
538 20 : int read = i; // Range to insert.
539 20 : size_t num_canonical = i; // Length of canonicalized part of list.
540 33 : do {
541 53 : num_canonical = InsertRangeInCanonicalList(character_ranges,
542 : num_canonical,
543 53 : character_ranges[read]);
544 53 : read++;
545 53 : } while (read < n);
546 :
547 22 : while (character_ranges.length() > num_canonical)
548 22 : character_ranges.popBack();
549 :
550 20 : MOZ_ASSERT(CharacterRange::IsCanonical(character_ranges));
551 : }
552 :
553 : // -------------------------------------------------------------------
554 : // SeqRegExpNode
555 :
556 : class VisitMarker
557 : {
558 : public:
559 473 : explicit VisitMarker(NodeInfo* info)
560 473 : : info_(info)
561 : {
562 473 : MOZ_ASSERT(!info->visited);
563 473 : info->visited = true;
564 473 : }
565 946 : ~VisitMarker() {
566 473 : info_->visited = false;
567 473 : }
568 : private:
569 : NodeInfo* info_;
570 : };
571 :
572 : bool
573 0 : SeqRegExpNode::FillInBMInfo(int offset,
574 : int budget,
575 : BoyerMooreLookahead* bm,
576 : bool not_at_start)
577 : {
578 0 : if (!bm->CheckOverRecursed())
579 0 : return false;
580 0 : if (!on_success_->FillInBMInfo(offset, budget - 1, bm, not_at_start))
581 0 : return false;
582 0 : if (offset == 0)
583 0 : set_bm_info(not_at_start, bm);
584 0 : return true;
585 : }
586 :
587 : RegExpNode*
588 249 : SeqRegExpNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
589 : {
590 249 : if (info()->replacement_calculated)
591 86 : return replacement();
592 :
593 163 : if (depth < 0)
594 0 : return this;
595 :
596 163 : MOZ_ASSERT(!info()->visited);
597 326 : VisitMarker marker(info());
598 163 : return FilterSuccessor(depth - 1, ignore_case, unicode);
599 : }
600 :
601 : RegExpNode*
602 330 : SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case, bool unicode)
603 : {
604 330 : RegExpNode* next = on_success_->FilterLATIN1(depth - 1, ignore_case, unicode);
605 330 : if (next == nullptr)
606 1 : return set_replacement(nullptr);
607 :
608 329 : on_success_ = next;
609 329 : return set_replacement(this);
610 : }
611 :
612 : // -------------------------------------------------------------------
613 : // ActionNode
614 :
615 : int
616 256 : ActionNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
617 : {
618 256 : if (budget <= 0)
619 10 : return 0;
620 246 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS)
621 10 : return 0; // Rewinds input!
622 472 : return on_success()->EatsAtLeast(still_to_find,
623 : budget - 1,
624 472 : not_at_start);
625 : }
626 :
627 : bool
628 23 : ActionNode::FillInBMInfo(int offset,
629 : int budget,
630 : BoyerMooreLookahead* bm,
631 : bool not_at_start)
632 : {
633 23 : if (!bm->CheckOverRecursed())
634 0 : return false;
635 :
636 23 : if (action_type_ == BEGIN_SUBMATCH) {
637 0 : bm->SetRest(offset);
638 23 : } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
639 23 : if (!on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start))
640 0 : return false;
641 : }
642 23 : SaveBMInfo(bm, not_at_start, offset);
643 :
644 23 : return true;
645 : }
646 :
647 : /* static */ ActionNode*
648 11 : ActionNode::SetRegister(int reg,
649 : int val,
650 : RegExpNode* on_success)
651 : {
652 11 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(SET_REGISTER, on_success);
653 11 : result->data_.u_store_register.reg = reg;
654 11 : result->data_.u_store_register.value = val;
655 11 : return result;
656 : }
657 :
658 : /* static */ ActionNode*
659 11 : ActionNode::IncrementRegister(int reg, RegExpNode* on_success)
660 : {
661 11 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(INCREMENT_REGISTER, on_success);
662 11 : result->data_.u_increment_register.reg = reg;
663 11 : return result;
664 : }
665 :
666 : /* static */ ActionNode*
667 128 : ActionNode::StorePosition(int reg, bool is_capture, RegExpNode* on_success)
668 : {
669 128 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(STORE_POSITION, on_success);
670 128 : result->data_.u_position_register.reg = reg;
671 128 : result->data_.u_position_register.is_capture = is_capture;
672 128 : return result;
673 : }
674 :
675 : /* static */ ActionNode*
676 6 : ActionNode::ClearCaptures(Interval range, RegExpNode* on_success)
677 : {
678 6 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(CLEAR_CAPTURES, on_success);
679 6 : result->data_.u_clear_captures.range_from = range.from();
680 6 : result->data_.u_clear_captures.range_to = range.to();
681 6 : return result;
682 : }
683 :
684 : /* static */ ActionNode*
685 12 : ActionNode::BeginSubmatch(int stack_pointer_reg, int position_reg, RegExpNode* on_success)
686 : {
687 12 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(BEGIN_SUBMATCH, on_success);
688 12 : result->data_.u_submatch.stack_pointer_register = stack_pointer_reg;
689 12 : result->data_.u_submatch.current_position_register = position_reg;
690 12 : return result;
691 : }
692 :
693 : /* static */ ActionNode*
694 10 : ActionNode::PositiveSubmatchSuccess(int stack_pointer_reg,
695 : int restore_reg,
696 : int clear_capture_count,
697 : int clear_capture_from,
698 : RegExpNode* on_success)
699 : {
700 10 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(POSITIVE_SUBMATCH_SUCCESS, on_success);
701 10 : result->data_.u_submatch.stack_pointer_register = stack_pointer_reg;
702 10 : result->data_.u_submatch.current_position_register = restore_reg;
703 10 : result->data_.u_submatch.clear_register_count = clear_capture_count;
704 10 : result->data_.u_submatch.clear_register_from = clear_capture_from;
705 10 : return result;
706 : }
707 :
708 : /* static */ ActionNode*
709 0 : ActionNode::EmptyMatchCheck(int start_register,
710 : int repetition_register,
711 : int repetition_limit,
712 : RegExpNode* on_success)
713 : {
714 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(EMPTY_MATCH_CHECK, on_success);
715 0 : result->data_.u_empty_match_check.start_register = start_register;
716 0 : result->data_.u_empty_match_check.repetition_register = repetition_register;
717 0 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
718 0 : return result;
719 : }
720 :
721 : // -------------------------------------------------------------------
722 : // TextNode
723 :
724 : int
725 183 : TextNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
726 : {
727 183 : int answer = Length();
728 183 : if (answer >= still_to_find)
729 59 : return answer;
730 124 : if (budget <= 0)
731 0 : return answer;
732 :
733 : // We are not at start after this node so we set the last argument to 'true'.
734 248 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
735 : budget - 1,
736 248 : true);
737 : }
738 :
739 : int
740 87 : TextNode::GreedyLoopTextLength()
741 : {
742 87 : TextElement elm = elements()[elements().length() - 1];
743 87 : return elm.cp_offset() + elm.length();
744 : }
745 :
746 : RegExpNode*
747 177 : TextNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
748 : {
749 177 : if (info()->replacement_calculated)
750 7 : return replacement();
751 :
752 170 : if (depth < 0)
753 0 : return this;
754 :
755 170 : MOZ_ASSERT(!info()->visited);
756 340 : VisitMarker marker(info());
757 170 : int element_count = elements().length();
758 337 : for (int i = 0; i < element_count; i++) {
759 170 : TextElement elm = elements()[i];
760 170 : if (elm.text_type() == TextElement::ATOM) {
761 103 : CharacterVector& quarks = const_cast<CharacterVector&>(elm.atom()->data());
762 675 : for (size_t j = 0; j < quarks.length(); j++) {
763 573 : uint16_t c = quarks[j];
764 573 : if (c <= kMaxOneByteCharCode)
765 572 : continue;
766 1 : if (!ignore_case)
767 4 : return set_replacement(nullptr);
768 :
769 : // Here, we need to check for characters whose upper and lower cases
770 : // are outside the Latin-1 range.
771 0 : char16_t converted = ConvertNonLatin1ToLatin1(c, unicode);
772 0 : if (converted == 0) {
773 : // Character is outside Latin-1 completely
774 0 : return set_replacement(nullptr);
775 : }
776 :
777 : // Convert quark to Latin-1 in place.
778 0 : quarks[j] = converted;
779 : }
780 : } else {
781 67 : MOZ_ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
782 67 : RegExpCharacterClass* cc = elm.char_class();
783 :
784 67 : CharacterRangeVector& ranges = cc->ranges(alloc());
785 67 : if (!CharacterRange::IsCanonical(ranges))
786 14 : CharacterRange::Canonicalize(ranges);
787 :
788 : // Now they are in order so we only need to look at the first.
789 67 : int range_count = ranges.length();
790 67 : if (cc->is_negated()) {
791 20 : if (range_count != 0 &&
792 10 : ranges[0].from() == 0 &&
793 0 : ranges[0].to() >= kMaxOneByteCharCode)
794 : {
795 : // This will be handled in a later filter.
796 0 : if (ignore_case && RangesContainLatin1Equivalents(ranges, unicode))
797 0 : continue;
798 0 : return set_replacement(nullptr);
799 : }
800 : } else {
801 114 : if (range_count == 0 ||
802 57 : ranges[0].from() > kMaxOneByteCharCode)
803 : {
804 : // This will be handled in a later filter.
805 2 : if (ignore_case && RangesContainLatin1Equivalents(ranges, unicode))
806 0 : continue;
807 2 : return set_replacement(nullptr);
808 : }
809 : }
810 : }
811 : }
812 167 : return FilterSuccessor(depth - 1, ignore_case, unicode);
813 : }
814 :
815 : void
816 211 : TextNode::CalculateOffsets()
817 : {
818 211 : int element_count = elements().length();
819 :
820 : // Set up the offsets of the elements relative to the start. This is a fixed
821 : // quantity since a TextNode can only contain fixed-width things.
822 211 : int cp_offset = 0;
823 423 : for (int i = 0; i < element_count; i++) {
824 212 : TextElement& elm = elements()[i];
825 212 : elm.set_cp_offset(cp_offset);
826 212 : cp_offset += elm.length();
827 : }
828 211 : }
829 :
830 62 : void TextNode::MakeCaseIndependent(bool is_latin1, bool unicode)
831 : {
832 62 : int element_count = elements().length();
833 124 : for (int i = 0; i < element_count; i++) {
834 62 : TextElement elm = elements()[i];
835 62 : if (elm.text_type() == TextElement::CHAR_CLASS) {
836 14 : RegExpCharacterClass* cc = elm.char_class();
837 :
838 : // None of the standard character classes is different in the case
839 : // independent case and it slows us down if we don't know that.
840 14 : if (cc->is_standard(alloc()))
841 8 : continue;
842 :
843 : // Similarly, there's nothing to do for the character class
844 : // containing all characters except line terminators and surrogates.
845 : // This one is added by UnicodeEverythingAtom.
846 10 : CharacterRangeVector& ranges = cc->ranges(alloc());
847 10 : if (CompareInverseRanges(ranges,
848 : kLineTerminatorAndSurrogateRanges,
849 : kLineTerminatorAndSurrogateRangeCount))
850 : {
851 0 : continue;
852 : }
853 :
854 10 : int range_count = ranges.length();
855 35 : for (int j = 0; j < range_count; j++)
856 25 : ranges[j].AddCaseEquivalents(is_latin1, unicode, &ranges);
857 : }
858 : }
859 62 : }
860 :
861 : // -------------------------------------------------------------------
862 : // AssertionNode
863 :
864 : int
865 47 : AssertionNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
866 : {
867 47 : if (budget <= 0)
868 0 : return 0;
869 :
870 : // If we know we are not at the start and we are asked "how many characters
871 : // will you match if you succeed?" then we can answer anything since false
872 : // implies false. So lets just return the max answer (still_to_find) since
873 : // that won't prevent us from preloading a lot of characters for the other
874 : // branches in the node graph.
875 47 : if (assertion_type() == AT_START && not_at_start)
876 0 : return still_to_find;
877 :
878 47 : return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
879 : }
880 :
881 : bool
882 3 : AssertionNode::FillInBMInfo(int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start)
883 : {
884 3 : if (!bm->CheckOverRecursed())
885 0 : return false;
886 :
887 : // Match the behaviour of EatsAtLeast on this node.
888 3 : if (assertion_type() == AT_START && not_at_start)
889 0 : return true;
890 :
891 3 : if (!on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start))
892 0 : return false;
893 3 : SaveBMInfo(bm, not_at_start, offset);
894 3 : return true;
895 : }
896 :
897 : // -------------------------------------------------------------------
898 : // BackReferenceNode
899 :
900 : int
901 0 : BackReferenceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
902 : {
903 0 : if (budget <= 0)
904 0 : return 0;
905 0 : return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
906 : }
907 :
908 : bool
909 0 : BackReferenceNode::FillInBMInfo(int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start)
910 : {
911 : // Working out the set of characters that a backreference can match is too
912 : // hard, so we just say that any character can match.
913 0 : bm->SetRest(offset);
914 0 : SaveBMInfo(bm, not_at_start, offset);
915 0 : return true;
916 : }
917 :
918 : // -------------------------------------------------------------------
919 : // ChoiceNode
920 :
921 : int
922 180 : ChoiceNode::EatsAtLeastHelper(int still_to_find,
923 : int budget,
924 : RegExpNode* ignore_this_node,
925 : bool not_at_start)
926 : {
927 180 : if (budget <= 0)
928 0 : return 0;
929 :
930 180 : int min = 100;
931 180 : size_t choice_count = alternatives().length();
932 180 : budget = (budget - 1) / choice_count;
933 490 : for (size_t i = 0; i < choice_count; i++) {
934 403 : RegExpNode* node = alternatives()[i].node();
935 403 : if (node == ignore_this_node) continue;
936 : int node_eats_at_least =
937 279 : node->EatsAtLeast(still_to_find, budget, not_at_start);
938 279 : if (node_eats_at_least < min)
939 221 : min = node_eats_at_least;
940 279 : if (min == 0)
941 93 : return 0;
942 : }
943 87 : return min;
944 : }
945 :
946 : int
947 52 : ChoiceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
948 : {
949 52 : return EatsAtLeastHelper(still_to_find,
950 : budget,
951 : nullptr,
952 52 : not_at_start);
953 : }
954 :
955 : void
956 37 : ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
957 : RegExpCompiler* compiler,
958 : int characters_filled_in,
959 : bool not_at_start)
960 : {
961 37 : not_at_start = (not_at_start || not_at_start_);
962 37 : int choice_count = alternatives().length();
963 37 : MOZ_ASSERT(choice_count > 0);
964 74 : alternatives()[0].node()->GetQuickCheckDetails(details,
965 : compiler,
966 : characters_filled_in,
967 74 : not_at_start);
968 83 : for (int i = 1; i < choice_count; i++) {
969 46 : QuickCheckDetails new_details(details->characters());
970 46 : RegExpNode* node = alternatives()[i].node();
971 46 : node->GetQuickCheckDetails(&new_details, compiler,
972 : characters_filled_in,
973 92 : not_at_start);
974 : // Here we merge the quick match details of the two branches.
975 46 : details->Merge(&new_details, characters_filled_in);
976 : }
977 37 : }
978 :
979 : bool
980 4 : ChoiceNode::FillInBMInfo(int offset,
981 : int budget,
982 : BoyerMooreLookahead* bm,
983 : bool not_at_start)
984 : {
985 4 : if (!bm->CheckOverRecursed())
986 0 : return false;
987 :
988 4 : const GuardedAlternativeVector& alts = alternatives();
989 4 : budget = (budget - 1) / alts.length();
990 21 : for (size_t i = 0; i < alts.length(); i++) {
991 17 : const GuardedAlternative& alt = alts[i];
992 17 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
993 0 : bm->SetRest(offset); // Give up trying to fill in info.
994 0 : SaveBMInfo(bm, not_at_start, offset);
995 0 : return true;
996 : }
997 17 : if (!alt.node()->FillInBMInfo(offset, budget, bm, not_at_start))
998 0 : return false;
999 : }
1000 4 : SaveBMInfo(bm, not_at_start, offset);
1001 4 : return true;
1002 : }
1003 :
1004 : RegExpNode*
1005 84 : ChoiceNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
1006 : {
1007 84 : if (info()->replacement_calculated)
1008 19 : return replacement();
1009 65 : if (depth < 0)
1010 0 : return this;
1011 65 : if (info()->visited)
1012 0 : return this;
1013 130 : VisitMarker marker(info());
1014 65 : int choice_count = alternatives().length();
1015 :
1016 232 : for (int i = 0; i < choice_count; i++) {
1017 173 : const GuardedAlternative alternative = alternatives()[i];
1018 173 : if (alternative.guards() != nullptr && alternative.guards()->length() != 0) {
1019 6 : set_replacement(this);
1020 6 : return this;
1021 : }
1022 : }
1023 :
1024 59 : int surviving = 0;
1025 59 : RegExpNode* survivor = nullptr;
1026 226 : for (int i = 0; i < choice_count; i++) {
1027 167 : GuardedAlternative alternative = alternatives()[i];
1028 : RegExpNode* replacement =
1029 167 : alternative.node()->FilterLATIN1(depth - 1, ignore_case, unicode);
1030 167 : MOZ_ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
1031 167 : if (replacement != nullptr) {
1032 165 : alternatives()[i].set_node(replacement);
1033 165 : surviving++;
1034 165 : survivor = replacement;
1035 : }
1036 : }
1037 59 : if (surviving < 2)
1038 1 : return set_replacement(survivor);
1039 :
1040 58 : set_replacement(this);
1041 58 : if (surviving == choice_count)
1042 58 : return this;
1043 :
1044 : // Only some of the nodes survived the filtering. We need to rebuild the
1045 : // alternatives list.
1046 0 : GuardedAlternativeVector new_alternatives(*alloc());
1047 0 : new_alternatives.reserve(surviving);
1048 0 : for (int i = 0; i < choice_count; i++) {
1049 : RegExpNode* replacement =
1050 0 : alternatives()[i].node()->FilterLATIN1(depth - 1, ignore_case, unicode);
1051 0 : if (replacement != nullptr) {
1052 0 : alternatives()[i].set_node(replacement);
1053 0 : new_alternatives.append(alternatives()[i]);
1054 : }
1055 : }
1056 :
1057 0 : alternatives_ = Move(new_alternatives);
1058 0 : return this;
1059 : }
1060 :
1061 : // -------------------------------------------------------------------
1062 : // NegativeLookaheadChoiceNode
1063 :
1064 : bool
1065 0 : NegativeLookaheadChoiceNode::FillInBMInfo(int offset,
1066 : int budget,
1067 : BoyerMooreLookahead* bm,
1068 : bool not_at_start)
1069 : {
1070 0 : if (!bm->CheckOverRecursed())
1071 0 : return false;
1072 :
1073 0 : if (!alternatives()[1].node()->FillInBMInfo(offset, budget - 1, bm, not_at_start))
1074 0 : return false;
1075 0 : if (offset == 0)
1076 0 : set_bm_info(not_at_start, bm);
1077 0 : return true;
1078 : }
1079 :
1080 : int
1081 6 : NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
1082 : {
1083 6 : if (budget <= 0)
1084 0 : return 0;
1085 :
1086 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
1087 : // afterwards.
1088 6 : RegExpNode* node = alternatives()[1].node();
1089 6 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1090 : }
1091 :
1092 : void
1093 0 : NegativeLookaheadChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1094 : RegExpCompiler* compiler,
1095 : int filled_in,
1096 : bool not_at_start)
1097 : {
1098 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
1099 : // afterwards.
1100 0 : RegExpNode* node = alternatives()[1].node();
1101 0 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1102 : }
1103 :
1104 : RegExpNode*
1105 2 : NegativeLookaheadChoiceNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
1106 : {
1107 2 : if (info()->replacement_calculated)
1108 0 : return replacement();
1109 2 : if (depth < 0)
1110 0 : return this;
1111 2 : if (info()->visited)
1112 0 : return this;
1113 :
1114 4 : VisitMarker marker(info());
1115 :
1116 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
1117 : // afterwards.
1118 2 : RegExpNode* node = alternatives()[1].node();
1119 2 : RegExpNode* replacement = node->FilterLATIN1(depth - 1, ignore_case, unicode);
1120 :
1121 2 : if (replacement == nullptr)
1122 0 : return set_replacement(nullptr);
1123 2 : alternatives()[1].set_node(replacement);
1124 :
1125 2 : RegExpNode* neg_node = alternatives()[0].node();
1126 2 : RegExpNode* neg_replacement = neg_node->FilterLATIN1(depth - 1, ignore_case, unicode);
1127 :
1128 : // If the negative lookahead is always going to fail then
1129 : // we don't need to check it.
1130 2 : if (neg_replacement == nullptr)
1131 0 : return set_replacement(replacement);
1132 :
1133 2 : alternatives()[0].set_node(neg_replacement);
1134 2 : return set_replacement(this);
1135 : }
1136 :
1137 : // -------------------------------------------------------------------
1138 : // LoopChoiceNode
1139 :
1140 : void
1141 16 : GuardedAlternative::AddGuard(LifoAlloc* alloc, Guard* guard)
1142 : {
1143 16 : if (guards_ == nullptr)
1144 16 : guards_ = alloc->newInfallible<GuardVector>(*alloc);
1145 16 : guards_->append(guard);
1146 16 : }
1147 :
1148 : void
1149 57 : LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt)
1150 : {
1151 57 : MOZ_ASSERT(loop_node_ == nullptr);
1152 57 : AddAlternative(alt);
1153 57 : loop_node_ = alt.node();
1154 57 : }
1155 :
1156 :
1157 : void
1158 57 : LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt)
1159 : {
1160 57 : MOZ_ASSERT(continue_node_ == nullptr);
1161 57 : AddAlternative(alt);
1162 57 : continue_node_ = alt.node();
1163 57 : }
1164 :
1165 : int
1166 128 : LoopChoiceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
1167 : {
1168 128 : return EatsAtLeastHelper(still_to_find,
1169 : budget - 1,
1170 : loop_node_,
1171 128 : not_at_start);
1172 : }
1173 :
1174 : void
1175 35 : LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1176 : RegExpCompiler* compiler,
1177 : int characters_filled_in,
1178 : bool not_at_start)
1179 : {
1180 35 : if (body_can_be_zero_length_ || info()->visited)
1181 4 : return;
1182 62 : VisitMarker marker(info());
1183 31 : return ChoiceNode::GetQuickCheckDetails(details,
1184 : compiler,
1185 : characters_filled_in,
1186 31 : not_at_start);
1187 : }
1188 :
1189 : bool
1190 2 : LoopChoiceNode::FillInBMInfo(int offset,
1191 : int budget,
1192 : BoyerMooreLookahead* bm,
1193 : bool not_at_start)
1194 : {
1195 2 : if (body_can_be_zero_length_ || budget <= 0) {
1196 0 : bm->SetRest(offset);
1197 0 : SaveBMInfo(bm, not_at_start, offset);
1198 0 : return true;
1199 : }
1200 2 : if (!ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start))
1201 0 : return false;
1202 2 : SaveBMInfo(bm, not_at_start, offset);
1203 2 : return true;
1204 : }
1205 :
1206 : RegExpNode*
1207 95 : LoopChoiceNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
1208 : {
1209 95 : if (info()->replacement_calculated)
1210 18 : return replacement();
1211 77 : if (depth < 0)
1212 0 : return this;
1213 77 : if (info()->visited)
1214 35 : return this;
1215 :
1216 : {
1217 83 : VisitMarker marker(info());
1218 :
1219 : RegExpNode* continue_replacement =
1220 42 : continue_node_->FilterLATIN1(depth - 1, ignore_case, unicode);
1221 :
1222 : // If we can't continue after the loop then there is no sense in doing the
1223 : // loop.
1224 42 : if (continue_replacement == nullptr)
1225 1 : return set_replacement(nullptr);
1226 : }
1227 :
1228 41 : return ChoiceNode::FilterLATIN1(depth - 1, ignore_case, unicode);
1229 : }
1230 :
1231 : // -------------------------------------------------------------------
1232 : // Analysis
1233 :
1234 : void
1235 687 : Analysis::EnsureAnalyzed(RegExpNode* that)
1236 : {
1237 687 : if (!CheckRecursionLimit(cx)) {
1238 0 : failASCII("Stack overflow");
1239 0 : return;
1240 : }
1241 :
1242 687 : if (that->info()->been_analyzed || that->info()->being_analyzed)
1243 130 : return;
1244 557 : that->info()->being_analyzed = true;
1245 557 : that->Accept(this);
1246 557 : that->info()->being_analyzed = false;
1247 557 : that->info()->been_analyzed = true;
1248 : }
1249 :
1250 : void
1251 42 : Analysis::VisitEnd(EndNode* that)
1252 : {
1253 : // nothing to do
1254 42 : }
1255 :
1256 : void
1257 211 : Analysis::VisitText(TextNode* that)
1258 : {
1259 211 : if (ignore_case_)
1260 62 : that->MakeCaseIndependent(is_latin1_, unicode_);
1261 211 : EnsureAnalyzed(that->on_success());
1262 211 : if (!has_failed()) {
1263 211 : that->CalculateOffsets();
1264 : }
1265 211 : }
1266 :
1267 : void
1268 176 : Analysis::VisitAction(ActionNode* that)
1269 : {
1270 176 : RegExpNode* target = that->on_success();
1271 176 : EnsureAnalyzed(target);
1272 :
1273 176 : if (!has_failed()) {
1274 : // If the next node is interested in what it follows then this node
1275 : // has to be interested too so it can pass the information on.
1276 176 : that->info()->AddFromFollowing(target->info());
1277 : }
1278 176 : }
1279 :
1280 : void
1281 28 : Analysis::VisitChoice(ChoiceNode* that)
1282 : {
1283 28 : NodeInfo* info = that->info();
1284 :
1285 132 : for (size_t i = 0; i < that->alternatives().length(); i++) {
1286 104 : RegExpNode* node = that->alternatives()[i].node();
1287 104 : EnsureAnalyzed(node);
1288 104 : if (has_failed()) return;
1289 :
1290 : // Anything the following nodes need to know has to be known by
1291 : // this node also, so it can pass it on.
1292 104 : info->AddFromFollowing(node->info());
1293 : }
1294 : }
1295 :
1296 : void
1297 56 : Analysis::VisitLoopChoice(LoopChoiceNode* that)
1298 : {
1299 56 : NodeInfo* info = that->info();
1300 168 : for (size_t i = 0; i < that->alternatives().length(); i++) {
1301 112 : RegExpNode* node = that->alternatives()[i].node();
1302 112 : if (node != that->loop_node()) {
1303 56 : EnsureAnalyzed(node);
1304 56 : if (has_failed()) return;
1305 56 : info->AddFromFollowing(node->info());
1306 : }
1307 : }
1308 :
1309 : // Check the loop last since it may need the value of this node
1310 : // to get a correct result.
1311 56 : EnsureAnalyzed(that->loop_node());
1312 56 : if (!has_failed())
1313 56 : info->AddFromFollowing(that->loop_node()->info());
1314 : }
1315 :
1316 : void
1317 0 : Analysis::VisitBackReference(BackReferenceNode* that)
1318 : {
1319 0 : EnsureAnalyzed(that->on_success());
1320 0 : }
1321 :
1322 : void
1323 44 : Analysis::VisitAssertion(AssertionNode* that)
1324 : {
1325 44 : EnsureAnalyzed(that->on_success());
1326 44 : }
1327 :
1328 : // -------------------------------------------------------------------
1329 : // Implementation of the Irregexp regular expression engine.
1330 : //
1331 : // The Irregexp regular expression engine is intended to be a complete
1332 : // implementation of ECMAScript regular expressions. It generates either
1333 : // bytecodes or native code.
1334 :
1335 : // The Irregexp regexp engine is structured in three steps.
1336 : // 1) The parser generates an abstract syntax tree. See RegExpAST.cpp.
1337 : // 2) From the AST a node network is created. The nodes are all
1338 : // subclasses of RegExpNode. The nodes represent states when
1339 : // executing a regular expression. Several optimizations are
1340 : // performed on the node network.
1341 : // 3) From the nodes we generate either byte codes or native code
1342 : // that can actually execute the regular expression (perform
1343 : // the search). The code generation step is described in more
1344 : // detail below.
1345 :
1346 : // Code generation.
1347 : //
1348 : // The nodes are divided into four main categories.
1349 : // * Choice nodes
1350 : // These represent places where the regular expression can
1351 : // match in more than one way. For example on entry to an
1352 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
1353 : // * Action nodes
1354 : // These represent places where some action should be
1355 : // performed. Examples include recording the current position
1356 : // in the input string to a register (in order to implement
1357 : // captures) or other actions on register for example in order
1358 : // to implement the counters needed for {} repetitions.
1359 : // * Matching nodes
1360 : // These attempt to match some element part of the input string.
1361 : // Examples of elements include character classes, plain strings
1362 : // or back references.
1363 : // * End nodes
1364 : // These are used to implement the actions required on finding
1365 : // a successful match or failing to find a match.
1366 : //
1367 : // The code generated (whether as byte codes or native code) maintains
1368 : // some state as it runs. This consists of the following elements:
1369 : //
1370 : // * The capture registers. Used for string captures.
1371 : // * Other registers. Used for counters etc.
1372 : // * The current position.
1373 : // * The stack of backtracking information. Used when a matching node
1374 : // fails to find a match and needs to try an alternative.
1375 : //
1376 : // Conceptual regular expression execution model:
1377 : //
1378 : // There is a simple conceptual model of regular expression execution
1379 : // which will be presented first. The actual code generated is a more
1380 : // efficient simulation of the simple conceptual model:
1381 : //
1382 : // * Choice nodes are implemented as follows:
1383 : // For each choice except the last {
1384 : // push current position
1385 : // push backtrack code location
1386 : // <generate code to test for choice>
1387 : // backtrack code location:
1388 : // pop current position
1389 : // }
1390 : // <generate code to test for last choice>
1391 : //
1392 : // * Actions nodes are generated as follows
1393 : // <push affected registers on backtrack stack>
1394 : // <generate code to perform action>
1395 : // push backtrack code location
1396 : // <generate code to test for following nodes>
1397 : // backtrack code location:
1398 : // <pop affected registers to restore their state>
1399 : // <pop backtrack location from stack and go to it>
1400 : //
1401 : // * Matching nodes are generated as follows:
1402 : // if input string matches at current position
1403 : // update current position
1404 : // <generate code to test for following nodes>
1405 : // else
1406 : // <pop backtrack location from stack and go to it>
1407 : //
1408 : // Thus it can be seen that the current position is saved and restored
1409 : // by the choice nodes, whereas the registers are saved and restored by
1410 : // by the action nodes that manipulate them.
1411 : //
1412 : // The other interesting aspect of this model is that nodes are generated
1413 : // at the point where they are needed by a recursive call to Emit(). If
1414 : // the node has already been code generated then the Emit() call will
1415 : // generate a jump to the previously generated code instead. In order to
1416 : // limit recursion it is possible for the Emit() function to put the node
1417 : // on a work list for later generation and instead generate a jump. The
1418 : // destination of the jump is resolved later when the code is generated.
1419 : //
1420 : // Actual regular expression code generation.
1421 : //
1422 : // Code generation is actually more complicated than the above. In order
1423 : // to improve the efficiency of the generated code some optimizations are
1424 : // performed
1425 : //
1426 : // * Choice nodes have 1-character lookahead.
1427 : // A choice node looks at the following character and eliminates some of
1428 : // the choices immediately based on that character. This is not yet
1429 : // implemented.
1430 : // * Simple greedy loops store reduced backtracking information.
1431 : // A quantifier like /.*foo/m will greedily match the whole input. It will
1432 : // then need to backtrack to a point where it can match "foo". The naive
1433 : // implementation of this would push each character position onto the
1434 : // backtracking stack, then pop them off one by one. This would use space
1435 : // proportional to the length of the input string. However since the "."
1436 : // can only match in one way and always has a constant length (in this case
1437 : // of 1) it suffices to store the current position on the top of the stack
1438 : // once. Matching now becomes merely incrementing the current position and
1439 : // backtracking becomes decrementing the current position and checking the
1440 : // result against the stored current position. This is faster and saves
1441 : // space.
1442 : // * The current state is virtualized.
1443 : // This is used to defer expensive operations until it is clear that they
1444 : // are needed and to generate code for a node more than once, allowing
1445 : // specialized an efficient versions of the code to be created. This is
1446 : // explained in the section below.
1447 : //
1448 : // Execution state virtualization.
1449 : //
1450 : // Instead of emitting code, nodes that manipulate the state can record their
1451 : // manipulation in an object called the Trace. The Trace object can record a
1452 : // current position offset, an optional backtrack code location on the top of
1453 : // the virtualized backtrack stack and some register changes. When a node is
1454 : // to be emitted it can flush the Trace or update it. Flushing the Trace
1455 : // will emit code to bring the actual state into line with the virtual state.
1456 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
1457 : // registers). Postponing work can save time when executing the regular
1458 : // expression since it may be found that the work never has to be done as a
1459 : // failure to match can occur. In addition it is much faster to jump to a
1460 : // known backtrack code location than it is to pop an unknown backtrack
1461 : // location from the stack and jump there.
1462 : //
1463 : // The virtual state found in the Trace affects code generation. For example
1464 : // the virtual state contains the difference between the actual current
1465 : // position and the virtual current position, and matching code needs to use
1466 : // this offset to attempt a match in the correct location of the input
1467 : // string. Therefore code generated for a non-trivial trace is specialized
1468 : // to that trace. The code generator therefore has the ability to generate
1469 : // code for each node several times. In order to limit the size of the
1470 : // generated code there is an arbitrary limit on how many specialized sets of
1471 : // code may be generated for a given node. If the limit is reached, the
1472 : // trace is flushed and a generic version of the code for a node is emitted.
1473 : // This is subsequently used for that node. The code emitted for non-generic
1474 : // trace is not recorded in the node and so it cannot currently be reused in
1475 : // the event that code generation is requested for an identical trace.
1476 :
1477 : /* static */ TextElement
1478 156 : TextElement::Atom(RegExpAtom* atom)
1479 : {
1480 156 : return TextElement(ATOM, atom);
1481 : }
1482 :
1483 : /* static */ TextElement
1484 131 : TextElement::CharClass(RegExpCharacterClass* char_class)
1485 : {
1486 131 : return TextElement(CHAR_CLASS, char_class);
1487 : }
1488 :
1489 : int
1490 1021 : TextElement::length() const
1491 : {
1492 1021 : switch (text_type()) {
1493 : case ATOM:
1494 554 : return atom()->length();
1495 : case CHAR_CLASS:
1496 467 : return 1;
1497 : }
1498 0 : MOZ_CRASH("Bad text type");
1499 : }
1500 :
1501 : class FrequencyCollator
1502 : {
1503 : public:
1504 40 : FrequencyCollator() : total_samples_(0) {
1505 5160 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
1506 5120 : frequencies_[i] = CharacterFrequency(i);
1507 : }
1508 40 : }
1509 :
1510 1012 : void CountCharacter(int character) {
1511 1012 : int index = (character & RegExpMacroAssembler::kTableMask);
1512 1012 : frequencies_[index].Increment();
1513 1012 : total_samples_++;
1514 1012 : }
1515 :
1516 : // Does not measure in percent, but rather per-128 (the table size from the
1517 : // regexp macro assembler).
1518 179 : int Frequency(int in_character) {
1519 179 : MOZ_ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
1520 179 : if (total_samples_ < 1) return 1; // Division by zero.
1521 : int freq_in_per128 =
1522 179 : (frequencies_[in_character].counter() * 128) / total_samples_;
1523 179 : return freq_in_per128;
1524 : }
1525 :
1526 : private:
1527 : class CharacterFrequency {
1528 : public:
1529 5120 : CharacterFrequency() : counter_(0), character_(-1) { }
1530 5120 : explicit CharacterFrequency(int character)
1531 5120 : : counter_(0), character_(character)
1532 5120 : {}
1533 :
1534 1012 : void Increment() { counter_++; }
1535 179 : int counter() { return counter_; }
1536 : int character() { return character_; }
1537 :
1538 : private:
1539 : int counter_;
1540 : int character_;
1541 : };
1542 :
1543 : private:
1544 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
1545 : int total_samples_;
1546 : };
1547 :
1548 40 : class irregexp::RegExpCompiler
1549 : {
1550 : public:
1551 : RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_count,
1552 : bool ignore_case, bool is_latin1, bool match_only, bool unicode);
1553 :
1554 35 : int AllocateRegister() {
1555 35 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1556 0 : reg_exp_too_big_ = true;
1557 0 : return next_register_;
1558 : }
1559 35 : return next_register_++;
1560 : }
1561 :
1562 : RegExpCode Assemble(JSContext* cx,
1563 : RegExpMacroAssembler* assembler,
1564 : RegExpNode* start,
1565 : int capture_count);
1566 :
1567 0 : inline void AddWork(RegExpNode* node) {
1568 0 : AutoEnterOOMUnsafeRegion oomUnsafe;
1569 0 : if (!work_list_.append(node))
1570 0 : oomUnsafe.crash("AddWork");
1571 0 : }
1572 :
1573 : static const int kImplementationOffset = 0;
1574 : static const int kNumberOfRegistersOffset = 0;
1575 : static const int kCodeOffset = 1;
1576 :
1577 3243 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1578 40 : EndNode* accept() { return accept_; }
1579 :
1580 : static const int kMaxRecursion = 100;
1581 547 : inline int recursion_depth() { return recursion_depth_; }
1582 573 : inline void IncrementRecursionDepth() { recursion_depth_++; }
1583 573 : inline void DecrementRecursionDepth() { recursion_depth_--; }
1584 :
1585 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1586 40 : inline bool isRegExpTooBig() { return reg_exp_too_big_; }
1587 :
1588 1350 : inline bool ignore_case() { return ignore_case_; }
1589 1989 : inline bool latin1() { return latin1_; }
1590 393 : inline bool unicode() { return unicode_; }
1591 219 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1592 :
1593 80 : int current_expansion_factor() { return current_expansion_factor_; }
1594 160 : void set_current_expansion_factor(int value) {
1595 160 : current_expansion_factor_ = value;
1596 160 : }
1597 :
1598 300 : JSContext* cx() const { return cx_; }
1599 1008 : LifoAlloc* alloc() const { return alloc_; }
1600 :
1601 : static const int kNoRegister = -1;
1602 :
1603 : bool CheckOverRecursed();
1604 :
1605 : private:
1606 : EndNode* accept_;
1607 : int next_register_;
1608 : Vector<RegExpNode*, 4, SystemAllocPolicy> work_list_;
1609 : int recursion_depth_;
1610 : RegExpMacroAssembler* macro_assembler_;
1611 : bool ignore_case_;
1612 : bool latin1_;
1613 : bool match_only_;
1614 : bool unicode_;
1615 : bool reg_exp_too_big_;
1616 : int current_expansion_factor_;
1617 : FrequencyCollator frequency_collator_;
1618 : JSContext* cx_;
1619 : LifoAlloc* alloc_;
1620 : };
1621 :
1622 : class RecursionCheck
1623 : {
1624 : public:
1625 573 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1626 573 : compiler->IncrementRecursionDepth();
1627 573 : }
1628 573 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1629 :
1630 : private:
1631 : RegExpCompiler* compiler_;
1632 : };
1633 :
1634 : static inline bool
1635 377 : IsLatin1Equivalent(char16_t c, RegExpCompiler* compiler)
1636 : {
1637 377 : if (c <= kMaxOneByteCharCode)
1638 377 : return true;
1639 :
1640 0 : if (!compiler->ignore_case())
1641 0 : return false;
1642 :
1643 0 : char16_t converted = ConvertNonLatin1ToLatin1(c, compiler->unicode());
1644 :
1645 0 : return converted != 0 && converted <= kMaxOneByteCharCode;
1646 : }
1647 :
1648 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1649 : // a fixed array or a null handle depending on whether it succeeded.
1650 40 : RegExpCompiler::RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_count,
1651 40 : bool ignore_case, bool latin1, bool match_only, bool unicode)
1652 40 : : next_register_(2 * (capture_count + 1)),
1653 : recursion_depth_(0),
1654 : ignore_case_(ignore_case),
1655 : latin1_(latin1),
1656 : match_only_(match_only),
1657 : unicode_(unicode),
1658 : reg_exp_too_big_(false),
1659 : current_expansion_factor_(1),
1660 : frequency_collator_(),
1661 : cx_(cx),
1662 40 : alloc_(alloc)
1663 : {
1664 40 : accept_ = alloc->newInfallible<EndNode>(alloc, EndNode::ACCEPT);
1665 40 : MOZ_ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1666 40 : }
1667 :
1668 : RegExpCode
1669 40 : RegExpCompiler::Assemble(JSContext* cx,
1670 : RegExpMacroAssembler* assembler,
1671 : RegExpNode* start,
1672 : int capture_count)
1673 : {
1674 40 : macro_assembler_ = assembler;
1675 40 : macro_assembler_->set_slow_safe(false);
1676 :
1677 : // The LifoAlloc used by the regexp compiler is infallible and is currently
1678 : // expected to crash on OOM. Thus we have to disable the assertions made to
1679 : // prevent us from allocating any new chunk in the LifoAlloc. This is needed
1680 : // because the jit::MacroAssembler turns these assertions on by default.
1681 80 : LifoAlloc::AutoFallibleScope fallibleAllocator(alloc());
1682 :
1683 80 : jit::Label fail;
1684 40 : macro_assembler_->PushBacktrack(&fail);
1685 40 : Trace new_trace;
1686 40 : start->Emit(this, &new_trace);
1687 40 : macro_assembler_->BindBacktrack(&fail);
1688 40 : macro_assembler_->Fail();
1689 :
1690 40 : while (!work_list_.empty())
1691 0 : work_list_.popCopy()->Emit(this, &new_trace);
1692 :
1693 40 : RegExpCode code = macro_assembler_->GenerateCode(cx, match_only_);
1694 40 : if (code.empty())
1695 0 : return RegExpCode();
1696 :
1697 40 : if (reg_exp_too_big_) {
1698 0 : code.destroy();
1699 0 : JS_ReportErrorASCII(cx, "regexp too big");
1700 0 : return RegExpCode();
1701 : }
1702 :
1703 40 : return code;
1704 : }
1705 :
1706 : template <typename CharT>
1707 : static void
1708 40 : SampleChars(FrequencyCollator* collator, const CharT* chars, size_t length)
1709 : {
1710 : // Sample some characters from the middle of the string.
1711 : static const int kSampleSize = 128;
1712 :
1713 40 : int chars_sampled = 0;
1714 40 : int half_way = (int(length) - kSampleSize) / 2;
1715 1052 : for (size_t i = Max(0, half_way);
1716 1052 : i < length && chars_sampled < kSampleSize;
1717 : i++, chars_sampled++)
1718 : {
1719 1012 : collator->CountCharacter(chars[i]);
1720 : }
1721 40 : }
1722 :
1723 : static bool
1724 40 : IsNativeRegExpEnabled(JSContext* cx)
1725 : {
1726 : #ifdef JS_CODEGEN_NONE
1727 : return false;
1728 : #else
1729 40 : return cx->options().nativeRegExp();
1730 : #endif
1731 : }
1732 :
1733 : RegExpCode
1734 40 : irregexp::CompilePattern(JSContext* cx, HandleRegExpShared shared, RegExpCompileData* data,
1735 : HandleLinearString sample, bool is_global, bool ignore_case,
1736 : bool is_latin1, bool match_only, bool force_bytecode, bool sticky,
1737 : bool unicode, RegExpShared::JitCodeTables& tables)
1738 : {
1739 40 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
1740 0 : JS_ReportErrorASCII(cx, "regexp too big");
1741 0 : return RegExpCode();
1742 : }
1743 :
1744 40 : LifoAlloc& alloc = cx->tempLifoAlloc();
1745 : RegExpCompiler compiler(cx, &alloc, data->capture_count, ignore_case, is_latin1, match_only,
1746 80 : unicode);
1747 :
1748 : // Sample some characters from the middle of the string.
1749 40 : if (sample->hasLatin1Chars()) {
1750 70 : JS::AutoCheckCannotGC nogc;
1751 35 : SampleChars(compiler.frequency_collator(), sample->latin1Chars(nogc), sample->length());
1752 : } else {
1753 10 : JS::AutoCheckCannotGC nogc;
1754 5 : SampleChars(compiler.frequency_collator(), sample->twoByteChars(nogc), sample->length());
1755 : }
1756 :
1757 : // Wrap the body of the regexp in capture #0.
1758 40 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
1759 : 0,
1760 : &compiler,
1761 80 : compiler.accept());
1762 40 : RegExpNode* node = captured_body;
1763 40 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
1764 40 : bool is_start_anchored = sticky || data->tree->IsAnchoredAtStart();
1765 40 : int max_length = data->tree->max_match();
1766 40 : if (!is_start_anchored) {
1767 : // Add a .*? at the beginning, outside the body capture, unless
1768 : // this expression is anchored at the beginning.
1769 : RegExpNode* loop_node =
1770 38 : RegExpQuantifier::ToNode(0,
1771 : RegExpTree::kInfinity,
1772 : false,
1773 38 : alloc.newInfallible<RegExpCharacterClass>('*'),
1774 : &compiler,
1775 : captured_body,
1776 38 : data->contains_anchor);
1777 :
1778 19 : if (data->contains_anchor) {
1779 : // Unroll loop once, to take care of the case that might start
1780 : // at the start of input.
1781 0 : ChoiceNode* first_step_node = alloc.newInfallible<ChoiceNode>(&alloc, 2);
1782 : RegExpNode* char_class =
1783 0 : alloc.newInfallible<TextNode>(alloc.newInfallible<RegExpCharacterClass>('*'), loop_node);
1784 0 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
1785 0 : first_step_node->AddAlternative(GuardedAlternative(char_class));
1786 0 : node = first_step_node;
1787 : } else {
1788 19 : node = loop_node;
1789 : }
1790 : }
1791 :
1792 40 : if (compiler.isRegExpTooBig()) {
1793 0 : MOZ_ASSERT(compiler.cx()->isExceptionPending()); // over recursed
1794 0 : JS_ReportErrorASCII(cx, "regexp too big");
1795 0 : return RegExpCode();
1796 : }
1797 :
1798 40 : if (is_latin1) {
1799 35 : node = node->FilterLATIN1(RegExpCompiler::kMaxRecursion, ignore_case, unicode);
1800 : // Do it again to propagate the new nodes to places where they were not
1801 : // put because they had not been calculated yet.
1802 35 : if (node != nullptr) {
1803 34 : node = node->FilterLATIN1(RegExpCompiler::kMaxRecursion, ignore_case, unicode);
1804 : }
1805 : }
1806 :
1807 40 : if (node == nullptr)
1808 1 : node = alloc.newInfallible<EndNode>(&alloc, EndNode::BACKTRACK);
1809 :
1810 80 : Analysis analysis(cx, ignore_case, is_latin1, unicode);
1811 40 : analysis.EnsureAnalyzed(node);
1812 40 : if (analysis.has_failed()) {
1813 0 : JS_ReportErrorASCII(cx, "%s", analysis.errorMessage());
1814 0 : return RegExpCode();
1815 : }
1816 :
1817 80 : Maybe<jit::JitContext> ctx;
1818 80 : Maybe<NativeRegExpMacroAssembler> native_assembler;
1819 80 : Maybe<InterpretedRegExpMacroAssembler> interpreted_assembler;
1820 :
1821 : RegExpMacroAssembler* assembler;
1822 120 : if (IsNativeRegExpEnabled(cx) &&
1823 80 : !force_bytecode &&
1824 120 : jit::CanLikelyAllocateMoreExecutableMemory() &&
1825 40 : shared->getSource()->length() < 32 * 1024)
1826 : {
1827 : NativeRegExpMacroAssembler::Mode mode =
1828 40 : is_latin1 ? NativeRegExpMacroAssembler::LATIN1
1829 40 : : NativeRegExpMacroAssembler::CHAR16;
1830 :
1831 40 : ctx.emplace(cx, (jit::TempAllocator*) nullptr);
1832 40 : native_assembler.emplace(cx, &alloc, mode, (data->capture_count + 1) * 2, tables);
1833 40 : assembler = native_assembler.ptr();
1834 : } else {
1835 0 : interpreted_assembler.emplace(cx, &alloc, (data->capture_count + 1) * 2);
1836 0 : assembler = interpreted_assembler.ptr();
1837 : }
1838 :
1839 : // Inserted here, instead of in Assembler, because it depends on information
1840 : // in the AST that isn't replicated in the Node structure.
1841 : static const int kMaxBacksearchLimit = 1024;
1842 49 : if (is_end_anchored &&
1843 11 : !is_start_anchored &&
1844 : max_length < kMaxBacksearchLimit) {
1845 0 : assembler->SetCurrentPositionFromEnd(max_length);
1846 : }
1847 :
1848 40 : if (is_global) {
1849 0 : assembler->set_global_mode((data->tree->min_match() > 0)
1850 : ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
1851 0 : : RegExpMacroAssembler::GLOBAL);
1852 : }
1853 :
1854 40 : return compiler.Assemble(cx, assembler, node, data->capture_count);
1855 : }
1856 :
1857 : template <typename CharT>
1858 : RegExpRunStatus
1859 245 : irregexp::ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const CharT* chars, size_t start,
1860 : size_t length, MatchPairs* matches, size_t* endIndex)
1861 : {
1862 : typedef void (*RegExpCodeSignature)(InputOutputData*);
1863 :
1864 245 : InputOutputData data(chars, chars + length, start, matches, endIndex);
1865 :
1866 245 : RegExpCodeSignature function = reinterpret_cast<RegExpCodeSignature>(codeBlock->raw());
1867 :
1868 : {
1869 490 : JS::AutoSuppressGCAnalysis nogc;
1870 245 : CALL_GENERATED_1(function, &data);
1871 : }
1872 :
1873 245 : return (RegExpRunStatus) data.result;
1874 : }
1875 :
1876 : template RegExpRunStatus
1877 : irregexp::ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const Latin1Char* chars, size_t start,
1878 : size_t length, MatchPairs* matches, size_t* endIndex);
1879 :
1880 : template RegExpRunStatus
1881 : irregexp::ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const char16_t* chars, size_t start,
1882 : size_t length, MatchPairs* matches, size_t* endIndex);
1883 :
1884 : // -------------------------------------------------------------------
1885 : // Tree to graph conversion
1886 :
1887 : RegExpNode*
1888 122 : RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1889 : {
1890 : TextElementVector* elms =
1891 122 : compiler->alloc()->newInfallible<TextElementVector>(*compiler->alloc());
1892 122 : elms->append(TextElement::Atom(this));
1893 122 : return compiler->alloc()->newInfallible<TextNode>(elms, on_success);
1894 : }
1895 :
1896 : RegExpNode*
1897 3 : RegExpText::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1898 : {
1899 3 : return compiler->alloc()->newInfallible<TextNode>(&elements_, on_success);
1900 : }
1901 :
1902 : RegExpNode*
1903 90 : RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1904 : {
1905 90 : return compiler->alloc()->newInfallible<TextNode>(this, on_success);
1906 : }
1907 :
1908 : RegExpNode*
1909 24 : RegExpDisjunction::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1910 : {
1911 24 : if (!compiler->CheckOverRecursed())
1912 0 : return on_success;
1913 :
1914 24 : const RegExpTreeVector& alternatives = this->alternatives();
1915 24 : size_t length = alternatives.length();
1916 24 : ChoiceNode* result = compiler->alloc()->newInfallible<ChoiceNode>(compiler->alloc(), length);
1917 121 : for (size_t i = 0; i < length; i++) {
1918 97 : GuardedAlternative alternative(alternatives[i]->ToNode(compiler, on_success));
1919 97 : result->AddAlternative(alternative);
1920 : }
1921 24 : return result;
1922 : }
1923 :
1924 : RegExpNode*
1925 42 : RegExpQuantifier::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1926 : {
1927 84 : return ToNode(min(),
1928 : max(),
1929 42 : is_greedy(),
1930 : body(),
1931 : compiler,
1932 42 : on_success);
1933 : }
1934 :
1935 : // Scoped object to keep track of how much we unroll quantifier loops in the
1936 : // regexp graph generator.
1937 : class RegExpExpansionLimiter
1938 : {
1939 : public:
1940 : static const int kMaxExpansionFactor = 6;
1941 80 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1942 80 : : compiler_(compiler),
1943 80 : saved_expansion_factor_(compiler->current_expansion_factor()),
1944 160 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor)
1945 : {
1946 80 : MOZ_ASSERT(factor > 0);
1947 80 : if (ok_to_expand_) {
1948 80 : if (factor > kMaxExpansionFactor) {
1949 : // Avoid integer overflow of the current expansion factor.
1950 2 : ok_to_expand_ = false;
1951 2 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1952 : } else {
1953 78 : int new_factor = saved_expansion_factor_ * factor;
1954 78 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1955 78 : compiler->set_current_expansion_factor(new_factor);
1956 : }
1957 : }
1958 80 : }
1959 :
1960 160 : ~RegExpExpansionLimiter() {
1961 80 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
1962 80 : }
1963 :
1964 26 : bool ok_to_expand() { return ok_to_expand_; }
1965 :
1966 : private:
1967 : RegExpCompiler* compiler_;
1968 : int saved_expansion_factor_;
1969 : bool ok_to_expand_;
1970 : };
1971 :
1972 : /* static */ RegExpNode*
1973 84 : RegExpQuantifier::ToNode(int min,
1974 : int max,
1975 : bool is_greedy,
1976 : RegExpTree* body,
1977 : RegExpCompiler* compiler,
1978 : RegExpNode* on_success,
1979 : bool not_at_start /* = false */)
1980 : {
1981 : // x{f, t} becomes this:
1982 : //
1983 : // (r++)<-.
1984 : // | `
1985 : // | (x)
1986 : // v ^
1987 : // (r=0)-->(?)---/ [if r < t]
1988 : // |
1989 : // [if r >= f] \----> ...
1990 : //
1991 :
1992 : // 15.10.2.5 RepeatMatcher algorithm.
1993 : // The parser has already eliminated the case where max is 0. In the case
1994 : // where max_match is zero the parser has removed the quantifier if min was
1995 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
1996 :
1997 : // If we know that we cannot match zero length then things are a little
1998 : // simpler since we don't need to make the special zero length match check
1999 : // from step 2.1. If the min and max are small we can unroll a little in
2000 : // this case.
2001 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
2002 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
2003 :
2004 84 : if (max == 0)
2005 1 : return on_success; // This can happen due to recursion.
2006 :
2007 83 : if (!compiler->CheckOverRecursed())
2008 0 : return on_success;
2009 :
2010 83 : bool body_can_be_empty = (body->min_match() == 0);
2011 83 : int body_start_reg = RegExpCompiler::kNoRegister;
2012 83 : Interval capture_registers = body->CaptureRegisters();
2013 83 : bool needs_capture_clearing = !capture_registers.is_empty();
2014 83 : LifoAlloc* alloc = compiler->alloc();
2015 :
2016 83 : if (body_can_be_empty) {
2017 0 : body_start_reg = compiler->AllocateRegister();
2018 83 : } else if (!needs_capture_clearing) {
2019 : // Only unroll if there are no captures and the body can't be
2020 : // empty.
2021 : {
2022 131 : RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
2023 77 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
2024 23 : int new_max = (max == kInfinity) ? max : max - min;
2025 : // Recurse once to get the loop or optional matches after the fixed
2026 : // ones.
2027 23 : RegExpNode* answer = ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
2028 : // Unroll the forced matches from 0 to min. This can cause chains of
2029 : // TextNodes (which the parser does not generate). These should be
2030 : // combined if it turns out they hinder good code generation.
2031 47 : for (int i = 0; i < min; i++)
2032 24 : answer = body->ToNode(compiler, answer);
2033 23 : return answer;
2034 : }
2035 : }
2036 54 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
2037 3 : MOZ_ASSERT(max > 0); // Due to the 'if' above.
2038 3 : RegExpExpansionLimiter limiter(compiler, max);
2039 3 : if (limiter.ok_to_expand()) {
2040 : // Unroll the optional matches up to max.
2041 3 : RegExpNode* answer = on_success;
2042 6 : for (int i = 0; i < max; i++) {
2043 3 : ChoiceNode* alternation = alloc->newInfallible<ChoiceNode>(alloc, 2);
2044 3 : if (is_greedy) {
2045 3 : alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, answer)));
2046 3 : alternation->AddAlternative(GuardedAlternative(on_success));
2047 : } else {
2048 0 : alternation->AddAlternative(GuardedAlternative(on_success));
2049 0 : alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, answer)));
2050 : }
2051 3 : answer = alternation;
2052 3 : if (not_at_start) alternation->set_not_at_start();
2053 : }
2054 3 : return answer;
2055 : }
2056 : }
2057 : }
2058 57 : bool has_min = min > 0;
2059 57 : bool has_max = max < RegExpTree::kInfinity;
2060 57 : bool needs_counter = has_min || has_max;
2061 : int reg_ctr = needs_counter
2062 57 : ? compiler->AllocateRegister()
2063 57 : : RegExpCompiler::kNoRegister;
2064 57 : LoopChoiceNode* center = alloc->newInfallible<LoopChoiceNode>(alloc, body->min_match() == 0);
2065 57 : if (not_at_start)
2066 22 : center->set_not_at_start();
2067 : RegExpNode* loop_return = needs_counter
2068 57 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
2069 57 : : static_cast<RegExpNode*>(center);
2070 57 : if (body_can_be_empty) {
2071 : // If the body can be empty we need to check if it was and then
2072 : // backtrack.
2073 0 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
2074 : reg_ctr,
2075 : min,
2076 0 : loop_return);
2077 : }
2078 57 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
2079 57 : if (body_can_be_empty) {
2080 : // If the body can be empty we need to store the start position
2081 : // so we can bail out if it was empty.
2082 0 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
2083 : }
2084 57 : if (needs_capture_clearing) {
2085 : // Before entering the body of this loop we need to clear captures.
2086 6 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
2087 : }
2088 57 : GuardedAlternative body_alt(body_node);
2089 57 : if (has_max) {
2090 11 : Guard* body_guard = alloc->newInfallible<Guard>(reg_ctr, Guard::LT, max);
2091 11 : body_alt.AddGuard(alloc, body_guard);
2092 : }
2093 57 : GuardedAlternative rest_alt(on_success);
2094 57 : if (has_min) {
2095 5 : Guard* rest_guard = alloc->newInfallible<Guard>(reg_ctr, Guard::GEQ, min);
2096 5 : rest_alt.AddGuard(alloc, rest_guard);
2097 : }
2098 57 : if (is_greedy) {
2099 37 : center->AddLoopAlternative(body_alt);
2100 37 : center->AddContinueAlternative(rest_alt);
2101 : } else {
2102 20 : center->AddContinueAlternative(rest_alt);
2103 20 : center->AddLoopAlternative(body_alt);
2104 : }
2105 57 : if (needs_counter)
2106 11 : return ActionNode::SetRegister(reg_ctr, 0, center);
2107 46 : return center;
2108 : }
2109 :
2110 : RegExpNode*
2111 44 : RegExpAssertion::ToNode(RegExpCompiler* compiler,
2112 : RegExpNode* on_success)
2113 : {
2114 44 : NodeInfo info;
2115 44 : LifoAlloc* alloc = compiler->alloc();
2116 :
2117 44 : switch (assertion_type()) {
2118 : case START_OF_LINE:
2119 0 : return AssertionNode::AfterNewline(on_success);
2120 : case START_OF_INPUT:
2121 21 : return AssertionNode::AtStart(on_success);
2122 : case BOUNDARY:
2123 2 : return AssertionNode::AtBoundary(on_success);
2124 : case NON_BOUNDARY:
2125 0 : return AssertionNode::AtNonBoundary(on_success);
2126 : case END_OF_INPUT:
2127 21 : return AssertionNode::AtEnd(on_success);
2128 : case END_OF_LINE: {
2129 : // Compile $ in multiline regexps as an alternation with a positive
2130 : // lookahead in one side and an end-of-input on the other side.
2131 : // We need two registers for the lookahead.
2132 0 : int stack_pointer_register = compiler->AllocateRegister();
2133 0 : int position_register = compiler->AllocateRegister();
2134 : // The ChoiceNode to distinguish between a newline and end-of-input.
2135 0 : ChoiceNode* result = alloc->newInfallible<ChoiceNode>(alloc, 2);
2136 : // Create a newline atom.
2137 0 : CharacterRangeVector* newline_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
2138 0 : CharacterRange::AddClassEscape(alloc, 'n', newline_ranges);
2139 0 : RegExpCharacterClass* newline_atom = alloc->newInfallible<RegExpCharacterClass>('n');
2140 : TextNode* newline_matcher =
2141 0 : alloc->newInfallible<TextNode>(newline_atom,
2142 0 : ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
2143 : position_register,
2144 : 0, // No captures inside.
2145 : -1, // Ignored if no captures.
2146 0 : on_success));
2147 : // Create an end-of-input matcher.
2148 : RegExpNode* end_of_line =
2149 0 : ActionNode::BeginSubmatch(stack_pointer_register, position_register, newline_matcher);
2150 :
2151 : // Add the two alternatives to the ChoiceNode.
2152 0 : GuardedAlternative eol_alternative(end_of_line);
2153 0 : result->AddAlternative(eol_alternative);
2154 0 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
2155 0 : result->AddAlternative(end_alternative);
2156 0 : return result;
2157 : }
2158 : case NOT_AFTER_LEAD_SURROGATE:
2159 0 : return AssertionNode::NotAfterLeadSurrogate(on_success);
2160 : case NOT_IN_SURROGATE_PAIR:
2161 0 : return AssertionNode::NotInSurrogatePair(on_success);
2162 : default:
2163 0 : MOZ_CRASH("Bad assertion type");
2164 : }
2165 : return on_success;
2166 : }
2167 :
2168 : RegExpNode*
2169 0 : RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2170 : {
2171 0 : return compiler->alloc()->newInfallible<BackReferenceNode>(RegExpCapture::StartRegister(index()),
2172 0 : RegExpCapture::EndRegister(index()),
2173 0 : on_success);
2174 : }
2175 :
2176 : RegExpNode*
2177 1 : RegExpEmpty::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2178 : {
2179 1 : return on_success;
2180 : }
2181 :
2182 : RegExpNode*
2183 12 : RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2184 : {
2185 12 : int stack_pointer_register = compiler->AllocateRegister();
2186 12 : int position_register = compiler->AllocateRegister();
2187 :
2188 12 : const int registers_per_capture = 2;
2189 12 : const int register_of_first_capture = 2;
2190 12 : int register_count = capture_count_ * registers_per_capture;
2191 : int register_start =
2192 12 : register_of_first_capture + capture_from_ * registers_per_capture;
2193 :
2194 12 : if (!compiler->CheckOverRecursed())
2195 0 : return on_success;
2196 :
2197 12 : if (is_positive()) {
2198 : RegExpNode* bodyNode =
2199 20 : body()->ToNode(compiler,
2200 10 : ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
2201 : position_register,
2202 : register_count,
2203 : register_start,
2204 20 : on_success));
2205 10 : return ActionNode::BeginSubmatch(stack_pointer_register,
2206 : position_register,
2207 10 : bodyNode);
2208 : }
2209 :
2210 : // We use a ChoiceNode for a negative lookahead because it has most of
2211 : // the characteristics we need. It has the body of the lookahead as its
2212 : // first alternative and the expression after the lookahead of the second
2213 : // alternative. If the first alternative succeeds then the
2214 : // NegativeSubmatchSuccess will unwind the stack including everything the
2215 : // choice node set up and backtrack. If the first alternative fails then
2216 : // the second alternative is tried, which is exactly the desired result
2217 : // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
2218 : // ChoiceNode that knows to ignore the first exit when calculating quick
2219 : // checks.
2220 2 : LifoAlloc* alloc = compiler->alloc();
2221 :
2222 : RegExpNode* success =
2223 2 : alloc->newInfallible<NegativeSubmatchSuccess>(alloc,
2224 : stack_pointer_register,
2225 : position_register,
2226 : register_count,
2227 2 : register_start);
2228 2 : GuardedAlternative body_alt(body()->ToNode(compiler, success));
2229 :
2230 : ChoiceNode* choice_node =
2231 2 : alloc->newInfallible<NegativeLookaheadChoiceNode>(alloc, body_alt, GuardedAlternative(on_success));
2232 :
2233 2 : return ActionNode::BeginSubmatch(stack_pointer_register,
2234 : position_register,
2235 2 : choice_node);
2236 : }
2237 :
2238 : RegExpNode*
2239 24 : RegExpCapture::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2240 : {
2241 24 : return ToNode(body(), index(), compiler, on_success);
2242 : }
2243 :
2244 : /* static */ RegExpNode*
2245 64 : RegExpCapture::ToNode(RegExpTree* body,
2246 : int index,
2247 : RegExpCompiler* compiler,
2248 : RegExpNode* on_success)
2249 : {
2250 64 : if (!compiler->CheckOverRecursed())
2251 0 : return on_success;
2252 :
2253 64 : int start_reg = RegExpCapture::StartRegister(index);
2254 64 : int end_reg = RegExpCapture::EndRegister(index);
2255 64 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
2256 64 : RegExpNode* body_node = body->ToNode(compiler, store_end);
2257 64 : return ActionNode::StorePosition(start_reg, true, body_node);
2258 : }
2259 :
2260 : RegExpNode*
2261 49 : RegExpAlternative::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2262 : {
2263 49 : if (!compiler->CheckOverRecursed())
2264 0 : return on_success;
2265 :
2266 49 : const RegExpTreeVector& children = nodes();
2267 49 : RegExpNode* current = on_success;
2268 203 : for (int i = children.length() - 1; i >= 0; i--)
2269 154 : current = children[i]->ToNode(compiler, current);
2270 49 : return current;
2271 : }
2272 :
2273 : // -------------------------------------------------------------------
2274 : // BoyerMooreLookahead
2275 :
2276 : ContainedInLattice
2277 404 : irregexp::AddRange(ContainedInLattice containment,
2278 : const int* ranges,
2279 : int ranges_length,
2280 : Interval new_range)
2281 : {
2282 404 : MOZ_ASSERT((ranges_length & 1) == 1);
2283 404 : MOZ_ASSERT(ranges[ranges_length - 1] == kMaxUtf16CodeUnit + 1);
2284 404 : if (containment == kLatticeUnknown) return containment;
2285 393 : bool inside = false;
2286 393 : int last = 0;
2287 1279 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2288 : // Consider the range from last to ranges[i].
2289 : // We haven't got to the new range yet.
2290 1279 : if (ranges[i] <= new_range.from())
2291 886 : continue;
2292 :
2293 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
2294 : // inclusive, but the values in ranges are not.
2295 393 : if (last <= new_range.from() && new_range.to() < ranges[i])
2296 392 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2297 :
2298 1 : return kLatticeUnknown;
2299 : }
2300 0 : return containment;
2301 : }
2302 :
2303 : void
2304 66 : BoyerMoorePositionInfo::Set(int character)
2305 : {
2306 66 : SetInterval(Interval(character, character));
2307 66 : }
2308 :
2309 : void
2310 101 : BoyerMoorePositionInfo::SetInterval(const Interval& interval)
2311 : {
2312 101 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2313 101 : if (unicode_ignore_case_)
2314 0 : w_ = AddRange(w_, kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, interval);
2315 : else
2316 101 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2317 101 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2318 101 : surrogate_ =
2319 101 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2320 101 : if (interval.to() - interval.from() >= kMapSize - 1) {
2321 0 : if (map_count_ != kMapSize) {
2322 0 : map_count_ = kMapSize;
2323 0 : for (int i = 0; i < kMapSize; i++)
2324 0 : map_[i] = true;
2325 : }
2326 0 : return;
2327 : }
2328 384 : for (int i = interval.from(); i <= interval.to(); i++) {
2329 283 : int mod_character = (i & kMask);
2330 283 : if (!map_[mod_character]) {
2331 237 : map_count_++;
2332 237 : map_[mod_character] = true;
2333 : }
2334 283 : if (map_count_ == kMapSize)
2335 0 : return;
2336 : }
2337 : }
2338 :
2339 : void
2340 2 : BoyerMoorePositionInfo::SetAll()
2341 : {
2342 2 : s_ = w_ = d_ = kLatticeUnknown;
2343 2 : if (map_count_ != kMapSize) {
2344 2 : map_count_ = kMapSize;
2345 258 : for (int i = 0; i < kMapSize; i++)
2346 256 : map_[i] = true;
2347 : }
2348 2 : }
2349 :
2350 19 : BoyerMooreLookahead::BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler)
2351 19 : : length_(length), compiler_(compiler), bitmaps_(*alloc)
2352 : {
2353 19 : bool unicode_ignore_case = compiler->unicode() && compiler->ignore_case();
2354 19 : max_char_ = MaximumCharacter(compiler->latin1());
2355 :
2356 19 : bitmaps_.reserve(length);
2357 67 : for (size_t i = 0; i < length; i++)
2358 48 : bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc, unicode_ignore_case));
2359 19 : }
2360 :
2361 : // Find the longest range of lookahead that has the fewest number of different
2362 : // characters that can occur at a given position. Since we are optimizing two
2363 : // different parameters at once this is a tradeoff.
2364 18 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2365 18 : int biggest_points = 0;
2366 : // If more than 32 characters out of 128 can occur it is unlikely that we can
2367 : // be lucky enough to step forwards much of the time.
2368 18 : const int kMaxMax = 32;
2369 72 : for (int max_number_of_chars = 4;
2370 72 : max_number_of_chars < kMaxMax;
2371 54 : max_number_of_chars *= 2) {
2372 : biggest_points =
2373 54 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
2374 : }
2375 18 : if (biggest_points == 0) return false;
2376 15 : return true;
2377 : }
2378 :
2379 : // Find the highest-points range between 0 and length_ where the character
2380 : // information is not too vague. 'Too vague' means that there are more than
2381 : // max_number_of_chars that can occur at this position. Calculates the number
2382 : // of points as the product of width-of-the-range and
2383 : // probability-of-finding-one-of-the-characters, where the probability is
2384 : // calculated using the frequency distribution of the sample subject string.
2385 : int
2386 54 : BoyerMooreLookahead::FindBestInterval(int max_number_of_chars, int old_biggest_points,
2387 : int* from, int* to)
2388 : {
2389 54 : int biggest_points = old_biggest_points;
2390 : static const int kSize = RegExpMacroAssembler::kTableSize;
2391 96 : for (int i = 0; i < length_; ) {
2392 90 : while (i < length_ && Count(i) > max_number_of_chars) i++;
2393 63 : if (i == length_) break;
2394 42 : int remembered_from = i;
2395 : bool union_map[kSize];
2396 42 : for (int j = 0; j < kSize; j++) union_map[j] = false;
2397 270 : while (i < length_ && Count(i) <= max_number_of_chars) {
2398 114 : BoyerMoorePositionInfo* map = bitmaps_[i];
2399 114 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
2400 114 : i++;
2401 : }
2402 42 : int frequency = 0;
2403 5418 : for (int j = 0; j < kSize; j++) {
2404 5376 : if (union_map[j]) {
2405 : // Add 1 to the frequency to give a small per-character boost for
2406 : // the cases where our sampling is not good enough and many
2407 : // characters have a frequency of zero. This means the frequency
2408 : // can theoretically be up to 2*kSize though we treat it mostly as
2409 : // a fraction of kSize.
2410 179 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2411 : }
2412 : }
2413 : // We use the probability of skipping times the distance we are skipping to
2414 : // judge the effectiveness of this. Actually we have a cut-off: By
2415 : // dividing by 2 we switch off the skipping if the probability of skipping
2416 : // is less than 50%. This is because the multibyte mask-and-compare
2417 : // skipping in quickcheck is more likely to do well on this case.
2418 99 : bool in_quickcheck_range = ((i - remembered_from < 4) ||
2419 60 : (compiler_->latin1() ? remembered_from <= 4 : remembered_from <= 2));
2420 : // Called 'probability' but it is only a rough estimate and can actually
2421 : // be outside the 0-kSize range.
2422 42 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2423 42 : int points = (i - remembered_from) * probability;
2424 42 : if (points > biggest_points) {
2425 15 : *from = remembered_from;
2426 15 : *to = i - 1;
2427 15 : biggest_points = points;
2428 : }
2429 : }
2430 54 : return biggest_points;
2431 : }
2432 :
2433 : // Take all the characters that will not prevent a successful match if they
2434 : // occur in the subject string in the range between min_lookahead and
2435 : // max_lookahead (inclusive) measured from the current position. If the
2436 : // character at max_lookahead offset is not one of these characters, then we
2437 : // can safely skip forwards by the number of characters in the range.
2438 10 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
2439 : int max_lookahead,
2440 : uint8_t* boolean_skip_table)
2441 : {
2442 10 : const int kSize = RegExpMacroAssembler::kTableSize;
2443 :
2444 10 : const int kSkipArrayEntry = 0;
2445 10 : const int kDontSkipArrayEntry = 1;
2446 :
2447 1290 : for (int i = 0; i < kSize; i++)
2448 1280 : boolean_skip_table[i] = kSkipArrayEntry;
2449 10 : int skip = max_lookahead + 1 - min_lookahead;
2450 :
2451 43 : for (int i = max_lookahead; i >= min_lookahead; i--) {
2452 33 : BoyerMoorePositionInfo* map = bitmaps_[i];
2453 4257 : for (int j = 0; j < kSize; j++) {
2454 4224 : if (map->at(j))
2455 62 : boolean_skip_table[j] = kDontSkipArrayEntry;
2456 : }
2457 : }
2458 :
2459 10 : return skip;
2460 : }
2461 :
2462 : // See comment on the implementation of GetSkipTable.
2463 : bool
2464 18 : BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm)
2465 : {
2466 18 : const int kSize = RegExpMacroAssembler::kTableSize;
2467 :
2468 18 : int min_lookahead = 0;
2469 18 : int max_lookahead = 0;
2470 :
2471 18 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead))
2472 3 : return false;
2473 :
2474 15 : bool found_single_character = false;
2475 15 : int single_character = 0;
2476 23 : for (int i = max_lookahead; i >= min_lookahead; i--) {
2477 18 : BoyerMoorePositionInfo* map = bitmaps_[i];
2478 28 : if (map->map_count() > 1 ||
2479 3 : (found_single_character && map->map_count() != 0)) {
2480 10 : found_single_character = false;
2481 10 : break;
2482 : }
2483 619 : for (int j = 0; j < kSize; j++) {
2484 619 : if (map->at(j)) {
2485 8 : found_single_character = true;
2486 8 : single_character = j;
2487 8 : break;
2488 : }
2489 : }
2490 : }
2491 :
2492 15 : int lookahead_width = max_lookahead + 1 - min_lookahead;
2493 :
2494 15 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2495 : // The mask-compare can probably handle this better.
2496 5 : return false;
2497 : }
2498 :
2499 10 : if (found_single_character) {
2500 0 : jit::Label cont, again;
2501 0 : masm->Bind(&again);
2502 0 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2503 0 : if (max_char_ > kSize) {
2504 0 : masm->CheckCharacterAfterAnd(single_character,
2505 : RegExpMacroAssembler::kTableMask,
2506 0 : &cont);
2507 : } else {
2508 0 : masm->CheckCharacter(single_character, &cont);
2509 : }
2510 0 : masm->AdvanceCurrentPosition(lookahead_width);
2511 0 : masm->JumpOrBacktrack(&again);
2512 0 : masm->Bind(&cont);
2513 0 : return true;
2514 : }
2515 :
2516 20 : RegExpShared::JitCodeTable boolean_skip_table;
2517 : {
2518 20 : AutoEnterOOMUnsafeRegion oomUnsafe;
2519 10 : boolean_skip_table.reset(static_cast<uint8_t*>(js_malloc(kSize)));
2520 10 : if (!boolean_skip_table)
2521 0 : oomUnsafe.crash("Table malloc");
2522 : }
2523 :
2524 10 : int skip_distance = GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table.get());
2525 10 : MOZ_ASSERT(skip_distance != 0);
2526 :
2527 20 : jit::Label cont, again;
2528 10 : masm->Bind(&again);
2529 10 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2530 10 : masm->CheckBitInTable(Move(boolean_skip_table), &cont);
2531 10 : masm->AdvanceCurrentPosition(skip_distance);
2532 10 : masm->JumpOrBacktrack(&again);
2533 10 : masm->Bind(&cont);
2534 :
2535 10 : return true;
2536 : }
2537 :
2538 : bool
2539 300 : RegExpCompiler::CheckOverRecursed()
2540 : {
2541 300 : if (!CheckRecursionLimit(cx())) {
2542 0 : SetRegExpTooBig();
2543 0 : return false;
2544 : }
2545 :
2546 300 : return true;
2547 : }
2548 :
2549 : bool
2550 68 : BoyerMooreLookahead::CheckOverRecursed()
2551 : {
2552 68 : return compiler()->CheckOverRecursed();
2553 : }
2554 :
2555 : // -------------------------------------------------------------------
2556 : // Trace
2557 :
2558 589 : bool Trace::DeferredAction::Mentions(int that)
2559 : {
2560 589 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
2561 24 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
2562 24 : return range.Contains(that);
2563 : }
2564 565 : return reg() == that;
2565 : }
2566 :
2567 40 : bool Trace::mentions_reg(int reg)
2568 : {
2569 40 : for (DeferredAction* action = actions_; action != nullptr; action = action->next()) {
2570 0 : if (action->Mentions(reg))
2571 0 : return true;
2572 : }
2573 40 : return false;
2574 : }
2575 :
2576 : bool
2577 0 : Trace::GetStoredPosition(int reg, int* cp_offset)
2578 : {
2579 0 : MOZ_ASSERT(0 == *cp_offset);
2580 0 : for (DeferredAction* action = actions_; action != nullptr; action = action->next()) {
2581 0 : if (action->Mentions(reg)) {
2582 0 : if (action->action_type() == ActionNode::STORE_POSITION) {
2583 0 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
2584 0 : return true;
2585 : }
2586 0 : return false;
2587 : }
2588 : }
2589 0 : return false;
2590 : }
2591 :
2592 : int
2593 166 : Trace::FindAffectedRegisters(LifoAlloc* alloc, OutSet* affected_registers)
2594 : {
2595 166 : int max_register = RegExpCompiler::kNoRegister;
2596 439 : for (DeferredAction* action = actions_; action != nullptr; action = action->next()) {
2597 273 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
2598 8 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
2599 28 : for (int i = range.from(); i <= range.to(); i++)
2600 20 : affected_registers->Set(alloc, i);
2601 8 : if (range.to() > max_register) max_register = range.to();
2602 : } else {
2603 265 : affected_registers->Set(alloc, action->reg());
2604 265 : if (action->reg() > max_register) max_register = action->reg();
2605 : }
2606 : }
2607 166 : return max_register;
2608 : }
2609 :
2610 : void
2611 166 : Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
2612 : int max_register,
2613 : OutSet& registers_to_pop,
2614 : OutSet& registers_to_clear)
2615 : {
2616 870 : for (int reg = max_register; reg >= 0; reg--) {
2617 704 : if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
2618 660 : else if (registers_to_clear.Get(reg)) {
2619 52 : int clear_to = reg;
2620 82 : while (reg > 0 && registers_to_clear.Get(reg - 1))
2621 15 : reg--;
2622 52 : assembler->ClearRegisters(reg, clear_to);
2623 : }
2624 : }
2625 166 : }
2626 :
2627 : enum DeferredActionUndoType {
2628 : DEFER_IGNORE,
2629 : DEFER_RESTORE,
2630 : DEFER_CLEAR
2631 : };
2632 :
2633 : void
2634 166 : Trace::PerformDeferredActions(LifoAlloc* alloc,
2635 : RegExpMacroAssembler* assembler,
2636 : int max_register,
2637 : OutSet& affected_registers,
2638 : OutSet* registers_to_pop,
2639 : OutSet* registers_to_clear)
2640 : {
2641 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
2642 166 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
2643 :
2644 : // Count pushes performed to force a stack limit check occasionally.
2645 166 : int pushes = 0;
2646 :
2647 885 : for (int reg = 0; reg <= max_register; reg++) {
2648 719 : if (!affected_registers.Get(reg))
2649 446 : continue;
2650 :
2651 : // The chronologically first deferred action in the trace
2652 : // is used to infer the action needed to restore a register
2653 : // to its previous state (or not, if it's safe to ignore it).
2654 273 : DeferredActionUndoType undo_action = DEFER_IGNORE;
2655 :
2656 273 : int value = 0;
2657 273 : bool absolute = false;
2658 273 : bool clear = false;
2659 273 : int store_position = -1;
2660 : // This is a little tricky because we are scanning the actions in reverse
2661 : // historical order (newest first).
2662 862 : for (DeferredAction* action = actions_;
2663 862 : action != nullptr;
2664 : action = action->next()) {
2665 589 : if (action->Mentions(reg)) {
2666 285 : switch (action->action_type()) {
2667 : case ActionNode::SET_REGISTER: {
2668 : Trace::DeferredSetRegister* psr =
2669 11 : static_cast<Trace::DeferredSetRegister*>(action);
2670 11 : if (!absolute) {
2671 11 : value += psr->value();
2672 11 : absolute = true;
2673 : }
2674 : // SET_REGISTER is currently only used for newly introduced loop
2675 : // counters. They can have a significant previous value if they
2676 : // occour in a loop. TODO(lrn): Propagate this information, so
2677 : // we can set undo_action to IGNORE if we know there is no value to
2678 : // restore.
2679 11 : undo_action = DEFER_RESTORE;
2680 11 : MOZ_ASSERT(store_position == -1);
2681 11 : MOZ_ASSERT(!clear);
2682 11 : break;
2683 : }
2684 : case ActionNode::INCREMENT_REGISTER:
2685 13 : if (!absolute) {
2686 13 : value++;
2687 : }
2688 13 : MOZ_ASSERT(store_position == -1);
2689 13 : MOZ_ASSERT(!clear);
2690 13 : undo_action = DEFER_RESTORE;
2691 13 : break;
2692 : case ActionNode::STORE_POSITION: {
2693 : Trace::DeferredCapture* pc =
2694 241 : static_cast<Trace::DeferredCapture*>(action);
2695 241 : if (!clear && store_position == -1) {
2696 241 : store_position = pc->cp_offset();
2697 : }
2698 :
2699 : // For captures we know that stores and clears alternate.
2700 : // Other register, are never cleared, and if the occur
2701 : // inside a loop, they might be assigned more than once.
2702 241 : if (reg <= 1) {
2703 : // Registers zero and one, aka "capture zero", is
2704 : // always set correctly if we succeed. There is no
2705 : // need to undo a setting on backtrack, because we
2706 : // will set it again or fail.
2707 162 : undo_action = DEFER_IGNORE;
2708 : } else {
2709 79 : undo_action = pc->is_capture() ? DEFER_CLEAR : DEFER_RESTORE;
2710 : }
2711 241 : MOZ_ASSERT(!absolute);
2712 241 : MOZ_ASSERT(value == 0);
2713 241 : break;
2714 : }
2715 : case ActionNode::CLEAR_CAPTURES: {
2716 : // Since we're scanning in reverse order, if we've already
2717 : // set the position we have to ignore historically earlier
2718 : // clearing operations.
2719 20 : if (store_position == -1) {
2720 8 : clear = true;
2721 : }
2722 20 : undo_action = DEFER_RESTORE;
2723 20 : MOZ_ASSERT(!absolute);
2724 20 : MOZ_ASSERT(value == 0);
2725 20 : break;
2726 : }
2727 : default:
2728 0 : MOZ_CRASH("Bad action");
2729 : }
2730 : }
2731 : }
2732 : // Prepare for the undo-action (e.g., push if it's going to be popped).
2733 273 : if (undo_action == DEFER_RESTORE) {
2734 44 : pushes++;
2735 : RegExpMacroAssembler::StackCheckFlag stack_check =
2736 44 : RegExpMacroAssembler::kNoStackLimitCheck;
2737 44 : if (pushes == push_limit) {
2738 0 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
2739 0 : pushes = 0;
2740 : }
2741 :
2742 44 : assembler->PushRegister(reg, stack_check);
2743 44 : registers_to_pop->Set(alloc, reg);
2744 229 : } else if (undo_action == DEFER_CLEAR) {
2745 67 : registers_to_clear->Set(alloc, reg);
2746 : }
2747 : // Perform the chronologically last action (or accumulated increment)
2748 : // for the register.
2749 273 : if (store_position != -1) {
2750 241 : assembler->WriteCurrentPositionToRegister(reg, store_position);
2751 32 : } else if (clear) {
2752 8 : assembler->ClearRegisters(reg, reg);
2753 24 : } else if (absolute) {
2754 11 : assembler->SetRegister(reg, value);
2755 13 : } else if (value != 0) {
2756 13 : assembler->AdvanceRegister(reg, value);
2757 : }
2758 : }
2759 166 : }
2760 :
2761 : // This is called as we come into a loop choice node and some other tricky
2762 : // nodes. It normalizes the state of the code generator to ensure we can
2763 : // generate generic code.
2764 196 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor)
2765 : {
2766 196 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2767 :
2768 196 : MOZ_ASSERT(!is_trivial());
2769 :
2770 196 : if (actions_ == nullptr && backtrack() == nullptr) {
2771 : // Here we just have some deferred cp advances to fix and we are back to
2772 : // a normal situation. We may also have to forget some information gained
2773 : // through a quick check that was already performed.
2774 30 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
2775 : // Create a new trivial state and generate the node with that.
2776 30 : Trace new_state;
2777 30 : successor->Emit(compiler, &new_state);
2778 30 : return;
2779 : }
2780 :
2781 : // Generate deferred actions here along with code to undo them again.
2782 166 : OutSet affected_registers;
2783 :
2784 166 : if (backtrack() != nullptr) {
2785 : // Here we have a concrete backtrack location. These are set up by choice
2786 : // nodes and so they indicate that we have a deferred save of the current
2787 : // position which we may need to emit here.
2788 129 : assembler->PushCurrentPosition();
2789 : }
2790 :
2791 166 : int max_register = FindAffectedRegisters(compiler->alloc(), &affected_registers);
2792 166 : OutSet registers_to_pop;
2793 166 : OutSet registers_to_clear;
2794 166 : PerformDeferredActions(compiler->alloc(),
2795 : assembler,
2796 : max_register,
2797 : affected_registers,
2798 : ®isters_to_pop,
2799 166 : ®isters_to_clear);
2800 166 : if (cp_offset_ != 0)
2801 129 : assembler->AdvanceCurrentPosition(cp_offset_);
2802 :
2803 : // Create a new trivial state and generate the node with that.
2804 332 : jit::Label undo;
2805 166 : assembler->PushBacktrack(&undo);
2806 166 : Trace new_state;
2807 166 : successor->Emit(compiler, &new_state);
2808 :
2809 : // On backtrack we need to restore state.
2810 166 : assembler->BindBacktrack(&undo);
2811 : RestoreAffectedRegisters(assembler,
2812 : max_register,
2813 : registers_to_pop,
2814 166 : registers_to_clear);
2815 166 : if (backtrack() == nullptr) {
2816 37 : assembler->Backtrack();
2817 : } else {
2818 129 : assembler->PopCurrentPosition();
2819 129 : assembler->JumpOrBacktrack(backtrack());
2820 : }
2821 : }
2822 :
2823 : void
2824 58 : Trace::InvalidateCurrentCharacter()
2825 : {
2826 58 : characters_preloaded_ = 0;
2827 58 : }
2828 :
2829 : void
2830 232 : Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler)
2831 : {
2832 232 : MOZ_ASSERT(by > 0);
2833 : // We don't have an instruction for shifting the current character register
2834 : // down or for using a shifted value for anything so lets just forget that
2835 : // we preloaded any characters into it.
2836 232 : characters_preloaded_ = 0;
2837 : // Adjust the offsets of the quick check performed information. This
2838 : // information is used to find out what we already determined about the
2839 : // characters by means of mask and compare.
2840 232 : quick_check_performed_.Advance(by);
2841 232 : cp_offset_ += by;
2842 232 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2843 0 : compiler->SetRegExpTooBig();
2844 0 : cp_offset_ = 0;
2845 : }
2846 232 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2847 232 : }
2848 :
2849 : void
2850 396 : OutSet::Set(LifoAlloc* alloc, unsigned value)
2851 : {
2852 396 : if (value < kFirstLimit) {
2853 396 : first_ |= (1 << value);
2854 : } else {
2855 0 : if (remaining_ == nullptr)
2856 0 : remaining_ = alloc->newInfallible<RemainingVector>(*alloc);
2857 :
2858 0 : for (size_t i = 0; i < remaining().length(); i++) {
2859 0 : if (remaining()[i] == value)
2860 0 : return;
2861 : }
2862 0 : remaining().append(value);
2863 : }
2864 : }
2865 :
2866 : bool
2867 2150 : OutSet::Get(unsigned value)
2868 : {
2869 2150 : if (value < kFirstLimit)
2870 2150 : return (first_ & (1 << value)) != 0;
2871 0 : if (remaining_ == nullptr)
2872 0 : return false;
2873 0 : for (size_t i = 0; i < remaining().length(); i++) {
2874 0 : if (remaining()[i] == value)
2875 0 : return true;
2876 : }
2877 0 : return false;
2878 : }
2879 :
2880 : // -------------------------------------------------------------------
2881 : // Graph emitting
2882 :
2883 : void
2884 12 : NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace)
2885 : {
2886 12 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2887 :
2888 : // Omit flushing the trace. We discard the entire stack frame anyway.
2889 :
2890 12 : if (!label()->bound()) {
2891 : // We are completely independent of the trace, since we ignore it,
2892 : // so this code can be used as the generic version.
2893 2 : assembler->Bind(label());
2894 : }
2895 :
2896 : // Throw away everything on the backtrack stack since the start
2897 : // of the negative submatch and restore the character position.
2898 12 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
2899 12 : assembler->ReadBacktrackStackPointerFromRegister(stack_pointer_register_);
2900 :
2901 12 : if (clear_capture_count_ > 0) {
2902 : // Clear any captures that might have been performed during the success
2903 : // of the body of the negative look-ahead.
2904 0 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
2905 0 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
2906 : }
2907 :
2908 : // Now that we have unwound the stack we find at the top of the stack the
2909 : // backtrack that the BeginSubmatch node got.
2910 12 : assembler->Backtrack();
2911 12 : }
2912 :
2913 : void
2914 133 : EndNode::Emit(RegExpCompiler* compiler, Trace* trace)
2915 : {
2916 133 : if (!trace->is_trivial()) {
2917 66 : trace->Flush(compiler, this);
2918 66 : return;
2919 : }
2920 67 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2921 67 : if (!label()->bound()) {
2922 40 : assembler->Bind(label());
2923 : }
2924 67 : switch (action_) {
2925 : case ACCEPT:
2926 66 : assembler->Succeed();
2927 66 : return;
2928 : case BACKTRACK:
2929 1 : assembler->JumpOrBacktrack(trace->backtrack());
2930 1 : return;
2931 : case NEGATIVE_SUBMATCH_SUCCESS:
2932 : // This case is handled in a different virtual method.
2933 0 : MOZ_CRASH("Bad action: NEGATIVE_SUBMATCH_SUCCESS");
2934 : }
2935 0 : MOZ_CRASH("Bad action");
2936 : }
2937 :
2938 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2939 : // that matches newline or the start of input).
2940 : static void
2941 0 : EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
2942 : {
2943 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2944 :
2945 : // We will be loading the previous character into the current character
2946 : // register.
2947 0 : Trace new_trace(*trace);
2948 0 : new_trace.InvalidateCurrentCharacter();
2949 :
2950 0 : jit::Label ok;
2951 0 : if (new_trace.cp_offset() == 0) {
2952 : // The start of input counts as a newline in this context, so skip to
2953 : // ok if we are at the start.
2954 0 : assembler->CheckAtStart(&ok);
2955 : }
2956 :
2957 : // We already checked that we are not at the start of input so it must be
2958 : // OK to load the previous character.
2959 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, new_trace.backtrack(), false);
2960 :
2961 0 : if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2962 : // Newline means \n, \r, 0x2028 or 0x2029.
2963 0 : if (!compiler->latin1())
2964 0 : assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2965 0 : assembler->CheckCharacter('\n', &ok);
2966 0 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2967 : }
2968 0 : assembler->Bind(&ok);
2969 0 : on_success->Emit(compiler, &new_trace);
2970 0 : }
2971 :
2972 : // Assert that the next character cannot be a part of a surrogate pair.
2973 : static void
2974 0 : EmitNotAfterLeadSurrogate(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
2975 : {
2976 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2977 :
2978 : // We will be loading the previous character into the current character
2979 : // register.
2980 0 : Trace new_trace(*trace);
2981 0 : new_trace.InvalidateCurrentCharacter();
2982 :
2983 0 : jit::Label ok;
2984 0 : if (new_trace.cp_offset() == 0)
2985 0 : assembler->CheckAtStart(&ok);
2986 :
2987 : // We already checked that we are not at the start of input so it must be
2988 : // OK to load the previous character.
2989 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false);
2990 0 : assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax,
2991 0 : new_trace.backtrack());
2992 :
2993 0 : assembler->Bind(&ok);
2994 0 : on_success->Emit(compiler, &new_trace);
2995 0 : }
2996 :
2997 : // Assert that the next character is not a trail surrogate that has a
2998 : // corresponding lead surrogate.
2999 : static void
3000 0 : EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
3001 : {
3002 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3003 :
3004 0 : jit::Label ok;
3005 0 : assembler->CheckPosition(trace->cp_offset(), &ok);
3006 :
3007 : // We will be loading the next and previous characters into the current
3008 : // character register.
3009 0 : Trace new_trace(*trace);
3010 0 : new_trace.InvalidateCurrentCharacter();
3011 :
3012 0 : if (new_trace.cp_offset() == 0)
3013 0 : assembler->CheckAtStart(&ok);
3014 :
3015 : // First check if next character is a trail surrogate.
3016 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset(), new_trace.backtrack(), false);
3017 : assembler->CheckCharacterNotInRange(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax,
3018 0 : &ok);
3019 :
3020 : // Next check if previous character is a lead surrogate.
3021 : // We already checked that we are not at the start of input so it must be
3022 : // OK to load the previous character.
3023 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false);
3024 0 : assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax,
3025 0 : new_trace.backtrack());
3026 :
3027 0 : assembler->Bind(&ok);
3028 0 : on_success->Emit(compiler, &new_trace);
3029 0 : }
3030 :
3031 : // Check for [0-9A-Z_a-z].
3032 : static void
3033 2 : EmitWordCheck(RegExpMacroAssembler* assembler,
3034 : jit::Label* word, jit::Label* non_word, bool fall_through_on_word,
3035 : bool unicode_ignore_case)
3036 : {
3037 4 : if (!unicode_ignore_case &&
3038 2 : assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W',
3039 2 : fall_through_on_word ? non_word : word))
3040 : {
3041 : // Optimized implementation available.
3042 2 : return;
3043 : }
3044 :
3045 0 : if (unicode_ignore_case) {
3046 0 : assembler->CheckCharacter(0x017F, word);
3047 0 : assembler->CheckCharacter(0x212A, word);
3048 : }
3049 :
3050 0 : assembler->CheckCharacterGT('z', non_word);
3051 0 : assembler->CheckCharacterLT('0', non_word);
3052 0 : assembler->CheckCharacterGT('a' - 1, word);
3053 0 : assembler->CheckCharacterLT('9' + 1, word);
3054 0 : assembler->CheckCharacterLT('A', non_word);
3055 0 : assembler->CheckCharacterLT('Z' + 1, word);
3056 :
3057 0 : if (fall_through_on_word)
3058 0 : assembler->CheckNotCharacter('_', non_word);
3059 : else
3060 0 : assembler->CheckCharacter('_', word);
3061 : }
3062 :
3063 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3064 : void
3065 2 : AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace)
3066 : {
3067 2 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3068 2 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3069 2 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3070 2 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3071 2 : if (lookahead == nullptr) {
3072 : int eats_at_least =
3073 1 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3074 : kRecursionBudget,
3075 2 : not_at_start));
3076 1 : if (eats_at_least >= 1) {
3077 : BoyerMooreLookahead* bm =
3078 1 : alloc()->newInfallible<BoyerMooreLookahead>(alloc(), eats_at_least, compiler);
3079 1 : FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3080 1 : if (bm->at(0)->is_non_word())
3081 1 : next_is_word_character = Trace::FALSE_VALUE;
3082 1 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3083 : }
3084 : } else {
3085 1 : if (lookahead->at(0)->is_non_word())
3086 0 : next_is_word_character = Trace::FALSE_VALUE;
3087 1 : if (lookahead->at(0)->is_word())
3088 1 : next_is_word_character = Trace::TRUE_VALUE;
3089 : }
3090 2 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3091 2 : if (next_is_word_character == Trace::UNKNOWN) {
3092 0 : jit::Label before_non_word;
3093 0 : jit::Label before_word;
3094 0 : if (trace->characters_preloaded() != 1) {
3095 0 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3096 : }
3097 : // Fall through on non-word.
3098 0 : EmitWordCheck(assembler, &before_word, &before_non_word, false,
3099 0 : compiler->unicode() && compiler->ignore_case());
3100 : // Next character is not a word character.
3101 0 : assembler->Bind(&before_non_word);
3102 0 : jit::Label ok;
3103 0 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3104 0 : assembler->JumpOrBacktrack(&ok);
3105 :
3106 0 : assembler->Bind(&before_word);
3107 0 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3108 0 : assembler->Bind(&ok);
3109 2 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3110 1 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3111 : } else {
3112 1 : MOZ_ASSERT(next_is_word_character == Trace::FALSE_VALUE);
3113 1 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3114 : }
3115 2 : }
3116 :
3117 : void
3118 2 : AssertionNode::BacktrackIfPrevious(RegExpCompiler* compiler,
3119 : Trace* trace,
3120 : AssertionNode::IfPrevious backtrack_if_previous)
3121 : {
3122 2 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3123 2 : Trace new_trace(*trace);
3124 2 : new_trace.InvalidateCurrentCharacter();
3125 :
3126 4 : jit::Label fall_through, dummy;
3127 :
3128 2 : jit::Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack() : &fall_through;
3129 2 : jit::Label* word = backtrack_if_previous == kIsNonWord ? &fall_through : new_trace.backtrack();
3130 :
3131 2 : if (new_trace.cp_offset() == 0) {
3132 : // The start of input counts as a non-word character, so the question is
3133 : // decided if we are at the start.
3134 1 : assembler->CheckAtStart(non_word);
3135 : }
3136 : // We already checked that we are not at the start of input so it must be
3137 : // OK to load the previous character.
3138 2 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3139 2 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord,
3140 4 : compiler->unicode() && compiler->ignore_case());
3141 :
3142 2 : assembler->Bind(&fall_through);
3143 2 : on_success()->Emit(compiler, &new_trace);
3144 2 : }
3145 :
3146 : void
3147 2 : AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3148 : RegExpCompiler* compiler,
3149 : int filled_in,
3150 : bool not_at_start)
3151 : {
3152 2 : if (assertion_type_ == AT_START && not_at_start) {
3153 0 : details->set_cannot_match();
3154 0 : return;
3155 : }
3156 2 : return on_success()->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
3157 : }
3158 :
3159 : void
3160 54 : AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace)
3161 : {
3162 54 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3163 54 : switch (assertion_type_) {
3164 : case AT_END: {
3165 62 : jit::Label ok;
3166 31 : assembler->CheckPosition(trace->cp_offset(), &ok);
3167 31 : assembler->JumpOrBacktrack(trace->backtrack());
3168 31 : assembler->Bind(&ok);
3169 31 : break;
3170 : }
3171 : case AT_START: {
3172 21 : if (trace->at_start() == Trace::FALSE_VALUE) {
3173 0 : assembler->JumpOrBacktrack(trace->backtrack());
3174 0 : return;
3175 : }
3176 21 : if (trace->at_start() == Trace::UNKNOWN) {
3177 21 : assembler->CheckNotAtStart(trace->backtrack());
3178 21 : Trace at_start_trace = *trace;
3179 21 : at_start_trace.set_at_start(true);
3180 21 : on_success()->Emit(compiler, &at_start_trace);
3181 21 : return;
3182 : }
3183 : }
3184 0 : break;
3185 : case AFTER_NEWLINE:
3186 0 : EmitHat(compiler, on_success(), trace);
3187 0 : return;
3188 : case AT_BOUNDARY:
3189 : case AT_NON_BOUNDARY: {
3190 2 : EmitBoundaryCheck(compiler, trace);
3191 2 : return;
3192 : }
3193 : case NOT_AFTER_LEAD_SURROGATE:
3194 0 : EmitNotAfterLeadSurrogate(compiler, on_success(), trace);
3195 0 : return;
3196 : case NOT_IN_SURROGATE_PAIR:
3197 0 : EmitNotInSurrogatePair(compiler, on_success(), trace);
3198 0 : return;
3199 : }
3200 31 : on_success()->Emit(compiler, trace);
3201 : }
3202 :
3203 : static bool
3204 2168 : DeterminedAlready(QuickCheckDetails* quick_check, int offset)
3205 : {
3206 2168 : if (quick_check == nullptr)
3207 0 : return false;
3208 2168 : if (offset >= quick_check->characters())
3209 1343 : return false;
3210 825 : return quick_check->positions(offset)->determines_perfectly;
3211 : }
3212 :
3213 : static void
3214 496 : UpdateBoundsCheck(int index, int* checked_up_to)
3215 : {
3216 496 : if (index > *checked_up_to)
3217 145 : *checked_up_to = index;
3218 496 : }
3219 :
3220 : static void
3221 1 : EmitBoundaryTest(RegExpMacroAssembler* masm,
3222 : int border,
3223 : jit::Label* fall_through,
3224 : jit::Label* above_or_equal,
3225 : jit::Label* below)
3226 : {
3227 1 : if (below != fall_through) {
3228 0 : masm->CheckCharacterLT(border, below);
3229 0 : if (above_or_equal != fall_through)
3230 0 : masm->JumpOrBacktrack(above_or_equal);
3231 : } else {
3232 1 : masm->CheckCharacterGT(border - 1, above_or_equal);
3233 : }
3234 1 : }
3235 :
3236 : static void
3237 106 : EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
3238 : int first,
3239 : int last,
3240 : jit::Label* fall_through,
3241 : jit::Label* in_range,
3242 : jit::Label* out_of_range)
3243 : {
3244 106 : if (in_range == fall_through) {
3245 44 : if (first == last)
3246 13 : masm->CheckNotCharacter(first, out_of_range);
3247 : else
3248 31 : masm->CheckCharacterNotInRange(first, last, out_of_range);
3249 : } else {
3250 62 : if (first == last)
3251 48 : masm->CheckCharacter(first, in_range);
3252 : else
3253 14 : masm->CheckCharacterInRange(first, last, in_range);
3254 62 : if (out_of_range != fall_through)
3255 0 : masm->JumpOrBacktrack(out_of_range);
3256 : }
3257 106 : }
3258 :
3259 : typedef InfallibleVector<int, 4> RangeBoundaryVector;
3260 :
3261 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
3262 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
3263 : static void
3264 6 : EmitUseLookupTable(RegExpMacroAssembler* masm,
3265 : RangeBoundaryVector& ranges,
3266 : int start_index,
3267 : int end_index,
3268 : int min_char,
3269 : jit::Label* fall_through,
3270 : jit::Label* even_label,
3271 : jit::Label* odd_label)
3272 : {
3273 : static const int kSize = RegExpMacroAssembler::kTableSize;
3274 : static const int kMask = RegExpMacroAssembler::kTableMask;
3275 :
3276 12 : DebugOnly<int> base = (min_char & ~kMask);
3277 :
3278 : // Assert that everything is on one kTableSize page.
3279 68 : for (int i = start_index; i <= end_index; i++)
3280 62 : MOZ_ASSERT((ranges[i] & ~kMask) == base);
3281 6 : MOZ_ASSERT(start_index == 0 || (ranges[start_index - 1] & ~kMask) <= base);
3282 :
3283 : char templ[kSize];
3284 : jit::Label* on_bit_set;
3285 : jit::Label* on_bit_clear;
3286 : int bit;
3287 6 : if (even_label == fall_through) {
3288 0 : on_bit_set = odd_label;
3289 0 : on_bit_clear = even_label;
3290 0 : bit = 1;
3291 : } else {
3292 6 : on_bit_set = even_label;
3293 6 : on_bit_clear = odd_label;
3294 6 : bit = 0;
3295 : }
3296 305 : for (int i = 0; i < (ranges[start_index] & kMask) && i < kSize; i++)
3297 299 : templ[i] = bit;
3298 6 : int j = 0;
3299 6 : bit ^= 1;
3300 62 : for (int i = start_index; i < end_index; i++) {
3301 390 : for (j = (ranges[i] & kMask); j < (ranges[i + 1] & kMask); j++) {
3302 334 : templ[j] = bit;
3303 : }
3304 56 : bit ^= 1;
3305 : }
3306 141 : for (int i = j; i < kSize; i++) {
3307 135 : templ[i] = bit;
3308 : }
3309 :
3310 : // TODO(erikcorry): Cache these.
3311 12 : RegExpShared::JitCodeTable ba;
3312 : {
3313 12 : AutoEnterOOMUnsafeRegion oomUnsafe;
3314 6 : ba.reset(static_cast<uint8_t*>(js_malloc(kSize)));
3315 6 : if (!ba)
3316 0 : oomUnsafe.crash("Table malloc");
3317 : }
3318 :
3319 774 : for (int i = 0; i < kSize; i++)
3320 768 : ba[i] = templ[i];
3321 :
3322 6 : masm->CheckBitInTable(Move(ba), on_bit_set);
3323 6 : if (on_bit_clear != fall_through)
3324 6 : masm->JumpOrBacktrack(on_bit_clear);
3325 6 : }
3326 :
3327 : static void
3328 44 : CutOutRange(RegExpMacroAssembler* masm,
3329 : RangeBoundaryVector& ranges,
3330 : int start_index,
3331 : int end_index,
3332 : int cut_index,
3333 : jit::Label* even_label,
3334 : jit::Label* odd_label)
3335 : {
3336 44 : bool odd = (((cut_index - start_index) & 1) == 1);
3337 44 : jit::Label* in_range_label = odd ? odd_label : even_label;
3338 88 : jit::Label dummy;
3339 88 : EmitDoubleBoundaryTest(masm,
3340 44 : ranges[cut_index],
3341 44 : ranges[cut_index + 1] - 1,
3342 : &dummy,
3343 : in_range_label,
3344 44 : &dummy);
3345 44 : MOZ_ASSERT(!dummy.used());
3346 : // Cut out the single range by rewriting the array. This creates a new
3347 : // range that is a merger of the two ranges on either side of the one we
3348 : // are cutting out. The oddity of the labels is preserved.
3349 67 : for (int j = cut_index; j > start_index; j--)
3350 23 : ranges[j] = ranges[j - 1];
3351 134 : for (int j = cut_index + 1; j < end_index; j++)
3352 90 : ranges[j] = ranges[j + 1];
3353 44 : }
3354 :
3355 : // Unicode case. Split the search space into kSize spaces that are handled
3356 : // with recursion.
3357 : static void
3358 6 : SplitSearchSpace(RangeBoundaryVector& ranges,
3359 : int start_index,
3360 : int end_index,
3361 : int* new_start_index,
3362 : int* new_end_index,
3363 : int* border)
3364 : {
3365 : static const int kSize = RegExpMacroAssembler::kTableSize;
3366 : static const int kMask = RegExpMacroAssembler::kTableMask;
3367 :
3368 6 : int first = ranges[start_index];
3369 6 : int last = ranges[end_index] - 1;
3370 :
3371 6 : *new_start_index = start_index;
3372 6 : *border = (ranges[start_index] & ~kMask) + kSize;
3373 130 : while (*new_start_index < end_index) {
3374 62 : if (ranges[*new_start_index] > *border)
3375 0 : break;
3376 62 : (*new_start_index)++;
3377 : }
3378 : // new_start_index is the index of the first edge that is beyond the
3379 : // current kSize space.
3380 :
3381 : // For very large search spaces we do a binary chop search of the non-ASCII
3382 : // space instead of just going to the end of the current kSize space. The
3383 : // heuristics are complicated a little by the fact that any 128-character
3384 : // encoding space can be quickly tested with a table lookup, so we don't
3385 : // wish to do binary chop search at a smaller granularity than that. A
3386 : // 128-character space can take up a lot of space in the ranges array if,
3387 : // for example, we only want to match every second character (eg. the lower
3388 : // case characters on some Unicode pages).
3389 6 : int binary_chop_index = (end_index + start_index) / 2;
3390 : // The first test ensures that we get to the code that handles the ASCII
3391 : // range with a single not-taken branch, speeding up this important
3392 : // character range (even non-ASCII charset-based text has spaces and
3393 : // punctuation).
3394 12 : if (*border - 1 > kMaxOneByteCharCode && // ASCII case.
3395 0 : end_index - start_index > (*new_start_index - start_index) * 2 &&
3396 0 : last - first > kSize * 2 &&
3397 6 : binary_chop_index > *new_start_index &&
3398 0 : ranges[binary_chop_index] >= first + 2 * kSize)
3399 : {
3400 0 : int scan_forward_for_section_border = binary_chop_index;;
3401 0 : int new_border = (ranges[binary_chop_index] | kMask) + 1;
3402 :
3403 0 : while (scan_forward_for_section_border < end_index) {
3404 0 : if (ranges[scan_forward_for_section_border] > new_border) {
3405 0 : *new_start_index = scan_forward_for_section_border;
3406 0 : *border = new_border;
3407 0 : break;
3408 : }
3409 0 : scan_forward_for_section_border++;
3410 : }
3411 : }
3412 :
3413 6 : MOZ_ASSERT(*new_start_index > start_index);
3414 6 : *new_end_index = *new_start_index - 1;
3415 6 : if (ranges[*new_end_index] == *border)
3416 0 : (*new_end_index)--;
3417 6 : if (*border >= ranges[end_index]) {
3418 6 : *border = ranges[end_index];
3419 6 : *new_start_index = end_index; // Won't be used.
3420 6 : *new_end_index = end_index - 1;
3421 : }
3422 6 : }
3423 :
3424 : // Gets a series of segment boundaries representing a character class. If the
3425 : // character is in the range between an even and an odd boundary (counting from
3426 : // start_index) then go to even_label, otherwise go to odd_label. We already
3427 : // know that the character is in the range of min_char to max_char inclusive.
3428 : // Either label can be nullptr indicating backtracking. Either label can also be
3429 : // equal to the fall_through label.
3430 : static void
3431 119 : GenerateBranches(RegExpMacroAssembler* masm,
3432 : RangeBoundaryVector& ranges,
3433 : int start_index,
3434 : int end_index,
3435 : char16_t min_char,
3436 : char16_t max_char,
3437 : jit::Label* fall_through,
3438 : jit::Label* even_label,
3439 : jit::Label* odd_label)
3440 : {
3441 119 : int first = ranges[start_index];
3442 119 : int last = ranges[end_index] - 1;
3443 :
3444 119 : MOZ_ASSERT(min_char < first);
3445 :
3446 : // Just need to test if the character is before or on-or-after
3447 : // a particular character.
3448 119 : if (start_index == end_index) {
3449 1 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
3450 1 : return;
3451 : }
3452 :
3453 : // Another almost trivial case: There is one interval in the middle that is
3454 : // different from the end intervals.
3455 118 : if (start_index + 1 == end_index) {
3456 62 : EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label, odd_label);
3457 62 : return;
3458 : }
3459 :
3460 : // It's not worth using table lookup if there are very few intervals in the
3461 : // character class.
3462 56 : if (end_index - start_index <= 6) {
3463 : // It is faster to test for individual characters, so we look for those
3464 : // first, then try arbitrary ranges in the second round.
3465 : static int kNoCutIndex = -1;
3466 44 : int cut = kNoCutIndex;
3467 109 : for (int i = start_index; i < end_index; i++) {
3468 98 : if (ranges[i] == ranges[i + 1] - 1) {
3469 33 : cut = i;
3470 33 : break;
3471 : }
3472 : }
3473 44 : if (cut == kNoCutIndex) cut = start_index;
3474 44 : CutOutRange(masm, ranges, start_index, end_index, cut, even_label, odd_label);
3475 44 : MOZ_ASSERT(end_index - start_index >= 2);
3476 44 : GenerateBranches(masm,
3477 : ranges,
3478 : start_index + 1,
3479 : end_index - 1,
3480 : min_char,
3481 : max_char,
3482 : fall_through,
3483 : even_label,
3484 44 : odd_label);
3485 44 : return;
3486 : }
3487 :
3488 : // If there are a lot of intervals in the regexp, then we will use tables to
3489 : // determine whether the character is inside or outside the character class.
3490 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
3491 :
3492 12 : if ((max_char >> kBits) == (min_char >> kBits)) {
3493 6 : EmitUseLookupTable(masm,
3494 : ranges,
3495 : start_index,
3496 : end_index,
3497 : min_char,
3498 : fall_through,
3499 : even_label,
3500 6 : odd_label);
3501 6 : return;
3502 : }
3503 :
3504 6 : if ((min_char >> kBits) != (first >> kBits)) {
3505 0 : masm->CheckCharacterLT(first, odd_label);
3506 0 : GenerateBranches(masm,
3507 : ranges,
3508 : start_index + 1,
3509 : end_index,
3510 : first,
3511 : max_char,
3512 : fall_through,
3513 : odd_label,
3514 0 : even_label);
3515 0 : return;
3516 : }
3517 :
3518 6 : int new_start_index = 0;
3519 6 : int new_end_index = 0;
3520 6 : int border = 0;
3521 :
3522 : SplitSearchSpace(ranges,
3523 : start_index,
3524 : end_index,
3525 : &new_start_index,
3526 : &new_end_index,
3527 6 : &border);
3528 :
3529 12 : jit::Label handle_rest;
3530 6 : jit::Label* above = &handle_rest;
3531 6 : if (border == last + 1) {
3532 : // We didn't find any section that started after the limit, so everything
3533 : // above the border is one of the terminal labels.
3534 6 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
3535 6 : MOZ_ASSERT(new_end_index == end_index - 1);
3536 : }
3537 :
3538 6 : MOZ_ASSERT(start_index <= new_end_index);
3539 6 : MOZ_ASSERT(new_start_index <= end_index);
3540 6 : MOZ_ASSERT(start_index < new_start_index);
3541 6 : MOZ_ASSERT(new_end_index < end_index);
3542 6 : MOZ_ASSERT(new_end_index + 1 == new_start_index ||
3543 : (new_end_index + 2 == new_start_index &&
3544 : border == ranges[new_end_index + 1]));
3545 6 : MOZ_ASSERT(min_char < border - 1);
3546 6 : MOZ_ASSERT(border < max_char);
3547 6 : MOZ_ASSERT(ranges[new_end_index] < border);
3548 6 : MOZ_ASSERT(border < ranges[new_start_index] ||
3549 : (border == ranges[new_start_index] &&
3550 : new_start_index == end_index &&
3551 : new_end_index == end_index - 1 &&
3552 : border == last + 1));
3553 6 : MOZ_ASSERT(new_start_index == 0 || border >= ranges[new_start_index - 1]);
3554 :
3555 6 : masm->CheckCharacterGT(border - 1, above);
3556 12 : jit::Label dummy;
3557 6 : GenerateBranches(masm,
3558 : ranges,
3559 : start_index,
3560 : new_end_index,
3561 : min_char,
3562 6 : border - 1,
3563 : &dummy,
3564 : even_label,
3565 6 : odd_label);
3566 6 : if (handle_rest.used()) {
3567 0 : masm->Bind(&handle_rest);
3568 0 : bool flip = (new_start_index & 1) != (start_index & 1);
3569 0 : GenerateBranches(masm,
3570 : ranges,
3571 : new_start_index,
3572 : end_index,
3573 : border,
3574 : max_char,
3575 : &dummy,
3576 : flip ? odd_label : even_label,
3577 0 : flip ? even_label : odd_label);
3578 : }
3579 : }
3580 :
3581 : static void
3582 100 : EmitCharClass(LifoAlloc* alloc,
3583 : RegExpMacroAssembler* macro_assembler,
3584 : RegExpCharacterClass* cc,
3585 : bool latin1,
3586 : jit::Label* on_failure,
3587 : int cp_offset,
3588 : bool check_offset,
3589 : bool preloaded)
3590 : {
3591 100 : CharacterRangeVector& ranges = cc->ranges(alloc);
3592 100 : if (!CharacterRange::IsCanonical(ranges)) {
3593 6 : CharacterRange::Canonicalize(ranges);
3594 : }
3595 :
3596 100 : int max_char = MaximumCharacter(latin1);
3597 100 : int range_count = ranges.length();
3598 :
3599 100 : int last_valid_range = range_count - 1;
3600 196 : while (last_valid_range >= 0) {
3601 148 : CharacterRange& range = ranges[last_valid_range];
3602 148 : if (range.from() <= max_char) {
3603 100 : break;
3604 : }
3605 48 : last_valid_range--;
3606 : }
3607 :
3608 100 : if (last_valid_range < 0) {
3609 0 : if (!cc->is_negated()) {
3610 0 : macro_assembler->JumpOrBacktrack(on_failure);
3611 : }
3612 0 : if (check_offset) {
3613 0 : macro_assembler->CheckPosition(cp_offset, on_failure);
3614 : }
3615 31 : return;
3616 : }
3617 :
3618 150 : if (last_valid_range == 0 &&
3619 50 : ranges[0].IsEverything(max_char)) {
3620 18 : if (cc->is_negated()) {
3621 0 : macro_assembler->JumpOrBacktrack(on_failure);
3622 : } else {
3623 : // This is a common case hit by non-anchored expressions.
3624 18 : if (check_offset) {
3625 3 : macro_assembler->CheckPosition(cp_offset, on_failure);
3626 : }
3627 : }
3628 18 : return;
3629 : }
3630 114 : if (last_valid_range == 0 &&
3631 101 : !cc->is_negated() &&
3632 19 : ranges[0].IsEverything(max_char)) {
3633 : // This is a common case hit by non-anchored expressions.
3634 0 : if (check_offset) {
3635 0 : macro_assembler->CheckPosition(cp_offset, on_failure);
3636 : }
3637 0 : return;
3638 : }
3639 :
3640 82 : if (!preloaded) {
3641 74 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
3642 : }
3643 :
3644 98 : if (cc->is_standard(alloc) &&
3645 16 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
3646 16 : on_failure)) {
3647 13 : return;
3648 : }
3649 :
3650 : // A new list with ascending entries. Each entry is a code unit
3651 : // where there is a boundary between code units that are part of
3652 : // the class and code units that are not. Normally we insert an
3653 : // entry at zero which goes to the failure label, but if there
3654 : // was already one there we fall through for success on that entry.
3655 : // Subsequent entries have alternating meaning (success/failure).
3656 : RangeBoundaryVector* range_boundaries =
3657 69 : alloc->newInfallible<RangeBoundaryVector>(*alloc);
3658 :
3659 69 : bool zeroth_entry_is_failure = !cc->is_negated();
3660 :
3661 69 : range_boundaries->reserve(last_valid_range);
3662 213 : for (int i = 0; i <= last_valid_range; i++) {
3663 144 : CharacterRange& range = ranges[i];
3664 144 : if (range.from() == 0) {
3665 4 : MOZ_ASSERT(i == 0);
3666 4 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
3667 : } else {
3668 140 : range_boundaries->append(range.from());
3669 : }
3670 144 : range_boundaries->append(range.to() + 1);
3671 : }
3672 69 : int end_index = range_boundaries->length() - 1;
3673 69 : if ((*range_boundaries)[end_index] > max_char)
3674 3 : end_index--;
3675 :
3676 138 : jit::Label fall_through;
3677 69 : GenerateBranches(macro_assembler,
3678 : *range_boundaries,
3679 : 0, // start_index.
3680 : end_index,
3681 : 0, // min_char.
3682 : max_char,
3683 : &fall_through,
3684 : zeroth_entry_is_failure ? &fall_through : on_failure,
3685 69 : zeroth_entry_is_failure ? on_failure : &fall_through);
3686 69 : macro_assembler->Bind(&fall_through);
3687 : }
3688 :
3689 : typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
3690 : char16_t c,
3691 : jit::Label* on_failure,
3692 : int cp_offset,
3693 : bool check,
3694 : bool preloaded);
3695 :
3696 : static inline bool
3697 303 : EmitSimpleCharacter(RegExpCompiler* compiler,
3698 : char16_t c,
3699 : jit::Label* on_failure,
3700 : int cp_offset,
3701 : bool check,
3702 : bool preloaded)
3703 : {
3704 303 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3705 303 : bool bound_checked = false;
3706 303 : if (!preloaded) {
3707 303 : assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
3708 303 : bound_checked = true;
3709 : }
3710 303 : assembler->CheckNotCharacter(c, on_failure);
3711 303 : return bound_checked;
3712 : }
3713 :
3714 : // Emit character for case independent match, when GetCaseIndependentLetters
3715 : // returns single character.
3716 : // This is used by the following 2 cases:
3717 : // * non-letters (things that don't have case)
3718 : // * letters that map across Latin1 and non-Latin1, and non-Latin1 case is
3719 : // filtered out because of Latin1 match
3720 : static inline bool
3721 93 : EmitAtomSingle(RegExpCompiler* compiler,
3722 : char16_t c,
3723 : jit::Label* on_failure,
3724 : int cp_offset,
3725 : bool check,
3726 : bool preloaded)
3727 : {
3728 93 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3729 93 : bool latin1 = compiler->latin1();
3730 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
3731 93 : int length = GetCaseIndependentLetters(c, latin1, compiler->unicode(), chars);
3732 93 : if (length != 1)
3733 93 : return false;
3734 :
3735 0 : bool checked = false;
3736 0 : if (!preloaded) {
3737 0 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
3738 0 : checked = check;
3739 : }
3740 0 : macro_assembler->CheckNotCharacter(chars[0], on_failure);
3741 0 : return checked;
3742 : }
3743 :
3744 : static bool
3745 93 : ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
3746 : bool latin1,
3747 : char16_t c1,
3748 : char16_t c2,
3749 : jit::Label* on_failure)
3750 : {
3751 93 : char16_t char_mask = MaximumCharacter(latin1);
3752 :
3753 93 : MOZ_ASSERT(c1 != c2);
3754 93 : if (c1 > c2) {
3755 93 : char16_t tmp = c1;
3756 93 : c1 = c2;
3757 93 : c2 = tmp;
3758 : }
3759 :
3760 93 : char16_t exor = c1 ^ c2;
3761 : // Check whether exor has only one bit set.
3762 93 : if (((exor - 1) & exor) == 0) {
3763 : // If c1 and c2 differ only by one bit.
3764 93 : char16_t mask = char_mask ^ exor;
3765 93 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
3766 93 : return true;
3767 : }
3768 :
3769 0 : char16_t diff = c2 - c1;
3770 0 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
3771 : // If the characters differ by 2^n but don't differ by one bit then
3772 : // subtract the difference from the found character, then do the or
3773 : // trick. We avoid the theoretical case where negative numbers are
3774 : // involved in order to simplify code generation.
3775 0 : char16_t mask = char_mask ^ diff;
3776 0 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
3777 : diff,
3778 : mask,
3779 0 : on_failure);
3780 0 : return true;
3781 : }
3782 0 : return false;
3783 : }
3784 :
3785 : // Emit character for case independent match, when GetCaseIndependentLetters
3786 : // returns multiple characters.
3787 : static inline bool
3788 93 : EmitAtomMulti(RegExpCompiler* compiler,
3789 : char16_t c,
3790 : jit::Label* on_failure,
3791 : int cp_offset,
3792 : bool check,
3793 : bool preloaded)
3794 : {
3795 93 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3796 93 : bool latin1 = compiler->latin1();
3797 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
3798 93 : int length = GetCaseIndependentLetters(c, latin1, compiler->unicode(), chars);
3799 93 : if (length <= 1) return false;
3800 : // We may not need to check against the end of the input string
3801 : // if this character lies before a character that matched.
3802 93 : if (!preloaded)
3803 93 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
3804 186 : jit::Label ok;
3805 : MOZ_ASSERT(kEcma262UnCanonicalizeMaxWidth == 4);
3806 93 : switch (length) {
3807 : case 2: {
3808 186 : if (ShortCutEmitCharacterPair(macro_assembler,
3809 : latin1,
3810 93 : chars[0],
3811 93 : chars[1],
3812 : on_failure)) {
3813 : } else {
3814 0 : macro_assembler->CheckCharacter(chars[0], &ok);
3815 0 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
3816 0 : macro_assembler->Bind(&ok);
3817 : }
3818 93 : break;
3819 : }
3820 : case 4:
3821 0 : macro_assembler->CheckCharacter(chars[3], &ok);
3822 : MOZ_FALLTHROUGH;
3823 : case 3:
3824 0 : macro_assembler->CheckCharacter(chars[0], &ok);
3825 0 : macro_assembler->CheckCharacter(chars[1], &ok);
3826 0 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
3827 0 : macro_assembler->Bind(&ok);
3828 0 : break;
3829 : default:
3830 0 : MOZ_CRASH("Bad length");
3831 : }
3832 93 : return true;
3833 : }
3834 :
3835 : // We call this repeatedly to generate code for each pass over the text node.
3836 : // The passes are in increasing order of difficulty because we hope one
3837 : // of the first passes will fail in which case we are saved the work of the
3838 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3839 : // we will check the '%' in the first pass, the case independent 'a' in the
3840 : // second pass and the character class in the last pass.
3841 : //
3842 : // The passes are done from right to left, so for example to test for /bar/
3843 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3844 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3845 : // bounds check most of the time. In the example we only need to check for
3846 : // end-of-input when loading the putative 'r'.
3847 : //
3848 : // A slight complication involves the fact that the first character may already
3849 : // be fetched into a register by the previous node. In this case we want to
3850 : // do the test for that character first. We do this in separate passes. The
3851 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3852 : // pass has been performed then subsequent passes will have true in
3853 : // first_element_checked to indicate that that character does not need to be
3854 : // checked again.
3855 : //
3856 : // In addition to all this we are passed a Trace, which can
3857 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3858 : // object we can see details of any quick check that was already passed in
3859 : // order to get to the code we are now generating. The quick check can involve
3860 : // loading characters, which means we do not need to recheck the bounds
3861 : // up to the limit the quick check already checked. In addition the quick
3862 : // check can have involved a mask and compare operation which may simplify
3863 : // or obviate the need for further checks at some character positions.
3864 : void
3865 785 : TextNode::TextEmitPass(RegExpCompiler* compiler,
3866 : TextEmitPassType pass,
3867 : bool preloaded,
3868 : Trace* trace,
3869 : bool first_element_checked,
3870 : int* checked_up_to)
3871 : {
3872 785 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3873 785 : bool latin1 = compiler->latin1();
3874 785 : jit::Label* backtrack = trace->backtrack();
3875 785 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3876 785 : int element_count = elements().length();
3877 1573 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3878 788 : TextElement elm = elements()[i];
3879 788 : int cp_offset = trace->cp_offset() + elm.cp_offset();
3880 788 : if (elm.text_type() == TextElement::ATOM) {
3881 453 : const CharacterVector& quarks = elm.atom()->data();
3882 2545 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3883 2092 : if (first_element_checked && i == 0 && j == 0) continue;
3884 2066 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3885 1262 : EmitCharacterFunction* emit_function = nullptr;
3886 1262 : switch (pass) {
3887 : case NON_LATIN1_MATCH:
3888 377 : MOZ_ASSERT(latin1);
3889 377 : if (!IsLatin1Equivalent(quarks[j], compiler)) {
3890 0 : assembler->JumpOrBacktrack(backtrack);
3891 0 : return;
3892 : }
3893 377 : break;
3894 : case CASE_SINGLE_CHARACTER_MATCH:
3895 93 : emit_function = &EmitAtomSingle;
3896 93 : break;
3897 : case SIMPLE_CHARACTER_MATCH:
3898 303 : emit_function = &EmitSimpleCharacter;
3899 303 : break;
3900 : case CASE_MUTLI_CHARACTER_MATCH:
3901 93 : emit_function = &EmitAtomMulti;
3902 93 : break;
3903 : default:
3904 396 : break;
3905 : }
3906 1262 : if (emit_function != nullptr) {
3907 1467 : bool bound_checked = emit_function(compiler,
3908 489 : quarks[j],
3909 : backtrack,
3910 : cp_offset + j,
3911 489 : *checked_up_to < cp_offset + j,
3912 489 : preloaded);
3913 489 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3914 : }
3915 : }
3916 : } else {
3917 335 : MOZ_ASSERT(TextElement::CHAR_CLASS == elm.text_type());
3918 335 : if (pass == CHARACTER_CLASS_MATCH) {
3919 124 : if (first_element_checked && i == 0) continue;
3920 102 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3921 100 : RegExpCharacterClass* cc = elm.char_class();
3922 200 : EmitCharClass(alloc(),
3923 : assembler,
3924 : cc,
3925 : latin1,
3926 : backtrack,
3927 : cp_offset,
3928 100 : *checked_up_to < cp_offset,
3929 100 : preloaded);
3930 100 : UpdateBoundsCheck(cp_offset, checked_up_to);
3931 : }
3932 : }
3933 : }
3934 : }
3935 :
3936 : int
3937 647 : TextNode::Length()
3938 : {
3939 647 : TextElement elm = elements()[elements().length() - 1];
3940 647 : MOZ_ASSERT(elm.cp_offset() >= 0);
3941 647 : return elm.cp_offset() + elm.length();
3942 : }
3943 :
3944 : bool
3945 1016 : TextNode::SkipPass(int int_pass, bool ignore_case)
3946 : {
3947 1016 : TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3948 1016 : if (ignore_case)
3949 300 : return pass == SIMPLE_CHARACTER_MATCH;
3950 716 : return pass == CASE_SINGLE_CHARACTER_MATCH || pass == CASE_MUTLI_CHARACTER_MATCH;
3951 : }
3952 :
3953 : // This generates the code to match a text node. A text node can contain
3954 : // straight character sequences (possibly to be matched in a case-independent
3955 : // way) and character classes. For efficiency we do not do this in a single
3956 : // pass from left to right. Instead we pass over the text node several times,
3957 : // emitting code for some character positions every time. See the comment on
3958 : // TextEmitPass for details.
3959 : void
3960 233 : TextNode::Emit(RegExpCompiler* compiler, Trace* trace)
3961 : {
3962 233 : LimitResult limit_result = LimitVersions(compiler, trace);
3963 234 : if (limit_result == DONE) return;
3964 232 : MOZ_ASSERT(limit_result == CONTINUE);
3965 :
3966 232 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3967 0 : compiler->SetRegExpTooBig();
3968 0 : return;
3969 : }
3970 :
3971 232 : if (compiler->latin1()) {
3972 202 : int dummy = 0;
3973 202 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3974 : }
3975 :
3976 232 : bool first_elt_done = false;
3977 232 : int bound_checked_to = trace->cp_offset() - 1;
3978 232 : bound_checked_to += trace->bound_checked_up_to();
3979 :
3980 : // If a character is preloaded into the current character register then
3981 : // check that now.
3982 232 : if (trace->characters_preloaded() == 1) {
3983 110 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3984 88 : if (!SkipPass(pass, compiler->ignore_case())) {
3985 48 : TextEmitPass(compiler,
3986 : static_cast<TextEmitPassType>(pass),
3987 : true,
3988 : trace,
3989 : false,
3990 48 : &bound_checked_to);
3991 : }
3992 : }
3993 22 : first_elt_done = true;
3994 : }
3995 :
3996 1160 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3997 928 : if (!SkipPass(pass, compiler->ignore_case())) {
3998 535 : TextEmitPass(compiler,
3999 : static_cast<TextEmitPassType>(pass),
4000 : false,
4001 : trace,
4002 : first_elt_done,
4003 535 : &bound_checked_to);
4004 : }
4005 : }
4006 :
4007 232 : Trace successor_trace(*trace);
4008 232 : successor_trace.set_at_start(false);
4009 232 : successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
4010 464 : RecursionCheck rc(compiler);
4011 232 : on_success()->Emit(compiler, &successor_trace);
4012 : }
4013 :
4014 : void
4015 194 : LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace)
4016 : {
4017 194 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4018 194 : if (trace->stop_node() == this) {
4019 : int text_length =
4020 26 : GreedyLoopTextLengthForAlternative(&alternatives()[0]);
4021 26 : MOZ_ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
4022 : // Update the counter-based backtracking info on the stack. This is an
4023 : // optimization for greedy loops (see below).
4024 26 : MOZ_ASSERT(trace->cp_offset() == text_length);
4025 26 : macro_assembler->AdvanceCurrentPosition(text_length);
4026 26 : macro_assembler->JumpOrBacktrack(trace->loop_label());
4027 26 : return;
4028 : }
4029 168 : MOZ_ASSERT(trace->stop_node() == nullptr);
4030 168 : if (!trace->is_trivial()) {
4031 75 : trace->Flush(compiler, this);
4032 75 : return;
4033 : }
4034 93 : ChoiceNode::Emit(compiler, trace);
4035 : }
4036 :
4037 : /* Code generation for choice nodes.
4038 : *
4039 : * We generate quick checks that do a mask and compare to eliminate a
4040 : * choice. If the quick check succeeds then it jumps to the continuation to
4041 : * do slow checks and check subsequent nodes. If it fails (the common case)
4042 : * it falls through to the next choice.
4043 : *
4044 : * Here is the desired flow graph. Nodes directly below each other imply
4045 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
4046 : * 3 doesn't have a quick check so we have to call the slow check.
4047 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
4048 : * regexp continuation is generated directly after the Sn node, up to the
4049 : * next JumpOrBacktrack if we decide to reuse some already generated code. Some
4050 : * nodes expect preload_characters to be preloaded into the current
4051 : * character register. R nodes do this preloading. Vertices are marked
4052 : * F for failures and S for success (possible success in the case of quick
4053 : * nodes). L, V, < and > are used as arrow heads.
4054 : *
4055 : * ----------> R
4056 : * |
4057 : * V
4058 : * Q1 -----> S1
4059 : * | S /
4060 : * F| /
4061 : * | F/
4062 : * | /
4063 : * | R
4064 : * | /
4065 : * V L
4066 : * Q2 -----> S2
4067 : * | S /
4068 : * F| /
4069 : * | F/
4070 : * | /
4071 : * | R
4072 : * | /
4073 : * V L
4074 : * S3
4075 : * |
4076 : * F|
4077 : * |
4078 : * R
4079 : * |
4080 : * backtrack V
4081 : * <----------Q4
4082 : * \ F |
4083 : * \ |S
4084 : * \ F V
4085 : * \-----S4
4086 : *
4087 : * For greedy loops we reverse our expectation and expect to match rather
4088 : * than fail. Therefore we want the loop code to look like this (U is the
4089 : * unwind code that steps back in the greedy loop). The following alternatives
4090 : * look the same as above.
4091 : * _____
4092 : * / \
4093 : * V |
4094 : * ----------> S1 |
4095 : * /| |
4096 : * / |S |
4097 : * F/ \_____/
4098 : * /
4099 : * |<-----------
4100 : * | \
4101 : * V \
4102 : * Q2 ---> S2 \
4103 : * | S / |
4104 : * F| / |
4105 : * | F/ |
4106 : * | / |
4107 : * | R |
4108 : * | / |
4109 : * F VL |
4110 : * <------U |
4111 : * back |S |
4112 : * \______________/
4113 : */
4114 :
4115 : // This class is used when generating the alternatives in a choice node. It
4116 : // records the way the alternative is being code generated.
4117 942 : class irregexp::AlternativeGeneration
4118 : {
4119 : public:
4120 942 : AlternativeGeneration()
4121 942 : : possible_success(),
4122 : expects_preload(false),
4123 : after(),
4124 942 : quick_check_details()
4125 942 : {}
4126 :
4127 : jit::Label possible_success;
4128 : bool expects_preload;
4129 : jit::Label after;
4130 : QuickCheckDetails quick_check_details;
4131 : };
4132 :
4133 : void
4134 16 : ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
4135 : Guard* guard, Trace* trace)
4136 : {
4137 16 : switch (guard->op()) {
4138 : case Guard::LT:
4139 11 : MOZ_ASSERT(!trace->mentions_reg(guard->reg()));
4140 11 : macro_assembler->IfRegisterGE(guard->reg(),
4141 : guard->value(),
4142 22 : trace->backtrack());
4143 11 : break;
4144 : case Guard::GEQ:
4145 5 : MOZ_ASSERT(!trace->mentions_reg(guard->reg()));
4146 5 : macro_assembler->IfRegisterLT(guard->reg(),
4147 : guard->value(),
4148 10 : trace->backtrack());
4149 5 : break;
4150 : }
4151 16 : }
4152 :
4153 : int
4154 93 : ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least)
4155 : {
4156 93 : int preload_characters = Min(4, eats_at_least);
4157 93 : if (compiler->macro_assembler()->CanReadUnaligned()) {
4158 93 : bool latin1 = compiler->latin1();
4159 93 : if (latin1) {
4160 81 : if (preload_characters > 4)
4161 0 : preload_characters = 4;
4162 : // We can't preload 3 characters because there is no machine instruction
4163 : // to do that. We can't just load 4 because we could be reading
4164 : // beyond the end of the string, which could cause a memory fault.
4165 81 : if (preload_characters == 3)
4166 1 : preload_characters = 2;
4167 : } else {
4168 12 : if (preload_characters > 2)
4169 2 : preload_characters = 2;
4170 : }
4171 : } else {
4172 0 : if (preload_characters > 1)
4173 0 : preload_characters = 1;
4174 : }
4175 93 : return preload_characters;
4176 : }
4177 :
4178 : RegExpNode*
4179 34 : TextNode::GetSuccessorOfOmnivorousTextNode(RegExpCompiler* compiler)
4180 : {
4181 34 : if (elements().length() != 1)
4182 1 : return nullptr;
4183 :
4184 33 : TextElement elm = elements()[0];
4185 33 : if (elm.text_type() != TextElement::CHAR_CLASS)
4186 13 : return nullptr;
4187 :
4188 20 : RegExpCharacterClass* node = elm.char_class();
4189 20 : CharacterRangeVector& ranges = node->ranges(alloc());
4190 :
4191 20 : if (!CharacterRange::IsCanonical(ranges))
4192 0 : CharacterRange::Canonicalize(ranges);
4193 :
4194 20 : if (node->is_negated())
4195 0 : return ranges.length() == 0 ? on_success() : nullptr;
4196 :
4197 20 : if (ranges.length() != 1)
4198 2 : return nullptr;
4199 :
4200 18 : uint32_t max_char = MaximumCharacter(compiler->latin1());
4201 18 : return ranges[0].IsEverything(max_char) ? on_success() : nullptr;
4202 : }
4203 :
4204 : // Finds the fixed match length of a sequence of nodes that goes from
4205 : // this alternative and back to this choice node. If there are variable
4206 : // length nodes or other complications in the way then return a sentinel
4207 : // value indicating that a greedy loop cannot be constructed.
4208 : int
4209 119 : ChoiceNode::GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative)
4210 : {
4211 119 : int length = 0;
4212 119 : RegExpNode* node = alternative->node();
4213 : // Later we will generate code for all these text nodes using recursion
4214 : // so we have to limit the max number.
4215 119 : int recursion_depth = 0;
4216 293 : while (node != this) {
4217 154 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
4218 0 : return kNodeIsTooComplexForGreedyLoops;
4219 : }
4220 154 : int node_length = node->GreedyLoopTextLength();
4221 154 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
4222 67 : return kNodeIsTooComplexForGreedyLoops;
4223 : }
4224 87 : length += node_length;
4225 87 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
4226 87 : node = seq_node->on_success();
4227 : }
4228 52 : return length;
4229 : }
4230 :
4231 : // Creates a list of AlternativeGenerations. If the list has a reasonable
4232 : // size then it is on the stack, otherwise the excess is on the heap.
4233 : class AlternativeGenerationList
4234 : {
4235 : public:
4236 93 : AlternativeGenerationList(LifoAlloc* alloc, size_t count)
4237 93 : : alt_gens_(*alloc)
4238 : {
4239 93 : alt_gens_.reserve(count);
4240 315 : for (size_t i = 0; i < count && i < kAFew; i++)
4241 222 : alt_gens_.append(a_few_alt_gens_ + i);
4242 105 : for (size_t i = kAFew; i < count; i++) {
4243 24 : AutoEnterOOMUnsafeRegion oomUnsafe;
4244 12 : AlternativeGeneration* gen = js_new<AlternativeGeneration>();
4245 12 : if (!gen)
4246 0 : oomUnsafe.crash("AlternativeGenerationList js_new");
4247 12 : alt_gens_.append(gen);
4248 : }
4249 93 : }
4250 :
4251 186 : ~AlternativeGenerationList() {
4252 105 : for (size_t i = kAFew; i < alt_gens_.length(); i++) {
4253 12 : js_delete(alt_gens_[i]);
4254 12 : alt_gens_[i] = nullptr;
4255 : }
4256 93 : }
4257 :
4258 438 : AlternativeGeneration* at(int i) {
4259 438 : return alt_gens_[i];
4260 : }
4261 :
4262 : private:
4263 : static const size_t kAFew = 10;
4264 : InfallibleVector<AlternativeGeneration*, 1> alt_gens_;
4265 : AlternativeGeneration a_few_alt_gens_[kAFew];
4266 : };
4267 :
4268 : void
4269 149 : ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace)
4270 : {
4271 149 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4272 149 : size_t choice_count = alternatives().length();
4273 : #ifdef DEBUG
4274 346 : for (size_t i = 0; i < choice_count - 1; i++) {
4275 197 : const GuardedAlternative& alternative = alternatives()[i];
4276 197 : const GuardVector* guards = alternative.guards();
4277 197 : if (guards) {
4278 48 : for (size_t j = 0; j < guards->length(); j++)
4279 24 : MOZ_ASSERT(!trace->mentions_reg((*guards)[j]->reg()));
4280 : }
4281 : }
4282 : #endif
4283 :
4284 149 : LimitResult limit_result = LimitVersions(compiler, trace);
4285 205 : if (limit_result == DONE) return;
4286 93 : MOZ_ASSERT(limit_result == CONTINUE);
4287 :
4288 93 : int new_flush_budget = trace->flush_budget() / choice_count;
4289 93 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
4290 0 : trace->Flush(compiler, this);
4291 0 : return;
4292 : }
4293 :
4294 186 : RecursionCheck rc(compiler);
4295 :
4296 93 : Trace* current_trace = trace;
4297 :
4298 93 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives()[0]);
4299 93 : bool greedy_loop = false;
4300 186 : jit::Label greedy_loop_label;
4301 93 : Trace counter_backtrack_trace;
4302 93 : counter_backtrack_trace.set_backtrack(&greedy_loop_label);
4303 93 : if (not_at_start()) counter_backtrack_trace.set_at_start(false);
4304 :
4305 93 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4306 : // Here we have special handling for greedy loops containing only text nodes
4307 : // and other simple nodes. These are handled by pushing the current
4308 : // position on the stack and then incrementing the current position each
4309 : // time around the switch. On backtrack we decrement the current position
4310 : // and check it against the pushed value. This avoids pushing backtrack
4311 : // information for each iteration of the loop, which could take up a lot of
4312 : // space.
4313 26 : greedy_loop = true;
4314 26 : MOZ_ASSERT(trace->stop_node() == nullptr);
4315 26 : macro_assembler->PushCurrentPosition();
4316 26 : current_trace = &counter_backtrack_trace;
4317 52 : jit::Label greedy_match_failed;
4318 26 : Trace greedy_match_trace;
4319 26 : if (not_at_start()) greedy_match_trace.set_at_start(false);
4320 26 : greedy_match_trace.set_backtrack(&greedy_match_failed);
4321 52 : jit::Label loop_label;
4322 26 : macro_assembler->Bind(&loop_label);
4323 26 : greedy_match_trace.set_stop_node(this);
4324 26 : greedy_match_trace.set_loop_label(&loop_label);
4325 26 : alternatives()[0].node()->Emit(compiler, &greedy_match_trace);
4326 26 : macro_assembler->Bind(&greedy_match_failed);
4327 : }
4328 :
4329 186 : jit::Label second_choice; // For use in greedy matches.
4330 93 : macro_assembler->Bind(&second_choice);
4331 :
4332 93 : size_t first_normal_choice = greedy_loop ? 1 : 0;
4333 :
4334 93 : bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
4335 93 : const int kEatsAtLeastNotYetInitialized = -1;
4336 93 : int eats_at_least = kEatsAtLeastNotYetInitialized;
4337 :
4338 93 : bool skip_was_emitted = false;
4339 :
4340 93 : if (!greedy_loop && choice_count == 2) {
4341 60 : GuardedAlternative alt1 = alternatives()[1];
4342 60 : if (!alt1.guards() || alt1.guards()->length() == 0) {
4343 55 : RegExpNode* eats_anything_node = alt1.node();
4344 55 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == this) {
4345 : // At this point we know that we are at a non-greedy loop that will eat
4346 : // any character one at a time. Any non-anchored regexp has such a
4347 : // loop prepended to it in order to find where it starts. We look for
4348 : // a pattern of the form ...abc... where we can look 6 characters ahead
4349 : // and step forwards 3 if the character is not one of abc. Abc need
4350 : // not be atoms, they can be any reasonably limited character class or
4351 : // small alternation.
4352 18 : MOZ_ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
4353 18 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
4354 18 : if (lookahead == nullptr) {
4355 18 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4356 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4357 : kRecursionBudget,
4358 18 : not_at_start));
4359 18 : if (eats_at_least >= 1) {
4360 : BoyerMooreLookahead* bm =
4361 18 : alloc()->newInfallible<BoyerMooreLookahead>(alloc(), eats_at_least, compiler);
4362 18 : GuardedAlternative alt0 = alternatives()[0];
4363 18 : alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
4364 18 : skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4365 : }
4366 : } else {
4367 0 : skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
4368 : }
4369 : }
4370 : }
4371 : }
4372 :
4373 93 : if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4374 : // Save some time by looking at most one machine word ahead.
4375 75 : eats_at_least =
4376 75 : EatsAtLeast(compiler->latin1() ? 4 : 2, kRecursionBudget, not_at_start);
4377 : }
4378 93 : int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4379 :
4380 176 : bool preload_is_current = !skip_was_emitted &&
4381 176 : (current_trace->characters_preloaded() == preload_characters);
4382 93 : bool preload_has_checked_bounds = preload_is_current;
4383 :
4384 186 : AlternativeGenerationList alt_gens(alloc(), choice_count);
4385 :
4386 : // For now we just call all choices one after the other. The idea ultimately
4387 : // is to use the Dispatch table to try only the relevant ones.
4388 301 : for (size_t i = first_normal_choice; i < choice_count; i++) {
4389 208 : GuardedAlternative alternative = alternatives()[i];
4390 208 : AlternativeGeneration* alt_gen = alt_gens.at(i);
4391 208 : alt_gen->quick_check_details.set_characters(preload_characters);
4392 208 : const GuardVector* guards = alternative.guards();
4393 208 : Trace new_trace(*current_trace);
4394 208 : new_trace.set_characters_preloaded(preload_is_current ?
4395 : preload_characters :
4396 208 : 0);
4397 208 : if (preload_has_checked_bounds) {
4398 150 : new_trace.set_bound_checked_up_to(preload_characters);
4399 : }
4400 208 : new_trace.quick_check_performed()->Clear();
4401 208 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4402 208 : alt_gen->expects_preload = preload_is_current;
4403 208 : bool generate_full_check_inline = false;
4404 414 : if (try_to_emit_quick_check_for_alternative(i) &&
4405 206 : alternative.node()->EmitQuickCheck(compiler,
4406 : &new_trace,
4407 : preload_has_checked_bounds,
4408 : &alt_gen->possible_success,
4409 : &alt_gen->quick_check_details,
4410 206 : i < choice_count - 1)) {
4411 : // Quick check was generated for this choice.
4412 110 : preload_is_current = true;
4413 110 : preload_has_checked_bounds = true;
4414 : // On the last choice in the ChoiceNode we generated the quick
4415 : // check to fall through on possible success. So now we need to
4416 : // generate the full check inline.
4417 110 : if (i == choice_count - 1) {
4418 36 : macro_assembler->Bind(&alt_gen->possible_success);
4419 36 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4420 36 : new_trace.set_characters_preloaded(preload_characters);
4421 36 : new_trace.set_bound_checked_up_to(preload_characters);
4422 36 : generate_full_check_inline = true;
4423 : }
4424 98 : } else if (alt_gen->quick_check_details.cannot_match()) {
4425 0 : if (i == choice_count - 1 && !greedy_loop) {
4426 0 : macro_assembler->JumpOrBacktrack(trace->backtrack());
4427 : }
4428 0 : continue;
4429 : } else {
4430 : // No quick check was generated. Put the full code here.
4431 : // If this is not the first choice then there could be slow checks from
4432 : // previous cases that go here when they fail. There's no reason to
4433 : // insist that they preload characters since the slow check we are about
4434 : // to generate probably can't use it.
4435 98 : if (i != first_normal_choice) {
4436 56 : alt_gen->expects_preload = false;
4437 56 : new_trace.InvalidateCurrentCharacter();
4438 : }
4439 98 : if (i < choice_count - 1) {
4440 41 : new_trace.set_backtrack(&alt_gen->after);
4441 : }
4442 98 : generate_full_check_inline = true;
4443 : }
4444 208 : if (generate_full_check_inline) {
4445 134 : if (new_trace.actions() != nullptr)
4446 41 : new_trace.set_flush_budget(new_flush_budget);
4447 134 : if (guards) {
4448 14 : for (size_t j = 0; j < guards->length(); j++)
4449 7 : GenerateGuard(macro_assembler, (*guards)[j], &new_trace);
4450 : }
4451 134 : alternative.node()->Emit(compiler, &new_trace);
4452 134 : preload_is_current = false;
4453 : }
4454 208 : macro_assembler->Bind(&alt_gen->after);
4455 : }
4456 93 : if (greedy_loop) {
4457 26 : macro_assembler->Bind(&greedy_loop_label);
4458 : // If we have unwound to the bottom then backtrack.
4459 26 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4460 : // Otherwise try the second priority at an earlier position.
4461 26 : macro_assembler->AdvanceCurrentPosition(-text_length);
4462 26 : macro_assembler->JumpOrBacktrack(&second_choice);
4463 : }
4464 :
4465 : // At this point we need to generate slow checks for the alternatives where
4466 : // the quick check was inlined. We can recognize these because the associated
4467 : // label was bound.
4468 208 : for (size_t i = first_normal_choice; i < choice_count - 1; i++) {
4469 115 : AlternativeGeneration* alt_gen = alt_gens.at(i);
4470 115 : Trace new_trace(*current_trace);
4471 : // If there are actions to be flushed we have to limit how many times
4472 : // they are flushed. Take the budget of the parent trace and distribute
4473 : // it fairly amongst the children.
4474 115 : if (new_trace.actions() != nullptr) {
4475 60 : new_trace.set_flush_budget(new_flush_budget);
4476 : }
4477 115 : EmitOutOfLineContinuation(compiler,
4478 : &new_trace,
4479 115 : alternatives()[i],
4480 : alt_gen,
4481 : preload_characters,
4482 230 : alt_gens.at(i + 1)->expects_preload);
4483 : }
4484 : }
4485 :
4486 : void
4487 115 : ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4488 : Trace* trace,
4489 : GuardedAlternative alternative,
4490 : AlternativeGeneration* alt_gen,
4491 : int preload_characters,
4492 : bool next_expects_preload)
4493 : {
4494 115 : if (!alt_gen->possible_success.used())
4495 41 : return;
4496 :
4497 74 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4498 74 : macro_assembler->Bind(&alt_gen->possible_success);
4499 74 : Trace out_of_line_trace(*trace);
4500 74 : out_of_line_trace.set_characters_preloaded(preload_characters);
4501 74 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4502 74 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4503 74 : const GuardVector* guards = alternative.guards();
4504 74 : if (next_expects_preload) {
4505 118 : jit::Label reload_current_char;
4506 59 : out_of_line_trace.set_backtrack(&reload_current_char);
4507 59 : if (guards) {
4508 18 : for (size_t j = 0; j < guards->length(); j++)
4509 9 : GenerateGuard(macro_assembler, (*guards)[j], &out_of_line_trace);
4510 : }
4511 59 : alternative.node()->Emit(compiler, &out_of_line_trace);
4512 59 : macro_assembler->Bind(&reload_current_char);
4513 : // Reload the current character, since the next quick check expects that.
4514 : // We don't need to check bounds here because we only get into this
4515 : // code through a quick check which already did the checked load.
4516 59 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4517 : nullptr,
4518 : false,
4519 118 : preload_characters);
4520 59 : macro_assembler->JumpOrBacktrack(&(alt_gen->after));
4521 : } else {
4522 15 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4523 15 : if (guards) {
4524 0 : for (size_t j = 0; j < guards->length(); j++)
4525 0 : GenerateGuard(macro_assembler, (*guards)[j], &out_of_line_trace);
4526 : }
4527 15 : alternative.node()->Emit(compiler, &out_of_line_trace);
4528 : }
4529 : }
4530 :
4531 : void
4532 290 : ActionNode::Emit(RegExpCompiler* compiler, Trace* trace)
4533 : {
4534 290 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4535 290 : LimitResult limit_result = LimitVersions(compiler, trace);
4536 362 : if (limit_result == DONE) return;
4537 248 : MOZ_ASSERT(limit_result == CONTINUE);
4538 :
4539 466 : RecursionCheck rc(compiler);
4540 :
4541 248 : switch (action_type_) {
4542 : case STORE_POSITION: {
4543 : Trace::DeferredCapture
4544 : new_capture(data_.u_position_register.reg,
4545 164 : data_.u_position_register.is_capture,
4546 164 : trace);
4547 164 : Trace new_trace = *trace;
4548 164 : new_trace.add_action(&new_capture);
4549 164 : on_success()->Emit(compiler, &new_trace);
4550 164 : break;
4551 : }
4552 : case INCREMENT_REGISTER: {
4553 : Trace::DeferredIncrementRegister
4554 13 : new_increment(data_.u_increment_register.reg);
4555 13 : Trace new_trace = *trace;
4556 13 : new_trace.add_action(&new_increment);
4557 13 : on_success()->Emit(compiler, &new_trace);
4558 13 : break;
4559 : }
4560 : case SET_REGISTER: {
4561 : Trace::DeferredSetRegister
4562 11 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4563 11 : Trace new_trace = *trace;
4564 11 : new_trace.add_action(&new_set);
4565 11 : on_success()->Emit(compiler, &new_trace);
4566 11 : break;
4567 : }
4568 : case CLEAR_CAPTURES: {
4569 : Trace::DeferredClearCaptures
4570 : new_capture(Interval(data_.u_clear_captures.range_from,
4571 6 : data_.u_clear_captures.range_to));
4572 6 : Trace new_trace = *trace;
4573 6 : new_trace.add_action(&new_capture);
4574 6 : on_success()->Emit(compiler, &new_trace);
4575 6 : break;
4576 : }
4577 : case BEGIN_SUBMATCH:
4578 24 : if (!trace->is_trivial()) {
4579 12 : trace->Flush(compiler, this);
4580 : } else {
4581 12 : assembler->WriteCurrentPositionToRegister(data_.u_submatch.current_position_register, 0);
4582 12 : assembler->WriteBacktrackStackPointerToRegister(data_.u_submatch.stack_pointer_register);
4583 12 : on_success()->Emit(compiler, trace);
4584 : }
4585 24 : break;
4586 : case EMPTY_MATCH_CHECK: {
4587 0 : int start_pos_reg = data_.u_empty_match_check.start_register;
4588 0 : int stored_pos = 0;
4589 0 : int rep_reg = data_.u_empty_match_check.repetition_register;
4590 0 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4591 0 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4592 0 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4593 : // If we know we haven't advanced and there is no minimum we
4594 : // can just backtrack immediately.
4595 0 : assembler->JumpOrBacktrack(trace->backtrack());
4596 0 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4597 : // If we know we've advanced we can generate the continuation
4598 : // immediately.
4599 0 : on_success()->Emit(compiler, trace);
4600 0 : } else if (!trace->is_trivial()) {
4601 0 : trace->Flush(compiler, this);
4602 : } else {
4603 0 : jit::Label skip_empty_check;
4604 : // If we have a minimum number of repetitions we check the current
4605 : // number first and skip the empty check if it's not enough.
4606 0 : if (has_minimum) {
4607 0 : int limit = data_.u_empty_match_check.repetition_limit;
4608 0 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4609 : }
4610 : // If the match is empty we bail out, otherwise we fall through
4611 : // to the on-success continuation.
4612 0 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4613 0 : trace->backtrack());
4614 0 : assembler->Bind(&skip_empty_check);
4615 0 : on_success()->Emit(compiler, trace);
4616 : }
4617 0 : break;
4618 : }
4619 : case POSITIVE_SUBMATCH_SUCCESS: {
4620 30 : if (!trace->is_trivial()) {
4621 20 : trace->Flush(compiler, this);
4622 20 : return;
4623 : }
4624 10 : assembler->ReadCurrentPositionFromRegister(data_.u_submatch.current_position_register);
4625 10 : assembler->ReadBacktrackStackPointerFromRegister(data_.u_submatch.stack_pointer_register);
4626 10 : int clear_register_count = data_.u_submatch.clear_register_count;
4627 10 : if (clear_register_count == 0) {
4628 10 : on_success()->Emit(compiler, trace);
4629 10 : return;
4630 : }
4631 0 : int clear_registers_from = data_.u_submatch.clear_register_from;
4632 0 : jit::Label clear_registers_backtrack;
4633 0 : Trace new_trace = *trace;
4634 0 : new_trace.set_backtrack(&clear_registers_backtrack);
4635 0 : on_success()->Emit(compiler, &new_trace);
4636 :
4637 0 : assembler->Bind(&clear_registers_backtrack);
4638 0 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4639 0 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4640 :
4641 0 : MOZ_ASSERT(trace->backtrack() == nullptr);
4642 0 : assembler->Backtrack();
4643 0 : return;
4644 : }
4645 : default:
4646 0 : MOZ_CRASH("Bad action");
4647 : }
4648 : }
4649 :
4650 : void
4651 0 : BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace)
4652 : {
4653 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4654 0 : if (!trace->is_trivial()) {
4655 0 : trace->Flush(compiler, this);
4656 0 : return;
4657 : }
4658 :
4659 0 : LimitResult limit_result = LimitVersions(compiler, trace);
4660 0 : if (limit_result == DONE) return;
4661 0 : MOZ_ASSERT(limit_result == CONTINUE);
4662 :
4663 0 : RecursionCheck rc(compiler);
4664 :
4665 0 : MOZ_ASSERT(start_reg_ + 1 == end_reg_);
4666 0 : if (compiler->ignore_case()) {
4667 0 : assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4668 : trace->backtrack(),
4669 0 : compiler->unicode());
4670 : } else {
4671 0 : assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4672 : }
4673 0 : on_success()->Emit(compiler, trace);
4674 : }
4675 :
4676 : RegExpNode::LimitResult
4677 672 : RegExpNode::LimitVersions(RegExpCompiler* compiler, Trace* trace)
4678 : {
4679 : // If we are generating a greedy loop then don't stop and don't reuse code.
4680 672 : if (trace->stop_node() != nullptr)
4681 26 : return CONTINUE;
4682 :
4683 646 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4684 646 : if (trace->is_trivial()) {
4685 209 : if (label()->bound()) {
4686 : // We are being asked to generate a generic version, but that's already
4687 : // been done so just go to it.
4688 76 : macro_assembler->JumpOrBacktrack(label());
4689 76 : return DONE;
4690 : }
4691 133 : if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
4692 : // To avoid too deep recursion we push the node to the work queue and just
4693 : // generate a goto here.
4694 0 : compiler->AddWork(this);
4695 0 : macro_assembler->JumpOrBacktrack(label());
4696 0 : return DONE;
4697 : }
4698 : // Generate generic version of the node and bind the label for later use.
4699 133 : macro_assembler->Bind(label());
4700 133 : return CONTINUE;
4701 : }
4702 :
4703 : // We are being asked to make a non-generic version. Keep track of how many
4704 : // non-generic versions we generate so as not to overdo it.
4705 437 : trace_count_++;
4706 851 : if (trace_count_ < kMaxCopiesCodeGenerated &&
4707 414 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
4708 414 : return CONTINUE;
4709 : }
4710 :
4711 : // If we get here code has been generated for this node too many times or
4712 : // recursion is too deep. Time to switch to a generic version. The code for
4713 : // generic versions above can handle deep recursion properly.
4714 23 : trace->Flush(compiler, this);
4715 23 : return DONE;
4716 : }
4717 :
4718 : bool
4719 206 : RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
4720 : Trace* trace,
4721 : bool preload_has_checked_bounds,
4722 : jit::Label* on_possible_success,
4723 : QuickCheckDetails* details,
4724 : bool fall_through_on_failure)
4725 : {
4726 206 : if (details->characters() == 0) return false;
4727 133 : GetQuickCheckDetails(
4728 266 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
4729 133 : if (details->cannot_match()) return false;
4730 133 : if (!details->Rationalize(compiler->latin1())) return false;
4731 110 : MOZ_ASSERT(details->characters() == 1 ||
4732 : compiler->macro_assembler()->CanReadUnaligned());
4733 110 : uint32_t mask = details->mask();
4734 110 : uint32_t value = details->value();
4735 :
4736 110 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4737 :
4738 110 : if (trace->characters_preloaded() != details->characters()) {
4739 100 : assembler->LoadCurrentCharacter(trace->cp_offset(),
4740 : trace->backtrack(),
4741 50 : !preload_has_checked_bounds,
4742 100 : details->characters());
4743 : }
4744 :
4745 110 : bool need_mask = true;
4746 :
4747 110 : if (details->characters() == 1) {
4748 : // If number of characters preloaded is 1 then we used a byte or 16 bit
4749 : // load so the value is already masked down.
4750 30 : uint32_t char_mask = MaximumCharacter(compiler->latin1());
4751 30 : if ((mask & char_mask) == char_mask) need_mask = false;
4752 30 : mask &= char_mask;
4753 : } else {
4754 : // For 2-character preloads in LATIN1 mode or 1-character preloads in
4755 : // TWO_BYTE mode we also use a 16 bit load with zero extend.
4756 80 : if (details->characters() == 2 && compiler->latin1()) {
4757 29 : if ((mask & 0xffff) == 0xffff) need_mask = false;
4758 51 : } else if (details->characters() == 1 && !compiler->latin1()) {
4759 0 : if ((mask & 0xffff) == 0xffff) need_mask = false;
4760 : } else {
4761 51 : if (mask == 0xffffffff) need_mask = false;
4762 : }
4763 : }
4764 :
4765 110 : if (fall_through_on_failure) {
4766 74 : if (need_mask) {
4767 49 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
4768 : } else {
4769 25 : assembler->CheckCharacter(value, on_possible_success);
4770 : }
4771 : } else {
4772 36 : if (need_mask) {
4773 18 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
4774 : } else {
4775 18 : assembler->CheckNotCharacter(value, trace->backtrack());
4776 : }
4777 : }
4778 110 : return true;
4779 : }
4780 :
4781 : bool
4782 38 : TextNode::FillInBMInfo(int initial_offset,
4783 : int budget,
4784 : BoyerMooreLookahead* bm,
4785 : bool not_at_start)
4786 : {
4787 38 : if (!bm->CheckOverRecursed())
4788 0 : return false;
4789 :
4790 38 : if (initial_offset >= bm->length())
4791 0 : return true;
4792 :
4793 38 : int offset = initial_offset;
4794 38 : int max_char = bm->max_char();
4795 74 : for (size_t i = 0; i < elements().length(); i++) {
4796 38 : if (offset >= bm->length()) {
4797 0 : if (initial_offset == 0)
4798 0 : set_bm_info(not_at_start, bm);
4799 2 : return true;
4800 : }
4801 38 : TextElement text = elements()[i];
4802 38 : if (text.text_type() == TextElement::ATOM) {
4803 22 : RegExpAtom* atom = text.atom();
4804 77 : for (int j = 0; j < atom->length(); j++, offset++) {
4805 57 : if (offset >= bm->length()) {
4806 2 : if (initial_offset == 0)
4807 2 : set_bm_info(not_at_start, bm);
4808 2 : return true;
4809 : }
4810 55 : char16_t character = atom->data()[j];
4811 55 : if (bm->compiler()->ignore_case()) {
4812 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
4813 60 : int length = GetCaseIndependentLetters(character,
4814 30 : bm->max_char() == kMaxOneByteCharCode,
4815 30 : bm->compiler()->unicode(),
4816 30 : chars);
4817 71 : for (int j = 0; j < length; j++)
4818 41 : bm->Set(offset, chars[j]);
4819 : } else {
4820 25 : if (character <= max_char) bm->Set(offset, character);
4821 : }
4822 : }
4823 : } else {
4824 16 : MOZ_ASSERT(TextElement::CHAR_CLASS == text.text_type());
4825 16 : RegExpCharacterClass* char_class = text.char_class();
4826 16 : const CharacterRangeVector& ranges = char_class->ranges(alloc());
4827 16 : if (char_class->is_negated()) {
4828 2 : bm->SetAll(offset);
4829 : } else {
4830 79 : for (size_t k = 0; k < ranges.length(); k++) {
4831 65 : const CharacterRange& range = ranges[k];
4832 65 : if (range.from() > max_char)
4833 30 : continue;
4834 35 : int to = Min(max_char, static_cast<int>(range.to()));
4835 35 : bm->SetInterval(offset, Interval(range.from(), to));
4836 : }
4837 : }
4838 16 : offset++;
4839 : }
4840 : }
4841 36 : if (offset >= bm->length()) {
4842 30 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
4843 30 : return true;
4844 : }
4845 12 : if (!on_success()->FillInBMInfo(offset,
4846 : budget - 1,
4847 : bm,
4848 6 : true)) // Not at start after a text node.
4849 0 : return false;
4850 6 : if (initial_offset == 0)
4851 4 : set_bm_info(not_at_start, bm);
4852 6 : return true;
4853 : }
4854 :
4855 : // -------------------------------------------------------------------
4856 : // QuickCheckDetails
4857 :
4858 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
4859 : static inline uint32_t
4860 162 : SmearBitsRight(uint32_t v)
4861 : {
4862 162 : v |= v >> 1;
4863 162 : v |= v >> 2;
4864 162 : v |= v >> 4;
4865 162 : v |= v >> 8;
4866 162 : v |= v >> 16;
4867 162 : return v;
4868 : }
4869 :
4870 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
4871 : // super-class in the .h file).
4872 : //
4873 : // We iterate along the text object, building up for each character a
4874 : // mask and value that can be used to test for a quick failure to match.
4875 : // The masks and values for the positions will be combined into a single
4876 : // machine word for the current character width in order to be used in
4877 : // generating a quick check.
4878 : void
4879 206 : TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
4880 : RegExpCompiler* compiler,
4881 : int characters_filled_in,
4882 : bool not_at_start)
4883 : {
4884 206 : MOZ_ASSERT(characters_filled_in < details->characters());
4885 206 : int characters = details->characters();
4886 206 : int char_mask = MaximumCharacter(compiler->latin1());
4887 :
4888 238 : for (size_t k = 0; k < elements().length(); k++) {
4889 207 : TextElement elm = elements()[k];
4890 207 : if (elm.text_type() == TextElement::ATOM) {
4891 134 : const CharacterVector& quarks = elm.atom()->data();
4892 296 : for (size_t i = 0; i < (size_t) characters && i < quarks.length(); i++) {
4893 : QuickCheckDetails::Position* pos =
4894 279 : details->positions(characters_filled_in);
4895 279 : char16_t c = quarks[i];
4896 279 : if (c > char_mask) {
4897 : // If we expect a non-LATIN1 character from an LATIN1 string,
4898 : // there is no way we can match. Not even case independent
4899 : // matching can turn an LATIN1 character into non-LATIN1 or
4900 : // vice versa.
4901 0 : details->set_cannot_match();
4902 0 : pos->determines_perfectly = false;
4903 175 : return;
4904 : }
4905 279 : if (compiler->ignore_case()) {
4906 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
4907 156 : size_t length = GetCaseIndependentLetters(c, compiler->latin1(),
4908 312 : compiler->unicode(), chars);
4909 156 : MOZ_ASSERT(length != 0); // Can only happen if c > char_mask (see above).
4910 156 : if (length == 1) {
4911 : // This letter has no case equivalents, so it's nice and simple
4912 : // and the mask-compare will determine definitely whether we have
4913 : // a match at this character position.
4914 59 : pos->mask = char_mask;
4915 59 : pos->value = c;
4916 59 : pos->determines_perfectly = true;
4917 : } else {
4918 97 : uint32_t common_bits = char_mask;
4919 97 : uint32_t bits = chars[0];
4920 194 : for (size_t j = 1; j < length; j++) {
4921 97 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
4922 97 : common_bits ^= differing_bits;
4923 97 : bits &= common_bits;
4924 : }
4925 : // If length is 2 and common bits has only one zero in it then
4926 : // our mask and compare instruction will determine definitely
4927 : // whether we have a match at this character position. Otherwise
4928 : // it can only be an approximate check.
4929 97 : uint32_t one_zero = (common_bits | ~char_mask);
4930 97 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
4931 97 : pos->determines_perfectly = true;
4932 : }
4933 97 : pos->mask = common_bits;
4934 97 : pos->value = bits;
4935 : }
4936 : } else {
4937 : // Don't ignore case. Nice simple case where the mask-compare will
4938 : // determine definitely whether we have a match at this character
4939 : // position.
4940 123 : pos->mask = char_mask;
4941 123 : pos->value = c;
4942 123 : pos->determines_perfectly = true;
4943 : }
4944 279 : characters_filled_in++;
4945 279 : MOZ_ASSERT(characters_filled_in <= details->characters());
4946 279 : if (characters_filled_in == details->characters()) {
4947 117 : return;
4948 : }
4949 : }
4950 : } else {
4951 : QuickCheckDetails::Position* pos =
4952 73 : details->positions(characters_filled_in);
4953 73 : RegExpCharacterClass* tree = elm.char_class();
4954 73 : const CharacterRangeVector& ranges = tree->ranges(alloc());
4955 73 : if (tree->is_negated()) {
4956 : // A quick check uses multi-character mask and compare. There is no
4957 : // useful way to incorporate a negative char class into this scheme
4958 : // so we just conservatively create a mask and value that will always
4959 : // succeed.
4960 6 : pos->mask = 0;
4961 6 : pos->value = 0;
4962 : } else {
4963 67 : size_t first_range = 0;
4964 67 : while (ranges[first_range].from() > char_mask) {
4965 0 : first_range++;
4966 0 : if (first_range == ranges.length()) {
4967 0 : details->set_cannot_match();
4968 0 : pos->determines_perfectly = false;
4969 0 : return;
4970 : }
4971 : }
4972 67 : CharacterRange range = ranges[first_range];
4973 67 : char16_t from = range.from();
4974 67 : char16_t to = range.to();
4975 67 : if (to > char_mask) {
4976 21 : to = char_mask;
4977 : }
4978 67 : uint32_t differing_bits = (from ^ to);
4979 : // A mask and compare is only perfect if the differing bits form a
4980 : // number like 00011111 with one single block of trailing 1s.
4981 109 : if ((differing_bits & (differing_bits + 1)) == 0 &&
4982 42 : from + differing_bits == to) {
4983 32 : pos->determines_perfectly = true;
4984 : }
4985 67 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
4986 67 : uint32_t bits = (from & common_bits);
4987 200 : for (size_t i = first_range + 1; i < ranges.length(); i++) {
4988 133 : CharacterRange range = ranges[i];
4989 133 : char16_t from = range.from();
4990 133 : char16_t to = range.to();
4991 133 : if (from > char_mask) continue;
4992 95 : if (to > char_mask) to = char_mask;
4993 : // Here we are combining more ranges into the mask and compare
4994 : // value. With each new range the mask becomes more sparse and
4995 : // so the chances of a false positive rise. A character class
4996 : // with multiple ranges is assumed never to be equivalent to a
4997 : // mask and compare operation.
4998 95 : pos->determines_perfectly = false;
4999 95 : uint32_t new_common_bits = (from ^ to);
5000 95 : new_common_bits = ~SmearBitsRight(new_common_bits);
5001 95 : common_bits &= new_common_bits;
5002 95 : bits &= new_common_bits;
5003 95 : uint32_t differing_bits = (from & common_bits) ^ bits;
5004 95 : common_bits ^= differing_bits;
5005 95 : bits &= common_bits;
5006 : }
5007 67 : pos->mask = common_bits;
5008 67 : pos->value = bits;
5009 : }
5010 73 : characters_filled_in++;
5011 73 : MOZ_ASSERT(characters_filled_in <= details->characters());
5012 73 : if (characters_filled_in == details->characters()) {
5013 58 : return;
5014 : }
5015 : }
5016 : }
5017 31 : MOZ_ASSERT(characters_filled_in != details->characters());
5018 31 : if (!details->cannot_match()) {
5019 31 : on_success()-> GetQuickCheckDetails(details,
5020 : compiler,
5021 : characters_filled_in,
5022 31 : true);
5023 : }
5024 : }
5025 :
5026 : void
5027 420 : QuickCheckDetails::Clear()
5028 : {
5029 657 : for (int i = 0; i < characters_; i++) {
5030 237 : positions_[i].mask = 0;
5031 237 : positions_[i].value = 0;
5032 237 : positions_[i].determines_perfectly = false;
5033 : }
5034 420 : characters_ = 0;
5035 420 : }
5036 :
5037 : void
5038 232 : QuickCheckDetails::Advance(int by)
5039 : {
5040 232 : MOZ_ASSERT(by >= 0);
5041 232 : if (by >= characters_) {
5042 212 : Clear();
5043 212 : return;
5044 : }
5045 40 : for (int i = 0; i < characters_ - by; i++) {
5046 20 : positions_[i] = positions_[by + i];
5047 : }
5048 50 : for (int i = characters_ - by; i < characters_; i++) {
5049 30 : positions_[i].mask = 0;
5050 30 : positions_[i].value = 0;
5051 30 : positions_[i].determines_perfectly = false;
5052 : }
5053 20 : characters_ -= by;
5054 : // We could change mask_ and value_ here but we would never advance unless
5055 : // they had already been used in a check and they won't be used again because
5056 : // it would gain us nothing. So there's no point.
5057 : }
5058 :
5059 : bool
5060 133 : QuickCheckDetails::Rationalize(bool is_latin1)
5061 : {
5062 133 : bool found_useful_op = false;
5063 133 : uint32_t char_mask = MaximumCharacter(is_latin1);
5064 :
5065 133 : mask_ = 0;
5066 133 : value_ = 0;
5067 133 : int char_shift = 0;
5068 439 : for (int i = 0; i < characters_; i++) {
5069 306 : Position* pos = &positions_[i];
5070 306 : if ((pos->mask & kMaxOneByteCharCode) != 0)
5071 263 : found_useful_op = true;
5072 306 : mask_ |= (pos->mask & char_mask) << char_shift;
5073 306 : value_ |= (pos->value & char_mask) << char_shift;
5074 306 : char_shift += is_latin1 ? 8 : 16;
5075 : }
5076 133 : return found_useful_op;
5077 : }
5078 :
5079 46 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index)
5080 : {
5081 46 : MOZ_ASSERT(characters_ == other->characters_);
5082 46 : if (other->cannot_match_)
5083 0 : return;
5084 46 : if (cannot_match_) {
5085 0 : *this = *other;
5086 0 : return;
5087 : }
5088 100 : for (int i = from_index; i < characters_; i++) {
5089 54 : QuickCheckDetails::Position* pos = positions(i);
5090 54 : QuickCheckDetails::Position* other_pos = other->positions(i);
5091 58 : if (pos->mask != other_pos->mask ||
5092 4 : pos->value != other_pos->value ||
5093 0 : !other_pos->determines_perfectly) {
5094 : // Our mask-compare operation will be approximate unless we have the
5095 : // exact same operation on both sides of the alternation.
5096 54 : pos->determines_perfectly = false;
5097 : }
5098 54 : pos->mask &= other_pos->mask;
5099 54 : pos->value &= pos->mask;
5100 54 : other_pos->value &= pos->mask;
5101 54 : char16_t differing_bits = (pos->value ^ other_pos->value);
5102 54 : pos->mask &= ~differing_bits;
5103 54 : pos->value &= pos->mask;
5104 : }
5105 : }
|