Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 :
4 : // Copyright 2012 the V8 project authors. All rights reserved.
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following
13 : // disclaimer in the documentation and/or other materials provided
14 : // with the distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived
17 : // from this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : #include "irregexp/RegExpParser.h"
32 :
33 : #include "mozilla/ArrayUtils.h"
34 : #include "mozilla/Move.h"
35 :
36 : #include "frontend/TokenStream.h"
37 : #include "irregexp/RegExpCharacters.h"
38 : #include "vm/ErrorReporting.h"
39 : #include "vm/StringBuffer.h"
40 :
41 : using namespace js;
42 : using namespace js::irregexp;
43 :
44 : using mozilla::Move;
45 : using mozilla::PointerRangeSize;
46 :
47 : // ----------------------------------------------------------------------------
48 : // RegExpBuilder
49 :
50 1121 : RegExpBuilder::RegExpBuilder(LifoAlloc* alloc)
51 : : alloc(alloc),
52 : pending_empty_(false),
53 : characters_(nullptr)
54 : #ifdef DEBUG
55 1121 : , last_added_(ADD_NONE)
56 : #endif
57 1121 : {}
58 :
59 : void
60 3717 : RegExpBuilder::FlushCharacters()
61 : {
62 3717 : pending_empty_ = false;
63 3717 : if (characters_ != nullptr) {
64 1134 : RegExpTree* atom = alloc->newInfallible<RegExpAtom>(characters_);
65 1134 : characters_ = nullptr;
66 1134 : text_.Add(alloc, atom);
67 : #ifdef DEBUG
68 1134 : last_added_ = ADD_ATOM;
69 : #endif
70 : }
71 3717 : }
72 :
73 : void
74 2991 : RegExpBuilder::FlushText()
75 : {
76 2991 : FlushCharacters();
77 2991 : int num_text = text_.length();
78 2991 : if (num_text == 0)
79 1670 : return;
80 1321 : if (num_text == 1) {
81 1285 : terms_.Add(alloc, text_.last());
82 : } else {
83 36 : RegExpText* text = alloc->newInfallible<RegExpText>(alloc);
84 111 : for (int i = 0; i < num_text; i++)
85 75 : text_.Get(i)->AppendToText(text);
86 36 : terms_.Add(alloc, text);
87 : }
88 1321 : text_.Clear();
89 : }
90 :
91 : void
92 5130 : RegExpBuilder::AddCharacter(char16_t c)
93 : {
94 5130 : pending_empty_ = false;
95 5130 : if (characters_ == nullptr)
96 1195 : characters_ = alloc->newInfallible<CharacterVector>(*alloc);
97 5130 : characters_->append(c);
98 : #ifdef DEBUG
99 5130 : last_added_ = ADD_CHAR;
100 : #endif
101 5130 : }
102 :
103 : void
104 0 : RegExpBuilder::AddEmpty()
105 : {
106 0 : pending_empty_ = true;
107 0 : }
108 :
109 : void
110 1088 : RegExpBuilder::AddAtom(RegExpTree* term)
111 : {
112 1088 : if (term->IsEmpty()) {
113 0 : AddEmpty();
114 0 : return;
115 : }
116 1088 : if (term->IsTextElement()) {
117 726 : FlushCharacters();
118 726 : text_.Add(alloc, term);
119 : } else {
120 362 : FlushText();
121 362 : terms_.Add(alloc, term);
122 : }
123 : #ifdef DEBUG
124 1088 : last_added_ = ADD_ATOM;
125 : #endif
126 : }
127 :
128 : void
129 489 : RegExpBuilder::AddAssertion(RegExpTree* assert)
130 : {
131 489 : FlushText();
132 489 : terms_.Add(alloc, assert);
133 : #ifdef DEBUG
134 489 : last_added_ = ADD_ASSERT;
135 : #endif
136 489 : }
137 :
138 : void
139 430 : RegExpBuilder::NewAlternative()
140 : {
141 430 : FlushTerms();
142 430 : }
143 :
144 : void
145 1551 : RegExpBuilder::FlushTerms()
146 : {
147 1551 : FlushText();
148 1551 : int num_terms = terms_.length();
149 : RegExpTree* alternative;
150 1551 : if (num_terms == 0)
151 2 : alternative = RegExpEmpty::GetInstance();
152 1549 : else if (num_terms == 1)
153 988 : alternative = terms_.last();
154 : else
155 561 : alternative = alloc->newInfallible<RegExpAlternative>(terms_.GetList(alloc));
156 1551 : alternatives_.Add(alloc, alternative);
157 1551 : terms_.Clear();
158 : #ifdef DEBUG
159 1551 : last_added_ = ADD_NONE;
160 : #endif
161 1551 : }
162 :
163 : RegExpTree*
164 1121 : RegExpBuilder::ToRegExp()
165 : {
166 1121 : FlushTerms();
167 1121 : int num_alternatives = alternatives_.length();
168 1121 : if (num_alternatives == 0) {
169 0 : return RegExpEmpty::GetInstance();
170 : }
171 1121 : if (num_alternatives == 1) {
172 957 : return alternatives_.last();
173 : }
174 164 : return alloc->newInfallible<RegExpDisjunction>(alternatives_.GetList(alloc));
175 : }
176 :
177 : void
178 652 : RegExpBuilder::AddQuantifierToAtom(int min, int max,
179 : RegExpQuantifier::QuantifierType quantifier_type)
180 : {
181 652 : if (pending_empty_) {
182 0 : pending_empty_ = false;
183 0 : return;
184 : }
185 : RegExpTree* atom;
186 652 : if (characters_ != nullptr) {
187 61 : MOZ_ASSERT(last_added_ == ADD_CHAR);
188 : // Last atom was character.
189 61 : CharacterVector* char_vector = characters_;
190 61 : int num_chars = char_vector->length();
191 61 : if (num_chars > 1) {
192 28 : CharacterVector* prefix = alloc->newInfallible<CharacterVector>(*alloc);
193 28 : prefix->append(char_vector->begin(), num_chars - 1);
194 28 : text_.Add(alloc, alloc->newInfallible<RegExpAtom>(prefix));
195 28 : char_vector = alloc->newInfallible<CharacterVector>(*alloc);
196 28 : char_vector->append((*characters_)[num_chars - 1]);
197 : }
198 61 : characters_ = nullptr;
199 61 : atom = alloc->newInfallible<RegExpAtom>(char_vector);
200 61 : FlushText();
201 591 : } else if (text_.length() > 0) {
202 528 : MOZ_ASSERT(last_added_ == ADD_ATOM);
203 528 : atom = text_.RemoveLast();
204 528 : FlushText();
205 63 : } else if (terms_.length() > 0) {
206 63 : MOZ_ASSERT(last_added_ == ADD_ATOM);
207 63 : atom = terms_.RemoveLast();
208 63 : if (atom->max_match() == 0) {
209 : // Guaranteed to only match an empty string.
210 : #ifdef DEBUG
211 0 : last_added_ = ADD_TERM;
212 : #endif
213 0 : if (min == 0)
214 0 : return;
215 0 : terms_.Add(alloc, atom);
216 0 : return;
217 : }
218 : } else {
219 : // Only call immediately after adding an atom or character!
220 0 : MOZ_CRASH("Bad call");
221 : }
222 652 : terms_.Add(alloc, alloc->newInfallible<RegExpQuantifier>(min, max, quantifier_type, atom));
223 : #ifdef DEBUG
224 652 : last_added_ = ADD_TERM;
225 : #endif
226 : }
227 :
228 : // ----------------------------------------------------------------------------
229 : // RegExpParser
230 :
231 : template <typename CharT>
232 757 : RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
233 : const CharT* chars, const CharT* end, bool multiline_mode,
234 : bool unicode, bool ignore_case)
235 : : ts(ts),
236 : alloc(alloc),
237 : captures_(nullptr),
238 : start_(chars),
239 757 : next_pos_(start_),
240 : end_(end),
241 : current_(kEndMarker),
242 : capture_count_(0),
243 : has_more_(true),
244 : multiline_(multiline_mode),
245 : unicode_(unicode),
246 : ignore_case_(ignore_case),
247 : simple_(false),
248 : contains_anchor_(false),
249 1514 : is_scanned_for_captures_(false)
250 : {
251 757 : Advance();
252 757 : }
253 :
254 : template <typename CharT>
255 : void
256 0 : RegExpParser<CharT>::SyntaxError(unsigned errorNumber, ...)
257 : {
258 0 : ErrorMetadata err;
259 :
260 0 : ts.fillExcludingContext(&err, ts.currentToken().pos.begin);
261 :
262 : // For most error reporting, the line of context derives from the token
263 : // stream. So when location information doesn't come from the token
264 : // stream, we can't give a line of context. But here the "line of context"
265 : // can be (and is) derived from the pattern text, so we can provide it no
266 : // matter if the location is derived from the caller.
267 0 : size_t offset = PointerRangeSize(start_, next_pos_ - 1);
268 0 : size_t end = PointerRangeSize(start_, end_);
269 :
270 : const CharT* windowStart = (offset > ErrorMetadata::lineOfContextRadius)
271 0 : ? start_ + (offset - ErrorMetadata::lineOfContextRadius)
272 0 : : start_;
273 :
274 0 : const CharT* windowEnd = (end - offset > ErrorMetadata::lineOfContextRadius)
275 0 : ? start_ + offset + ErrorMetadata::lineOfContextRadius
276 0 : : end_;
277 :
278 0 : size_t windowLength = PointerRangeSize(windowStart, windowEnd);
279 0 : MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2);
280 :
281 : // Create the windowed string, not including the potential line
282 : // terminator.
283 0 : StringBuffer windowBuf(ts.context());
284 0 : if (!windowBuf.append(windowStart, windowEnd))
285 0 : return;
286 :
287 : // The line of context must be null-terminated, and StringBuffer doesn't
288 : // make that happen unless we force it to.
289 0 : if (!windowBuf.append('\0'))
290 0 : return;
291 :
292 0 : err.lineOfContext.reset(windowBuf.stealChars());
293 0 : if (!err.lineOfContext)
294 0 : return;
295 :
296 0 : err.lineLength = windowLength;
297 0 : err.tokenOffset = offset - (windowStart - start_);
298 :
299 : va_list args;
300 0 : va_start(args, errorNumber);
301 :
302 0 : ReportCompileError(ts.context(), Move(err), nullptr, JSREPORT_ERROR, errorNumber, args);
303 :
304 0 : va_end(args);
305 : }
306 :
307 : template <typename CharT>
308 : RegExpTree*
309 0 : RegExpParser<CharT>::ReportError(unsigned errorNumber, const char* param /* = nullptr */)
310 : {
311 0 : gc::AutoSuppressGC suppressGC(ts.context());
312 0 : SyntaxError(errorNumber, param);
313 0 : return nullptr;
314 : }
315 :
316 : template <typename CharT>
317 : void
318 12344 : RegExpParser<CharT>::Advance()
319 : {
320 12344 : if (next_pos_ < end_) {
321 11587 : current_ = *next_pos_;
322 11587 : next_pos_++;
323 : } else {
324 757 : current_ = kEndMarker;
325 757 : has_more_ = false;
326 : }
327 12344 : }
328 :
329 : // Returns the value (0 .. 15) of a hexadecimal character c.
330 : // If c is not a legal hexadecimal character, returns a value < 0.
331 : inline int
332 638 : HexValue(uint32_t c)
333 : {
334 638 : c -= '0';
335 638 : if (static_cast<unsigned>(c) <= 9) return c;
336 239 : c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
337 239 : if (static_cast<unsigned>(c) <= 5) return c + 10;
338 0 : return -1;
339 : }
340 :
341 : template <typename CharT>
342 : widechar
343 6 : RegExpParser<CharT>::ParseOctalLiteral()
344 : {
345 6 : MOZ_ASSERT('0' <= current() && current() <= '7');
346 : // For compatibility with some other browsers (not all), we parse
347 : // up to three octal digits with a value below 256.
348 6 : widechar value = current() - '0';
349 6 : Advance();
350 6 : if ('0' <= current() && current() <= '7') {
351 0 : value = value * 8 + current() - '0';
352 0 : Advance();
353 0 : if (value < 32 && '0' <= current() && current() <= '7') {
354 0 : value = value * 8 + current() - '0';
355 0 : Advance();
356 : }
357 : }
358 6 : return value;
359 : }
360 :
361 : template <typename CharT>
362 : bool
363 167 : RegExpParser<CharT>::ParseHexEscape(int length, widechar* value)
364 : {
365 167 : const CharT* start = position();
366 167 : uint32_t val = 0;
367 167 : bool done = false;
368 805 : for (int i = 0; !done; i++) {
369 638 : widechar c = current();
370 638 : int d = HexValue(c);
371 638 : if (d < 0) {
372 0 : Reset(start);
373 0 : return false;
374 : }
375 638 : val = val * 16 + d;
376 638 : Advance();
377 638 : if (i == length - 1) {
378 167 : done = true;
379 : }
380 : }
381 167 : *value = val;
382 167 : return true;
383 : }
384 :
385 : template <typename CharT>
386 : bool
387 0 : RegExpParser<CharT>::ParseBracedHexEscape(widechar* value)
388 : {
389 0 : MOZ_ASSERT(current() == '{');
390 0 : Advance();
391 :
392 0 : bool first = true;
393 0 : uint32_t code = 0;
394 0 : while (true) {
395 0 : widechar c = current();
396 0 : if (c == kEndMarker) {
397 0 : ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
398 0 : return false;
399 : }
400 0 : if (c == '}') {
401 0 : if (first) {
402 0 : ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
403 0 : return false;
404 : }
405 0 : Advance();
406 0 : break;
407 : }
408 :
409 0 : int d = HexValue(c);
410 0 : if (d < 0) {
411 0 : ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
412 0 : return false;
413 : }
414 0 : code = (code << 4) | d;
415 0 : if (code > unicode::NonBMPMax) {
416 0 : ReportError(JSMSG_UNICODE_OVERFLOW, "regular expression");
417 0 : return false;
418 : }
419 0 : Advance();
420 0 : first = false;
421 : }
422 :
423 0 : *value = code;
424 0 : return true;
425 : }
426 :
427 : template <typename CharT>
428 : bool
429 0 : RegExpParser<CharT>::ParseTrailSurrogate(widechar* value)
430 : {
431 0 : if (current() != '\\')
432 0 : return false;
433 :
434 0 : const CharT* start = position();
435 0 : Advance();
436 0 : if (current() != 'u') {
437 0 : Reset(start);
438 0 : return false;
439 : }
440 0 : Advance();
441 0 : if (!ParseHexEscape(4, value)) {
442 0 : Reset(start);
443 0 : return false;
444 : }
445 0 : if (!unicode::IsTrailSurrogate(*value)) {
446 0 : Reset(start);
447 0 : return false;
448 : }
449 0 : return true;
450 : }
451 :
452 : template <typename CharT>
453 : bool
454 0 : RegExpParser<CharT>::ParseRawSurrogatePair(char16_t* lead, char16_t* trail)
455 : {
456 0 : widechar c1 = current();
457 0 : if (!unicode::IsLeadSurrogate(c1))
458 0 : return false;
459 :
460 0 : const CharT* start = position();
461 0 : Advance();
462 0 : widechar c2 = current();
463 0 : if (!unicode::IsTrailSurrogate(c2)) {
464 0 : Reset(start);
465 0 : return false;
466 : }
467 0 : Advance();
468 0 : *lead = c1;
469 0 : *trail = c2;
470 0 : return true;
471 : }
472 :
473 : static inline RegExpTree*
474 0 : RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to)
475 : {
476 0 : CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
477 0 : ranges->append(CharacterRange::Range(from, to));
478 0 : return alloc->newInfallible<RegExpCharacterClass>(ranges, false);
479 : }
480 :
481 : static inline RegExpTree*
482 0 : NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
483 : {
484 0 : return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
485 : }
486 :
487 : static bool
488 0 : IsSyntaxCharacter(widechar c)
489 : {
490 0 : switch (c) {
491 : case '^':
492 : case '$':
493 : case '\\':
494 : case '.':
495 : case '*':
496 : case '+':
497 : case '?':
498 : case '(':
499 : case ')':
500 : case '[':
501 : case ']':
502 : case '{':
503 : case '}':
504 : case '|':
505 : case '/':
506 0 : return true;
507 : default:
508 0 : return false;
509 : }
510 : }
511 :
512 : inline bool
513 324 : IsInRange(int value, int lower_limit, int higher_limit)
514 : {
515 324 : MOZ_ASSERT(lower_limit <= higher_limit);
516 324 : return static_cast<unsigned int>(value - lower_limit) <=
517 324 : static_cast<unsigned int>(higher_limit - lower_limit);
518 : }
519 :
520 : inline bool
521 324 : IsDecimalDigit(widechar c)
522 : {
523 : // ECMA-262, 3rd, 7.8.3 (p 16)
524 324 : return IsInRange(c, '0', '9');
525 : }
526 :
527 : #ifdef DEBUG
528 : // Currently only used in an assert.kASSERT.
529 : static bool
530 362 : IsSpecialClassEscape(widechar c)
531 : {
532 362 : switch (c) {
533 : case 'd': case 'D':
534 : case 's': case 'S':
535 : case 'w': case 'W':
536 0 : return true;
537 : default:
538 362 : return false;
539 : }
540 : }
541 : #endif
542 :
543 : template <typename CharT>
544 : bool
545 362 : RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
546 : {
547 362 : MOZ_ASSERT(current() == '\\');
548 362 : MOZ_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
549 362 : Advance();
550 362 : switch (current()) {
551 : case 'b':
552 0 : Advance();
553 0 : *code = '\b';
554 0 : return true;
555 : // ControlEscape :: one of
556 : // f n r t v
557 : case 'f':
558 4 : Advance();
559 4 : *code = '\f';
560 4 : return true;
561 : case 'n':
562 9 : Advance();
563 9 : *code = '\n';
564 9 : return true;
565 : case 'r':
566 9 : Advance();
567 9 : *code = '\r';
568 9 : return true;
569 : case 't':
570 7 : Advance();
571 7 : *code = '\t';
572 7 : return true;
573 : case 'v':
574 0 : Advance();
575 0 : *code = '\v';
576 0 : return true;
577 : case 'c': {
578 0 : widechar controlLetter = Next();
579 0 : widechar letter = controlLetter & ~('A' ^ 'a');
580 : // For compatibility with JSC, inside a character class
581 : // we also accept digits and underscore as control characters,
582 : // but only in non-unicode mode
583 0 : if ((!unicode_ &&
584 0 : ((controlLetter >= '0' && controlLetter <= '9') ||
585 0 : controlLetter == '_')) ||
586 0 : (letter >= 'A' && letter <= 'Z'))
587 : {
588 0 : Advance(2);
589 : // Control letters mapped to ASCII control characters in the range
590 : // 0x00-0x1f.
591 0 : *code = controlLetter & 0x1f;
592 0 : return true;
593 : }
594 0 : if (unicode_) {
595 0 : ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
596 0 : return false;
597 : }
598 : // We match JSC in reading the backslash as a literal
599 : // character instead of as starting an escape.
600 0 : *code = '\\';
601 0 : return true;
602 : }
603 : case '0':
604 4 : if (unicode_) {
605 0 : Advance();
606 0 : if (IsDecimalDigit(current()))
607 0 : return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
608 0 : *code = 0;
609 0 : return true;
610 : }
611 : MOZ_FALLTHROUGH;
612 : case '1': case '2': case '3': case '4': case '5': case '6': case '7':
613 4 : if (unicode_) {
614 0 : ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
615 0 : return false;
616 : }
617 : // For compatibility, outside of unicode mode, we interpret a decimal
618 : // escape that isn't a back reference (and therefore either \0 or not
619 : // valid according to the specification) as a 1..3 digit octal
620 : // character code.
621 4 : *code = ParseOctalLiteral();
622 4 : return true;
623 : case 'x': {
624 9 : Advance();
625 : widechar value;
626 9 : if (ParseHexEscape(2, &value)) {
627 9 : *code = value;
628 9 : return true;
629 : }
630 0 : if (unicode_) {
631 0 : ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
632 0 : return false;
633 : }
634 : // If \x is not followed by a two-digit hexadecimal, treat it
635 : // as an identity escape in non-unicode mode.
636 0 : *code = 'x';
637 0 : return true;
638 : }
639 : case 'u': {
640 147 : Advance();
641 : widechar value;
642 147 : if (unicode_) {
643 0 : if (current() == '{') {
644 0 : if (!ParseBracedHexEscape(&value))
645 0 : return false;
646 0 : *code = value;
647 0 : return true;
648 : }
649 0 : if (ParseHexEscape(4, &value)) {
650 0 : if (unicode::IsLeadSurrogate(value)) {
651 : widechar trail;
652 0 : if (ParseTrailSurrogate(&trail)) {
653 0 : *code = unicode::UTF16Decode(value, trail);
654 0 : return true;
655 : }
656 : }
657 0 : *code = value;
658 0 : return true;
659 : }
660 0 : ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
661 0 : return false;
662 : }
663 147 : if (ParseHexEscape(4, &value)) {
664 147 : *code = value;
665 147 : return true;
666 : }
667 : // If \u is not followed by a four-digit or braced hexadecimal, treat it
668 : // as an identity escape.
669 0 : *code = 'u';
670 0 : return true;
671 : }
672 : default: {
673 : // Extended identity escape (non-unicode only). We accept any character
674 : // that hasn't been matched by a more specific case, not just the subset
675 : // required by the ECMAScript specification.
676 173 : widechar result = current();
677 173 : if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) {
678 0 : ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
679 0 : return false;
680 : }
681 173 : Advance();
682 173 : *code = result;
683 173 : return true;
684 : }
685 : }
686 : return true;
687 : }
688 :
689 : class WideCharRange
690 : {
691 : public:
692 : WideCharRange()
693 : : from_(0), to_(0)
694 : {}
695 :
696 0 : WideCharRange(widechar from, widechar to)
697 0 : : from_(from), to_(to)
698 0 : {}
699 :
700 0 : static inline WideCharRange Singleton(widechar value) {
701 0 : return WideCharRange(value, value);
702 : }
703 0 : static inline WideCharRange Range(widechar from, widechar to) {
704 0 : MOZ_ASSERT(from <= to);
705 0 : return WideCharRange(from, to);
706 : }
707 :
708 0 : bool Contains(widechar i) const { return from_ <= i && i <= to_; }
709 0 : widechar from() const { return from_; }
710 0 : widechar to() const { return to_; }
711 :
712 : private:
713 : widechar from_;
714 : widechar to_;
715 : };
716 :
717 : typedef InfallibleVector<WideCharRange, 1> WideCharRangeVector;
718 :
719 : static inline CharacterRange
720 0 : LeadSurrogateRange()
721 : {
722 0 : return CharacterRange::Range(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax);
723 : }
724 :
725 : static inline CharacterRange
726 0 : TrailSurrogateRange()
727 : {
728 0 : return CharacterRange::Range(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax);
729 : }
730 :
731 : static inline WideCharRange
732 0 : NonBMPRange()
733 : {
734 0 : return WideCharRange::Range(unicode::NonBMPMin, unicode::NonBMPMax);
735 : }
736 :
737 : static const char16_t kNoCharClass = 0;
738 :
739 : // Adds a character or pre-defined character class to character ranges.
740 : // If char_class is not kInvalidClass, it's interpreted as a class
741 : // escape (i.e., 's' means whitespace, from '\s').
742 : static inline void
743 615 : AddCharOrEscape(LifoAlloc* alloc,
744 : CharacterRangeVector* ranges,
745 : char16_t char_class,
746 : widechar c)
747 : {
748 615 : if (char_class != kNoCharClass)
749 37 : CharacterRange::AddClassEscape(alloc, char_class, ranges);
750 : else
751 578 : ranges->append(CharacterRange::Singleton(c));
752 615 : }
753 :
754 : static inline void
755 0 : AddCharOrEscapeUnicode(LifoAlloc* alloc,
756 : CharacterRangeVector* ranges,
757 : CharacterRangeVector* lead_ranges,
758 : CharacterRangeVector* trail_ranges,
759 : WideCharRangeVector* wide_ranges,
760 : char16_t char_class,
761 : widechar c,
762 : bool ignore_case)
763 : {
764 0 : if (char_class != kNoCharClass) {
765 0 : CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case);
766 0 : switch (char_class) {
767 : case 'S':
768 : case 'W':
769 : case 'D':
770 0 : lead_ranges->append(LeadSurrogateRange());
771 0 : trail_ranges->append(TrailSurrogateRange());
772 0 : wide_ranges->append(NonBMPRange());
773 0 : break;
774 : case '.':
775 0 : MOZ_CRASH("Bad char_class!");
776 : }
777 0 : return;
778 : }
779 :
780 0 : if (unicode::IsLeadSurrogate(c))
781 0 : lead_ranges->append(CharacterRange::Singleton(c));
782 0 : else if (unicode::IsTrailSurrogate(c))
783 0 : trail_ranges->append(CharacterRange::Singleton(c));
784 0 : else if (c >= unicode::NonBMPMin)
785 0 : wide_ranges->append(WideCharRange::Singleton(c));
786 : else
787 0 : ranges->append(CharacterRange::Singleton(c));
788 : }
789 :
790 : static inline void
791 0 : AddUnicodeRange(LifoAlloc* alloc,
792 : CharacterRangeVector* ranges,
793 : CharacterRangeVector* lead_ranges,
794 : CharacterRangeVector* trail_ranges,
795 : WideCharRangeVector* wide_ranges,
796 : widechar first,
797 : widechar next)
798 : {
799 0 : MOZ_ASSERT(first <= next);
800 0 : if (first < unicode::LeadSurrogateMin) {
801 0 : if (next < unicode::LeadSurrogateMin) {
802 0 : ranges->append(CharacterRange::Range(first, next));
803 0 : return;
804 : }
805 0 : ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMin - 1));
806 0 : first = unicode::LeadSurrogateMin;
807 : }
808 0 : if (first <= unicode::LeadSurrogateMax) {
809 0 : if (next <= unicode::LeadSurrogateMax) {
810 0 : lead_ranges->append(CharacterRange::Range(first, next));
811 0 : return;
812 : }
813 0 : lead_ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMax));
814 0 : first = unicode::LeadSurrogateMax + 1;
815 : }
816 : MOZ_ASSERT(unicode::LeadSurrogateMax + 1 == unicode::TrailSurrogateMin);
817 0 : if (first <= unicode::TrailSurrogateMax) {
818 0 : if (next <= unicode::TrailSurrogateMax) {
819 0 : trail_ranges->append(CharacterRange::Range(first, next));
820 0 : return;
821 : }
822 0 : trail_ranges->append(CharacterRange::Range(first, unicode::TrailSurrogateMax));
823 0 : first = unicode::TrailSurrogateMax + 1;
824 : }
825 0 : if (first <= unicode::UTF16Max) {
826 0 : if (next <= unicode::UTF16Max) {
827 0 : ranges->append(CharacterRange::Range(first, next));
828 0 : return;
829 : }
830 0 : ranges->append(CharacterRange::Range(first, unicode::UTF16Max));
831 0 : first = unicode::NonBMPMin;
832 : }
833 : MOZ_ASSERT(unicode::UTF16Max + 1 == unicode::NonBMPMin);
834 0 : wide_ranges->append(WideCharRange::Range(first, next));
835 : }
836 :
837 : // Negate a vector of ranges by subtracting its ranges from a range
838 : // encompassing the full range of possible values.
839 : template <typename RangeType>
840 : static inline void
841 0 : NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector<RangeType, 1>** ranges,
842 : RangeType full_range)
843 : {
844 : typedef InfallibleVector<RangeType, 1> RangeVector;
845 0 : RangeVector* tmp_ranges = alloc->newInfallible<RangeVector>(*alloc);
846 0 : tmp_ranges->append(full_range);
847 0 : RangeVector* result_ranges = alloc->newInfallible<RangeVector>(*alloc);
848 :
849 : // Perform the following calculation:
850 : // result_ranges = tmp_ranges - ranges
851 : // with the following steps:
852 : // result_ranges = tmp_ranges - ranges[0]
853 : // SWAP(result_ranges, tmp_ranges)
854 : // result_ranges = tmp_ranges - ranges[1]
855 : // SWAP(result_ranges, tmp_ranges)
856 : // ...
857 : // result_ranges = tmp_ranges - ranges[N-1]
858 : // SWAP(result_ranges, tmp_ranges)
859 : // The last SWAP is just for simplicity of the loop.
860 0 : for (size_t i = 0; i < (*ranges)->length(); i++) {
861 0 : result_ranges->clear();
862 :
863 0 : const RangeType& range = (**ranges)[i];
864 0 : for (size_t j = 0; j < tmp_ranges->length(); j++) {
865 0 : const RangeType& tmpRange = (*tmp_ranges)[j];
866 0 : auto from1 = tmpRange.from();
867 0 : auto to1 = tmpRange.to();
868 0 : auto from2 = range.from();
869 0 : auto to2 = range.to();
870 :
871 0 : if (from1 < from2) {
872 0 : if (to1 < from2) {
873 0 : result_ranges->append(tmpRange);
874 0 : } else if (to1 <= to2) {
875 0 : result_ranges->append(RangeType::Range(from1, from2 - 1));
876 : } else {
877 0 : result_ranges->append(RangeType::Range(from1, from2 - 1));
878 0 : result_ranges->append(RangeType::Range(to2 + 1, to1));
879 : }
880 0 : } else if (from1 <= to2) {
881 0 : if (to1 > to2)
882 0 : result_ranges->append(RangeType::Range(to2 + 1, to1));
883 : } else {
884 0 : result_ranges->append(tmpRange);
885 : }
886 : }
887 :
888 0 : auto tmp = tmp_ranges;
889 0 : tmp_ranges = result_ranges;
890 0 : result_ranges = tmp;
891 : }
892 :
893 : // After the loop, result is pointed at by tmp_ranges, instead of
894 : // result_ranges.
895 0 : *ranges = tmp_ranges;
896 0 : }
897 :
898 : static bool
899 0 : WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c)
900 : {
901 0 : for (size_t i = 0; i < wide_ranges->length(); i++) {
902 0 : const WideCharRange& range = (*wide_ranges)[i];
903 0 : if (range.Contains(c))
904 0 : return true;
905 : }
906 0 : return false;
907 : }
908 :
909 : static void
910 0 : CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff,
911 : WideCharRangeVector* wide_ranges,
912 : WideCharRangeVector** tmp_wide_ranges)
913 : {
914 0 : widechar contains_from = 0;
915 0 : widechar contains_to = 0;
916 0 : for (widechar c = from; c <= to; c++) {
917 0 : if (WideCharRangesContain(wide_ranges, c) &&
918 0 : !WideCharRangesContain(wide_ranges, c + diff))
919 : {
920 0 : if (contains_from == 0)
921 0 : contains_from = c;
922 0 : contains_to = c;
923 0 : } else if (contains_from != 0) {
924 0 : if (!*tmp_wide_ranges)
925 0 : *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
926 :
927 0 : (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
928 0 : contains_to + diff));
929 0 : contains_from = 0;
930 : }
931 : }
932 :
933 0 : if (contains_from != 0) {
934 0 : if (!*tmp_wide_ranges)
935 0 : *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
936 :
937 0 : (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
938 0 : contains_to + diff));
939 : }
940 0 : }
941 :
942 : static RegExpTree*
943 0 : UnicodeRangesAtom(LifoAlloc* alloc,
944 : CharacterRangeVector* ranges,
945 : CharacterRangeVector* lead_ranges,
946 : CharacterRangeVector* trail_ranges,
947 : WideCharRangeVector* wide_ranges,
948 : bool is_negated,
949 : bool ignore_case)
950 : {
951 : // Calculate case folding for non-BMP first and negate the range if needed.
952 0 : if (ignore_case) {
953 0 : WideCharRangeVector* tmp_wide_ranges = nullptr;
954 : #define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
955 : CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
956 0 : FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
957 0 : FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC)
958 : #undef CALL_CALC
959 :
960 0 : if (tmp_wide_ranges) {
961 0 : for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
962 0 : wide_ranges->append((*tmp_wide_ranges)[i]);
963 : }
964 : }
965 :
966 0 : if (is_negated) {
967 0 : NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange());
968 0 : NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange());
969 0 : NegateUnicodeRanges(alloc, &wide_ranges, NonBMPRange());
970 : }
971 :
972 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
973 :
974 0 : bool added = false;
975 :
976 0 : if (is_negated) {
977 0 : ranges->append(LeadSurrogateRange());
978 0 : ranges->append(TrailSurrogateRange());
979 : }
980 0 : if (ranges->length() > 0) {
981 0 : builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated));
982 0 : added = true;
983 : }
984 :
985 0 : if (lead_ranges->length() > 0) {
986 0 : if (added)
987 0 : builder->NewAlternative();
988 0 : builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(lead_ranges, false));
989 0 : builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
990 0 : unicode::TrailSurrogateMax));
991 0 : added = true;
992 : }
993 :
994 0 : if (trail_ranges->length() > 0) {
995 0 : if (added)
996 0 : builder->NewAlternative();
997 0 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
998 0 : RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
999 0 : builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(trail_ranges, false));
1000 0 : added = true;
1001 : }
1002 :
1003 0 : for (size_t i = 0; i < wide_ranges->length(); i++) {
1004 0 : if (added)
1005 0 : builder->NewAlternative();
1006 :
1007 0 : const WideCharRange& range = (*wide_ranges)[i];
1008 0 : widechar from = range.from();
1009 0 : widechar to = range.to();
1010 : char16_t from_lead, from_trail;
1011 : char16_t to_lead, to_trail;
1012 :
1013 0 : unicode::UTF16Encode(from, &from_lead, &from_trail);
1014 0 : if (from == to) {
1015 0 : builder->AddCharacter(from_lead);
1016 0 : builder->AddCharacter(from_trail);
1017 : } else {
1018 0 : unicode::UTF16Encode(to, &to_lead, &to_trail);
1019 0 : if (from_lead == to_lead) {
1020 0 : MOZ_ASSERT(from_trail != to_trail);
1021 0 : builder->AddCharacter(from_lead);
1022 0 : builder->AddAtom(RangeAtom(alloc, from_trail, to_trail));
1023 0 : } else if (from_trail == unicode::TrailSurrogateMin &&
1024 0 : to_trail == unicode::TrailSurrogateMax)
1025 : {
1026 0 : builder->AddAtom(RangeAtom(alloc, from_lead, to_lead));
1027 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
1028 0 : unicode::TrailSurrogateMax));
1029 0 : } else if (from_lead + 1 == to_lead) {
1030 0 : builder->AddCharacter(from_lead);
1031 0 : builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
1032 :
1033 0 : builder->NewAlternative();
1034 :
1035 0 : builder->AddCharacter(to_lead);
1036 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
1037 0 : } else if (from_lead + 2 == to_lead) {
1038 0 : builder->AddCharacter(from_lead);
1039 0 : builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
1040 :
1041 0 : builder->NewAlternative();
1042 :
1043 0 : builder->AddCharacter(from_lead + 1);
1044 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
1045 0 : unicode::TrailSurrogateMax));
1046 :
1047 0 : builder->NewAlternative();
1048 :
1049 0 : builder->AddCharacter(to_lead);
1050 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
1051 : } else {
1052 0 : builder->AddCharacter(from_lead);
1053 0 : builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
1054 :
1055 0 : builder->NewAlternative();
1056 :
1057 0 : builder->AddAtom(RangeAtom(alloc, from_lead + 1, to_lead - 1));
1058 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
1059 0 : unicode::TrailSurrogateMax));
1060 :
1061 0 : builder->NewAlternative();
1062 :
1063 0 : builder->AddCharacter(to_lead);
1064 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
1065 : }
1066 : }
1067 0 : added = true;
1068 : }
1069 :
1070 0 : return builder->ToRegExp();
1071 : }
1072 :
1073 : template <typename CharT>
1074 : RegExpTree*
1075 377 : RegExpParser<CharT>::ParseCharacterClass()
1076 : {
1077 377 : MOZ_ASSERT(current() == '[');
1078 377 : Advance();
1079 377 : bool is_negated = false;
1080 377 : if (current() == '^') {
1081 91 : is_negated = true;
1082 91 : Advance();
1083 : }
1084 377 : CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1085 377 : CharacterRangeVector* lead_ranges = nullptr;
1086 377 : CharacterRangeVector* trail_ranges = nullptr;
1087 377 : WideCharRangeVector* wide_ranges = nullptr;
1088 :
1089 377 : if (unicode_) {
1090 0 : lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1091 0 : trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1092 0 : wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
1093 : }
1094 :
1095 2259 : while (has_more() && current() != ']') {
1096 946 : char16_t char_class = kNoCharClass;
1097 946 : widechar first = 0;
1098 946 : if (!ParseClassAtom(&char_class, &first))
1099 0 : return nullptr;
1100 946 : if (current() == '-') {
1101 336 : Advance();
1102 336 : if (current() == kEndMarker) {
1103 : // If we reach the end we break out of the loop and let the
1104 : // following code report an error.
1105 5 : break;
1106 336 : } else if (current() == ']') {
1107 5 : if (unicode_) {
1108 0 : AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
1109 0 : char_class, first, ignore_case_);
1110 : } else {
1111 5 : AddCharOrEscape(alloc, ranges, char_class, first);
1112 : }
1113 5 : ranges->append(CharacterRange::Singleton('-'));
1114 5 : break;
1115 : }
1116 331 : char16_t char_class_2 = kNoCharClass;
1117 331 : widechar next = 0;
1118 331 : if (!ParseClassAtom(&char_class_2, &next))
1119 0 : return nullptr;
1120 331 : if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1121 0 : if (unicode_)
1122 0 : return ReportError(JSMSG_RANGE_WITH_CLASS_ESCAPE);
1123 :
1124 : // Either end is an escaped character class. Treat the '-' verbatim.
1125 0 : AddCharOrEscape(alloc, ranges, char_class, first);
1126 0 : ranges->append(CharacterRange::Singleton('-'));
1127 0 : AddCharOrEscape(alloc, ranges, char_class_2, next);
1128 0 : continue;
1129 : }
1130 331 : if (first > next)
1131 0 : return ReportError(JSMSG_BAD_CLASS_RANGE);
1132 331 : if (unicode_)
1133 0 : AddUnicodeRange(alloc, ranges, lead_ranges, trail_ranges,wide_ranges, first, next);
1134 : else
1135 331 : ranges->append(CharacterRange::Range(first, next));
1136 : } else {
1137 610 : if (unicode_) {
1138 0 : AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
1139 0 : char_class, first, ignore_case_);
1140 : } else {
1141 610 : AddCharOrEscape(alloc, ranges, char_class, first);
1142 : }
1143 : }
1144 : }
1145 377 : if (!has_more())
1146 0 : return ReportError(JSMSG_UNTERM_CLASS);
1147 377 : Advance();
1148 377 : if (!unicode_) {
1149 377 : if (ranges->length() == 0) {
1150 0 : ranges->append(CharacterRange::Everything());
1151 0 : is_negated = !is_negated;
1152 : }
1153 377 : return alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated);
1154 : }
1155 :
1156 0 : if (!is_negated && ranges->length() == 0 && lead_ranges->length() == 0 &&
1157 0 : trail_ranges->length() == 0 && wide_ranges->length() == 0)
1158 : {
1159 0 : ranges->append(CharacterRange::Everything());
1160 0 : return alloc->newInfallible<RegExpCharacterClass>(ranges, true);
1161 : }
1162 :
1163 0 : return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated,
1164 0 : ignore_case_);
1165 : }
1166 :
1167 : template <typename CharT>
1168 : bool
1169 1277 : RegExpParser<CharT>::ParseClassAtom(char16_t* char_class, widechar* value)
1170 : {
1171 1277 : MOZ_ASSERT(*char_class == kNoCharClass);
1172 1277 : widechar first = current();
1173 1277 : if (first == '\\') {
1174 399 : switch (Next()) {
1175 : case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
1176 37 : *char_class = Next();
1177 37 : Advance(2);
1178 37 : return true;
1179 : }
1180 : case kEndMarker:
1181 0 : return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
1182 : default:
1183 362 : if (!ParseClassCharacterEscape(value))
1184 0 : return false;
1185 362 : return true;
1186 : }
1187 : } else {
1188 878 : if (unicode_) {
1189 : char16_t lead, trail;
1190 0 : if (ParseRawSurrogatePair(&lead, &trail)) {
1191 0 : *value = unicode::UTF16Decode(lead, trail);
1192 0 : return true;
1193 : }
1194 : }
1195 878 : Advance();
1196 878 : *value = first;
1197 878 : return true;
1198 : }
1199 : }
1200 :
1201 : // In order to know whether an escape is a backreference or not we have to scan
1202 : // the entire regexp and find the number of capturing parentheses. However we
1203 : // don't want to scan the regexp twice unless it is necessary. This mini-parser
1204 : // is called when needed. It can see the difference between capturing and
1205 : // noncapturing parentheses and can skip character classes and backslash-escaped
1206 : // characters.
1207 : template <typename CharT>
1208 : void
1209 0 : RegExpParser<CharT>::ScanForCaptures()
1210 : {
1211 : // Start with captures started previous to current position
1212 0 : int capture_count = captures_started();
1213 : // Add count of captures after this position.
1214 : widechar n;
1215 0 : while ((n = current()) != kEndMarker) {
1216 0 : Advance();
1217 0 : switch (n) {
1218 : case '\\':
1219 0 : Advance();
1220 0 : break;
1221 : case '[': {
1222 : widechar c;
1223 0 : while ((c = current()) != kEndMarker) {
1224 0 : Advance();
1225 0 : if (c == '\\') {
1226 0 : Advance();
1227 : } else {
1228 0 : if (c == ']') break;
1229 : }
1230 : }
1231 0 : break;
1232 : }
1233 : case '(':
1234 0 : if (current() != '?') capture_count++;
1235 0 : break;
1236 : }
1237 : }
1238 0 : capture_count_ = capture_count;
1239 0 : is_scanned_for_captures_ = true;
1240 0 : }
1241 :
1242 : template <typename CharT>
1243 : bool
1244 0 : RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out)
1245 : {
1246 0 : MOZ_ASSERT('\\' == current());
1247 0 : MOZ_ASSERT('1' <= Next() && Next() <= '9');
1248 :
1249 : // Try to parse a decimal literal that is no greater than the total number
1250 : // of left capturing parentheses in the input.
1251 0 : const CharT* start = position();
1252 0 : int value = Next() - '0';
1253 0 : Advance(2);
1254 0 : while (true) {
1255 0 : widechar c = current();
1256 0 : if (IsDecimalDigit(c)) {
1257 0 : value = 10 * value + (c - '0');
1258 0 : if (value > kMaxCaptures) {
1259 0 : Reset(start);
1260 0 : return false;
1261 : }
1262 0 : Advance();
1263 : } else {
1264 0 : break;
1265 : }
1266 : }
1267 0 : if (value > captures_started()) {
1268 0 : if (!is_scanned_for_captures_) {
1269 0 : const CharT* saved_position = position();
1270 0 : ScanForCaptures();
1271 0 : Reset(saved_position);
1272 : }
1273 0 : if (value > capture_count_) {
1274 0 : Reset(start);
1275 0 : return false;
1276 : }
1277 : }
1278 0 : *index_out = value;
1279 0 : return true;
1280 : }
1281 :
1282 : // QuantifierPrefix ::
1283 : // { DecimalDigits }
1284 : // { DecimalDigits , }
1285 : // { DecimalDigits , DecimalDigits }
1286 : //
1287 : // Returns true if parsing succeeds, and set the min_out and max_out
1288 : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1289 : template <typename CharT>
1290 : bool
1291 94 : RegExpParser<CharT>::ParseIntervalQuantifier(int* min_out, int* max_out)
1292 : {
1293 94 : MOZ_ASSERT(current() == '{');
1294 94 : const CharT* start = position();
1295 94 : Advance();
1296 94 : int min = 0;
1297 94 : if (!IsDecimalDigit(current())) {
1298 0 : Reset(start);
1299 0 : return false;
1300 : }
1301 116 : while (IsDecimalDigit(current())) {
1302 116 : int next = current() - '0';
1303 116 : if (min > (RegExpTree::kInfinity - next) / 10) {
1304 : // Overflow. Skip past remaining decimal digits and return -1.
1305 0 : do {
1306 0 : Advance();
1307 0 : } while (IsDecimalDigit(current()));
1308 0 : min = RegExpTree::kInfinity;
1309 0 : break;
1310 : }
1311 116 : min = 10 * min + next;
1312 116 : Advance();
1313 : }
1314 94 : int max = 0;
1315 94 : if (current() == '}') {
1316 74 : max = min;
1317 74 : Advance();
1318 20 : } else if (current() == ',') {
1319 20 : Advance();
1320 20 : if (current() == '}') {
1321 10 : max = RegExpTree::kInfinity;
1322 10 : Advance();
1323 : } else {
1324 10 : while (IsDecimalDigit(current())) {
1325 10 : int next = current() - '0';
1326 10 : if (max > (RegExpTree::kInfinity - next) / 10) {
1327 0 : do {
1328 0 : Advance();
1329 0 : } while (IsDecimalDigit(current()));
1330 0 : max = RegExpTree::kInfinity;
1331 0 : break;
1332 : }
1333 10 : max = 10 * max + next;
1334 10 : Advance();
1335 : }
1336 10 : if (current() != '}') {
1337 0 : Reset(start);
1338 0 : return false;
1339 : }
1340 10 : Advance();
1341 : }
1342 : } else {
1343 0 : Reset(start);
1344 0 : return false;
1345 : }
1346 94 : *min_out = min;
1347 94 : *max_out = max;
1348 94 : return true;
1349 : }
1350 :
1351 : // Pattern ::
1352 : // Disjunction
1353 : template <typename CharT>
1354 : RegExpTree*
1355 757 : RegExpParser<CharT>::ParsePattern()
1356 : {
1357 757 : RegExpTree* result = ParseDisjunction();
1358 757 : MOZ_ASSERT_IF(result, !has_more());
1359 757 : return result;
1360 : }
1361 :
1362 : static inline RegExpTree*
1363 0 : CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff)
1364 : {
1365 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1366 :
1367 0 : builder->AddCharacter(lead);
1368 0 : CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1369 0 : ranges->append(CharacterRange::Range(trail, trail));
1370 0 : ranges->append(CharacterRange::Range(trail + diff, trail + diff));
1371 0 : builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
1372 :
1373 0 : return builder->ToRegExp();
1374 : }
1375 :
1376 : static inline RegExpTree*
1377 0 : SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
1378 : {
1379 0 : if (ignore_case) {
1380 : #define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
1381 : if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
1382 : return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
1383 0 : FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
1384 0 : FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM)
1385 : #undef CALL_ATOM
1386 : }
1387 :
1388 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1389 0 : builder->AddCharacter(lead);
1390 0 : builder->AddCharacter(trail);
1391 0 : return builder->ToRegExp();
1392 : }
1393 :
1394 : static inline RegExpTree*
1395 0 : LeadSurrogateAtom(LifoAlloc* alloc, char16_t value)
1396 : {
1397 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1398 0 : builder->AddCharacter(value);
1399 0 : builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
1400 0 : unicode::TrailSurrogateMax));
1401 0 : return builder->ToRegExp();
1402 : }
1403 :
1404 : static inline RegExpTree*
1405 0 : TrailSurrogateAtom(LifoAlloc* alloc, char16_t value)
1406 : {
1407 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1408 0 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1409 0 : RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
1410 0 : builder->AddCharacter(value);
1411 0 : return builder->ToRegExp();
1412 : }
1413 :
1414 : static inline RegExpTree*
1415 0 : UnicodeEverythingAtom(LifoAlloc* alloc)
1416 : {
1417 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1418 :
1419 : // everything except \x0a, \x0d, \u2028 and \u2029
1420 :
1421 0 : CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1422 0 : AddClassNegated(kLineTerminatorAndSurrogateRanges,
1423 : kLineTerminatorAndSurrogateRangeCount,
1424 0 : ranges);
1425 0 : builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
1426 :
1427 0 : builder->NewAlternative();
1428 :
1429 0 : builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
1430 0 : builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
1431 0 : unicode::TrailSurrogateMax));
1432 :
1433 0 : builder->NewAlternative();
1434 :
1435 0 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1436 0 : RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
1437 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
1438 :
1439 0 : builder->NewAlternative();
1440 :
1441 0 : builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
1442 0 : builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
1443 :
1444 0 : return builder->ToRegExp();
1445 : }
1446 :
1447 : RegExpTree*
1448 0 : UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
1449 : {
1450 0 : CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1451 0 : CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1452 0 : CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1453 0 : WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
1454 0 : AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0,
1455 0 : ignore_case);
1456 :
1457 0 : return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
1458 : }
1459 :
1460 : static inline RegExpTree*
1461 0 : UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom)
1462 : {
1463 : // If a back reference has a standalone lead surrogate as its last
1464 : // character, then that lead surrogate shouldn't match lead surrogates that
1465 : // are paired with a corresponding trail surrogate.
1466 0 : RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1467 :
1468 0 : builder->AddAtom(atom);
1469 0 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1470 0 : RegExpAssertion::NOT_IN_SURROGATE_PAIR));
1471 :
1472 0 : return builder->ToRegExp();
1473 : }
1474 :
1475 : // Disjunction ::
1476 : // Alternative
1477 : // Alternative | Disjunction
1478 : // Alternative ::
1479 : // [empty]
1480 : // Term Alternative
1481 : // Term ::
1482 : // Assertion
1483 : // Atom
1484 : // Atom Quantifier
1485 : template <typename CharT>
1486 : RegExpTree*
1487 757 : RegExpParser<CharT>::ParseDisjunction()
1488 : {
1489 : // Used to store current state while parsing subexpressions.
1490 757 : RegExpParserState initial_state(alloc, nullptr, INITIAL, 0);
1491 757 : RegExpParserState* stored_state = &initial_state;
1492 : // Cache the builder in a local variable for quick access.
1493 757 : RegExpBuilder* builder = initial_state.builder();
1494 7501 : while (true) {
1495 8258 : switch (current()) {
1496 : case kEndMarker:
1497 757 : if (stored_state->IsSubexpression()) {
1498 : // Inside a parenthesized group when hitting end of input.
1499 757 : return ReportError(JSMSG_MISSING_PAREN);
1500 : }
1501 757 : MOZ_ASSERT(INITIAL == stored_state->group_type());
1502 : // Parsing completed successfully.
1503 757 : return builder->ToRegExp();
1504 : case ')': {
1505 364 : if (!stored_state->IsSubexpression())
1506 0 : return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN);
1507 364 : MOZ_ASSERT(INITIAL != stored_state->group_type());
1508 :
1509 364 : Advance();
1510 : // End disjunction parsing and convert builder content to new single
1511 : // regexp atom.
1512 364 : RegExpTree* body = builder->ToRegExp();
1513 :
1514 364 : int end_capture_index = captures_started();
1515 :
1516 364 : int capture_index = stored_state->capture_index();
1517 364 : SubexpressionType group_type = stored_state->group_type();
1518 :
1519 : // Restore previous state.
1520 364 : stored_state = stored_state->previous_state();
1521 364 : builder = stored_state->builder();
1522 :
1523 : // Build result of subexpression.
1524 364 : if (group_type == CAPTURE) {
1525 271 : RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index);
1526 271 : (*captures_)[capture_index - 1] = capture;
1527 271 : body = capture;
1528 93 : } else if (group_type != GROUPING) {
1529 27 : MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD ||
1530 : group_type == NEGATIVE_LOOKAHEAD);
1531 27 : bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
1532 27 : body = alloc->newInfallible<RegExpLookahead>(body,
1533 : is_positive,
1534 54 : end_capture_index - capture_index,
1535 : capture_index);
1536 : }
1537 364 : builder->AddAtom(body);
1538 364 : if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
1539 0 : continue;
1540 : // For compatability with JSC and ES3, we allow quantifiers after
1541 : // lookaheads, and break in all cases.
1542 364 : break;
1543 : }
1544 : case '|': {
1545 430 : Advance();
1546 430 : builder->NewAlternative();
1547 430 : continue;
1548 : }
1549 : case '*':
1550 : case '+':
1551 : case '?':
1552 0 : return ReportError(JSMSG_NOTHING_TO_REPEAT);
1553 : case '^': {
1554 259 : Advance();
1555 259 : if (multiline_) {
1556 0 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_LINE));
1557 : } else {
1558 259 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_INPUT));
1559 259 : set_contains_anchor();
1560 : }
1561 259 : continue;
1562 : }
1563 : case '$': {
1564 212 : Advance();
1565 : RegExpAssertion::AssertionType assertion_type =
1566 212 : multiline_ ? RegExpAssertion::END_OF_LINE :
1567 212 : RegExpAssertion::END_OF_INPUT;
1568 212 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type));
1569 212 : continue;
1570 : }
1571 : case '.': {
1572 123 : Advance();
1573 : // everything except \x0a, \x0d, \u2028 and \u2029
1574 123 : if (unicode_) {
1575 0 : builder->AddAtom(UnicodeEverythingAtom(alloc));
1576 0 : break;
1577 : }
1578 123 : CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1579 123 : CharacterRange::AddClassEscape(alloc, '.', ranges);
1580 123 : RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
1581 123 : builder->AddAtom(atom);
1582 123 : break;
1583 : }
1584 : case '(': {
1585 364 : SubexpressionType subexpr_type = CAPTURE;
1586 364 : Advance();
1587 364 : if (current() == '?') {
1588 93 : switch (Next()) {
1589 : case ':':
1590 66 : subexpr_type = GROUPING;
1591 66 : break;
1592 : case '=':
1593 20 : subexpr_type = POSITIVE_LOOKAHEAD;
1594 20 : break;
1595 : case '!':
1596 7 : subexpr_type = NEGATIVE_LOOKAHEAD;
1597 7 : break;
1598 : default:
1599 0 : return ReportError(JSMSG_INVALID_GROUP);
1600 : }
1601 93 : Advance(2);
1602 : } else {
1603 271 : if (captures_ == nullptr)
1604 184 : captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
1605 271 : if (captures_started() >= kMaxCaptures)
1606 0 : return ReportError(JSMSG_TOO_MANY_PARENS);
1607 271 : captures_->append((RegExpCapture*) nullptr);
1608 : }
1609 : // Store current state and begin new disjunction parsing.
1610 364 : stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type,
1611 728 : captures_started());
1612 364 : builder = stored_state->builder();
1613 364 : continue;
1614 : }
1615 : case '[': {
1616 377 : RegExpTree* atom = ParseCharacterClass();
1617 377 : if (!atom)
1618 0 : return nullptr;
1619 377 : builder->AddAtom(atom);
1620 377 : break;
1621 : }
1622 : // Atom ::
1623 : // \ AtomEscape
1624 : case '\\':
1625 599 : switch (Next()) {
1626 : case kEndMarker:
1627 0 : return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
1628 : case 'b':
1629 18 : Advance(2);
1630 18 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::BOUNDARY));
1631 18 : continue;
1632 : case 'B':
1633 0 : Advance(2);
1634 0 : builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::NON_BOUNDARY));
1635 0 : continue;
1636 : // AtomEscape ::
1637 : // CharacterClassEscape
1638 : //
1639 : // CharacterClassEscape :: one of
1640 : // d D s S w W
1641 : case 'D': case 'S': case 'W':
1642 36 : if (unicode_) {
1643 0 : Advance();
1644 0 : builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(),
1645 0 : ignore_case_));
1646 0 : Advance();
1647 0 : break;
1648 : }
1649 : MOZ_FALLTHROUGH;
1650 : case 'd': case 's': case 'w': {
1651 224 : widechar c = Next();
1652 224 : Advance(2);
1653 : CharacterRangeVector* ranges =
1654 224 : alloc->newInfallible<CharacterRangeVector>(*alloc);
1655 224 : if (unicode_)
1656 0 : CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_);
1657 : else
1658 224 : CharacterRange::AddClassEscape(alloc, c, ranges);
1659 224 : RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
1660 224 : builder->AddAtom(atom);
1661 224 : break;
1662 : }
1663 : case '1': case '2': case '3': case '4': case '5': case '6':
1664 : case '7': case '8': case '9': {
1665 0 : int index = 0;
1666 0 : if (ParseBackReferenceIndex(&index)) {
1667 0 : RegExpCapture* capture = nullptr;
1668 0 : if (captures_ != nullptr && index <= (int) captures_->length()) {
1669 0 : capture = (*captures_)[index - 1];
1670 : }
1671 0 : if (capture == nullptr) {
1672 0 : builder->AddEmpty();
1673 0 : break;
1674 : }
1675 0 : RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
1676 0 : if (unicode_)
1677 0 : builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
1678 : else
1679 0 : builder->AddAtom(atom);
1680 0 : break;
1681 : }
1682 0 : if (unicode_)
1683 0 : return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE);
1684 0 : widechar first_digit = Next();
1685 0 : if (first_digit == '8' || first_digit == '9') {
1686 : // Treat as identity escape
1687 0 : builder->AddCharacter(first_digit);
1688 0 : Advance(2);
1689 0 : break;
1690 : }
1691 : MOZ_FALLTHROUGH;
1692 : }
1693 : case '0': {
1694 2 : if (unicode_) {
1695 0 : Advance(2);
1696 0 : if (IsDecimalDigit(current()))
1697 0 : return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
1698 0 : builder->AddCharacter(0);
1699 0 : break;
1700 : }
1701 :
1702 2 : Advance();
1703 2 : widechar octal = ParseOctalLiteral();
1704 2 : builder->AddCharacter(octal);
1705 2 : break;
1706 : }
1707 : // ControlEscape :: one of
1708 : // f n r t v
1709 : case 'f':
1710 0 : Advance(2);
1711 0 : builder->AddCharacter('\f');
1712 0 : break;
1713 : case 'n':
1714 28 : Advance(2);
1715 28 : builder->AddCharacter('\n');
1716 28 : break;
1717 : case 'r':
1718 9 : Advance(2);
1719 9 : builder->AddCharacter('\r');
1720 9 : break;
1721 : case 't':
1722 4 : Advance(2);
1723 4 : builder->AddCharacter('\t');
1724 4 : break;
1725 : case 'v':
1726 0 : Advance(2);
1727 0 : builder->AddCharacter('\v');
1728 0 : break;
1729 : case 'c': {
1730 0 : Advance();
1731 0 : widechar controlLetter = Next();
1732 : // Special case if it is an ASCII letter.
1733 : // Convert lower case letters to uppercase.
1734 0 : widechar letter = controlLetter & ~('a' ^ 'A');
1735 0 : if (letter < 'A' || 'Z' < letter) {
1736 0 : if (unicode_)
1737 0 : return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
1738 : // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
1739 : // This is outside the specification. We match JSC in
1740 : // reading the backslash as a literal character instead
1741 : // of as starting an escape.
1742 0 : builder->AddCharacter('\\');
1743 : } else {
1744 0 : Advance(2);
1745 0 : builder->AddCharacter(controlLetter & 0x1f);
1746 : }
1747 0 : break;
1748 : }
1749 : case 'x': {
1750 6 : Advance(2);
1751 : widechar value;
1752 6 : if (ParseHexEscape(2, &value)) {
1753 6 : builder->AddCharacter(value);
1754 : } else {
1755 0 : if (unicode_)
1756 0 : return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
1757 0 : builder->AddCharacter('x');
1758 : }
1759 6 : break;
1760 : }
1761 : case 'u': {
1762 5 : Advance(2);
1763 : widechar value;
1764 5 : if (unicode_) {
1765 0 : if (current() == '{') {
1766 0 : if (!ParseBracedHexEscape(&value))
1767 0 : return nullptr;
1768 0 : if (unicode::IsLeadSurrogate(value)) {
1769 0 : builder->AddAtom(LeadSurrogateAtom(alloc, value));
1770 0 : } else if (unicode::IsTrailSurrogate(value)) {
1771 0 : builder->AddAtom(TrailSurrogateAtom(alloc, value));
1772 0 : } else if (value >= unicode::NonBMPMin) {
1773 : char16_t lead, trail;
1774 0 : unicode::UTF16Encode(value, &lead, &trail);
1775 0 : builder->AddAtom(SurrogatePairAtom(alloc, lead, trail,
1776 0 : ignore_case_));
1777 : } else {
1778 0 : builder->AddCharacter(value);
1779 : }
1780 0 : } else if (ParseHexEscape(4, &value)) {
1781 0 : if (unicode::IsLeadSurrogate(value)) {
1782 : widechar trail;
1783 0 : if (ParseTrailSurrogate(&trail)) {
1784 0 : builder->AddAtom(SurrogatePairAtom(alloc, value, trail,
1785 0 : ignore_case_));
1786 : } else {
1787 0 : builder->AddAtom(LeadSurrogateAtom(alloc, value));
1788 : }
1789 0 : } else if (unicode::IsTrailSurrogate(value)) {
1790 0 : builder->AddAtom(TrailSurrogateAtom(alloc, value));
1791 : } else {
1792 0 : builder->AddCharacter(value);
1793 : }
1794 : } else {
1795 0 : return ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
1796 : }
1797 5 : break;
1798 : }
1799 5 : if (ParseHexEscape(4, &value)) {
1800 5 : builder->AddCharacter(value);
1801 : } else {
1802 0 : builder->AddCharacter('u');
1803 : }
1804 5 : break;
1805 : }
1806 : default:
1807 : // Identity escape.
1808 303 : if (unicode_ && !IsSyntaxCharacter(Next()))
1809 0 : return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
1810 303 : builder->AddCharacter(Next());
1811 303 : Advance(2);
1812 303 : break;
1813 : }
1814 581 : break;
1815 : case '{': {
1816 0 : if (unicode_)
1817 0 : return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
1818 : int dummy;
1819 0 : if (ParseIntervalQuantifier(&dummy, &dummy))
1820 0 : return ReportError(JSMSG_NOTHING_TO_REPEAT);
1821 : MOZ_FALLTHROUGH;
1822 : }
1823 : default:
1824 4773 : if (unicode_) {
1825 : char16_t lead, trail;
1826 0 : if (ParseRawSurrogatePair(&lead, &trail)) {
1827 0 : builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_));
1828 : } else {
1829 0 : widechar c = current();
1830 0 : if (unicode::IsLeadSurrogate(c))
1831 0 : builder->AddAtom(LeadSurrogateAtom(alloc, c));
1832 0 : else if (unicode::IsTrailSurrogate(c))
1833 0 : builder->AddAtom(TrailSurrogateAtom(alloc, c));
1834 0 : else if (c == ']')
1835 0 : return ReportError(JSMSG_RAW_BRACKET_IN_REGEP);
1836 0 : else if (c == '}')
1837 0 : return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
1838 : else
1839 0 : builder->AddCharacter(c);
1840 0 : Advance();
1841 : }
1842 0 : break;
1843 : }
1844 4773 : builder->AddCharacter(current());
1845 4773 : Advance();
1846 4773 : break;
1847 : } // end switch(current())
1848 :
1849 : int min;
1850 : int max;
1851 6218 : switch (current()) {
1852 : // QuantifierPrefix ::
1853 : // *
1854 : // +
1855 : // ?
1856 : // {
1857 : case '*':
1858 176 : min = 0;
1859 176 : max = RegExpTree::kInfinity;
1860 176 : Advance();
1861 176 : break;
1862 : case '+':
1863 297 : min = 1;
1864 297 : max = RegExpTree::kInfinity;
1865 297 : Advance();
1866 297 : break;
1867 : case '?':
1868 85 : min = 0;
1869 85 : max = 1;
1870 85 : Advance();
1871 85 : break;
1872 : case '{':
1873 94 : if (ParseIntervalQuantifier(&min, &max)) {
1874 94 : if (max < min)
1875 0 : return ReportError(JSMSG_NUMBERS_OUT_OF_ORDER);
1876 94 : break;
1877 : } else {
1878 0 : continue;
1879 : }
1880 : default:
1881 5566 : continue;
1882 : }
1883 652 : RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
1884 652 : if (current() == '?') {
1885 18 : quantifier_type = RegExpQuantifier::NON_GREEDY;
1886 18 : Advance();
1887 : }
1888 652 : builder->AddQuantifierToAtom(min, max, quantifier_type);
1889 : }
1890 : }
1891 :
1892 : template class irregexp::RegExpParser<Latin1Char>;
1893 : template class irregexp::RegExpParser<char16_t>;
1894 :
1895 : template <typename CharT>
1896 : static bool
1897 40 : ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
1898 : bool multiline, bool match_only, bool unicode, bool ignore_case,
1899 : bool global, bool sticky, RegExpCompileData* data)
1900 : {
1901 : // We shouldn't strip pattern for exec, or test with global/sticky,
1902 : // to reflect correct match position and lastIndex.
1903 40 : if (match_only && !global && !sticky) {
1904 : // Try to strip a leading '.*' from the RegExp, but only if it is not
1905 : // followed by a '?' (which will affect how the .* is parsed). This
1906 : // pattern will affect the captures produced by the RegExp, but not
1907 : // whether there is a match or not.
1908 18 : if (length >= 3 && chars[0] == '.' && chars[1] == '*' && chars[2] != '?') {
1909 0 : chars += 2;
1910 0 : length -= 2;
1911 : }
1912 :
1913 : // Try to strip a trailing '.*' from the RegExp, which as above will
1914 : // affect the captures but not whether there is a match. Only do this
1915 : // when there are no other meta characters in the RegExp, so that we
1916 : // are sure this will not affect how the RegExp is parsed.
1917 37 : if (length >= 3 && !HasRegExpMetaChars(chars, length - 2) &&
1918 19 : chars[length - 2] == '.' && chars[length - 1] == '*')
1919 : {
1920 0 : length -= 2;
1921 : }
1922 : }
1923 :
1924 40 : RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
1925 40 : data->tree = parser.ParsePattern();
1926 40 : if (!data->tree)
1927 0 : return false;
1928 :
1929 40 : data->simple = parser.simple();
1930 40 : data->contains_anchor = parser.contains_anchor();
1931 40 : data->capture_count = parser.captures_started();
1932 40 : return true;
1933 : }
1934 :
1935 : bool
1936 40 : irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
1937 : bool multiline, bool match_only, bool unicode, bool ignore_case,
1938 : bool global, bool sticky, RegExpCompileData* data)
1939 : {
1940 80 : JS::AutoCheckCannotGC nogc;
1941 40 : return str->hasLatin1Chars()
1942 40 : ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
1943 : multiline, match_only, unicode, ignore_case, global, sticky, data)
1944 0 : : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
1945 80 : multiline, match_only, unicode, ignore_case, global, sticky, data);
1946 : }
1947 :
1948 : template <typename CharT>
1949 : static bool
1950 717 : ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
1951 : bool unicode)
1952 : {
1953 1434 : LifoAllocScope scope(&alloc);
1954 :
1955 717 : RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
1956 1434 : return parser.ParsePattern() != nullptr;
1957 : }
1958 :
1959 : bool
1960 676 : irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
1961 : bool unicode)
1962 : {
1963 1352 : JS::AutoCheckCannotGC nogc;
1964 676 : return str->hasLatin1Chars()
1965 676 : ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode)
1966 1352 : : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode);
1967 : }
1968 :
1969 : bool
1970 41 : irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc,
1971 : const mozilla::Range<const char16_t> chars, bool unicode)
1972 : {
1973 41 : return ::ParsePatternSyntax(ts, alloc, chars.begin().get(), chars.length(), unicode);
1974 : }
|