Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 :
4 : // Copyright 2012 the V8 project authors. All rights reserved.
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following
13 : // disclaimer in the documentation and/or other materials provided
14 : // with the distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived
17 : // from this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : #ifndef V8_JSREGEXP_H_
32 : #define V8_JSREGEXP_H_
33 :
34 : #include "jscntxt.h"
35 :
36 : #include "ds/SplayTree.h"
37 : #include "jit/Label.h"
38 : #include "vm/RegExpObject.h"
39 :
40 : namespace js {
41 :
42 : class MatchPairs;
43 : class RegExpShared;
44 :
45 : namespace jit {
46 : class Label;
47 : class JitCode;
48 : }
49 :
50 : namespace irregexp {
51 :
52 : class RegExpTree;
53 : class RegExpMacroAssembler;
54 :
55 : struct RegExpCompileData
56 : {
57 40 : RegExpCompileData()
58 40 : : tree(nullptr),
59 : simple(true),
60 : contains_anchor(false),
61 40 : capture_count(0)
62 40 : {}
63 :
64 : RegExpTree* tree;
65 : bool simple;
66 : bool contains_anchor;
67 : int capture_count;
68 : };
69 :
70 : struct RegExpCode
71 : {
72 : jit::JitCode* jitCode;
73 : uint8_t* byteCode;
74 :
75 40 : RegExpCode()
76 40 : : jitCode(nullptr), byteCode(nullptr)
77 40 : {}
78 :
79 80 : bool empty() {
80 80 : return !jitCode && !byteCode;
81 : }
82 :
83 0 : void destroy() {
84 0 : js_free(byteCode);
85 0 : }
86 : };
87 :
88 : RegExpCode
89 : CompilePattern(JSContext* cx, HandleRegExpShared shared, RegExpCompileData* data,
90 : HandleLinearString sample, bool is_global, bool ignore_case,
91 : bool is_latin1, bool match_only, bool force_bytecode, bool sticky,
92 : bool unicode, RegExpShared::JitCodeTables& tables);
93 :
94 : // Note: this may return RegExpRunStatus_Error if an interrupt was requested
95 : // while the code was executing.
96 : template <typename CharT>
97 : RegExpRunStatus
98 : ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const CharT* chars, size_t start,
99 : size_t length, MatchPairs* matches, size_t* endIndex);
100 :
101 : template <typename CharT>
102 : RegExpRunStatus
103 : InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* chars, size_t start,
104 : size_t length, MatchPairs* matches, size_t* endIndex);
105 :
106 : #define FOR_EACH_NODE_TYPE(VISIT) \
107 : VISIT(End) \
108 : VISIT(Action) \
109 : VISIT(Choice) \
110 : VISIT(BackReference) \
111 : VISIT(Assertion) \
112 : VISIT(Text)
113 :
114 : #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
115 : VISIT(Disjunction) \
116 : VISIT(Alternative) \
117 : VISIT(Assertion) \
118 : VISIT(CharacterClass) \
119 : VISIT(Atom) \
120 : VISIT(Quantifier) \
121 : VISIT(Capture) \
122 : VISIT(Lookahead) \
123 : VISIT(BackReference) \
124 : VISIT(Empty) \
125 : VISIT(Text)
126 :
127 : #define FORWARD_DECLARE(Name) class RegExp##Name;
128 : FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
129 : #undef FORWARD_DECLARE
130 :
131 : // InfallibleVector is like Vector, but all its methods are infallible (they
132 : // crash on OOM). We use this class instead of Vector to avoid a ton of
133 : // MOZ_MUST_USE warnings in irregexp code (imported from V8).
134 : template<typename T, size_t N>
135 93 : class InfallibleVector
136 : {
137 : Vector<T, N, LifoAllocPolicy<Infallible>> vector_;
138 :
139 : InfallibleVector(const InfallibleVector&) = delete;
140 : void operator=(const InfallibleVector&) = delete;
141 :
142 : public:
143 3706 : explicit InfallibleVector(const LifoAllocPolicy<Infallible>& alloc) : vector_(alloc) {}
144 :
145 18465 : void append(const T& t) { MOZ_ALWAYS_TRUE(vector_.append(t)); }
146 28 : void append(const T* begin, size_t length) { MOZ_ALWAYS_TRUE(vector_.append(begin, length)); }
147 :
148 0 : void clear() { vector_.clear(); }
149 22 : void popBack() { vector_.popBack(); }
150 1265 : void reserve(size_t n) { MOZ_ALWAYS_TRUE(vector_.reserve(n)); }
151 :
152 16132 : size_t length() const { return vector_.length(); }
153 238 : T popCopy() { return vector_.popCopy(); }
154 :
155 28 : T* begin() { return vector_.begin(); }
156 0 : const T* begin() const { return vector_.begin(); }
157 :
158 31696 : T& operator[](size_t index) { return vector_[index]; }
159 2439 : const T& operator[](size_t index) const { return vector_[index]; }
160 :
161 0 : InfallibleVector& operator=(InfallibleVector&& rhs) { vector_ = Move(rhs.vector_); return *this; }
162 : };
163 :
164 : class CharacterRange;
165 : typedef InfallibleVector<CharacterRange, 1> CharacterRangeVector;
166 :
167 : // Represents code units in the range from from_ to to_, both ends are
168 : // inclusive.
169 : class CharacterRange
170 : {
171 : public:
172 : CharacterRange()
173 : : from_(0), to_(0)
174 : {}
175 :
176 3179 : CharacterRange(char16_t from, char16_t to)
177 3179 : : from_(from), to_(to)
178 3179 : {}
179 :
180 : static void AddClassEscape(LifoAlloc* alloc, char16_t type, CharacterRangeVector* ranges);
181 : static void AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
182 : CharacterRangeVector* ranges, bool ignoreCase);
183 :
184 589 : static inline CharacterRange Singleton(char16_t value) {
185 589 : return CharacterRange(value, value);
186 : }
187 331 : static inline CharacterRange Range(char16_t from, char16_t to) {
188 331 : MOZ_ASSERT(from <= to);
189 331 : return CharacterRange(from, to);
190 : }
191 18 : static inline CharacterRange Everything() {
192 18 : return CharacterRange(0, 0xFFFF);
193 : }
194 493 : bool Contains(char16_t i) { return from_ <= i && i <= to_; }
195 1925 : char16_t from() const { return from_; }
196 0 : void set_from(char16_t value) { from_ = value; }
197 1443 : char16_t to() const { return to_; }
198 50 : void set_to(char16_t value) { to_ = value; }
199 : bool is_valid() { return from_ <= to_; }
200 87 : bool IsEverything(char16_t max) { return from_ == 0 && to_ >= max; }
201 : bool IsSingleton() { return (from_ == to_); }
202 : void AddCaseEquivalents(bool is_latin1, bool unicode, CharacterRangeVector* ranges);
203 :
204 : static void Split(const LifoAlloc* alloc,
205 : CharacterRangeVector base,
206 : const Vector<int>& overlay,
207 : CharacterRangeVector* included,
208 : CharacterRangeVector* excluded);
209 :
210 : // Whether a range list is in canonical form: Ranges ordered by from value,
211 : // and ranges non-overlapping and non-adjacent.
212 : static bool IsCanonical(const CharacterRangeVector& ranges);
213 :
214 : // Convert range list to canonical form. The characters covered by the ranges
215 : // will still be the same, but no character is in more than one range, and
216 : // adjacent ranges are merged. The resulting list may be shorter than the
217 : // original, but cannot be longer.
218 : static void Canonicalize(CharacterRangeVector& ranges);
219 :
220 : // Negate the contents of a character range in canonical form.
221 : static void Negate(const LifoAlloc* alloc,
222 : CharacterRangeVector src,
223 : CharacterRangeVector* dst);
224 :
225 : static const int kStartMarker = (1 << 24);
226 : static const int kPayloadMask = (1 << 24) - 1;
227 :
228 : private:
229 : char16_t from_;
230 : char16_t to_;
231 : };
232 :
233 : // A set of unsigned integers that behaves especially well on small
234 : // integers (< 32).
235 : class OutSet
236 : {
237 : public:
238 498 : OutSet()
239 498 : : first_(0), remaining_(nullptr), successors_(nullptr)
240 498 : {}
241 :
242 : OutSet* Extend(LifoAlloc* alloc, unsigned value);
243 : bool Get(unsigned value);
244 : static const unsigned kFirstLimit = 32;
245 :
246 : private:
247 : typedef InfallibleVector<OutSet*, 1> OutSetVector;
248 : typedef InfallibleVector<unsigned, 1> RemainingVector;
249 :
250 : // Destructively set a value in this set. In most cases you want
251 : // to use Extend instead to ensure that only one instance exists
252 : // that contains the same values.
253 : void Set(LifoAlloc* alloc, unsigned value);
254 :
255 : // The successors are a list of sets that contain the same values
256 : // as this set and the one more value that is not present in this
257 : // set.
258 : OutSetVector* successors() { return successors_; }
259 :
260 : OutSet(uint32_t first, RemainingVector* remaining)
261 : : first_(first), remaining_(remaining), successors_(nullptr)
262 : {}
263 :
264 0 : RemainingVector& remaining() { return *remaining_; }
265 :
266 : uint32_t first_;
267 : RemainingVector* remaining_;
268 : OutSetVector* successors_;
269 : friend class Trace;
270 : };
271 :
272 : // A mapping from integers, specified as ranges, to a set of integers.
273 : // Used for mapping character ranges to choices.
274 : class DispatchTable
275 : {
276 : public:
277 : explicit DispatchTable(LifoAlloc* alloc)
278 : {}
279 :
280 : class Entry {
281 : public:
282 : Entry()
283 : : from_(0), to_(0), out_set_(nullptr)
284 : {}
285 :
286 : Entry(char16_t from, char16_t to, OutSet* out_set)
287 : : from_(from), to_(to), out_set_(out_set)
288 : {}
289 :
290 : char16_t from() { return from_; }
291 : char16_t to() { return to_; }
292 : void set_to(char16_t value) { to_ = value; }
293 : void AddValue(LifoAlloc* alloc, int value) {
294 : out_set_ = out_set_->Extend(alloc, value);
295 : }
296 : OutSet* out_set() { return out_set_; }
297 : private:
298 : char16_t from_;
299 : char16_t to_;
300 : OutSet* out_set_;
301 : };
302 :
303 : void AddRange(LifoAlloc* alloc, CharacterRange range, int value);
304 : OutSet* Get(char16_t value);
305 : void Dump();
306 :
307 : private:
308 : // There can't be a static empty set since it allocates its
309 : // successors in a LifoAlloc and caches them.
310 : OutSet* empty() { return &empty_; }
311 : OutSet empty_;
312 : };
313 :
314 : class TextElement
315 : {
316 : public:
317 : enum TextType {
318 : ATOM,
319 : CHAR_CLASS
320 : };
321 :
322 : static TextElement Atom(RegExpAtom* atom);
323 : static TextElement CharClass(RegExpCharacterClass* char_class);
324 :
325 4337 : int cp_offset() const { return cp_offset_; }
326 212 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
327 : int length() const;
328 :
329 4293 : TextType text_type() const { return text_type_; }
330 :
331 1556 : RegExpTree* tree() const { return tree_; }
332 :
333 1266 : RegExpAtom* atom() const {
334 1266 : MOZ_ASSERT(text_type() == ATOM);
335 1266 : return reinterpret_cast<RegExpAtom*>(tree());
336 : }
337 :
338 290 : RegExpCharacterClass* char_class() const {
339 290 : MOZ_ASSERT(text_type() == CHAR_CLASS);
340 290 : return reinterpret_cast<RegExpCharacterClass*>(tree());
341 : }
342 :
343 : private:
344 287 : TextElement(TextType text_type, RegExpTree* tree)
345 287 : : cp_offset_(-1), text_type_(text_type), tree_(tree)
346 287 : {}
347 :
348 : int cp_offset_;
349 : TextType text_type_;
350 : RegExpTree* tree_;
351 : };
352 :
353 : typedef InfallibleVector<TextElement, 1> TextElementVector;
354 :
355 : class NodeVisitor;
356 : class RegExpCompiler;
357 : class Trace;
358 : class BoyerMooreLookahead;
359 :
360 : struct NodeInfo
361 : {
362 610 : NodeInfo()
363 610 : : being_analyzed(false),
364 : been_analyzed(false),
365 : follows_word_interest(false),
366 : follows_newline_interest(false),
367 : follows_start_interest(false),
368 : at_end(false),
369 : visited(false),
370 610 : replacement_calculated(false)
371 610 : {}
372 :
373 : // Returns true if the interests and assumptions of this node
374 : // matches the given one.
375 : bool Matches(NodeInfo* that) {
376 : return (at_end == that->at_end) &&
377 : (follows_word_interest == that->follows_word_interest) &&
378 : (follows_newline_interest == that->follows_newline_interest) &&
379 : (follows_start_interest == that->follows_start_interest);
380 : }
381 :
382 : // Updates the interests of this node given the interests of the
383 : // node preceding it.
384 : void AddFromPreceding(NodeInfo* that) {
385 : at_end |= that->at_end;
386 : follows_word_interest |= that->follows_word_interest;
387 : follows_newline_interest |= that->follows_newline_interest;
388 : follows_start_interest |= that->follows_start_interest;
389 : }
390 :
391 : bool HasLookbehind() {
392 : return follows_word_interest ||
393 : follows_newline_interest ||
394 : follows_start_interest;
395 : }
396 :
397 : // Sets the interests of this node to include the interests of the
398 : // following node.
399 392 : void AddFromFollowing(NodeInfo* that) {
400 392 : follows_word_interest |= that->follows_word_interest;
401 392 : follows_newline_interest |= that->follows_newline_interest;
402 392 : follows_start_interest |= that->follows_start_interest;
403 392 : }
404 :
405 : void ResetCompilationState() {
406 : being_analyzed = false;
407 : been_analyzed = false;
408 : }
409 :
410 : bool being_analyzed: 1;
411 : bool been_analyzed: 1;
412 :
413 : // These bits are set of this node has to know what the preceding
414 : // character was.
415 : bool follows_word_interest: 1;
416 : bool follows_newline_interest: 1;
417 : bool follows_start_interest: 1;
418 :
419 : bool at_end: 1;
420 : bool visited: 1;
421 : bool replacement_calculated: 1;
422 : };
423 :
424 : // Details of a quick mask-compare check that can look ahead in the
425 : // input stream.
426 : class QuickCheckDetails
427 : {
428 : public:
429 1297 : QuickCheckDetails()
430 1297 : : characters_(0),
431 : mask_(0),
432 : value_(0),
433 1297 : cannot_match_(false)
434 1297 : {}
435 :
436 46 : explicit QuickCheckDetails(int characters)
437 46 : : characters_(characters),
438 : mask_(0),
439 : value_(0),
440 46 : cannot_match_(false)
441 46 : {}
442 :
443 : bool Rationalize(bool latin1);
444 :
445 : // Merge in the information from another branch of an alternation.
446 : void Merge(QuickCheckDetails* other, int from_index);
447 :
448 : // Advance the current position by some amount.
449 : void Advance(int by);
450 :
451 : void Clear();
452 :
453 262 : bool cannot_match() { return cannot_match_; }
454 0 : void set_cannot_match() { cannot_match_ = true; }
455 :
456 4488 : int characters() { return characters_; }
457 208 : void set_characters(int characters) { characters_ = characters; }
458 :
459 : struct Position {
460 5372 : Position() : mask(0), value(0), determines_perfectly(false) { }
461 : char16_t mask;
462 : char16_t value;
463 : bool determines_perfectly;
464 : };
465 :
466 1285 : Position* positions(int index) {
467 1285 : MOZ_ASSERT(index >= 0);
468 1285 : MOZ_ASSERT(index < characters_);
469 1285 : return positions_ + index;
470 : }
471 :
472 110 : uint32_t mask() { return mask_; }
473 110 : uint32_t value() { return value_; }
474 :
475 : private:
476 : // How many characters do we have quick check information from. This is
477 : // the same for all branches of a choice node.
478 : int characters_;
479 : Position positions_[4];
480 :
481 : // These values are the condensate of the above array after Rationalize().
482 : uint32_t mask_;
483 : uint32_t value_;
484 :
485 : // If set to true, there is no way this quick check can match at all.
486 : // E.g., if it requires to be at the start of the input, and isn't.
487 : bool cannot_match_;
488 : };
489 :
490 : class RegExpNode
491 : {
492 : public:
493 : explicit RegExpNode(LifoAlloc* alloc);
494 0 : virtual ~RegExpNode() {}
495 : virtual void Accept(NodeVisitor* visitor) = 0;
496 :
497 : // Generates a goto to this node or actually generates the code at this point.
498 : virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
499 :
500 : // How many characters must this node consume at a minimum in order to
501 : // succeed. If we have found at least 'still_to_find' characters that
502 : // must be consumed there is no need to ask any following nodes whether
503 : // they are sure to eat any more characters. The not_at_start argument is
504 : // used to indicate that we know we are not at the start of the input. In
505 : // this case anchored branches will always fail and can be ignored when
506 : // determining how many characters are consumed on success.
507 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
508 :
509 : // Emits some quick code that checks whether the preloaded characters match.
510 : // Falls through on certain failure, jumps to the label on possible success.
511 : // If the node cannot make a quick check it does nothing and returns false.
512 : bool EmitQuickCheck(RegExpCompiler* compiler,
513 : Trace* trace,
514 : bool preload_has_checked_bounds,
515 : jit::Label* on_possible_success,
516 : QuickCheckDetails* details_return,
517 : bool fall_through_on_failure);
518 :
519 : // For a given number of characters this returns a mask and a value. The
520 : // next n characters are anded with the mask and compared with the value.
521 : // A comparison failure indicates the node cannot match the next n characters.
522 : // A comparison success indicates the node may match.
523 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
524 : RegExpCompiler* compiler,
525 : int characters_filled_in,
526 : bool not_at_start) = 0;
527 :
528 : static const int kNodeIsTooComplexForGreedyLoops = -1;
529 :
530 18 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
531 :
532 : // Only returns the successor for a text node of length 1 that matches any
533 : // character and that has no guards on it.
534 21 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(RegExpCompiler* compiler) {
535 21 : return nullptr;
536 : }
537 :
538 : static const int kRecursionBudget = 200;
539 :
540 : // Collects information on the possible code units (mod 128) that can match if
541 : // we look forward. This is used for a Boyer-Moore-like string searching
542 : // implementation. TODO(erikcorry): This should share more code with
543 : // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
544 : // the number of nodes we are willing to look at in order to create this data.
545 0 : virtual bool FillInBMInfo(int offset,
546 : int budget,
547 : BoyerMooreLookahead* bm,
548 : bool not_at_start) {
549 0 : MOZ_CRASH("Bad call");
550 : }
551 :
552 : // If we know that the input is ASCII then there are some nodes that can
553 : // never match. This method returns a node that can be substituted for
554 : // itself, or nullptr if the node can never match.
555 46 : virtual RegExpNode* FilterLATIN1(int depth, bool ignore_case, bool unicode) { return this; }
556 :
557 : // Helper for FilterLATIN1.
558 130 : RegExpNode* replacement() {
559 130 : MOZ_ASSERT(info()->replacement_calculated);
560 130 : return replacement_;
561 : }
562 401 : RegExpNode* set_replacement(RegExpNode* replacement) {
563 401 : info()->replacement_calculated = true;
564 401 : replacement_ = replacement;
565 401 : return replacement; // For convenience.
566 : }
567 :
568 : // We want to avoid recalculating the lookahead info, so we store it on the
569 : // node. Only info that is for this node is stored. We can tell that the
570 : // info is for this node when offset == 0, so the information is calculated
571 : // relative to this node.
572 32 : void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
573 32 : if (offset == 0) set_bm_info(not_at_start, bm);
574 32 : }
575 :
576 539 : jit::Label* label() { return &label_; }
577 :
578 : // If non-generic code is generated for a node (i.e. the node is not at the
579 : // start of the trace) then it cannot be reused. This variable sets a limit
580 : // on how often we allow that to happen before we insist on starting a new
581 : // trace and generating generic code for a node that can be reused by flushing
582 : // the deferred actions in the current trace and generating a goto.
583 : static const int kMaxCopiesCodeGenerated = 10;
584 :
585 5746 : NodeInfo* info() { return &info_; }
586 :
587 20 : BoyerMooreLookahead* bm_info(bool not_at_start) {
588 20 : return bm_info_[not_at_start ? 1 : 0];
589 : }
590 :
591 1270 : LifoAlloc* alloc() const { return alloc_; }
592 :
593 : protected:
594 : enum LimitResult { DONE, CONTINUE };
595 : RegExpNode* replacement_;
596 :
597 : LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
598 :
599 42 : void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
600 42 : bm_info_[not_at_start ? 1 : 0] = bm;
601 42 : }
602 :
603 : private:
604 : static const int kFirstCharBudget = 10;
605 : jit::Label label_;
606 : NodeInfo info_;
607 :
608 : // This variable keeps track of how many times code has been generated for
609 : // this node (in different traces). We don't keep track of where the
610 : // generated code is located unless the code is generated at the start of
611 : // a trace, in which case it is generic and can be reused by flushing the
612 : // deferred operations in the current trace and generating a goto.
613 : int trace_count_;
614 : BoyerMooreLookahead* bm_info_[2];
615 :
616 : LifoAlloc* alloc_;
617 : };
618 :
619 : // A simple closed interval.
620 : class Interval
621 : {
622 : public:
623 100 : Interval() : from_(kNone), to_(kNone) { }
624 :
625 115 : Interval(int from, int to) : from_(from), to_(to) { }
626 :
627 24 : Interval Union(Interval that) {
628 24 : if (that.from_ == kNone)
629 19 : return *this;
630 5 : else if (from_ == kNone)
631 4 : return that;
632 : else
633 1 : return Interval(Min(from_, that.from_), Max(to_, that.to_));
634 : }
635 :
636 24 : bool Contains(int value) {
637 24 : return (from_ <= value) && (value <= to_);
638 : }
639 :
640 83 : bool is_empty() { return from_ == kNone; }
641 :
642 1923 : int from() const { return from_; }
643 959 : int to() const { return to_; }
644 :
645 100 : static Interval Empty() { return Interval(); }
646 : static const int kNone = -1;
647 :
648 : private:
649 : int from_;
650 : int to_;
651 : };
652 :
653 0 : class SeqRegExpNode : public RegExpNode
654 : {
655 : public:
656 437 : explicit SeqRegExpNode(RegExpNode* on_success)
657 437 : : RegExpNode(on_success->alloc()), on_success_(on_success)
658 437 : {}
659 :
660 1629 : RegExpNode* on_success() { return on_success_; }
661 : void set_on_success(RegExpNode* node) { on_success_ = node; }
662 : virtual RegExpNode* FilterLATIN1(int depth, bool ignore_case, bool unicode);
663 : virtual bool FillInBMInfo(int offset,
664 : int budget,
665 : BoyerMooreLookahead* bm,
666 : bool not_at_start);
667 :
668 : protected:
669 : RegExpNode* FilterSuccessor(int depth, bool ignore_case, bool unicode);
670 :
671 : private:
672 : RegExpNode* on_success_;
673 : };
674 :
675 0 : class ActionNode : public SeqRegExpNode
676 : {
677 : public:
678 : enum ActionType {
679 : SET_REGISTER,
680 : INCREMENT_REGISTER,
681 : STORE_POSITION,
682 : BEGIN_SUBMATCH,
683 : POSITIVE_SUBMATCH_SUCCESS,
684 : EMPTY_MATCH_CHECK,
685 : CLEAR_CAPTURES
686 : };
687 :
688 178 : ActionNode(ActionType action_type, RegExpNode* on_success)
689 178 : : SeqRegExpNode(on_success),
690 178 : action_type_(action_type)
691 178 : {}
692 :
693 : static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
694 : static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
695 : static ActionNode* StorePosition(int reg,
696 : bool is_capture,
697 : RegExpNode* on_success);
698 : static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
699 : static ActionNode* BeginSubmatch(int stack_pointer_reg,
700 : int position_reg,
701 : RegExpNode* on_success);
702 : static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
703 : int restore_reg,
704 : int clear_capture_count,
705 : int clear_capture_from,
706 : RegExpNode* on_success);
707 : static ActionNode* EmptyMatchCheck(int start_register,
708 : int repetition_register,
709 : int repetition_limit,
710 : RegExpNode* on_success);
711 : virtual void Accept(NodeVisitor* visitor);
712 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
713 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
714 119 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
715 : RegExpCompiler* compiler,
716 : int filled_in,
717 : bool not_at_start) {
718 238 : return on_success()->GetQuickCheckDetails(
719 238 : details, compiler, filled_in, not_at_start);
720 : }
721 : virtual bool FillInBMInfo(int offset,
722 : int budget,
723 : BoyerMooreLookahead* bm,
724 : bool not_at_start);
725 : ActionType action_type() { return action_type_; }
726 : // TODO(erikcorry): We should allow some action nodes in greedy loops.
727 49 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
728 :
729 : private:
730 : union {
731 : struct {
732 : int reg;
733 : int value;
734 : } u_store_register;
735 : struct {
736 : int reg;
737 : } u_increment_register;
738 : struct {
739 : int reg;
740 : bool is_capture;
741 : } u_position_register;
742 : struct {
743 : int stack_pointer_register;
744 : int current_position_register;
745 : int clear_register_count;
746 : int clear_register_from;
747 : } u_submatch;
748 : struct {
749 : int start_register;
750 : int repetition_register;
751 : int repetition_limit;
752 : } u_empty_match_check;
753 : struct {
754 : int range_from;
755 : int range_to;
756 : } u_clear_captures;
757 : } data_;
758 : ActionType action_type_;
759 : friend class DotPrinter;
760 : };
761 :
762 0 : class TextNode : public SeqRegExpNode
763 : {
764 : public:
765 125 : TextNode(TextElementVector* elements,
766 : RegExpNode* on_success)
767 125 : : SeqRegExpNode(on_success),
768 125 : elements_(elements)
769 125 : {}
770 :
771 90 : TextNode(RegExpCharacterClass* that,
772 : RegExpNode* on_success)
773 90 : : SeqRegExpNode(on_success),
774 90 : elements_(alloc()->newInfallible<TextElementVector>(*alloc()))
775 : {
776 90 : elements_->append(TextElement::CharClass(that));
777 90 : }
778 :
779 : virtual void Accept(NodeVisitor* visitor);
780 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
781 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
782 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
783 : RegExpCompiler* compiler,
784 : int characters_filled_in,
785 : bool not_at_start);
786 4552 : TextElementVector& elements() { return *elements_; }
787 : void MakeCaseIndependent(bool is_latin1, bool unicode);
788 : virtual int GreedyLoopTextLength();
789 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
790 : RegExpCompiler* compiler);
791 : virtual bool FillInBMInfo(int offset,
792 : int budget,
793 : BoyerMooreLookahead* bm,
794 : bool not_at_start);
795 : void CalculateOffsets();
796 : virtual RegExpNode* FilterLATIN1(int depth, bool ignore_case, bool unicode);
797 :
798 : private:
799 : enum TextEmitPassType {
800 : NON_LATIN1_MATCH, // Check for characters that can't match.
801 : SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
802 : CASE_SINGLE_CHARACTER_MATCH, // Case-independent single character check.
803 : CASE_MUTLI_CHARACTER_MATCH, // Case-independent single character with
804 : // multiple variation.
805 : CHARACTER_CLASS_MATCH // Character class.
806 : };
807 : static bool SkipPass(int pass, bool ignore_case);
808 : static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
809 : static const int kLastPass = CHARACTER_CLASS_MATCH;
810 : void TextEmitPass(RegExpCompiler* compiler,
811 : TextEmitPassType pass,
812 : bool preloaded,
813 : Trace* trace,
814 : bool first_element_checked,
815 : int* checked_up_to);
816 : int Length();
817 : TextElementVector* elements_;
818 : };
819 :
820 0 : class AssertionNode : public SeqRegExpNode
821 : {
822 : public:
823 : enum AssertionType {
824 : AT_END,
825 : AT_START,
826 : AT_BOUNDARY,
827 : AT_NON_BOUNDARY,
828 : AFTER_NEWLINE,
829 : NOT_AFTER_LEAD_SURROGATE,
830 : NOT_IN_SURROGATE_PAIR
831 : };
832 44 : AssertionNode(AssertionType t, RegExpNode* on_success)
833 44 : : SeqRegExpNode(on_success), assertion_type_(t)
834 44 : {}
835 :
836 21 : static AssertionNode* AtEnd(RegExpNode* on_success) {
837 21 : return on_success->alloc()->newInfallible<AssertionNode>(AT_END, on_success);
838 : }
839 21 : static AssertionNode* AtStart(RegExpNode* on_success) {
840 21 : return on_success->alloc()->newInfallible<AssertionNode>(AT_START, on_success);
841 : }
842 2 : static AssertionNode* AtBoundary(RegExpNode* on_success) {
843 2 : return on_success->alloc()->newInfallible<AssertionNode>(AT_BOUNDARY, on_success);
844 : }
845 0 : static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
846 0 : return on_success->alloc()->newInfallible<AssertionNode>(AT_NON_BOUNDARY, on_success);
847 : }
848 0 : static AssertionNode* AfterNewline(RegExpNode* on_success) {
849 0 : return on_success->alloc()->newInfallible<AssertionNode>(AFTER_NEWLINE, on_success);
850 : }
851 0 : static AssertionNode* NotAfterLeadSurrogate(RegExpNode* on_success) {
852 0 : return on_success->alloc()->newInfallible<AssertionNode>(NOT_AFTER_LEAD_SURROGATE,
853 0 : on_success);
854 : }
855 0 : static AssertionNode* NotInSurrogatePair(RegExpNode* on_success) {
856 0 : return on_success->alloc()->newInfallible<AssertionNode>(NOT_IN_SURROGATE_PAIR,
857 0 : on_success);
858 : }
859 : virtual void Accept(NodeVisitor* visitor);
860 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
861 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
862 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
863 : RegExpCompiler* compiler,
864 : int filled_in,
865 : bool not_at_start);
866 : virtual bool FillInBMInfo(int offset,
867 : int budget,
868 : BoyerMooreLookahead* bm,
869 : bool not_at_start);
870 50 : AssertionType assertion_type() { return assertion_type_; }
871 :
872 : private:
873 : void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
874 : enum IfPrevious { kIsNonWord, kIsWord };
875 : void BacktrackIfPrevious(RegExpCompiler* compiler,
876 : Trace* trace,
877 : IfPrevious backtrack_if_previous);
878 : AssertionType assertion_type_;
879 : };
880 :
881 0 : class BackReferenceNode : public SeqRegExpNode
882 : {
883 : public:
884 0 : BackReferenceNode(int start_reg,
885 : int end_reg,
886 : RegExpNode* on_success)
887 0 : : SeqRegExpNode(on_success),
888 : start_reg_(start_reg),
889 0 : end_reg_(end_reg)
890 0 : {}
891 :
892 : virtual void Accept(NodeVisitor* visitor);
893 : int start_register() { return start_reg_; }
894 : int end_register() { return end_reg_; }
895 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
896 : virtual int EatsAtLeast(int still_to_find,
897 : int recursion_depth,
898 : bool not_at_start);
899 0 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
900 : RegExpCompiler* compiler,
901 : int characters_filled_in,
902 : bool not_at_start) {
903 0 : return;
904 : }
905 : virtual bool FillInBMInfo(int offset,
906 : int budget,
907 : BoyerMooreLookahead* bm,
908 : bool not_at_start);
909 :
910 : private:
911 : int start_reg_;
912 : int end_reg_;
913 : };
914 :
915 0 : class EndNode : public RegExpNode
916 : {
917 : public:
918 : enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
919 :
920 43 : explicit EndNode(LifoAlloc* alloc, Action action)
921 43 : : RegExpNode(alloc), action_(action)
922 43 : {}
923 :
924 : virtual void Accept(NodeVisitor* visitor);
925 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
926 114 : virtual int EatsAtLeast(int still_to_find,
927 : int recursion_depth,
928 114 : bool not_at_start) { return 0; }
929 0 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
930 : RegExpCompiler* compiler,
931 : int characters_filled_in,
932 : bool not_at_start)
933 : {
934 : // Returning 0 from EatsAtLeast should ensure we never get here.
935 0 : MOZ_CRASH("Bad call");
936 : }
937 0 : virtual bool FillInBMInfo(int offset,
938 : int budget,
939 : BoyerMooreLookahead* bm,
940 : bool not_at_start) {
941 : // Returning 0 from EatsAtLeast should ensure we never get here.
942 0 : MOZ_CRASH("Bad call");
943 : }
944 :
945 : private:
946 : Action action_;
947 : };
948 :
949 0 : class NegativeSubmatchSuccess : public EndNode
950 : {
951 : public:
952 2 : NegativeSubmatchSuccess(LifoAlloc* alloc,
953 : int stack_pointer_reg,
954 : int position_reg,
955 : int clear_capture_count,
956 : int clear_capture_start)
957 2 : : EndNode(alloc, NEGATIVE_SUBMATCH_SUCCESS),
958 : stack_pointer_register_(stack_pointer_reg),
959 : current_position_register_(position_reg),
960 : clear_capture_count_(clear_capture_count),
961 2 : clear_capture_start_(clear_capture_start)
962 2 : {}
963 :
964 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
965 :
966 : private:
967 : int stack_pointer_register_;
968 : int current_position_register_;
969 : int clear_capture_count_;
970 : int clear_capture_start_;
971 : };
972 :
973 : class Guard
974 : {
975 : public:
976 : enum Relation { LT, GEQ };
977 16 : Guard(int reg, Relation op, int value)
978 16 : : reg_(reg),
979 : op_(op),
980 16 : value_(value)
981 16 : {}
982 :
983 56 : int reg() { return reg_; }
984 16 : Relation op() { return op_; }
985 16 : int value() { return value_; }
986 :
987 : private:
988 : int reg_;
989 : Relation op_;
990 : int value_;
991 : };
992 :
993 : typedef InfallibleVector<Guard*, 1> GuardVector;
994 :
995 : class GuardedAlternative
996 : {
997 : public:
998 221 : explicit GuardedAlternative(RegExpNode* node)
999 221 : : node_(node), guards_(nullptr)
1000 221 : {}
1001 :
1002 : void AddGuard(LifoAlloc* alloc, Guard* guard);
1003 1642 : RegExpNode* node() const { return node_; }
1004 169 : void set_node(RegExpNode* node) { node_ = node; }
1005 740 : const GuardVector* guards() const { return guards_; }
1006 :
1007 : private:
1008 : RegExpNode* node_;
1009 : GuardVector* guards_;
1010 : };
1011 :
1012 : typedef InfallibleVector<GuardedAlternative, 0> GuardedAlternativeVector;
1013 :
1014 : class AlternativeGeneration;
1015 :
1016 0 : class ChoiceNode : public RegExpNode
1017 : {
1018 : public:
1019 86 : explicit ChoiceNode(LifoAlloc* alloc, int expected_size)
1020 86 : : RegExpNode(alloc),
1021 : alternatives_(*alloc),
1022 : table_(nullptr),
1023 : not_at_start_(false),
1024 86 : being_calculated_(false)
1025 : {
1026 86 : alternatives_.reserve(expected_size);
1027 86 : }
1028 :
1029 : virtual void Accept(NodeVisitor* visitor);
1030 221 : void AddAlternative(GuardedAlternative node) {
1031 221 : alternatives_.append(node);
1032 221 : }
1033 :
1034 2699 : GuardedAlternativeVector& alternatives() { return alternatives_; }
1035 : DispatchTable* GetTable(bool ignore_case);
1036 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1037 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1038 : int EatsAtLeastHelper(int still_to_find,
1039 : int budget,
1040 : RegExpNode* ignore_this_node,
1041 : bool not_at_start);
1042 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1043 : RegExpCompiler* compiler,
1044 : int characters_filled_in,
1045 : bool not_at_start);
1046 : virtual bool FillInBMInfo(int offset,
1047 : int budget,
1048 : BoyerMooreLookahead* bm,
1049 : bool not_at_start);
1050 :
1051 : bool being_calculated() { return being_calculated_; }
1052 119 : bool not_at_start() { return not_at_start_; }
1053 22 : void set_not_at_start() { not_at_start_ = true; }
1054 : void set_being_calculated(bool b) { being_calculated_ = b; }
1055 204 : virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
1056 : virtual RegExpNode* FilterLATIN1(int depth, bool ignore_case, bool unicode);
1057 :
1058 : protected:
1059 : int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
1060 : GuardedAlternativeVector alternatives_;
1061 :
1062 : private:
1063 : friend class Analysis;
1064 : void GenerateGuard(RegExpMacroAssembler* macro_assembler,
1065 : Guard* guard,
1066 : Trace* trace);
1067 : int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
1068 : void EmitOutOfLineContinuation(RegExpCompiler* compiler,
1069 : Trace* trace,
1070 : GuardedAlternative alternative,
1071 : AlternativeGeneration* alt_gen,
1072 : int preload_characters,
1073 : bool next_expects_preload);
1074 : DispatchTable* table_;
1075 :
1076 : // If true, this node is never checked at the start of the input.
1077 : // Allows a new trace to start with at_start() set to false.
1078 : bool not_at_start_;
1079 : bool being_calculated_;
1080 : };
1081 :
1082 0 : class NegativeLookaheadChoiceNode : public ChoiceNode
1083 : {
1084 : public:
1085 2 : explicit NegativeLookaheadChoiceNode(LifoAlloc* alloc,
1086 : GuardedAlternative this_must_fail,
1087 : GuardedAlternative then_do_this)
1088 2 : : ChoiceNode(alloc, 2)
1089 : {
1090 2 : AddAlternative(this_must_fail);
1091 2 : AddAlternative(then_do_this);
1092 2 : }
1093 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1094 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1095 : RegExpCompiler* compiler,
1096 : int characters_filled_in,
1097 : bool not_at_start);
1098 : virtual bool FillInBMInfo(int offset,
1099 : int budget,
1100 : BoyerMooreLookahead* bm,
1101 : bool not_at_start);
1102 :
1103 : // For a negative lookahead we don't emit the quick check for the
1104 : // alternative that is expected to fail. This is because quick check code
1105 : // starts by loading enough characters for the alternative that takes fewest
1106 : // characters, but on a negative lookahead the negative branch did not take
1107 : // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1108 4 : virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
1109 : virtual RegExpNode* FilterLATIN1(int depth, bool ignore_case, bool unicode);
1110 : };
1111 :
1112 0 : class LoopChoiceNode : public ChoiceNode
1113 : {
1114 : public:
1115 57 : explicit LoopChoiceNode(LifoAlloc* alloc, bool body_can_be_zero_length)
1116 57 : : ChoiceNode(alloc, 2),
1117 : loop_node_(nullptr),
1118 : continue_node_(nullptr),
1119 57 : body_can_be_zero_length_(body_can_be_zero_length)
1120 57 : {}
1121 :
1122 : void AddLoopAlternative(GuardedAlternative alt);
1123 : void AddContinueAlternative(GuardedAlternative alt);
1124 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1125 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1126 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1127 : RegExpCompiler* compiler,
1128 : int characters_filled_in,
1129 : bool not_at_start);
1130 : virtual bool FillInBMInfo(int offset,
1131 : int budget,
1132 : BoyerMooreLookahead* bm,
1133 : bool not_at_start);
1134 224 : RegExpNode* loop_node() { return loop_node_; }
1135 : RegExpNode* continue_node() { return continue_node_; }
1136 : bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1137 : virtual void Accept(NodeVisitor* visitor);
1138 : virtual RegExpNode* FilterLATIN1(int depth, bool ignore_case, bool unicode);
1139 :
1140 : private:
1141 : // AddAlternative is made private for loop nodes because alternatives
1142 : // should not be added freely, we need to keep track of which node
1143 : // goes back to the node itself.
1144 114 : void AddAlternative(GuardedAlternative node) {
1145 114 : ChoiceNode::AddAlternative(node);
1146 114 : }
1147 :
1148 : RegExpNode* loop_node_;
1149 : RegExpNode* continue_node_;
1150 : bool body_can_be_zero_length_;
1151 : };
1152 :
1153 : // Improve the speed that we scan for an initial point where a non-anchored
1154 : // regexp can match by using a Boyer-Moore-like table. This is done by
1155 : // identifying non-greedy non-capturing loops in the nodes that eat any
1156 : // character one at a time. For example in the middle of the regexp
1157 : // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1158 : // inserted at the start of any non-anchored regexp.
1159 : //
1160 : // When we have found such a loop we look ahead in the nodes to find the set of
1161 : // characters that can come at given distances. For example for the regexp
1162 : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1163 : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1164 : // the lookahead info where the set of characters is reasonably constrained. In
1165 : // our example this is from index 1 to 2 (0 is not constrained). We can now
1166 : // look 3 characters ahead and if we don't find one of [f, o] (the union of
1167 : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1168 : //
1169 : // For Unicode input strings we do the same, but modulo 128.
1170 : //
1171 : // We also look at the first string fed to the regexp and use that to get a hint
1172 : // of the character frequencies in the inputs. This affects the assessment of
1173 : // whether the set of characters is 'reasonably constrained'.
1174 : //
1175 : // We also have another lookahead mechanism (called quick check in the code),
1176 : // which uses a wide load of multiple characters followed by a mask and compare
1177 : // to determine whether a match is possible at this point.
1178 : enum ContainedInLattice {
1179 : kNotYet = 0,
1180 : kLatticeIn = 1,
1181 : kLatticeOut = 2,
1182 : kLatticeUnknown = 3 // Can also mean both in and out.
1183 : };
1184 :
1185 : inline ContainedInLattice
1186 392 : Combine(ContainedInLattice a, ContainedInLattice b) {
1187 392 : return static_cast<ContainedInLattice>(a | b);
1188 : }
1189 :
1190 : ContainedInLattice
1191 : AddRange(ContainedInLattice a,
1192 : const int* ranges,
1193 : int ranges_size,
1194 : Interval new_range);
1195 :
1196 : class BoyerMoorePositionInfo
1197 : {
1198 : public:
1199 48 : explicit BoyerMoorePositionInfo(LifoAlloc* alloc, bool unicode_ignore_case)
1200 48 : : map_(*alloc),
1201 : map_count_(0),
1202 : w_(kNotYet),
1203 : s_(kNotYet),
1204 : d_(kNotYet),
1205 : surrogate_(kNotYet),
1206 48 : unicode_ignore_case_(unicode_ignore_case)
1207 : {
1208 48 : map_.reserve(kMapSize);
1209 6192 : for (int i = 0; i < kMapSize; i++)
1210 6144 : map_.append(false);
1211 48 : }
1212 :
1213 19435 : bool& at(int i) { return map_[i]; }
1214 :
1215 : static const int kMapSize = 128;
1216 : static const int kMask = kMapSize - 1;
1217 :
1218 213 : int map_count() const { return map_count_; }
1219 :
1220 : void Set(int character);
1221 : void SetInterval(const Interval& interval);
1222 : void SetAll();
1223 2 : bool is_non_word() { return w_ == kLatticeOut; }
1224 2 : bool is_word() { return w_ == kLatticeIn; }
1225 :
1226 : private:
1227 : InfallibleVector<bool, 0> map_;
1228 : int map_count_; // Number of set bits in the map.
1229 : ContainedInLattice w_; // The \w character class.
1230 : ContainedInLattice s_; // The \s character class.
1231 : ContainedInLattice d_; // The \d character class.
1232 : ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1233 :
1234 : // True if the RegExp has unicode and ignoreCase flags.
1235 : bool unicode_ignore_case_;
1236 : };
1237 :
1238 : typedef InfallibleVector<BoyerMoorePositionInfo*, 1> BoyerMoorePositionInfoVector;
1239 :
1240 : class BoyerMooreLookahead
1241 : {
1242 : public:
1243 : BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler);
1244 :
1245 169 : int length() { return length_; }
1246 68 : int max_char() { return max_char_; }
1247 153 : RegExpCompiler* compiler() { return compiler_; }
1248 :
1249 192 : int Count(int map_number) {
1250 192 : return bitmaps_[map_number]->map_count();
1251 : }
1252 :
1253 4 : BoyerMoorePositionInfo* at(int i) { return bitmaps_[i]; }
1254 :
1255 66 : void Set(int map_number, int character) {
1256 66 : if (character > max_char_) return;
1257 66 : BoyerMoorePositionInfo* info = bitmaps_[map_number];
1258 66 : info->Set(character);
1259 : }
1260 :
1261 35 : void SetInterval(int map_number, const Interval& interval) {
1262 35 : if (interval.from() > max_char_) return;
1263 35 : BoyerMoorePositionInfo* info = bitmaps_[map_number];
1264 35 : if (interval.to() > max_char_) {
1265 0 : info->SetInterval(Interval(interval.from(), max_char_));
1266 : } else {
1267 35 : info->SetInterval(interval);
1268 : }
1269 : }
1270 :
1271 2 : void SetAll(int map_number) {
1272 2 : bitmaps_[map_number]->SetAll();
1273 2 : }
1274 :
1275 0 : void SetRest(int from_map) {
1276 0 : for (int i = from_map; i < length_; i++) SetAll(i);
1277 0 : }
1278 : bool EmitSkipInstructions(RegExpMacroAssembler* masm);
1279 :
1280 : bool CheckOverRecursed();
1281 :
1282 : private:
1283 : // This is the value obtained by EatsAtLeast. If we do not have at least this
1284 : // many characters left in the sample string then the match is bound to fail.
1285 : // Therefore it is OK to read a character this far ahead of the current match
1286 : // point.
1287 : int length_;
1288 : RegExpCompiler* compiler_;
1289 :
1290 : // 0xff for LATIN1, 0xffff for UTF-16.
1291 : int max_char_;
1292 : BoyerMoorePositionInfoVector bitmaps_;
1293 :
1294 : int GetSkipTable(int min_lookahead,
1295 : int max_lookahead,
1296 : uint8_t* boolean_skip_table);
1297 : bool FindWorthwhileInterval(int* from, int* to);
1298 : int FindBestInterval(int max_number_of_chars, int old_biggest_points, int* from, int* to);
1299 : };
1300 :
1301 : // There are many ways to generate code for a node. This class encapsulates
1302 : // the current way we should be generating. In other words it encapsulates
1303 : // the current state of the code generator. The effect of this is that we
1304 : // generate code for paths that the matcher can take through the regular
1305 : // expression. A given node in the regexp can be code-generated several times
1306 : // as it can be part of several traces. For example for the regexp:
1307 : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1308 : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1309 : // to match foo is generated only once (the traces have a common prefix). The
1310 : // code to store the capture is deferred and generated (twice) after the places
1311 : // where baz has been matched.
1312 : class Trace
1313 : {
1314 : public:
1315 : // A value for a property that is either known to be true, know to be false,
1316 : // or not known.
1317 : enum TriBool {
1318 : UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1319 : };
1320 :
1321 : class DeferredAction {
1322 : public:
1323 194 : DeferredAction(ActionNode::ActionType action_type, int reg)
1324 194 : : action_type_(action_type), reg_(reg), next_(nullptr)
1325 194 : {}
1326 :
1327 862 : DeferredAction* next() { return next_; }
1328 : bool Mentions(int reg);
1329 1265 : int reg() { return reg_; }
1330 1147 : ActionNode::ActionType action_type() { return action_type_; }
1331 : private:
1332 : ActionNode::ActionType action_type_;
1333 : int reg_;
1334 : DeferredAction* next_;
1335 : friend class Trace;
1336 : };
1337 :
1338 : class DeferredCapture : public DeferredAction {
1339 : public:
1340 164 : DeferredCapture(int reg, bool is_capture, Trace* trace)
1341 164 : : DeferredAction(ActionNode::STORE_POSITION, reg),
1342 164 : cp_offset_(trace->cp_offset()),
1343 328 : is_capture_(is_capture)
1344 164 : {}
1345 :
1346 241 : int cp_offset() { return cp_offset_; }
1347 79 : bool is_capture() { return is_capture_; }
1348 : private:
1349 : int cp_offset_;
1350 : bool is_capture_;
1351 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1352 : };
1353 :
1354 : class DeferredSetRegister : public DeferredAction {
1355 : public:
1356 11 : DeferredSetRegister(int reg, int value)
1357 11 : : DeferredAction(ActionNode::SET_REGISTER, reg),
1358 11 : value_(value)
1359 11 : {}
1360 11 : int value() { return value_; }
1361 : private:
1362 : int value_;
1363 : };
1364 :
1365 : class DeferredClearCaptures : public DeferredAction {
1366 : public:
1367 6 : explicit DeferredClearCaptures(Interval range)
1368 6 : : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1369 6 : range_(range)
1370 6 : {}
1371 :
1372 32 : Interval range() { return range_; }
1373 : private:
1374 : Interval range_;
1375 : };
1376 :
1377 : class DeferredIncrementRegister : public DeferredAction {
1378 : public:
1379 13 : explicit DeferredIncrementRegister(int reg)
1380 13 : : DeferredAction(ActionNode::INCREMENT_REGISTER, reg)
1381 13 : {}
1382 : };
1383 :
1384 355 : Trace()
1385 355 : : cp_offset_(0),
1386 : actions_(nullptr),
1387 : backtrack_(nullptr),
1388 : stop_node_(nullptr),
1389 : loop_label_(nullptr),
1390 : characters_preloaded_(0),
1391 : bound_checked_up_to_(0),
1392 : flush_budget_(100),
1393 355 : at_start_(UNKNOWN)
1394 355 : {}
1395 :
1396 : // End the trace. This involves flushing the deferred actions in the trace
1397 : // and pushing a backtrack location onto the backtrack stack. Once this is
1398 : // done we can start a new trace or go to one that has already been
1399 : // generated.
1400 : void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1401 :
1402 1586 : int cp_offset() { return cp_offset_; }
1403 249 : DeferredAction* actions() { return actions_; }
1404 :
1405 : // A trivial trace is one that has no deferred actions or other state that
1406 : // affects the assumptions used when generating code. There is no recorded
1407 : // backtrack location in a trivial trace, so with a trivial trace we will
1408 : // generate code that, on a failure to match, gets the backtrack location
1409 : // from the backtrack stack rather than using a direct jump instruction. We
1410 : // always start code generation with a trivial trace and non-trivial traces
1411 : // are created as we emit code for nodes or add to the list of deferred
1412 : // actions in the trace. The location of the code generated for a node using
1413 : // a trivial trace is recorded in a label in the node so that gotos can be
1414 : // generated to that code.
1415 1215 : bool is_trivial() {
1416 1862 : return backtrack_ == nullptr &&
1417 1155 : actions_ == nullptr &&
1418 942 : cp_offset_ == 0 &&
1419 859 : characters_preloaded_ == 0 &&
1420 835 : bound_checked_up_to_ == 0 &&
1421 2035 : quick_check_performed_.characters() == 0 &&
1422 1625 : at_start_ == UNKNOWN;
1423 : }
1424 :
1425 270 : TriBool at_start() { return at_start_; }
1426 319 : void set_at_start(bool at_start) {
1427 319 : at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;
1428 319 : }
1429 1474 : jit::Label* backtrack() { return backtrack_; }
1430 26 : jit::Label* loop_label() { return loop_label_; }
1431 1060 : RegExpNode* stop_node() { return stop_node_; }
1432 425 : int characters_preloaded() { return characters_preloaded_; }
1433 232 : int bound_checked_up_to() { return bound_checked_up_to_; }
1434 186 : int flush_budget() { return flush_budget_; }
1435 993 : QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1436 : bool mentions_reg(int reg);
1437 :
1438 : // Returns true if a deferred position store exists to the specified
1439 : // register and stores the offset in the out-parameter. Otherwise
1440 : // returns false.
1441 : bool GetStoredPosition(int reg, int* cp_offset);
1442 :
1443 : // These set methods and AdvanceCurrentPositionInTrace should be used only on
1444 : // new traces - the intention is that traces are immutable after creation.
1445 194 : void add_action(DeferredAction* new_action) {
1446 194 : MOZ_ASSERT(new_action->next_ == nullptr);
1447 194 : new_action->next_ = actions_;
1448 194 : actions_ = new_action;
1449 194 : }
1450 :
1451 234 : void set_backtrack(jit::Label* backtrack) { backtrack_ = backtrack; }
1452 26 : void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1453 26 : void set_loop_label(jit::Label* label) { loop_label_ = label; }
1454 318 : void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1455 186 : void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1456 101 : void set_flush_budget(int to) { flush_budget_ = to; }
1457 110 : void set_quick_check_performed(QuickCheckDetails* d) {
1458 110 : quick_check_performed_ = *d;
1459 110 : }
1460 : void InvalidateCurrentCharacter();
1461 : void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1462 :
1463 : private:
1464 : int FindAffectedRegisters(LifoAlloc* alloc, OutSet* affected_registers);
1465 : void PerformDeferredActions(LifoAlloc* alloc,
1466 : RegExpMacroAssembler* macro,
1467 : int max_register,
1468 : OutSet& affected_registers,
1469 : OutSet* registers_to_pop,
1470 : OutSet* registers_to_clear);
1471 : void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1472 : int max_register,
1473 : OutSet& registers_to_pop,
1474 : OutSet& registers_to_clear);
1475 : int cp_offset_;
1476 : DeferredAction* actions_;
1477 : jit::Label* backtrack_;
1478 : RegExpNode* stop_node_;
1479 : jit::Label* loop_label_;
1480 : int characters_preloaded_;
1481 : int bound_checked_up_to_;
1482 : QuickCheckDetails quick_check_performed_;
1483 : int flush_budget_;
1484 : TriBool at_start_;
1485 : };
1486 :
1487 40 : class NodeVisitor
1488 : {
1489 : public:
1490 40 : virtual ~NodeVisitor() { }
1491 : #define DECLARE_VISIT(Type) \
1492 : virtual void Visit##Type(Type##Node* that) = 0;
1493 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1494 : #undef DECLARE_VISIT
1495 0 : virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1496 : };
1497 :
1498 : // Assertion propagation moves information about assertions such as
1499 : // \b to the affected nodes. For instance, in /.\b./ information must
1500 : // be propagated to the first '.' that whatever follows needs to know
1501 : // if it matched a word or a non-word, and to the second '.' that it
1502 : // has to check if it succeeds a word or non-word. In this case the
1503 : // result will be something like:
1504 : //
1505 : // +-------+ +------------+
1506 : // | . | | . |
1507 : // +-------+ ---> +------------+
1508 : // | word? | | check word |
1509 : // +-------+ +------------+
1510 40 : class Analysis : public NodeVisitor
1511 : {
1512 : public:
1513 40 : Analysis(JSContext* cx, bool ignore_case, bool is_latin1, bool unicode)
1514 40 : : cx(cx),
1515 : ignore_case_(ignore_case),
1516 : is_latin1_(is_latin1),
1517 : unicode_(unicode),
1518 40 : error_message_(nullptr)
1519 40 : {}
1520 :
1521 : void EnsureAnalyzed(RegExpNode* node);
1522 :
1523 : #define DECLARE_VISIT(Type) \
1524 : virtual void Visit##Type(Type##Node* that);
1525 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1526 : #undef DECLARE_VISIT
1527 : virtual void VisitLoopChoice(LoopChoiceNode* that);
1528 :
1529 643 : bool has_failed() { return error_message_ != nullptr; }
1530 0 : const char* errorMessage() {
1531 0 : MOZ_ASSERT(error_message_ != nullptr);
1532 0 : return error_message_;
1533 : }
1534 0 : void failASCII(const char* error_message) {
1535 0 : error_message_ = error_message;
1536 0 : }
1537 :
1538 : private:
1539 : JSContext* cx;
1540 : bool ignore_case_;
1541 : bool is_latin1_;
1542 : bool unicode_;
1543 : const char* error_message_;
1544 :
1545 : Analysis(Analysis&) = delete;
1546 : void operator=(Analysis&) = delete;
1547 : };
1548 :
1549 : void
1550 : AddClassNegated(const int* elmv, int elmc, CharacterRangeVector* ranges);
1551 :
1552 : } } // namespace js::irregexp
1553 :
1554 : #endif // V8_JSREGEXP_H_
|