Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 :
4 : // Copyright 2012 the V8 project authors. All rights reserved.
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following
13 : // disclaimer in the documentation and/or other materials provided
14 : // with the distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived
17 : // from this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : #ifndef V8_PARSER_H_
32 : #define V8_PARSER_H_
33 :
34 : #include "mozilla/Range.h"
35 :
36 : #include <stdarg.h>
37 :
38 : #include "irregexp/RegExpAST.h"
39 :
40 : namespace js {
41 :
42 : namespace frontend {
43 : class TokenStream;
44 : }
45 :
46 : namespace irregexp {
47 :
48 : bool
49 : ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
50 : bool multiline, bool match_only, bool unicode, bool ignore_case,
51 : bool global, bool sticky, RegExpCompileData* data);
52 :
53 : bool
54 : ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
55 : bool unicode);
56 :
57 : bool
58 : ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc,
59 : const mozilla::Range<const char16_t> chars, bool unicode);
60 :
61 : // A BufferedVector is an automatically growing list, just like (and backed
62 : // by) a Vector, that is optimized for the case of adding and removing
63 : // a single element. The last element added is stored outside the backing list,
64 : // and if no more than one element is ever added, the ZoneList isn't even
65 : // allocated.
66 : // Elements must not be nullptr pointers.
67 : template <typename T, int initial_size>
68 : class BufferedVector
69 : {
70 : public:
71 : typedef InfallibleVector<T*, 1> VectorType;
72 :
73 3363 : BufferedVector() : list_(nullptr), last_(nullptr) {}
74 :
75 : // Adds element at end of list. This element is buffered and can
76 : // be read using last() or removed using RemoveLast until a new Add or until
77 : // RemoveLast or GetList has been called.
78 6263 : void Add(LifoAlloc* alloc, T* value) {
79 6263 : if (last_ != nullptr) {
80 1919 : if (list_ == nullptr) {
81 950 : list_ = alloc->newInfallible<VectorType>(*alloc);
82 950 : list_->reserve(initial_size);
83 : }
84 1919 : list_->append(last_);
85 : }
86 6263 : last_ = value;
87 6263 : }
88 :
89 3230 : T* last() {
90 3230 : MOZ_ASSERT(last_ != nullptr);
91 3230 : return last_;
92 : }
93 :
94 591 : T* RemoveLast() {
95 591 : MOZ_ASSERT(last_ != nullptr);
96 591 : T* result = last_;
97 591 : if ((list_ != nullptr) && (list_->length() > 0))
98 238 : last_ = list_->popCopy();
99 : else
100 353 : last_ = nullptr;
101 591 : return result;
102 : }
103 :
104 75 : T* Get(int i) {
105 75 : MOZ_ASSERT((0 <= i) && (i < length()));
106 75 : if (list_ == nullptr) {
107 0 : MOZ_ASSERT(0 == i);
108 0 : return last_;
109 : } else {
110 75 : if (size_t(i) == list_->length()) {
111 36 : MOZ_ASSERT(last_ != nullptr);
112 36 : return last_;
113 : } else {
114 39 : return (*list_)[i];
115 : }
116 : }
117 : }
118 :
119 2872 : void Clear() {
120 2872 : list_ = nullptr;
121 2872 : last_ = nullptr;
122 2872 : }
123 :
124 6392 : int length() {
125 6392 : int length = (list_ == nullptr) ? 0 : list_->length();
126 6392 : return length + ((last_ == nullptr) ? 0 : 1);
127 : }
128 :
129 725 : VectorType* GetList(LifoAlloc* alloc) {
130 725 : if (list_ == nullptr)
131 0 : list_ = alloc->newInfallible<VectorType>(*alloc);
132 725 : if (last_ != nullptr) {
133 725 : list_->append(last_);
134 725 : last_ = nullptr;
135 : }
136 725 : return list_;
137 : }
138 :
139 : private:
140 : VectorType* list_;
141 : T* last_;
142 : };
143 :
144 :
145 : // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
146 : class RegExpBuilder
147 : {
148 : public:
149 : explicit RegExpBuilder(LifoAlloc* alloc);
150 : void AddCharacter(char16_t character);
151 : // "Adds" an empty expression. Does nothing except consume a
152 : // following quantifier
153 : void AddEmpty();
154 : void AddAtom(RegExpTree* tree);
155 : void AddAssertion(RegExpTree* tree);
156 : void NewAlternative(); // '|'
157 : void AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type);
158 : RegExpTree* ToRegExp();
159 :
160 : private:
161 : void FlushCharacters();
162 : void FlushText();
163 : void FlushTerms();
164 :
165 : LifoAlloc* alloc;
166 : bool pending_empty_;
167 : CharacterVector* characters_;
168 : BufferedVector<RegExpTree, 2> terms_;
169 : BufferedVector<RegExpTree, 2> text_;
170 : BufferedVector<RegExpTree, 2> alternatives_;
171 :
172 : enum LastAdded {
173 : ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM
174 : };
175 : #ifdef DEBUG
176 : LastAdded last_added_;
177 : #endif
178 : };
179 :
180 : // Characters parsed by RegExpParser can be either char16_t or kEndMarker.
181 : typedef uint32_t widechar;
182 :
183 : template <typename CharT>
184 : class RegExpParser
185 : {
186 : public:
187 : RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
188 : const CharT* chars, const CharT* end, bool multiline_mode, bool unicode,
189 : bool ignore_case);
190 :
191 : RegExpTree* ParsePattern();
192 : RegExpTree* ParseDisjunction();
193 : RegExpTree* ParseCharacterClass();
194 :
195 : // Parses a {...,...} quantifier and stores the range in the given
196 : // out parameters.
197 : bool ParseIntervalQuantifier(int* min_out, int* max_out);
198 :
199 : // Tries to parse the input as a single escaped character. If successful
200 : // it stores the result in the output parameter and returns true.
201 : // Otherwise it throws an error and returns false. The character must not
202 : // be 'b' or 'B' since they are usually handled specially.
203 : bool ParseClassCharacterEscape(widechar* code);
204 :
205 : // Checks whether the following is a length-digit hexadecimal number,
206 : // and sets the value if it is.
207 : bool ParseHexEscape(int length, widechar* value);
208 :
209 : bool ParseBracedHexEscape(widechar* value);
210 : bool ParseTrailSurrogate(widechar* value);
211 : bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail);
212 :
213 : widechar ParseOctalLiteral();
214 :
215 : // Tries to parse the input as a back reference. If successful it
216 : // stores the result in the output parameter and returns true. If
217 : // it fails it will push back the characters read so the same characters
218 : // can be reparsed.
219 : bool ParseBackReferenceIndex(int* index_out);
220 :
221 : bool ParseClassAtom(char16_t* char_class, widechar *value);
222 :
223 : private:
224 : void SyntaxError(unsigned errorNumber, ...);
225 :
226 : public:
227 : RegExpTree* ReportError(unsigned errorNumber, const char* param = nullptr);
228 :
229 : void Advance();
230 727 : void Advance(int dist) {
231 727 : next_pos_ += dist - 1;
232 727 : Advance();
233 727 : }
234 :
235 0 : void Reset(const CharT* pos) {
236 0 : next_pos_ = pos;
237 0 : has_more_ = (pos < end_);
238 0 : Advance();
239 0 : }
240 :
241 : // Reports whether the pattern might be used as a literal search string.
242 : // Only use if the result of the parse is a single atom node.
243 40 : bool simple() { return simple_; }
244 40 : bool contains_anchor() { return contains_anchor_; }
245 259 : void set_contains_anchor() { contains_anchor_ = true; }
246 1039 : int captures_started() { return captures_ == nullptr ? 0 : captures_->length(); }
247 261 : const CharT* position() { return next_pos_ - 1; }
248 :
249 : static const int kMaxCaptures = 1 << 16;
250 : static const widechar kEndMarker = (1 << 21);
251 :
252 : private:
253 : enum SubexpressionType {
254 : INITIAL,
255 : CAPTURE, // All positive values represent captures.
256 : POSITIVE_LOOKAHEAD,
257 : NEGATIVE_LOOKAHEAD,
258 : GROUPING
259 : };
260 :
261 : class RegExpParserState {
262 : public:
263 1121 : RegExpParserState(LifoAlloc* alloc,
264 : RegExpParserState* previous_state,
265 : SubexpressionType group_type,
266 : int disjunction_capture_index)
267 : : previous_state_(previous_state),
268 1121 : builder_(alloc->newInfallible<RegExpBuilder>(alloc)),
269 : group_type_(group_type),
270 2242 : disjunction_capture_index_(disjunction_capture_index)
271 1121 : {}
272 : // Parser state of containing expression, if any.
273 364 : RegExpParserState* previous_state() { return previous_state_; }
274 1121 : bool IsSubexpression() { return previous_state_ != nullptr; }
275 : // RegExpBuilder building this regexp's AST.
276 1485 : RegExpBuilder* builder() { return builder_; }
277 : // Type of regexp being parsed (parenthesized group or entire regexp).
278 1485 : SubexpressionType group_type() { return group_type_; }
279 : // Index in captures array of first capture in this sub-expression, if any.
280 : // Also the capture index of this sub-expression itself, if group_type
281 : // is CAPTURE.
282 364 : int capture_index() { return disjunction_capture_index_; }
283 :
284 : private:
285 : // Linked list implementation of stack of states.
286 : RegExpParserState* previous_state_;
287 : // Builder for the stored disjunction.
288 : RegExpBuilder* builder_;
289 : // Stored disjunction type (capture, look-ahead or grouping), if any.
290 : SubexpressionType group_type_;
291 : // Stored disjunction's capture index (if any).
292 : int disjunction_capture_index_;
293 : };
294 :
295 27481 : widechar current() { return current_; }
296 2452 : bool has_more() { return has_more_; }
297 2379 : bool has_next() { return next_pos_ < end_; }
298 2017 : widechar Next() {
299 2017 : if (has_next())
300 2017 : return *next_pos_;
301 0 : return kEndMarker;
302 : }
303 : void ScanForCaptures();
304 :
305 : frontend::TokenStream& ts;
306 : LifoAlloc* alloc;
307 : RegExpCaptureVector* captures_;
308 : const CharT* const start_;
309 : const CharT* next_pos_;
310 : const CharT* end_;
311 : widechar current_;
312 : // The capture count is only valid after we have scanned for captures.
313 : int capture_count_;
314 : bool has_more_;
315 : bool multiline_;
316 : bool unicode_;
317 : bool ignore_case_;
318 : bool simple_;
319 : bool contains_anchor_;
320 : bool is_scanned_for_captures_;
321 : };
322 :
323 : } } // namespace js::irregexp
324 :
325 : #endif // V8_PARSER_H_
|