Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 :
4 : // Copyright 2012 the V8 project authors. All rights reserved.
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following
13 : // disclaimer in the documentation and/or other materials provided
14 : // with the distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived
17 : // from this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : #ifndef V8_REGEXP_AST_H_
32 : #define V8_REGEXP_AST_H_
33 :
34 : // Prevent msvc build failures as indicated in bug 1205328
35 : #ifdef min
36 : # undef min
37 : #endif
38 : #ifdef max
39 : # undef max
40 : #endif
41 :
42 : #include "irregexp/RegExpEngine.h"
43 :
44 : namespace js {
45 : namespace irregexp {
46 :
47 : class RegExpCompiler;
48 : class RegExpNode;
49 :
50 : class RegExpVisitor
51 : {
52 : public:
53 : virtual ~RegExpVisitor() { }
54 : #define MAKE_CASE(Name) \
55 : virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
56 : FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
57 : #undef MAKE_CASE
58 : };
59 :
60 4167 : class RegExpTree
61 : {
62 : public:
63 : static const int kInfinity = INT32_MAX;
64 0 : virtual ~RegExpTree() {}
65 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
66 : RegExpNode* on_success) = 0;
67 362 : virtual bool IsTextElement() { return false; }
68 19 : virtual bool IsAnchoredAtStart() { return false; }
69 34 : virtual bool IsAnchoredAtEnd() { return false; }
70 : virtual int min_match() = 0;
71 : virtual int max_match() = 0;
72 : // Returns the interval of registers used for captures within this
73 : // expression.
74 92 : virtual Interval CaptureRegisters() { return Interval::Empty(); }
75 0 : virtual void AppendToText(RegExpText* text) {
76 0 : MOZ_CRASH("Bad call");
77 : }
78 : #define MAKE_ASTYPE(Name) \
79 : virtual RegExp##Name* As##Name(); \
80 : virtual bool Is##Name();
81 : FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
82 : #undef MAKE_ASTYPE
83 : };
84 :
85 : typedef InfallibleVector<RegExpTree*, 1> RegExpTreeVector;
86 :
87 0 : class RegExpDisjunction : public RegExpTree
88 : {
89 : public:
90 : explicit RegExpDisjunction(RegExpTreeVector* alternatives);
91 : virtual void* Accept(RegExpVisitor* visitor, void* data);
92 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
93 : RegExpNode* on_success);
94 : virtual RegExpDisjunction* AsDisjunction();
95 : virtual Interval CaptureRegisters();
96 : virtual bool IsDisjunction();
97 : virtual bool IsAnchoredAtStart();
98 : virtual bool IsAnchoredAtEnd();
99 110 : virtual int min_match() { return min_match_; }
100 140 : virtual int max_match() { return max_match_; }
101 :
102 34 : const RegExpTreeVector& alternatives() { return *alternatives_; }
103 :
104 : private:
105 : RegExpTreeVector* alternatives_;
106 : int min_match_;
107 : int max_match_;
108 : };
109 :
110 0 : class RegExpAlternative : public RegExpTree
111 : {
112 : public:
113 : explicit RegExpAlternative(RegExpTreeVector* nodes);
114 : virtual void* Accept(RegExpVisitor* visitor, void* data);
115 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
116 : RegExpNode* on_success);
117 : virtual RegExpAlternative* AsAlternative();
118 : virtual Interval CaptureRegisters();
119 : virtual bool IsAlternative();
120 : virtual bool IsAnchoredAtStart();
121 : virtual bool IsAnchoredAtEnd();
122 158 : virtual int min_match() { return min_match_; }
123 234 : virtual int max_match() { return max_match_; }
124 :
125 112 : const RegExpTreeVector& nodes() { return *nodes_; }
126 :
127 : private:
128 : RegExpTreeVector* nodes_;
129 : int min_match_;
130 : int max_match_;
131 : };
132 :
133 0 : class RegExpAssertion : public RegExpTree {
134 : public:
135 : enum AssertionType {
136 : START_OF_LINE,
137 : START_OF_INPUT,
138 : END_OF_LINE,
139 : END_OF_INPUT,
140 : BOUNDARY,
141 : NON_BOUNDARY,
142 : NOT_AFTER_LEAD_SURROGATE,
143 : NOT_IN_SURROGATE_PAIR
144 : };
145 489 : explicit RegExpAssertion(AssertionType type) : assertion_type_(type) { }
146 : virtual void* Accept(RegExpVisitor* visitor, void* data);
147 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
148 : RegExpNode* on_success);
149 : virtual RegExpAssertion* AsAssertion();
150 : virtual bool IsAssertion();
151 : virtual bool IsAnchoredAtStart();
152 : virtual bool IsAnchoredAtEnd();
153 487 : virtual int min_match() { return 0; }
154 488 : virtual int max_match() { return 0; }
155 75 : AssertionType assertion_type() { return assertion_type_; }
156 : private:
157 : AssertionType assertion_type_;
158 : };
159 :
160 : class CharacterSet
161 : {
162 : public:
163 19 : explicit CharacterSet(char16_t standard_set_type)
164 19 : : ranges_(nullptr),
165 19 : standard_set_type_(standard_set_type)
166 19 : {}
167 724 : explicit CharacterSet(CharacterRangeVector* ranges)
168 724 : : ranges_(ranges),
169 724 : standard_set_type_(0)
170 724 : {}
171 :
172 : CharacterRangeVector& ranges(LifoAlloc* alloc);
173 16 : char16_t standard_set_type() { return standard_set_type_; }
174 10 : void set_standard_set_type(char16_t special_set_type) {
175 10 : standard_set_type_ = special_set_type;
176 10 : }
177 81 : bool is_standard() { return standard_set_type_ != 0; }
178 : void Canonicalize();
179 :
180 : private:
181 : CharacterRangeVector* ranges_;
182 :
183 : // If non-zero, the value represents a standard set (e.g., all whitespace
184 : // characters) without having to expand the ranges.
185 : char16_t standard_set_type_;
186 : };
187 :
188 0 : class RegExpCharacterClass : public RegExpTree
189 : {
190 : public:
191 724 : RegExpCharacterClass(CharacterRangeVector* ranges, bool is_negated)
192 724 : : set_(ranges),
193 724 : is_negated_(is_negated)
194 724 : {}
195 :
196 19 : explicit RegExpCharacterClass(char16_t type)
197 19 : : set_(type),
198 19 : is_negated_(false)
199 19 : {}
200 :
201 : virtual void* Accept(RegExpVisitor* visitor, void* data);
202 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
203 : RegExpNode* on_success);
204 : virtual RegExpCharacterClass* AsCharacterClass();
205 : virtual bool IsCharacterClass();
206 724 : virtual bool IsTextElement() { return true; }
207 731 : virtual int min_match() { return 1; }
208 1148 : virtual int max_match() { return 1; }
209 : virtual void AppendToText(RegExpText* text);
210 :
211 : CharacterSet character_set() { return set_; }
212 :
213 : // TODO(lrn): Remove need for complex version if is_standard that
214 : // recognizes a mangled standard set and just do { return set_.is_special(); }
215 : bool is_standard(LifoAlloc* alloc);
216 :
217 : // Returns a value representing the standard character set if is_standard()
218 : // returns true.
219 : // Currently used values are:
220 : // s : unicode whitespace
221 : // S : unicode non-whitespace
222 : // w : ASCII word character (digit, letter, underscore)
223 : // W : non-ASCII word character
224 : // d : ASCII digit
225 : // D : non-ASCII digit
226 : // . : non-unicode non-newline
227 : // * : All characters
228 16 : char16_t standard_type() { return set_.standard_set_type(); }
229 :
230 286 : CharacterRangeVector& ranges(LifoAlloc* alloc) { return set_.ranges(alloc); }
231 295 : bool is_negated() { return is_negated_; }
232 :
233 : private:
234 : CharacterSet set_;
235 : bool is_negated_;
236 : };
237 :
238 : typedef InfallibleVector<char16_t, 10> CharacterVector;
239 :
240 0 : class RegExpAtom : public RegExpTree
241 : {
242 : public:
243 1223 : explicit RegExpAtom(CharacterVector* data)
244 1223 : : data_(data)
245 1223 : {}
246 :
247 : virtual void* Accept(RegExpVisitor* visitor, void* data);
248 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
249 : RegExpNode* on_success);
250 : virtual RegExpAtom* AsAtom();
251 : virtual bool IsAtom();
252 2 : virtual bool IsTextElement() { return true; }
253 1030 : virtual int min_match() { return data_->length(); }
254 1129 : virtual int max_match() { return data_->length(); }
255 : virtual void AppendToText(RegExpText* text);
256 :
257 745 : const CharacterVector& data() { return *data_; }
258 631 : int length() { return data_->length(); }
259 :
260 : private:
261 : CharacterVector* data_;
262 : };
263 :
264 0 : class RegExpText : public RegExpTree
265 : {
266 : public:
267 36 : explicit RegExpText(LifoAlloc* alloc)
268 36 : : elements_(*alloc), length_(0)
269 36 : {}
270 :
271 : virtual void* Accept(RegExpVisitor* visitor, void* data);
272 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
273 : RegExpNode* on_success);
274 : virtual RegExpText* AsText();
275 : virtual bool IsText();
276 0 : virtual bool IsTextElement() { return true; }
277 34 : virtual int min_match() { return length_; }
278 34 : virtual int max_match() { return length_; }
279 : virtual void AppendToText(RegExpText* text);
280 :
281 75 : void AddElement(TextElement elm) {
282 75 : elements_.append(elm);
283 75 : length_ += elm.length();
284 75 : }
285 0 : const TextElementVector& elements() { return elements_; }
286 :
287 : private:
288 : TextElementVector elements_;
289 : int length_;
290 : };
291 :
292 0 : class RegExpQuantifier : public RegExpTree
293 : {
294 : public:
295 : enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
296 652 : RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
297 652 : : body_(body),
298 : min_(min),
299 : max_(max),
300 652 : min_match_(min * body->min_match()),
301 1304 : quantifier_type_(type)
302 : {
303 652 : if (max > 0 && body->max_match() > kInfinity / max) {
304 13 : max_match_ = kInfinity;
305 : } else {
306 639 : max_match_ = max * body->max_match();
307 : }
308 652 : }
309 :
310 : virtual void* Accept(RegExpVisitor* visitor, void* data);
311 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
312 : RegExpNode* on_success);
313 : static RegExpNode* ToNode(int min,
314 : int max,
315 : bool is_greedy,
316 : RegExpTree* body,
317 : RegExpCompiler* compiler,
318 : RegExpNode* on_success,
319 : bool not_at_start = false);
320 : virtual RegExpQuantifier* AsQuantifier();
321 : virtual Interval CaptureRegisters();
322 : virtual bool IsQuantifier();
323 580 : virtual int min_match() { return min_match_; }
324 587 : virtual int max_match() { return max_match_; }
325 42 : int min() { return min_; }
326 42 : int max() { return max_; }
327 0 : bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
328 0 : bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
329 42 : bool is_greedy() { return quantifier_type_ == GREEDY; }
330 48 : RegExpTree* body() { return body_; }
331 :
332 : private:
333 : RegExpTree* body_;
334 : int min_;
335 : int max_;
336 : int min_match_;
337 : int max_match_;
338 : QuantifierType quantifier_type_;
339 : };
340 :
341 0 : class RegExpCapture : public RegExpTree
342 : {
343 : public:
344 271 : explicit RegExpCapture(RegExpTree* body, int index)
345 271 : : body_(body), index_(index)
346 271 : {}
347 :
348 : virtual void* Accept(RegExpVisitor* visitor, void* data);
349 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
350 : RegExpNode* on_success);
351 : static RegExpNode* ToNode(RegExpTree* body,
352 : int index,
353 : RegExpCompiler* compiler,
354 : RegExpNode* on_success);
355 : virtual RegExpCapture* AsCapture();
356 : virtual bool IsAnchoredAtStart();
357 : virtual bool IsAnchoredAtEnd();
358 : virtual Interval CaptureRegisters();
359 : virtual bool IsCapture();
360 269 : virtual int min_match() { return body_->min_match(); }
361 335 : virtual int max_match() { return body_->max_match(); }
362 36 : RegExpTree* body() { return body_; }
363 38 : int index() { return index_; }
364 71 : static int StartRegister(int index) { return index * 2; }
365 71 : static int EndRegister(int index) { return index * 2 + 1; }
366 :
367 : private:
368 : RegExpTree* body_;
369 : int index_;
370 : };
371 :
372 0 : class RegExpLookahead : public RegExpTree
373 : {
374 : public:
375 27 : RegExpLookahead(RegExpTree* body,
376 : bool is_positive,
377 : int capture_count,
378 : int capture_from)
379 27 : : body_(body),
380 : is_positive_(is_positive),
381 : capture_count_(capture_count),
382 27 : capture_from_(capture_from)
383 27 : {}
384 :
385 : virtual void* Accept(RegExpVisitor* visitor, void* data);
386 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
387 : RegExpNode* on_success);
388 : virtual RegExpLookahead* AsLookahead();
389 : virtual Interval CaptureRegisters();
390 : virtual bool IsLookahead();
391 : virtual bool IsAnchoredAtStart();
392 27 : virtual int min_match() { return 0; }
393 30 : virtual int max_match() { return 0; }
394 12 : RegExpTree* body() { return body_; }
395 12 : bool is_positive() { return is_positive_; }
396 : int capture_count() { return capture_count_; }
397 : int capture_from() { return capture_from_; }
398 :
399 : private:
400 : RegExpTree* body_;
401 : bool is_positive_;
402 : int capture_count_;
403 : int capture_from_;
404 : };
405 :
406 : typedef InfallibleVector<RegExpCapture*, 1> RegExpCaptureVector;
407 :
408 0 : class RegExpBackReference : public RegExpTree
409 : {
410 : public:
411 0 : explicit RegExpBackReference(RegExpCapture* capture)
412 0 : : capture_(capture)
413 0 : {}
414 :
415 : virtual void* Accept(RegExpVisitor* visitor, void* data);
416 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
417 : RegExpNode* on_success);
418 : virtual RegExpBackReference* AsBackReference();
419 : virtual bool IsBackReference();
420 0 : virtual int min_match() { return 0; }
421 0 : virtual int max_match() { return capture_->max_match(); }
422 0 : int index() { return capture_->index(); }
423 : RegExpCapture* capture() { return capture_; }
424 : private:
425 : RegExpCapture* capture_;
426 : };
427 :
428 0 : class RegExpEmpty : public RegExpTree
429 : {
430 : public:
431 1 : RegExpEmpty()
432 1 : {}
433 :
434 : virtual void* Accept(RegExpVisitor* visitor, void* data);
435 : virtual RegExpNode* ToNode(RegExpCompiler* compiler,
436 : RegExpNode* on_success);
437 : virtual RegExpEmpty* AsEmpty();
438 : virtual bool IsEmpty();
439 2 : virtual int min_match() { return 0; }
440 2 : virtual int max_match() { return 0; }
441 2 : static RegExpEmpty* GetInstance() {
442 2 : static RegExpEmpty instance;
443 2 : return &instance;
444 : }
445 : };
446 :
447 : } } // namespace js::irregexp
448 :
449 : #endif // V8_REGEXP_AST_H_
|