Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99:
3 : * This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #ifndef frontend_TokenStream_h
8 : #define frontend_TokenStream_h
9 :
10 : // JS lexical scanner interface.
11 :
12 : #include "mozilla/ArrayUtils.h"
13 : #include "mozilla/Assertions.h"
14 : #include "mozilla/Attributes.h"
15 : #include "mozilla/DebugOnly.h"
16 : #include "mozilla/PodOperations.h"
17 : #include "mozilla/Unused.h"
18 :
19 : #include <stdarg.h>
20 : #include <stddef.h>
21 : #include <stdio.h>
22 :
23 : #include "jscntxt.h"
24 : #include "jspubtd.h"
25 :
26 : #include "frontend/TokenKind.h"
27 : #include "js/UniquePtr.h"
28 : #include "js/Vector.h"
29 : #include "vm/ErrorReporting.h"
30 : #include "vm/RegExpShared.h"
31 : #include "vm/String.h"
32 : #include "vm/Unicode.h"
33 :
34 : struct KeywordInfo;
35 :
36 : namespace js {
37 : namespace frontend {
38 :
39 : struct TokenPos {
40 : uint32_t begin; // Offset of the token's first char.
41 : uint32_t end; // Offset of 1 past the token's last char.
42 :
43 278543 : TokenPos() {}
44 91883 : TokenPos(uint32_t begin, uint32_t end) : begin(begin), end(end) {}
45 :
46 : // Return a TokenPos that covers left, right, and anything in between.
47 12 : static TokenPos box(const TokenPos& left, const TokenPos& right) {
48 12 : MOZ_ASSERT(left.begin <= left.end);
49 12 : MOZ_ASSERT(left.end <= right.begin);
50 12 : MOZ_ASSERT(right.begin <= right.end);
51 12 : return TokenPos(left.begin, right.end);
52 : }
53 :
54 : bool operator==(const TokenPos& bpos) const {
55 : return begin == bpos.begin && end == bpos.end;
56 : }
57 :
58 : bool operator!=(const TokenPos& bpos) const {
59 : return begin != bpos.begin || end != bpos.end;
60 : }
61 :
62 : bool operator <(const TokenPos& bpos) const {
63 : return begin < bpos.begin;
64 : }
65 :
66 : bool operator <=(const TokenPos& bpos) const {
67 : return begin <= bpos.begin;
68 : }
69 :
70 : bool operator >(const TokenPos& bpos) const {
71 : return !(*this <= bpos);
72 : }
73 :
74 : bool operator >=(const TokenPos& bpos) const {
75 : return !(*this < bpos);
76 : }
77 :
78 5675 : bool encloses(const TokenPos& pos) const {
79 5675 : return begin <= pos.begin && pos.end <= end;
80 : }
81 : };
82 :
83 : enum DecimalPoint { NoDecimal = false, HasDecimal = true };
84 :
85 : enum class InvalidEscapeType {
86 : // No invalid character escapes.
87 : None,
88 : // A malformed \x escape.
89 : Hexadecimal,
90 : // A malformed \u escape.
91 : Unicode,
92 : // An otherwise well-formed \u escape which represents a
93 : // codepoint > 10FFFF.
94 : UnicodeOverflow,
95 : // An octal escape in a template token.
96 : Octal
97 : };
98 :
99 : class TokenStreamAnyChars;
100 :
101 249827 : struct Token
102 : {
103 : private:
104 : // Sometimes the parser needs to inform the tokenizer to interpret
105 : // subsequent text in a particular manner: for example, to tokenize a
106 : // keyword as an identifier, not as the actual keyword, on the right-hand
107 : // side of a dotted property access. Such information is communicated to
108 : // the tokenizer as a Modifier when getting the next token.
109 : //
110 : // Ideally this definition would reside in TokenStream as that's the real
111 : // user, but the debugging-use of it here causes a cyclic dependency (and
112 : // C++ provides no way to forward-declare an enum inside a class). So
113 : // define it here, then typedef it into TokenStream with static consts to
114 : // bring the initializers into scope.
115 : enum Modifier
116 : {
117 : // Normal operation.
118 : None,
119 :
120 : // Looking for an operand, not an operator. In practice, this means
121 : // that when '/' is seen, we look for a regexp instead of just returning
122 : // TOK_DIV.
123 : Operand,
124 :
125 : // Treat subsequent characters as the tail of a template literal, after
126 : // a template substitution, beginning with a "}", continuing with zero
127 : // or more template literal characters, and ending with either "${" or
128 : // the end of the template literal. For example:
129 : //
130 : // var entity = "world";
131 : // var s = `Hello ${entity}!`;
132 : // ^ TemplateTail context
133 : TemplateTail,
134 : };
135 : enum ModifierException
136 : {
137 : NoException,
138 :
139 : // Used in following 2 cases:
140 : // a) After |yield| we look for a token on the same line that starts an
141 : // expression (Operand): |yield <expr>|. If no token is found, the
142 : // |yield| stands alone, and the next token on a subsequent line must
143 : // be: a comma continuing a comma expression, a semicolon terminating
144 : // the statement that ended with |yield|, or the start of another
145 : // statement (possibly an expression statement). The comma/semicolon
146 : // cases are gotten as operators (None), contrasting with Operand
147 : // earlier.
148 : // b) After an arrow function with a block body in an expression
149 : // statement, the next token must be: a colon in a conditional
150 : // expression, a comma continuing a comma expression, a semicolon
151 : // terminating the statement, or the token on a subsequent line that is
152 : // the start of another statement (possibly an expression statement).
153 : // Colon is gotten as operator (None), and it should only be gotten in
154 : // conditional expression and missing it results in SyntaxError.
155 : // Comma/semicolon cases are also gotten as operators (None), and 4th
156 : // case is gotten after them. If no comma/semicolon found but EOL,
157 : // the next token should be gotten as operand in 4th case (especially if
158 : // '/' is the first character). So we should peek the token as
159 : // operand before try getting colon/comma/semicolon.
160 : // See also the comment in Parser::assignExpr().
161 : NoneIsOperand,
162 :
163 : // If a semicolon is inserted automatically, the next token is already
164 : // gotten with None, but we expect Operand.
165 : OperandIsNone,
166 : };
167 : friend class TokenStreamAnyChars;
168 :
169 : public:
170 : TokenKind type; // char value or above enumerator
171 : TokenPos pos; // token position in file
172 : union {
173 : private:
174 : friend struct Token;
175 : PropertyName* name; // non-numeric atom
176 : JSAtom* atom; // potentially-numeric atom
177 : struct {
178 : double value; // floating point number
179 : DecimalPoint decimalPoint; // literal contains '.'
180 : } number;
181 : RegExpFlag reflags; // regexp flags; use tokenbuf to access
182 : // regexp chars
183 : } u;
184 : #ifdef DEBUG
185 : Modifier modifier; // Modifier used to get this token
186 : ModifierException modifierException; // Exception for this modifier
187 : #endif
188 :
189 : // Mutators
190 :
191 152183 : void setName(PropertyName* name) {
192 152183 : MOZ_ASSERT(type == TOK_NAME);
193 152183 : u.name = name;
194 152183 : }
195 :
196 21832 : void setAtom(JSAtom* atom) {
197 21832 : MOZ_ASSERT(type == TOK_STRING ||
198 : type == TOK_TEMPLATE_HEAD ||
199 : type == TOK_NO_SUBS_TEMPLATE);
200 21832 : u.atom = atom;
201 21832 : }
202 :
203 210 : void setRegExpFlags(RegExpFlag flags) {
204 210 : MOZ_ASSERT(type == TOK_REGEXP);
205 210 : MOZ_ASSERT((flags & AllFlags) == flags);
206 210 : u.reflags = flags;
207 210 : }
208 :
209 9971 : void setNumber(double n, DecimalPoint decimalPoint) {
210 9971 : MOZ_ASSERT(type == TOK_NUMBER);
211 9971 : u.number.value = n;
212 9971 : u.number.decimalPoint = decimalPoint;
213 9971 : }
214 :
215 : // Type-safe accessors
216 :
217 258455 : PropertyName* name() const {
218 258455 : MOZ_ASSERT(type == TOK_NAME);
219 258455 : return u.name->JSAtom::asPropertyName(); // poor-man's type verification
220 : }
221 :
222 23218 : JSAtom* atom() const {
223 23218 : MOZ_ASSERT(type == TOK_STRING ||
224 : type == TOK_TEMPLATE_HEAD ||
225 : type == TOK_NO_SUBS_TEMPLATE);
226 23218 : return u.atom;
227 : }
228 :
229 210 : RegExpFlag regExpFlags() const {
230 210 : MOZ_ASSERT(type == TOK_REGEXP);
231 210 : MOZ_ASSERT((u.reflags & AllFlags) == u.reflags);
232 210 : return u.reflags;
233 : }
234 :
235 9996 : double number() const {
236 9996 : MOZ_ASSERT(type == TOK_NUMBER);
237 9996 : return u.number.value;
238 : }
239 :
240 9971 : DecimalPoint decimalPoint() const {
241 9971 : MOZ_ASSERT(type == TOK_NUMBER);
242 9971 : return u.number.decimalPoint;
243 : }
244 : };
245 :
246 : extern TokenKind
247 : ReservedWordTokenKind(PropertyName* str);
248 :
249 : extern const char*
250 : ReservedWordToCharZ(PropertyName* str);
251 :
252 : extern const char*
253 : ReservedWordToCharZ(TokenKind tt);
254 :
255 : // Ideally, tokenizing would be entirely independent of context. But the
256 : // strict mode flag, which is in SharedContext, affects tokenizing, and
257 : // TokenStream needs to see it.
258 : //
259 : // This class is a tiny back-channel from TokenStream to the strict mode flag
260 : // that avoids exposing the rest of SharedContext to TokenStream.
261 : //
262 1902 : class StrictModeGetter {
263 : public:
264 : virtual bool strictMode() = 0;
265 : };
266 :
267 2466 : class TokenStreamAnyChars
268 : {
269 : protected:
270 : TokenStreamAnyChars(JSContext* cx, const ReadOnlyCompileOptions& options, StrictModeGetter* smg);
271 :
272 : static const size_t ntokens = 4; // 1 current + 2 lookahead, rounded
273 : // to power of 2 to avoid divmod by 3
274 : static const unsigned maxLookahead = 2;
275 : static const unsigned ntokensMask = ntokens - 1;
276 :
277 : public:
278 : // Accessors.
279 3560868 : const Token& currentToken() const { return tokens[cursor]; }
280 578535 : bool isCurrentTokenType(TokenKind type) const {
281 578535 : return currentToken().type == type;
282 : }
283 :
284 13318 : const char* getFilename() const { return filename; }
285 : bool getMutedErrors() const { return mutedErrors; }
286 2071 : JSVersion versionNumber() const { return VersionNumber(options().version); }
287 : JSVersion versionWithFlags() const { return options().version; }
288 :
289 : MOZ_MUST_USE bool checkOptions();
290 :
291 : protected:
292 : PropertyName* reservedWordToPropertyName(TokenKind tt) const;
293 :
294 : public:
295 155694 : PropertyName* currentName() const {
296 155694 : if (isCurrentTokenType(TOK_NAME))
297 153095 : return currentToken().name();
298 :
299 2599 : MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
300 2599 : return reservedWordToPropertyName(currentToken().type);
301 : }
302 :
303 106016 : bool currentNameHasEscapes() const {
304 106016 : if (isCurrentTokenType(TOK_NAME)) {
305 105360 : TokenPos pos = currentToken().pos;
306 105360 : return (pos.end - pos.begin) != currentToken().name()->length();
307 : }
308 :
309 656 : MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
310 656 : return false;
311 : }
312 :
313 : PropertyName* nextName() const {
314 : if (nextToken().type != TOK_NAME)
315 : return nextToken().name();
316 :
317 : MOZ_ASSERT(TokenKindIsPossibleIdentifierName(nextToken().type));
318 : return reservedWordToPropertyName(nextToken().type);
319 : }
320 :
321 63920 : bool isCurrentTokenAssignment() const {
322 63920 : return TokenKindIsAssignment(currentToken().type);
323 : }
324 :
325 : // Flag methods.
326 1676 : bool isEOF() const { return flags.isEOF; }
327 109 : bool sawOctalEscape() const { return flags.sawOctalEscape; }
328 525 : bool hadError() const { return flags.hadError; }
329 7116 : void clearSawOctalEscape() { flags.sawOctalEscape = false; }
330 :
331 3 : bool hasInvalidTemplateEscape() const {
332 3 : return invalidTemplateEscapeType != InvalidEscapeType::None;
333 : }
334 0 : void clearInvalidTemplateEscape() {
335 0 : invalidTemplateEscapeType = InvalidEscapeType::None;
336 0 : }
337 :
338 : static const uint32_t NoOffset = UINT32_MAX;
339 :
340 : protected:
341 : // This is protected because it should only be called by the tokenizer
342 : // while tokenizing not by, for example, BytecodeEmitter.
343 0 : bool strictMode() const { return strictModeGetter && strictModeGetter->strictMode(); }
344 :
345 0 : void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
346 0 : MOZ_ASSERT(type != InvalidEscapeType::None);
347 0 : if (invalidTemplateEscapeType != InvalidEscapeType::None)
348 0 : return;
349 0 : invalidTemplateEscapeOffset = offset;
350 0 : invalidTemplateEscapeType = type;
351 : }
352 :
353 : uint32_t invalidTemplateEscapeOffset = 0;
354 : InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
355 :
356 : protected:
357 : struct Flags
358 : {
359 : bool isEOF:1; // Hit end of file.
360 : bool isDirtyLine:1; // Non-whitespace since start of line.
361 : bool sawOctalEscape:1; // Saw an octal character escape.
362 : bool hadError:1; // Hit a syntax error, at start or during a
363 : // token.
364 :
365 82458 : Flags()
366 82458 : : isEOF(), isDirtyLine(), sawOctalEscape(), hadError()
367 82458 : {}
368 : };
369 :
370 : public:
371 : typedef Token::Modifier Modifier;
372 : static constexpr Modifier None = Token::None;
373 : static constexpr Modifier Operand = Token::Operand;
374 : static constexpr Modifier TemplateTail = Token::TemplateTail;
375 :
376 : typedef Token::ModifierException ModifierException;
377 : static constexpr ModifierException NoException = Token::NoException;
378 : static constexpr ModifierException NoneIsOperand = Token::NoneIsOperand;
379 : static constexpr ModifierException OperandIsNone = Token::OperandIsNone;
380 :
381 2303 : void addModifierException(ModifierException modifierException) {
382 : #ifdef DEBUG
383 2303 : const Token& next = nextToken();
384 2303 : if (next.modifierException == NoneIsOperand)
385 : {
386 : // Token after yield expression without operand already has
387 : // NoneIsOperand exception.
388 18 : MOZ_ASSERT(modifierException == OperandIsNone);
389 18 : MOZ_ASSERT(next.type != TOK_DIV,
390 : "next token requires contextual specifier to be parsed unambiguously");
391 :
392 : // Do not update modifierException.
393 18 : return;
394 : }
395 :
396 2285 : MOZ_ASSERT(next.modifierException == NoException);
397 2285 : switch (modifierException) {
398 : case NoneIsOperand:
399 1507 : MOZ_ASSERT(next.modifier == Operand);
400 1507 : MOZ_ASSERT(next.type != TOK_DIV,
401 : "next token requires contextual specifier to be parsed unambiguously");
402 1507 : break;
403 : case OperandIsNone:
404 778 : MOZ_ASSERT(next.modifier == None);
405 778 : MOZ_ASSERT(next.type != TOK_DIV && next.type != TOK_REGEXP,
406 : "next token requires contextual specifier to be parsed unambiguously");
407 778 : break;
408 : default:
409 0 : MOZ_CRASH("unexpected modifier exception");
410 : }
411 2285 : tokens[(cursor + 1) & ntokensMask].modifierException = modifierException;
412 : #endif
413 : }
414 :
415 : void
416 836062 : verifyConsistentModifier(Modifier modifier, Token lookaheadToken) {
417 : #ifdef DEBUG
418 : // Easy case: modifiers match.
419 836062 : if (modifier == lookaheadToken.modifier)
420 831894 : return;
421 :
422 4168 : if (lookaheadToken.modifierException == OperandIsNone) {
423 : // getToken(Operand) permissibly following getToken().
424 1187 : if (modifier == Operand && lookaheadToken.modifier == None)
425 1187 : return;
426 : }
427 :
428 2981 : if (lookaheadToken.modifierException == NoneIsOperand) {
429 : // getToken() permissibly following getToken(Operand).
430 2981 : if (modifier == None && lookaheadToken.modifier == Operand)
431 2981 : return;
432 : }
433 :
434 0 : MOZ_ASSERT_UNREACHABLE("this token was previously looked up with a "
435 : "different modifier, potentially making "
436 : "tokenization non-deterministic");
437 : #endif
438 : }
439 :
440 : #ifdef DEBUG
441 64 : inline bool debugHasNoLookahead() const {
442 64 : return lookahead == 0;
443 : }
444 : #endif
445 :
446 284 : bool hasDisplayURL() const {
447 284 : return displayURL_ != nullptr;
448 : }
449 :
450 2 : char16_t* displayURL() {
451 2 : return displayURL_.get();
452 : }
453 :
454 284 : bool hasSourceMapURL() const {
455 284 : return sourceMapURL_ != nullptr;
456 : }
457 :
458 0 : char16_t* sourceMapURL() {
459 0 : return sourceMapURL_.get();
460 : }
461 :
462 : // This class maps a userbuf offset (which is 0-indexed) to a line number
463 : // (which is 1-indexed) and a column index (which is 0-indexed).
464 2466 : class SourceCoords
465 : {
466 : // For a given buffer holding source code, |lineStartOffsets_| has one
467 : // element per line of source code, plus one sentinel element. Each
468 : // non-sentinel element holds the buffer offset for the start of the
469 : // corresponding line of source code. For this example script:
470 : //
471 : // 1 // xyz [line starts at offset 0]
472 : // 2 var x; [line starts at offset 7]
473 : // 3 [line starts at offset 14]
474 : // 4 var y; [line starts at offset 15]
475 : //
476 : // |lineStartOffsets_| is:
477 : //
478 : // [0, 7, 14, 15, MAX_PTR]
479 : //
480 : // To convert a "line number" to a "line index" (i.e. an index into
481 : // |lineStartOffsets_|), subtract |initialLineNum_|. E.g. line 3's
482 : // line index is (3 - initialLineNum_), which is 2. Therefore
483 : // lineStartOffsets_[2] holds the buffer offset for the start of line 3,
484 : // which is 14. (Note that |initialLineNum_| is often 1, but not
485 : // always.)
486 : //
487 : // The first element is always 0, and the last element is always the
488 : // MAX_PTR sentinel.
489 : //
490 : // offset-to-line/column lookups are O(log n) in the worst case (binary
491 : // search), but in practice they're heavily clustered and we do better
492 : // than that by using the previous lookup's result (lastLineIndex_) as
493 : // a starting point.
494 : //
495 : // Checking if an offset lies within a particular line number
496 : // (isOnThisLine()) is O(1).
497 : //
498 : Vector<uint32_t, 128> lineStartOffsets_;
499 : uint32_t initialLineNum_;
500 :
501 : // This is mutable because it's modified on every search, but that fact
502 : // isn't visible outside this class.
503 : mutable uint32_t lastLineIndex_;
504 :
505 : uint32_t lineIndexOf(uint32_t offset) const;
506 :
507 : static const uint32_t MAX_PTR = UINT32_MAX;
508 :
509 107992 : uint32_t lineIndexToNum(uint32_t lineIndex) const { return lineIndex + initialLineNum_; }
510 564064 : uint32_t lineNumToIndex(uint32_t lineNum) const { return lineNum - initialLineNum_; }
511 :
512 : public:
513 : SourceCoords(JSContext* cx, uint32_t ln);
514 :
515 : MOZ_MUST_USE bool add(uint32_t lineNum, uint32_t lineStartOffset);
516 : MOZ_MUST_USE bool fill(const SourceCoords& other);
517 :
518 439935 : bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
519 439935 : uint32_t lineIndex = lineNumToIndex(lineNum);
520 439935 : if (lineIndex + 1 >= lineStartOffsets_.length()) // +1 due to sentinel
521 0 : return false;
522 871184 : *onThisLine = lineStartOffsets_[lineIndex] <= offset &&
523 431249 : offset < lineStartOffsets_[lineIndex + 1];
524 439935 : return true;
525 : }
526 :
527 : uint32_t lineNum(uint32_t offset) const;
528 : uint32_t columnIndex(uint32_t offset) const;
529 : void lineNumAndColumnIndex(uint32_t offset, uint32_t* lineNum, uint32_t* columnIndex) const;
530 : };
531 :
532 : SourceCoords srcCoords;
533 :
534 : JSAtomState& names() const {
535 : return cx->names();
536 : }
537 :
538 0 : JSContext* context() const {
539 0 : return cx;
540 : }
541 :
542 79801 : const ReadOnlyCompileOptions& options() const {
543 79801 : return options_;
544 : }
545 :
546 : /**
547 : * Fill in |err|, excepting line-of-context-related fields. If the token
548 : * stream has location information, use that and return true. If it does
549 : * not, use the caller's location information and return false.
550 : */
551 : bool fillExcludingContext(ErrorMetadata* err, uint32_t offset);
552 :
553 : void updateFlagsForEOL();
554 :
555 369801 : const Token& nextToken() const {
556 369801 : MOZ_ASSERT(hasLookahead());
557 369801 : return tokens[(cursor + 1) & ntokensMask];
558 : }
559 :
560 392774 : bool hasLookahead() const { return lookahead > 0; }
561 :
562 : public:
563 : MOZ_MUST_USE bool compileWarning(ErrorMetadata&& metadata, UniquePtr<JSErrorNotes> notes,
564 : unsigned flags, unsigned errorNumber, va_list args);
565 :
566 : void reportErrorNoOffset(unsigned errorNumber, ...);
567 :
568 : // Compute error metadata for an error at no offset.
569 : void computeErrorMetadataNoOffset(ErrorMetadata* err);
570 :
571 : protected:
572 : // Options used for parsing/tokenizing.
573 : const ReadOnlyCompileOptions& options_;
574 :
575 : Token tokens[ntokens]; // circular token buffer
576 : unsigned cursor; // index of last parsed token
577 : unsigned lookahead; // count of lookahead tokens
578 : unsigned lineno; // current line number
579 : Flags flags; // flags -- see above
580 : size_t linebase; // start of current line
581 : size_t prevLinebase; // start of previous line; size_t(-1) if on the first line
582 : const char* filename; // input filename or null
583 : UniqueTwoByteChars displayURL_; // the user's requested source URL or null
584 : UniqueTwoByteChars sourceMapURL_; // source map's filename or null
585 : uint8_t isExprEnding[TOK_LIMIT];// which tokens definitely terminate exprs?
586 : JSContext* const cx;
587 : bool mutedErrors;
588 : StrictModeGetter* strictModeGetter; // used to test for strict mode
589 : };
590 :
591 : // TokenStream is the lexical scanner for Javascript source text.
592 : //
593 : // It takes a buffer of char16_t characters and linearly scans it into |Token|s.
594 : // Internally the class uses a four element circular buffer |tokens| of
595 : // |Token|s. As an index for |tokens|, the member |cursor| points to the
596 : // current token.
597 : // Calls to getToken() increase |cursor| by one and return the new current
598 : // token. If a TokenStream was just created, the current token is initialized
599 : // with random data (i.e. not initialized). It is therefore important that
600 : // one of the first four member functions listed below is called first.
601 : // The circular buffer lets us go back up to two tokens from the last
602 : // scanned token. Internally, the relative number of backward steps that were
603 : // taken (via ungetToken()) after the last token was scanned is stored in
604 : // |lookahead|.
605 : //
606 : // The following table lists in which situations it is safe to call each listed
607 : // function. No checks are made by the functions in non-debug builds.
608 : //
609 : // Function Name | Precondition; changes to |lookahead|
610 : // ------------------+---------------------------------------------------------
611 : // getToken | none; if |lookahead > 0| then |lookahead--|
612 : // peekToken | none; if |lookahead == 0| then |lookahead == 1|
613 : // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
614 : // matchToken | none; if |lookahead > 0| and the match succeeds then
615 : // | |lookahead--|
616 : // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
617 : // ungetToken | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
618 : //
619 : // The behavior of the token scanning process (see getTokenInternal()) can be
620 : // modified by calling one of the first four above listed member functions with
621 : // an optional argument of type Modifier. However, the modifier will be
622 : // ignored unless |lookahead == 0| holds. Due to constraints of the grammar,
623 : // this turns out not to be a problem in practice. See the
624 : // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
625 : // for more details:
626 : // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
627 : //
628 : // The methods seek() and tell() allow to rescan from a previous visited
629 : // location of the buffer.
630 : //
631 2466 : class MOZ_STACK_CLASS TokenStream final : public TokenStreamAnyChars
632 : {
633 : public:
634 : using CharT = char16_t;
635 : using CharBuffer = Vector<CharT, 32>;
636 :
637 : TokenStream(JSContext* cx, const ReadOnlyCompileOptions& options,
638 : const CharT* base, size_t length, StrictModeGetter* smg);
639 :
640 420 : const CharBuffer& getTokenbuf() const { return tokenbuf; }
641 :
642 : // If there is an invalid escape in a template, report it and return false,
643 : // otherwise return true.
644 973 : bool checkForInvalidTemplateEscapeError() {
645 973 : if (invalidTemplateEscapeType == InvalidEscapeType::None)
646 973 : return true;
647 :
648 0 : reportInvalidEscapeError(invalidTemplateEscapeOffset, invalidTemplateEscapeType);
649 0 : return false;
650 : }
651 :
652 : // TokenStream-specific error reporters.
653 : void reportError(unsigned errorNumber, ...);
654 :
655 : // Report the given error at the current offset.
656 : void error(unsigned errorNumber, ...);
657 :
658 : // Report the given error at the given offset.
659 : void errorAt(uint32_t offset, unsigned errorNumber, ...);
660 :
661 : // Warn at the current offset.
662 : MOZ_MUST_USE bool warning(unsigned errorNumber, ...);
663 :
664 : private:
665 : // Compute a line of context for an otherwise-filled-in |err| at the given
666 : // offset in this token stream. (This function basically exists to make
667 : // |computeErrorMetadata| more readable and shouldn't be called elsewhere.)
668 : MOZ_MUST_USE bool computeLineOfContext(ErrorMetadata* err, uint32_t offset);
669 :
670 : public:
671 : // Compute error metadata for an error at the given offset.
672 : MOZ_MUST_USE bool computeErrorMetadata(ErrorMetadata* err, uint32_t offset);
673 :
674 : // General-purpose error reporters. You should avoid calling these
675 : // directly, and instead use the more succinct alternatives (error(),
676 : // warning(), &c.) in TokenStream, Parser, and BytecodeEmitter.
677 : bool reportStrictModeErrorNumberVA(UniquePtr<JSErrorNotes> notes, uint32_t offset,
678 : bool strictMode, unsigned errorNumber, va_list args);
679 : bool reportExtraWarningErrorNumberVA(UniquePtr<JSErrorNotes> notes, uint32_t offset,
680 : unsigned errorNumber, va_list args);
681 :
682 3 : JSAtom* getRawTemplateStringAtom() {
683 3 : MOZ_ASSERT(currentToken().type == TOK_TEMPLATE_HEAD ||
684 : currentToken().type == TOK_NO_SUBS_TEMPLATE);
685 3 : const CharT* cur = userbuf.rawCharPtrAt(currentToken().pos.begin + 1);
686 : const CharT* end;
687 3 : if (currentToken().type == TOK_TEMPLATE_HEAD) {
688 : // Of the form |`...${| or |}...${|
689 2 : end = userbuf.rawCharPtrAt(currentToken().pos.end - 2);
690 : } else {
691 : // NO_SUBS_TEMPLATE is of the form |`...`| or |}...`|
692 1 : end = userbuf.rawCharPtrAt(currentToken().pos.end - 1);
693 : }
694 :
695 6 : CharBuffer charbuf(cx);
696 5 : while (cur < end) {
697 1 : CharT ch = *cur;
698 1 : if (ch == '\r') {
699 0 : ch = '\n';
700 0 : if ((cur + 1 < end) && (*(cur + 1) == '\n'))
701 0 : cur++;
702 : }
703 1 : if (!charbuf.append(ch))
704 0 : return nullptr;
705 1 : cur++;
706 : }
707 3 : return AtomizeChars(cx, charbuf.begin(), charbuf.length());
708 : }
709 :
710 : private:
711 : // This is private because it should only be called by the tokenizer while
712 : // tokenizing not by, for example, BytecodeEmitter.
713 : bool reportStrictModeError(unsigned errorNumber, ...);
714 :
715 0 : void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
716 0 : switch (type) {
717 : case InvalidEscapeType::None:
718 0 : MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
719 : return;
720 : case InvalidEscapeType::Hexadecimal:
721 0 : errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
722 0 : return;
723 : case InvalidEscapeType::Unicode:
724 0 : errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
725 0 : return;
726 : case InvalidEscapeType::UnicodeOverflow:
727 0 : errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
728 0 : return;
729 : case InvalidEscapeType::Octal:
730 0 : errorAt(offset, JSMSG_DEPRECATED_OCTAL);
731 0 : return;
732 : }
733 : }
734 :
735 : static JSAtom* atomize(JSContext* cx, CharBuffer& cb);
736 : MOZ_MUST_USE bool putIdentInTokenbuf(const CharT* identStart);
737 :
738 : public:
739 : // Advance to the next token. If the token stream encountered an error,
740 : // return false. Otherwise return true and store the token kind in |*ttp|.
741 997955 : MOZ_MUST_USE bool getToken(TokenKind* ttp, Modifier modifier = None) {
742 : // Check for a pushed-back token resulting from mismatching lookahead.
743 997955 : if (lookahead != 0) {
744 667171 : MOZ_ASSERT(!flags.hadError);
745 667171 : lookahead--;
746 667171 : cursor = (cursor + 1) & ntokensMask;
747 667171 : TokenKind tt = currentToken().type;
748 667171 : MOZ_ASSERT(tt != TOK_EOL);
749 667171 : verifyConsistentModifier(modifier, currentToken());
750 667171 : *ttp = tt;
751 667171 : return true;
752 : }
753 :
754 330784 : return getTokenInternal(ttp, modifier);
755 : }
756 :
757 : // Push the last scanned token back into the stream.
758 667466 : void ungetToken() {
759 667466 : MOZ_ASSERT(lookahead < maxLookahead);
760 667466 : lookahead++;
761 667466 : cursor = (cursor - 1) & ntokensMask;
762 667466 : }
763 :
764 244787 : MOZ_MUST_USE bool peekToken(TokenKind* ttp, Modifier modifier = None) {
765 244787 : if (lookahead > 0) {
766 39589 : MOZ_ASSERT(!flags.hadError);
767 39589 : verifyConsistentModifier(modifier, nextToken());
768 39589 : *ttp = nextToken().type;
769 39589 : return true;
770 : }
771 205198 : if (!getTokenInternal(ttp, modifier))
772 0 : return false;
773 205198 : ungetToken();
774 205198 : return true;
775 : }
776 :
777 22762 : MOZ_MUST_USE bool peekTokenPos(TokenPos* posp, Modifier modifier = None) {
778 22762 : if (lookahead == 0) {
779 : TokenKind tt;
780 19435 : if (!getTokenInternal(&tt, modifier))
781 0 : return false;
782 19435 : ungetToken();
783 19435 : MOZ_ASSERT(hasLookahead());
784 : } else {
785 3327 : MOZ_ASSERT(!flags.hadError);
786 3327 : verifyConsistentModifier(modifier, nextToken());
787 : }
788 22762 : *posp = nextToken().pos;
789 22762 : return true;
790 : }
791 :
792 11100 : MOZ_MUST_USE bool peekOffset(uint32_t* offset, Modifier modifier = None) {
793 11100 : TokenPos pos;
794 11100 : if (!peekTokenPos(&pos, modifier))
795 0 : return false;
796 11100 : *offset = pos.begin;
797 11100 : return true;
798 : }
799 :
800 : // This is like peekToken(), with one exception: if there is an EOL
801 : // between the end of the current token and the start of the next token, it
802 : // return true and store TOK_EOL in |*ttp|. In that case, no token with
803 : // TOK_EOL is actually created, just a TOK_EOL TokenKind is returned, and
804 : // currentToken() shouldn't be consulted. (This is the only place TOK_EOL
805 : // is produced.)
806 : MOZ_ALWAYS_INLINE MOZ_MUST_USE bool
807 136654 : peekTokenSameLine(TokenKind* ttp, Modifier modifier = None) {
808 136654 : const Token& curr = currentToken();
809 :
810 : // If lookahead != 0, we have scanned ahead at least one token, and
811 : // |lineno| is the line that the furthest-scanned token ends on. If
812 : // it's the same as the line that the current token ends on, that's a
813 : // stronger condition than what we are looking for, and we don't need
814 : // to return TOK_EOL.
815 136654 : if (lookahead != 0) {
816 : bool onThisLine;
817 127583 : if (!srcCoords.isOnThisLine(curr.pos.end, lineno, &onThisLine)) {
818 0 : reportError(JSMSG_OUT_OF_MEMORY);
819 125975 : return false;
820 : }
821 :
822 127583 : if (onThisLine) {
823 125975 : MOZ_ASSERT(!flags.hadError);
824 125975 : verifyConsistentModifier(modifier, nextToken());
825 125975 : *ttp = nextToken().type;
826 125975 : return true;
827 : }
828 : }
829 :
830 : // The above check misses two cases where we don't have to return
831 : // TOK_EOL.
832 : // - The next token starts on the same line, but is a multi-line token.
833 : // - The next token starts on the same line, but lookahead==2 and there
834 : // is a newline between the next token and the one after that.
835 : // The following test is somewhat expensive but gets these cases (and
836 : // all others) right.
837 : TokenKind tmp;
838 10679 : if (!getToken(&tmp, modifier))
839 0 : return false;
840 10679 : const Token& next = currentToken();
841 10679 : ungetToken();
842 :
843 21358 : *ttp = srcCoords.lineNum(curr.pos.end) == srcCoords.lineNum(next.pos.begin)
844 10679 : ? next.type
845 : : TOK_EOL;
846 10679 : return true;
847 : }
848 :
849 : // Get the next token from the stream if its kind is |tt|.
850 275990 : MOZ_MUST_USE bool matchToken(bool* matchedp, TokenKind tt, Modifier modifier = None) {
851 : TokenKind token;
852 275990 : if (!getToken(&token, modifier))
853 0 : return false;
854 275990 : if (token == tt) {
855 96246 : *matchedp = true;
856 : } else {
857 179744 : ungetToken();
858 179744 : *matchedp = false;
859 : }
860 275990 : return true;
861 : }
862 :
863 3538 : void consumeKnownToken(TokenKind tt, Modifier modifier = None) {
864 : bool matched;
865 3538 : MOZ_ASSERT(hasLookahead());
866 3538 : MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
867 3538 : MOZ_ALWAYS_TRUE(matched);
868 3538 : }
869 :
870 91722 : MOZ_MUST_USE bool nextTokenEndsExpr(bool* endsExpr) {
871 : TokenKind tt;
872 91722 : if (!peekToken(&tt))
873 0 : return false;
874 91722 : *endsExpr = isExprEnding[tt];
875 91722 : return true;
876 : }
877 :
878 : class MOZ_STACK_CLASS Position {
879 : public:
880 : // The Token fields may contain pointers to atoms, so for correct
881 : // rooting we must ensure collection of atoms is disabled while objects
882 : // of this class are live. Do this by requiring a dummy AutoKeepAtoms
883 : // reference in the constructor.
884 : //
885 : // This class is explicity ignored by the analysis, so don't add any
886 : // more pointers to GC things here!
887 80005 : explicit Position(AutoKeepAtoms&) { }
888 : private:
889 : Position(const Position&) = delete;
890 : friend class TokenStream;
891 : const CharT* buf;
892 : Flags flags;
893 : unsigned lineno;
894 : size_t linebase;
895 : size_t prevLinebase;
896 : Token currentToken;
897 : unsigned lookahead;
898 : Token lookaheadTokens[maxLookahead];
899 : };
900 :
901 : MOZ_MUST_USE bool advance(size_t position);
902 : void tell(Position*);
903 : void seek(const Position& pos);
904 : MOZ_MUST_USE bool seek(const Position& pos, const TokenStream& other);
905 :
906 0 : const CharT* rawCharPtrAt(size_t offset) const {
907 0 : return userbuf.rawCharPtrAt(offset);
908 : }
909 :
910 0 : const CharT* rawLimit() const {
911 0 : return userbuf.limit();
912 : }
913 :
914 : private:
915 : // This is the low-level interface to the JS source code buffer. It just
916 : // gets raw chars, basically. TokenStreams functions are layered on top
917 : // and do some extra stuff like converting all EOL sequences to '\n',
918 : // tracking the line number, and setting |flags.isEOF|. (The "raw" in "raw
919 : // chars" refers to the lack of EOL sequence normalization.)
920 : //
921 : // buf[0..length-1] often represents a substring of some larger source,
922 : // where we have only the substring in memory. The |startOffset| argument
923 : // indicates the offset within this larger string at which our string
924 : // begins, the offset of |buf[0]|.
925 : class TokenBuf {
926 : public:
927 2449 : TokenBuf(JSContext* cx, const CharT* buf, size_t length, size_t startOffset)
928 2449 : : base_(buf),
929 : startOffset_(startOffset),
930 2449 : limit_(buf + length),
931 4898 : ptr(buf)
932 2449 : { }
933 :
934 4605133 : bool hasRawChars() const {
935 4605133 : return ptr < limit_;
936 : }
937 :
938 358540 : bool atStart() const {
939 358540 : return offset() == 0;
940 : }
941 :
942 0 : size_t startOffset() const {
943 0 : return startOffset_;
944 : }
945 :
946 1603373 : size_t offset() const {
947 1603373 : return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
948 : }
949 :
950 266 : const CharT* rawCharPtrAt(size_t offset) const {
951 266 : MOZ_ASSERT(startOffset_ <= offset);
952 266 : MOZ_ASSERT(offset - startOffset_ <= mozilla::PointerRangeSize(base_, limit_));
953 266 : return base_ + (offset - startOffset_);
954 : }
955 :
956 0 : const CharT* limit() const {
957 0 : return limit_;
958 : }
959 :
960 4501658 : CharT getRawChar() {
961 4501658 : return *ptr++; // this will nullptr-crash if poisoned
962 : }
963 :
964 15555 : CharT peekRawChar() const {
965 15555 : return *ptr; // this will nullptr-crash if poisoned
966 : }
967 :
968 102722 : bool matchRawChar(CharT c) {
969 102722 : if (*ptr == c) { // this will nullptr-crash if poisoned
970 24860 : ptr++;
971 24860 : return true;
972 : }
973 77862 : return false;
974 : }
975 :
976 7478 : bool matchRawCharBackwards(CharT c) {
977 7478 : MOZ_ASSERT(ptr); // make sure it hasn't been poisoned
978 7478 : if (*(ptr - 1) == c) {
979 0 : ptr--;
980 0 : return true;
981 : }
982 7478 : return false;
983 : }
984 :
985 351062 : void ungetRawChar() {
986 351062 : MOZ_ASSERT(ptr); // make sure it hasn't been poisoned
987 351062 : ptr--;
988 351062 : }
989 :
990 575617 : const CharT* addressOfNextRawChar(bool allowPoisoned = false) const {
991 575617 : MOZ_ASSERT_IF(!allowPoisoned, ptr); // make sure it hasn't been poisoned
992 575617 : return ptr;
993 : }
994 :
995 : // Use this with caution!
996 4116 : void setAddressOfNextRawChar(const CharT* a, bool allowPoisoned = false) {
997 4116 : MOZ_ASSERT_IF(!allowPoisoned, a);
998 4116 : ptr = a;
999 4116 : }
1000 :
1001 : #ifdef DEBUG
1002 : // Poison the TokenBuf so it cannot be accessed again.
1003 0 : void poison() {
1004 0 : ptr = nullptr;
1005 0 : }
1006 : #endif
1007 :
1008 445290 : static bool isRawEOLChar(int32_t c) {
1009 437803 : return c == '\n' ||
1010 437803 : c == '\r' ||
1011 883093 : c == unicode::LINE_SEPARATOR ||
1012 445290 : c == unicode::PARA_SEPARATOR;
1013 : }
1014 :
1015 : // Returns the offset of the next EOL, but stops once 'max' characters
1016 : // have been scanned (*including* the char at startOffset_).
1017 : size_t findEOLMax(size_t start, size_t max);
1018 :
1019 : private:
1020 : const CharT* base_; // base of buffer
1021 : uint32_t startOffset_; // offset of base_[0]
1022 : const CharT* limit_; // limit for quick bounds check
1023 : const CharT* ptr; // next char to get
1024 : };
1025 :
1026 : MOZ_MUST_USE bool getTokenInternal(TokenKind* ttp, Modifier modifier);
1027 :
1028 : MOZ_MUST_USE bool getStringOrTemplateToken(int untilChar, Token** tp);
1029 :
1030 : // Try to get the next character, normalizing '\r', '\r\n', and '\n' into
1031 : // '\n'. Also updates internal line-counter state. Return true on success
1032 : // and store the character in |*c|. Return false and leave |*c| undefined
1033 : // on failure.
1034 : MOZ_MUST_USE bool getChar(int32_t* cp);
1035 : int32_t getCharIgnoreEOL();
1036 :
1037 : void ungetChar(int32_t c);
1038 : void ungetCharIgnoreEOL(int32_t c);
1039 : Token* newToken(ptrdiff_t adjust);
1040 : uint32_t peekUnicodeEscape(uint32_t* codePoint);
1041 : uint32_t peekExtendedUnicodeEscape(uint32_t* codePoint);
1042 : uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
1043 : bool matchUnicodeEscapeIdent(uint32_t* codePoint);
1044 : bool matchTrailForLeadSurrogate(char16_t lead, char16_t* trail, uint32_t* codePoint);
1045 : bool peekChars(int n, CharT* cp);
1046 :
1047 : MOZ_MUST_USE bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
1048 : MOZ_MUST_USE bool getDirective(bool isMultiline, bool shouldWarnDeprecated,
1049 : const char* directive, uint8_t directiveLength,
1050 : const char* errorMsgPragma,
1051 : UniquePtr<CharT[], JS::FreePolicy>* destination);
1052 : MOZ_MUST_USE bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
1053 : MOZ_MUST_USE bool getSourceMappingURL(bool isMultiline, bool shouldWarnDeprecated);
1054 :
1055 : // |expect| cannot be an EOL char.
1056 102722 : bool matchChar(int32_t expect) {
1057 102722 : MOZ_ASSERT(!TokenBuf::isRawEOLChar(expect));
1058 205444 : return MOZ_LIKELY(userbuf.hasRawChars()) &&
1059 205444 : userbuf.matchRawChar(expect);
1060 : }
1061 :
1062 384 : void consumeKnownChar(int32_t expect) {
1063 : int32_t c;
1064 384 : MOZ_ALWAYS_TRUE(getChar(&c));
1065 384 : MOZ_ASSERT(c == expect);
1066 384 : }
1067 :
1068 8116 : MOZ_MUST_USE bool peekChar(int32_t* c) {
1069 8116 : if (!getChar(c))
1070 0 : return false;
1071 8116 : ungetChar(*c);
1072 8116 : return true;
1073 : }
1074 :
1075 34 : void skipChars(uint8_t n) {
1076 64 : while (n-- > 0) {
1077 30 : MOZ_ASSERT(userbuf.hasRawChars());
1078 60 : mozilla::DebugOnly<int32_t> c = getCharIgnoreEOL();
1079 30 : MOZ_ASSERT(c != '\n');
1080 : }
1081 4 : }
1082 :
1083 0 : void skipCharsIgnoreEOL(uint8_t n) {
1084 0 : while (n-- > 0) {
1085 0 : MOZ_ASSERT(userbuf.hasRawChars());
1086 0 : getCharIgnoreEOL();
1087 : }
1088 0 : }
1089 :
1090 : MOZ_MUST_USE MOZ_ALWAYS_INLINE bool updateLineInfoForEOL();
1091 :
1092 : TokenBuf userbuf; // user input buffer
1093 : CharBuffer tokenbuf; // current token string buffer
1094 : };
1095 :
1096 : extern const char*
1097 : TokenKindToDesc(TokenKind tt);
1098 :
1099 : } // namespace frontend
1100 : } // namespace js
1101 :
1102 : extern JS_FRIEND_API(int)
1103 : js_fgets(char* buf, int size, FILE* file);
1104 :
1105 : #ifdef DEBUG
1106 : extern const char*
1107 : TokenKindToString(js::frontend::TokenKind tt);
1108 : #endif
1109 :
1110 : #endif /* frontend_TokenStream_h */
|