LCOV - code coverage report
Current view: top level - xpcom/ds - Tokenizer.h (source / functions) Hit Total Coverage
Test: output.info Lines: 15 27 55.6 %
Date: 2017-07-14 16:53:18 Functions: 9 28 32.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
       3             : /* This Source Code Form is subject to the terms of the Mozilla Public
       4             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       5             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       6             : 
       7             : #ifndef Tokenizer_h__
       8             : #define Tokenizer_h__
       9             : 
      10             : #include "nsString.h"
      11             : #include "mozilla/CheckedInt.h"
      12             : #include "mozilla/UniquePtr.h"
      13             : #include "nsTArray.h"
      14             : 
      15             : namespace mozilla {
      16             : 
      17        7105 : class TokenizerBase
      18             : {
      19             : public:
      20             :   /**
      21             :    * The analyzer works with elements in the input cut to a sequence of token
      22             :    * where each token has an elementary type
      23             :    */
      24             :   enum TokenType : uint32_t
      25             :   {
      26             :     TOKEN_UNKNOWN,
      27             :     TOKEN_RAW,
      28             :     TOKEN_ERROR,
      29             :     TOKEN_INTEGER,
      30             :     TOKEN_WORD,
      31             :     TOKEN_CHAR,
      32             :     TOKEN_WS,
      33             :     TOKEN_EOL,
      34             :     TOKEN_EOF,
      35             :     TOKEN_CUSTOM0 = 1000
      36             :   };
      37             : 
      38             :   enum ECaseSensitivity
      39             :   {
      40             :     CASE_SENSITIVE,
      41             :     CASE_INSENSITIVE
      42             :   };
      43             : 
      44             :   /**
      45             :    * Class holding the type and the value of a token.  It can be manually created
      46             :    * to allow checks against it via methods of Tokenizer or are results of some of
      47             :    * the Tokenizer's methods.
      48             :    */
      49       45172 :   class Token
      50             :   {
      51             :     TokenType mType;
      52             :     nsDependentCSubstring mWord;
      53             :     nsCString mCustom;
      54             :     char mChar;
      55             :     uint64_t mInteger;
      56             :     ECaseSensitivity mCustomCaseInsensitivity;
      57             :     bool mCustomEnabled;
      58             : 
      59             :     // If this token is a result of the parsing process, this member is referencing
      60             :     // a sub-string in the input buffer.  If this is externally created Token this
      61             :     // member is left an empty string.
      62             :     nsDependentCSubstring mFragment;
      63             : 
      64             :     friend class TokenizerBase;
      65             :     void AssignFragment(nsACString::const_char_iterator begin,
      66             :                         nsACString::const_char_iterator end);
      67             : 
      68             :     static Token Raw();
      69             : 
      70             :   public:
      71             :     Token();
      72             :     Token(const Token& aOther);
      73             :     Token& operator=(const Token& aOther);
      74             : 
      75             :     // Static constructors of tokens by type and value
      76             :     static Token Word(const nsACString& aWord);
      77             :     static Token Char(const char aChar);
      78             :     static Token Number(const uint64_t aNumber);
      79             :     static Token Whitespace();
      80             :     static Token NewLine();
      81             :     static Token EndOfFile();
      82             :     static Token Error();
      83             : 
      84             :     // Compares the two tokens, type must be identical and value
      85             :     // of one of the tokens must be 'any' or equal.
      86             :     bool Equals(const Token& aOther) const;
      87             : 
      88        7001 :     TokenType Type() const { return mType; }
      89             :     char AsChar() const;
      90             :     nsDependentCSubstring AsString() const;
      91             :     uint64_t AsInteger() const;
      92             : 
      93           3 :     nsDependentCSubstring Fragment() const { return mFragment; }
      94             :   };
      95             : 
      96             :   /**
      97             :    * Consumers may register a custom string that, when found in the input, is considered
      98             :    * a token and returned by Next*() and accepted by Check*() methods.
      99             :    * AddCustomToken() returns a reference to a token that can then be comapred using
     100             :    * Token::Equals() againts the output from Next*() or be passed to Check*().
     101             :    */
     102             :   Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
     103             :   template <uint32_t N>
     104           0 :   Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
     105             :   {
     106           0 :     return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
     107             :   }
     108             :   void RemoveCustomToken(Token& aToken);
     109             :   /**
     110             :    * Only applies to a custom type of a Token (see AddCustomToken above.)
     111             :    * This turns on and off token recognition.  When a custom token is disabled,
     112             :    * it's ignored as never added as a custom token.
     113             :    */
     114             :   void EnableCustomToken(Token const& aToken, bool aEnable);
     115             : 
     116             :   /**
     117             :    * Mode of tokenization.
     118             :    * FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
     119             :    * if added.
     120             :    * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
     121             :    * This mode can be understood as a 'binary' mode.
     122             :    */
     123             :   enum class Mode
     124             :   {
     125             :     FULL,
     126             :     CUSTOM_ONLY
     127             :   };
     128             :   void SetTokenizingMode(Mode aMode);
     129             : 
     130             :   /**
     131             :    * Return false iff the last Check*() call has returned false or when we've read past
     132             :    * the end of the input string.
     133             :    */
     134             :   MOZ_MUST_USE bool HasFailed() const;
     135             : 
     136             : protected:
     137             :   explicit TokenizerBase(const char* aWhitespaces = nullptr,
     138             :                          const char* aAdditionalWordChars = nullptr);
     139             : 
     140             :   // false if we have already read the EOF token.
     141             :   bool HasInput() const;
     142             :   // Main parsing function, it doesn't shift the read cursor, just returns the next
     143             :   // token position.
     144             :   nsACString::const_char_iterator Parse(Token& aToken) const;
     145             :   // Is read cursor at the end?
     146             :   bool IsEnd(const nsACString::const_char_iterator& caret) const;
     147             :   // True, when we are at the end of the input data, but it has not been marked
     148             :   // as complete yet.  In that case we cannot proceed with providing a multi-char token.
     149             :   bool IsPending(const nsACString::const_char_iterator & caret) const;
     150             :   // Is read cursor on a character that is a word start?
     151             :   bool IsWordFirst(const char aInput) const;
     152             :   // Is read cursor on a character that is an in-word letter?
     153             :   bool IsWord(const char aInput) const;
     154             :   // Is read cursor on a character that is a valid number?
     155             :   // TODO - support multiple radix
     156             :   bool IsNumber(const char aInput) const;
     157             :   // Is equal to the given custom token?
     158             :   bool IsCustom(const nsACString::const_char_iterator& caret,
     159             :                 const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
     160             : 
     161             :   // Friendly helper to assign a fragment on a Token
     162             :   static void AssignFragment(Token& aToken,
     163             :                              nsACString::const_char_iterator begin,
     164             :                              nsACString::const_char_iterator end);
     165             : 
     166             :   // true iff we have already read the EOF token
     167             :   bool mPastEof;
     168             :   // true iff the last Check*() call has returned false, reverts to true on Rollback() call
     169             :   bool mHasFailed;
     170             :   // true if the input string is final (finished), false when we expect more data
     171             :   // yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
     172             :   bool mInputFinished;
     173             :   // custom only vs full tokenizing mode, see the Parse() method
     174             :   Mode mMode;
     175             :   // minimal raw data chunked delivery during incremental feed
     176             :   uint32_t mMinRawDelivery;
     177             : 
     178             :   // Customizable list of whitespaces
     179             :   const char* mWhitespaces;
     180             :   // Additinal custom word characters
     181             :   const char* mAdditionalWordChars;
     182             : 
     183             :   // All these point to the original buffer passed to the constructor or to the incremental
     184             :   // buffer after FeedInput.
     185             :   nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
     186             :   nsACString::const_char_iterator mEnd; // End of the input position
     187             : 
     188             :   // This is the list of tokens user has registered with AddCustomToken()
     189             :   nsTArray<UniquePtr<Token>> mCustomTokens;
     190             :   uint32_t mNextCustomTokenID;
     191             : 
     192             : private:
     193             :   TokenizerBase() = delete;
     194             :   TokenizerBase(const TokenizerBase&) = delete;
     195             :   TokenizerBase(TokenizerBase&&) = delete;
     196             :   TokenizerBase(const TokenizerBase&&) = delete;
     197             :   TokenizerBase &operator=(const TokenizerBase&) = delete;
     198             : };
     199             : 
     200             : /**
     201             :  * This is a simple implementation of a lexical analyzer or maybe better
     202             :  * called a tokenizer.  It doesn't allow any user dictionaries or
     203             :  * user define token types.
     204             :  *
     205             :  * It is limited only to ASCII input for now. UTF-8 or any other input
     206             :  * encoding must yet be implemented.
     207             :  */
     208        7105 : class Tokenizer : public TokenizerBase
     209             : {
     210             : public:
     211             :   /**
     212             :    * @param aSource
     213             :    *    The string to parse.
     214             :    *    IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer lifetime.
     215             :    *    It's up to the consumer to make sure the string's buffer outlives the Tokenizer!
     216             :    * @param aWhitespaces
     217             :    *    If non-null Tokenizer will use this custom set of whitespaces for CheckWhite()
     218             :    *    and SkipWhites() calls.
     219             :    *    By default the list consists of space and tab.
     220             :    * @param aAdditionalWordChars
     221             :    *    If non-null it will be added to the list of characters that consist a word.
     222             :    *    This is useful when you want to accept e.g. '-' in HTTP headers.
     223             :    *    By default a word character is consider any character for which upper case
     224             :    *    is different from lower case.
     225             :    *
     226             :    * If there is an overlap between aWhitespaces and aAdditionalWordChars, the check for
     227             :    * word characters is made first.
     228             :    */
     229             :   explicit Tokenizer(const nsACString& aSource,
     230             :                      const char* aWhitespaces = nullptr,
     231             :                      const char* aAdditionalWordChars = nullptr);
     232             :   explicit Tokenizer(const char* aSource,
     233             :                      const char* aWhitespaces = nullptr,
     234             :                      const char* aAdditionalWordChars = nullptr);
     235             : 
     236             :   /**
     237             :    * When there is still anything to read from the input, tokenize it, store the token type
     238             :    * and value to aToken result and shift the cursor past this just parsed token.  Each call
     239             :    * to Next() reads another token from the input and shifts the cursor.
     240             :    * Returns false if we have passed the end of the input.
     241             :    */
     242             :   MOZ_MUST_USE
     243             :   bool Next(Token& aToken);
     244             : 
     245             :   /**
     246             :    * Parse the token on the input read cursor position, check its type is equal to aTokenType
     247             :    * and if so, put it into aResult, shift the cursor and return true.  Otherwise, leave
     248             :    * the input read cursor position intact and return false.
     249             :    */
     250             :   MOZ_MUST_USE
     251             :   bool Check(const TokenType aTokenType, Token& aResult);
     252             :   /**
     253             :    * Same as above method, just compares both token type and token value passed in aToken.
     254             :    * When both the type and the value equals, shift the cursor and return true.  Otherwise
     255             :    * return false.
     256             :    */
     257             :   MOZ_MUST_USE
     258             :   bool Check(const Token& aToken);
     259             : 
     260             :   /**
     261             :    * SkipWhites method (below) may also skip new line characters automatically.
     262             :    */
     263             :   enum WhiteSkipping {
     264             :     /**
     265             :      * SkipWhites will only skip what is defined as a white space (default).
     266             :      */
     267             :     DONT_INCLUDE_NEW_LINE = 0,
     268             :     /**
     269             :      * SkipWhites will skip definited white spaces as well as new lines
     270             :      * automatically.
     271             :      */
     272             :     INCLUDE_NEW_LINE = 1
     273             :   };
     274             : 
     275             :   /**
     276             :    * Skips any occurence of whitespaces specified in mWhitespaces member,
     277             :    * optionally skip also new lines.
     278             :    */
     279             :   void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
     280             : 
     281             :   /**
     282             :    * Skips all tokens until the given one is found or EOF is hit.  The token
     283             :    * or EOF are next to read.
     284             :    */
     285             :   void SkipUntil(Token const& aToken);
     286             : 
     287             :   // These are mostly shortcuts for the Check() methods above.
     288             : 
     289             :   /**
     290             :    * Check whitespace character is present.
     291             :    */
     292             :   MOZ_MUST_USE
     293        7504 :   bool CheckWhite() { return Check(Token::Whitespace()); }
     294             :   /**
     295             :    * Check there is a single character on the read cursor position.  If so, shift the read
     296             :    * cursor position and return true.  Otherwise false.
     297             :    */
     298             :   MOZ_MUST_USE
     299        7495 :   bool CheckChar(const char aChar) { return Check(Token::Char(aChar)); }
     300             :   /**
     301             :    * This is a customizable version of CheckChar.  aClassifier is a function called with
     302             :    * value of the character on the current input read position.  If this user function
     303             :    * returns true, read cursor is shifted and true returned.  Otherwise false.
     304             :    * The user classifiction function is not called when we are at or past the end and
     305             :    * false is immediately returned.
     306             :    */
     307             :   MOZ_MUST_USE
     308             :   bool CheckChar(bool (*aClassifier)(const char aChar));
     309             :   /**
     310             :    * Check for a whole expected word.
     311             :    */
     312             :   MOZ_MUST_USE
     313             :   bool CheckWord(const nsACString& aWord) { return Check(Token::Word(aWord)); }
     314             :   /**
     315             :    * Shortcut for literal const word check with compile time length calculation.
     316             :    */
     317             :   template <uint32_t N>
     318             :   MOZ_MUST_USE
     319           0 :   bool CheckWord(const char (&aWord)[N]) { return Check(Token::Word(nsDependentCString(aWord, N - 1))); }
     320             :   /**
     321             :    * Checks \r, \n or \r\n.
     322             :    */
     323             :   MOZ_MUST_USE
     324           0 :   bool CheckEOL() { return Check(Token::NewLine()); }
     325             :   /**
     326             :    * Checks we are at the end of the input string reading.  If so, shift past the end
     327             :    * and returns true.  Otherwise does nothing and returns false.
     328             :    */
     329             :   MOZ_MUST_USE
     330          19 :   bool CheckEOF() { return Check(Token::EndOfFile()); }
     331             : 
     332             :   /**
     333             :    * These are shortcuts to obtain the value immediately when the token type matches.
     334             :    */
     335             :   MOZ_MUST_USE bool ReadChar(char* aValue);
     336             :   MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const char aChar),
     337             :                              char* aValue);
     338             :   MOZ_MUST_USE bool ReadWord(nsACString& aValue);
     339             :   MOZ_MUST_USE bool ReadWord(nsDependentCSubstring& aValue);
     340             : 
     341             :   /**
     342             :    * This is an integer read helper.  It returns false and doesn't move the read
     343             :    * cursor when any of the following happens:
     344             :    *  - the token at the read cursor is not an integer
     345             :    *  - the final number doesn't fit the T type
     346             :    * Otherwise true is returned, aValue is filled with the integral number
     347             :    * and the cursor is moved forward.
     348             :    */
     349             :   template <typename T>
     350          16 :   MOZ_MUST_USE bool ReadInteger(T* aValue)
     351             :   {
     352          16 :     MOZ_RELEASE_ASSERT(aValue);
     353             : 
     354          16 :     nsACString::const_char_iterator rollback = mRollback;
     355          16 :     nsACString::const_char_iterator cursor = mCursor;
     356          32 :     Token t;
     357          16 :     if (!Check(TOKEN_INTEGER, t)) {
     358          16 :       return false;
     359             :     }
     360             : 
     361           0 :     mozilla::CheckedInt<T> checked(t.AsInteger());
     362           0 :     if (!checked.isValid()) {
     363             :       // Move to a state as if Check() call has failed
     364           0 :       mRollback = rollback;
     365           0 :       mCursor = cursor;
     366           0 :       mHasFailed = true;
     367           0 :       return false;
     368             :     }
     369             : 
     370           0 :     *aValue = checked.value();
     371           0 :     return true;
     372             :   }
     373             : 
     374             :   /**
     375             :    * Returns the read cursor position back as it was before the last call of any parsing
     376             :    * method of Tokenizer (Next, Check*, Skip*, Read*) so that the last operation
     377             :    * can be repeated.
     378             :    * Rollback cannot be used multiple times, it only reverts the last successfull parse
     379             :    * operation.  It also cannot be used before any parsing operation has been called
     380             :    * on the Tokenizer.
     381             :    */
     382             :   void Rollback();
     383             : 
     384             :   /**
     385             :    * Record() and Claim() are collecting the input as it is being parsed to obtain
     386             :    * a substring between particular syntax bounderies defined by any recursive
     387             :    * descent parser or simple parser the Tokenizer is used to read the input for.
     388             :    * Inlucsion of a token that has just been parsed can be controlled using an arguemnt.
     389             :    */
     390             :   enum ClaimInclusion {
     391             :     /**
     392             :      * Include resulting (or passed) token of the last lexical analyzer operation in the result.
     393             :      */
     394             :     INCLUDE_LAST,
     395             :     /**
     396             :      * Do not include it.
     397             :      */
     398             :     EXCLUDE_LAST
     399             :   };
     400             : 
     401             :   /**
     402             :    * Start the process of recording.  Based on aInclude value the begining of the recorded
     403             :    * sub-string is at the current position (EXCLUDE_LAST) or at the position before the last
     404             :    * parsed token (INCLUDE_LAST).
     405             :    */
     406             :   void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
     407             :   /**
     408             :    * Claim result of the record started with Record() call before.  Depending on aInclude
     409             :    * the ending of the sub-string result includes or excludes the last parsed or checked
     410             :    * token.
     411             :    */
     412             :   void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
     413             :   void Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
     414             : 
     415             :   /**
     416             :    * If aToken is found, aResult is set to the substring between the current
     417             :    * position and the position of aToken, potentially including aToken depending
     418             :    * on aInclude.
     419             :    * If aToken isn't found aResult is set to the substring between the current
     420             :    * position and the end of the string.
     421             :    * If aToken is found, the method returns true. Otherwise it returns false.
     422             :    *
     423             :    * Calling Rollback() after ReadUntil() will return the read cursor to the
     424             :    * position it had before ReadUntil was called.
     425             :    */
     426             :   MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsDependentCSubstring& aResult,
     427             :                               ClaimInclusion aInclude = EXCLUDE_LAST);
     428             :   MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult,
     429             :                               ClaimInclusion aInclude = EXCLUDE_LAST);
     430             : 
     431             : protected:
     432             :   // All these point to the original buffer passed to the Tokenizer's constructor
     433             :   nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
     434             :   nsACString::const_char_iterator mRollback; // Position of the previous token start
     435             : 
     436             : private:
     437             :   Tokenizer() = delete;
     438             :   Tokenizer(const Tokenizer&) = delete;
     439             :   Tokenizer(Tokenizer&&) = delete;
     440             :   Tokenizer(const Tokenizer&&) = delete;
     441             :   Tokenizer &operator=(const Tokenizer&) = delete;
     442             : };
     443             : 
     444             : } // mozilla
     445             : 
     446             : #endif // Tokenizer_h__

Generated by: LCOV version 1.13