LCOV - code coverage report
Current view: top level - toolkit/components/protobuf/src/google/protobuf/io - tokenizer.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 17 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 11 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Protocol Buffers - Google's data interchange format
       2             : // Copyright 2008 Google Inc.  All rights reserved.
       3             : // https://developers.google.com/protocol-buffers/
       4             : //
       5             : // Redistribution and use in source and binary forms, with or without
       6             : // modification, are permitted provided that the following conditions are
       7             : // met:
       8             : //
       9             : //     * Redistributions of source code must retain the above copyright
      10             : // notice, this list of conditions and the following disclaimer.
      11             : //     * Redistributions in binary form must reproduce the above
      12             : // copyright notice, this list of conditions and the following disclaimer
      13             : // in the documentation and/or other materials provided with the
      14             : // distribution.
      15             : //     * Neither the name of Google Inc. nor the names of its
      16             : // contributors may be used to endorse or promote products derived from
      17             : // this software without specific prior written permission.
      18             : //
      19             : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      20             : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      21             : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      22             : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
      23             : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
      24             : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
      25             : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
      26             : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
      27             : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
      28             : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
      29             : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30             : 
      31             : // Author: kenton@google.com (Kenton Varda)
      32             : //  Based on original Protocol Buffers design by
      33             : //  Sanjay Ghemawat, Jeff Dean, and others.
      34             : //
      35             : // Class for parsing tokenized text from a ZeroCopyInputStream.
      36             : 
      37             : #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
      38             : #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
      39             : 
      40             : #include <string>
      41             : #include <vector>
      42             : #include <google/protobuf/stubs/common.h>
      43             : 
      44             : namespace google {
      45             : namespace protobuf {
      46             : namespace io {
      47             : 
      48             : class ZeroCopyInputStream;     // zero_copy_stream.h
      49             : 
      50             : // Defined in this file.
      51             : class ErrorCollector;
      52             : class Tokenizer;
      53             : 
      54             : // Abstract interface for an object which collects the errors that occur
      55             : // during parsing.  A typical implementation might simply print the errors
      56             : // to stdout.
      57             : class LIBPROTOBUF_EXPORT ErrorCollector {
      58             :  public:
      59           0 :   inline ErrorCollector() {}
      60             :   virtual ~ErrorCollector();
      61             : 
      62             :   // Indicates that there was an error in the input at the given line and
      63             :   // column numbers.  The numbers are zero-based, so you may want to add
      64             :   // 1 to each before printing them.
      65             :   virtual void AddError(int line, int column, const string& message) = 0;
      66             : 
      67             :   // Indicates that there was a warning in the input at the given line and
      68             :   // column numbers.  The numbers are zero-based, so you may want to add
      69             :   // 1 to each before printing them.
      70           0 :   virtual void AddWarning(int /* line */, int /* column */,
      71           0 :                           const string& /* message */) { }
      72             : 
      73             :  private:
      74             :   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
      75             : };
      76             : 
      77             : // This class converts a stream of raw text into a stream of tokens for
      78             : // the protocol definition parser to parse.  The tokens recognized are
      79             : // similar to those that make up the C language; see the TokenType enum for
      80             : // precise descriptions.  Whitespace and comments are skipped.  By default,
      81             : // C- and C++-style comments are recognized, but other styles can be used by
      82             : // calling set_comment_style().
      83             : class LIBPROTOBUF_EXPORT Tokenizer {
      84             :  public:
      85             :   // Construct a Tokenizer that reads and tokenizes text from the given
      86             :   // input stream and writes errors to the given error_collector.
      87             :   // The caller keeps ownership of input and error_collector.
      88             :   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
      89             :   ~Tokenizer();
      90             : 
      91             :   enum TokenType {
      92             :     TYPE_START,       // Next() has not yet been called.
      93             :     TYPE_END,         // End of input reached.  "text" is empty.
      94             : 
      95             :     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
      96             :                       // starting with a digit.  It is an error for a number
      97             :                       // to be followed by an identifier with no space in
      98             :                       // between.
      99             :     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
     100             :                       // the digits are decimal, but a prefix of "0x" indicates
     101             :                       // a hex number and a leading zero indicates octal, just
     102             :                       // like with C numeric literals.  A leading negative sign
     103             :                       // is NOT included in the token; it's up to the parser to
     104             :                       // interpret the unary minus operator on its own.
     105             :     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
     106             :                       // an exponent.  Always in decimal.  Again, never
     107             :                       // negative.
     108             :     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
     109             :                       // or double quotes can be used, but they must match.
     110             :                       // A string literal cannot cross a line break.
     111             :     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
     112             :                       // Symbols are always a single character, so "!+$%" is
     113             :                       // four tokens.
     114             :   };
     115             : 
     116             :   // Structure representing a token read from the token stream.
     117           0 :   struct Token {
     118             :     TokenType type;
     119             :     string text;       // The exact text of the token as it appeared in
     120             :                        // the input.  e.g. tokens of TYPE_STRING will still
     121             :                        // be escaped and in quotes.
     122             : 
     123             :     // "line" and "column" specify the position of the first character of
     124             :     // the token within the input stream.  They are zero-based.
     125             :     int line;
     126             :     int column;
     127             :     int end_column;
     128             :   };
     129             : 
     130             :   // Get the current token.  This is updated when Next() is called.  Before
     131             :   // the first call to Next(), current() has type TYPE_START and no contents.
     132             :   const Token& current();
     133             : 
     134             :   // Return the previous token -- i.e. what current() returned before the
     135             :   // previous call to Next().
     136             :   const Token& previous();
     137             : 
     138             :   // Advance to the next token.  Returns false if the end of the input is
     139             :   // reached.
     140             :   bool Next();
     141             : 
     142             :   // Like Next(), but also collects comments which appear between the previous
     143             :   // and next tokens.
     144             :   //
     145             :   // Comments which appear to be attached to the previous token are stored
     146             :   // in *prev_tailing_comments.  Comments which appear to be attached to the
     147             :   // next token are stored in *next_leading_comments.  Comments appearing in
     148             :   // between which do not appear to be attached to either will be added to
     149             :   // detached_comments.  Any of these parameters can be NULL to simply discard
     150             :   // the comments.
     151             :   //
     152             :   // A series of line comments appearing on consecutive lines, with no other
     153             :   // tokens appearing on those lines, will be treated as a single comment.
     154             :   //
     155             :   // Only the comment content is returned; comment markers (e.g. //) are
     156             :   // stripped out.  For block comments, leading whitespace and an asterisk will
     157             :   // be stripped from the beginning of each line other than the first.  Newlines
     158             :   // are included in the output.
     159             :   //
     160             :   // Examples:
     161             :   //
     162             :   //   optional int32 foo = 1;  // Comment attached to foo.
     163             :   //   // Comment attached to bar.
     164             :   //   optional int32 bar = 2;
     165             :   //
     166             :   //   optional string baz = 3;
     167             :   //   // Comment attached to baz.
     168             :   //   // Another line attached to baz.
     169             :   //
     170             :   //   // Comment attached to qux.
     171             :   //   //
     172             :   //   // Another line attached to qux.
     173             :   //   optional double qux = 4;
     174             :   //
     175             :   //   // Detached comment.  This is not attached to qux or corge
     176             :   //   // because there are blank lines separating it from both.
     177             :   //
     178             :   //   optional string corge = 5;
     179             :   //   /* Block comment attached
     180             :   //    * to corge.  Leading asterisks
     181             :   //    * will be removed. */
     182             :   //   /* Block comment attached to
     183             :   //    * grault. */
     184             :   //   optional int32 grault = 6;
     185             :   bool NextWithComments(string* prev_trailing_comments,
     186             :                         vector<string>* detached_comments,
     187             :                         string* next_leading_comments);
     188             : 
     189             :   // Parse helpers ---------------------------------------------------
     190             : 
     191             :   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
     192             :   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
     193             :   // result is undefined (possibly an assert failure).
     194             :   static double ParseFloat(const string& text);
     195             : 
     196             :   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
     197             :   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
     198             :   // result is undefined (possibly an assert failure).
     199             :   static void ParseString(const string& text, string* output);
     200             : 
     201             :   // Identical to ParseString, but appends to output.
     202             :   static void ParseStringAppend(const string& text, string* output);
     203             : 
     204             :   // Parses a TYPE_INTEGER token.  Returns false if the result would be
     205             :   // greater than max_value.  Otherwise, returns true and sets *output to the
     206             :   // result.  If the text is not from a Token of type TYPE_INTEGER originally
     207             :   // parsed by a Tokenizer, the result is undefined (possibly an assert
     208             :   // failure).
     209             :   static bool ParseInteger(const string& text, uint64 max_value,
     210             :                            uint64* output);
     211             : 
     212             :   // Options ---------------------------------------------------------
     213             : 
     214             :   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
     215             :   // which would otherwise be integers but which have the 'f' suffix will be
     216             :   // forced to be interpreted as floats.  For all other purposes, the 'f' is
     217             :   // ignored.
     218           0 :   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
     219             : 
     220             :   // Valid values for set_comment_style().
     221             :   enum CommentStyle {
     222             :     // Line comments begin with "//", block comments are delimited by "/*" and
     223             :     // "*/".
     224             :     CPP_COMMENT_STYLE,
     225             :     // Line comments begin with "#".  No way to write block comments.
     226             :     SH_COMMENT_STYLE
     227             :   };
     228             : 
     229             :   // Sets the comment style.
     230           0 :   void set_comment_style(CommentStyle style) { comment_style_ = style; }
     231             : 
     232             :   // Whether to require whitespace between a number and a field name.
     233             :   // Default is true. Do not use this; for Google-internal cleanup only.
     234           0 :   void set_require_space_after_number(bool require) {
     235           0 :     require_space_after_number_ = require;
     236           0 :   }
     237             : 
     238             :   // Whether to allow string literals to span multiple lines. Default is false.
     239             :   // Do not use this; for Google-internal cleanup only.
     240           0 :   void set_allow_multiline_strings(bool allow) {
     241           0 :     allow_multiline_strings_ = allow;
     242           0 :   }
     243             : 
     244             :   // External helper: validate an identifier.
     245             :   static bool IsIdentifier(const string& text);
     246             : 
     247             :   // -----------------------------------------------------------------
     248             :  private:
     249             :   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
     250             : 
     251             :   Token current_;           // Returned by current().
     252             :   Token previous_;          // Returned by previous().
     253             : 
     254             :   ZeroCopyInputStream* input_;
     255             :   ErrorCollector* error_collector_;
     256             : 
     257             :   char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
     258             :   const char* buffer_;      // Current buffer returned from input_.
     259             :   int buffer_size_;         // Size of buffer_.
     260             :   int buffer_pos_;          // Current position within the buffer.
     261             :   bool read_error_;         // Did we previously encounter a read error?
     262             : 
     263             :   // Line and column number of current_char_ within the whole input stream.
     264             :   int line_;
     265             :   int column_;
     266             : 
     267             :   // String to which text should be appended as we advance through it.
     268             :   // Call RecordTo(&str) to start recording and StopRecording() to stop.
     269             :   // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
     270             :   // position within the current buffer where recording started.
     271             :   string* record_target_;
     272             :   int record_start_;
     273             : 
     274             :   // Options.
     275             :   bool allow_f_after_float_;
     276             :   CommentStyle comment_style_;
     277             :   bool require_space_after_number_;
     278             :   bool allow_multiline_strings_;
     279             : 
     280             :   // Since we count columns we need to interpret tabs somehow.  We'll take
     281             :   // the standard 8-character definition for lack of any way to do better.
     282             :   static const int kTabWidth = 8;
     283             : 
     284             :   // -----------------------------------------------------------------
     285             :   // Helper methods.
     286             : 
     287             :   // Consume this character and advance to the next one.
     288             :   void NextChar();
     289             : 
     290             :   // Read a new buffer from the input.
     291             :   void Refresh();
     292             : 
     293             :   inline void RecordTo(string* target);
     294             :   inline void StopRecording();
     295             : 
     296             :   // Called when the current character is the first character of a new
     297             :   // token (not including whitespace or comments).
     298             :   inline void StartToken();
     299             :   // Called when the current character is the first character after the
     300             :   // end of the last token.  After this returns, current_.text will
     301             :   // contain all text consumed since StartToken() was called.
     302             :   inline void EndToken();
     303             : 
     304             :   // Convenience method to add an error at the current line and column.
     305           0 :   void AddError(const string& message) {
     306           0 :     error_collector_->AddError(line_, column_, message);
     307           0 :   }
     308             : 
     309             :   // -----------------------------------------------------------------
     310             :   // The following four methods are used to consume tokens of specific
     311             :   // types.  They are actually used to consume all characters *after*
     312             :   // the first, since the calling function consumes the first character
     313             :   // in order to decide what kind of token is being read.
     314             : 
     315             :   // Read and consume a string, ending when the given delimiter is
     316             :   // consumed.
     317             :   void ConsumeString(char delimiter);
     318             : 
     319             :   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
     320             :   // depending on what was read.  This needs to know if the first
     321             :   // character was a zero in order to correctly recognize hex and octal
     322             :   // numbers.
     323             :   // It also needs to know if the first characted was a . to parse floating
     324             :   // point correctly.
     325             :   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
     326             : 
     327             :   // Consume the rest of a line.
     328             :   void ConsumeLineComment(string* content);
     329             :   // Consume until "*/".
     330             :   void ConsumeBlockComment(string* content);
     331             : 
     332             :   enum NextCommentStatus {
     333             :     // Started a line comment.
     334             :     LINE_COMMENT,
     335             : 
     336             :     // Started a block comment.
     337             :     BLOCK_COMMENT,
     338             : 
     339             :     // Consumed a slash, then realized it wasn't a comment.  current_ has
     340             :     // been filled in with a slash token.  The caller should return it.
     341             :     SLASH_NOT_COMMENT,
     342             : 
     343             :     // We do not appear to be starting a comment here.
     344             :     NO_COMMENT
     345             :   };
     346             : 
     347             :   // If we're at the start of a new comment, consume it and return what kind
     348             :   // of comment it is.
     349             :   NextCommentStatus TryConsumeCommentStart();
     350             : 
     351             :   // -----------------------------------------------------------------
     352             :   // These helper methods make the parsing code more readable.  The
     353             :   // "character classes" refered to are defined at the top of the .cc file.
     354             :   // Basically it is a C++ class with one method:
     355             :   //   static bool InClass(char c);
     356             :   // The method returns true if c is a member of this "class", like "Letter"
     357             :   // or "Digit".
     358             : 
     359             :   // Returns true if the current character is of the given character
     360             :   // class, but does not consume anything.
     361             :   template<typename CharacterClass>
     362             :   inline bool LookingAt();
     363             : 
     364             :   // If the current character is in the given class, consume it and return
     365             :   // true.  Otherwise return false.
     366             :   // e.g. TryConsumeOne<Letter>()
     367             :   template<typename CharacterClass>
     368             :   inline bool TryConsumeOne();
     369             : 
     370             :   // Like above, but try to consume the specific character indicated.
     371             :   inline bool TryConsume(char c);
     372             : 
     373             :   // Consume zero or more of the given character class.
     374             :   template<typename CharacterClass>
     375             :   inline void ConsumeZeroOrMore();
     376             : 
     377             :   // Consume one or more of the given character class or log the given
     378             :   // error message.
     379             :   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
     380             :   template<typename CharacterClass>
     381             :   inline void ConsumeOneOrMore(const char* error);
     382             : };
     383             : 
     384             : // inline methods ====================================================
     385           0 : inline const Tokenizer::Token& Tokenizer::current() {
     386           0 :   return current_;
     387             : }
     388             : 
     389             : inline const Tokenizer::Token& Tokenizer::previous() {
     390             :   return previous_;
     391             : }
     392             : 
     393             : inline void Tokenizer::ParseString(const string& text, string* output) {
     394             :   output->clear();
     395             :   ParseStringAppend(text, output);
     396             : }
     397             : 
     398             : }  // namespace io
     399             : }  // namespace protobuf
     400             : 
     401             : }  // namespace google
     402             : #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__

Generated by: LCOV version 1.13