Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #ifndef Tokenizer_h__
8 : #define Tokenizer_h__
9 :
10 : #include "nsString.h"
11 : #include "mozilla/CheckedInt.h"
12 : #include "mozilla/UniquePtr.h"
13 : #include "nsTArray.h"
14 :
15 : namespace mozilla {
16 :
17 7105 : class TokenizerBase
18 : {
19 : public:
20 : /**
21 : * The analyzer works with elements in the input cut to a sequence of token
22 : * where each token has an elementary type
23 : */
24 : enum TokenType : uint32_t
25 : {
26 : TOKEN_UNKNOWN,
27 : TOKEN_RAW,
28 : TOKEN_ERROR,
29 : TOKEN_INTEGER,
30 : TOKEN_WORD,
31 : TOKEN_CHAR,
32 : TOKEN_WS,
33 : TOKEN_EOL,
34 : TOKEN_EOF,
35 : TOKEN_CUSTOM0 = 1000
36 : };
37 :
38 : enum ECaseSensitivity
39 : {
40 : CASE_SENSITIVE,
41 : CASE_INSENSITIVE
42 : };
43 :
44 : /**
45 : * Class holding the type and the value of a token. It can be manually created
46 : * to allow checks against it via methods of Tokenizer or are results of some of
47 : * the Tokenizer's methods.
48 : */
49 45172 : class Token
50 : {
51 : TokenType mType;
52 : nsDependentCSubstring mWord;
53 : nsCString mCustom;
54 : char mChar;
55 : uint64_t mInteger;
56 : ECaseSensitivity mCustomCaseInsensitivity;
57 : bool mCustomEnabled;
58 :
59 : // If this token is a result of the parsing process, this member is referencing
60 : // a sub-string in the input buffer. If this is externally created Token this
61 : // member is left an empty string.
62 : nsDependentCSubstring mFragment;
63 :
64 : friend class TokenizerBase;
65 : void AssignFragment(nsACString::const_char_iterator begin,
66 : nsACString::const_char_iterator end);
67 :
68 : static Token Raw();
69 :
70 : public:
71 : Token();
72 : Token(const Token& aOther);
73 : Token& operator=(const Token& aOther);
74 :
75 : // Static constructors of tokens by type and value
76 : static Token Word(const nsACString& aWord);
77 : static Token Char(const char aChar);
78 : static Token Number(const uint64_t aNumber);
79 : static Token Whitespace();
80 : static Token NewLine();
81 : static Token EndOfFile();
82 : static Token Error();
83 :
84 : // Compares the two tokens, type must be identical and value
85 : // of one of the tokens must be 'any' or equal.
86 : bool Equals(const Token& aOther) const;
87 :
88 7001 : TokenType Type() const { return mType; }
89 : char AsChar() const;
90 : nsDependentCSubstring AsString() const;
91 : uint64_t AsInteger() const;
92 :
93 3 : nsDependentCSubstring Fragment() const { return mFragment; }
94 : };
95 :
96 : /**
97 : * Consumers may register a custom string that, when found in the input, is considered
98 : * a token and returned by Next*() and accepted by Check*() methods.
99 : * AddCustomToken() returns a reference to a token that can then be comapred using
100 : * Token::Equals() againts the output from Next*() or be passed to Check*().
101 : */
102 : Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
103 : template <uint32_t N>
104 0 : Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
105 : {
106 0 : return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
107 : }
108 : void RemoveCustomToken(Token& aToken);
109 : /**
110 : * Only applies to a custom type of a Token (see AddCustomToken above.)
111 : * This turns on and off token recognition. When a custom token is disabled,
112 : * it's ignored as never added as a custom token.
113 : */
114 : void EnableCustomToken(Token const& aToken, bool aEnable);
115 :
116 : /**
117 : * Mode of tokenization.
118 : * FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
119 : * if added.
120 : * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
121 : * This mode can be understood as a 'binary' mode.
122 : */
123 : enum class Mode
124 : {
125 : FULL,
126 : CUSTOM_ONLY
127 : };
128 : void SetTokenizingMode(Mode aMode);
129 :
130 : /**
131 : * Return false iff the last Check*() call has returned false or when we've read past
132 : * the end of the input string.
133 : */
134 : MOZ_MUST_USE bool HasFailed() const;
135 :
136 : protected:
137 : explicit TokenizerBase(const char* aWhitespaces = nullptr,
138 : const char* aAdditionalWordChars = nullptr);
139 :
140 : // false if we have already read the EOF token.
141 : bool HasInput() const;
142 : // Main parsing function, it doesn't shift the read cursor, just returns the next
143 : // token position.
144 : nsACString::const_char_iterator Parse(Token& aToken) const;
145 : // Is read cursor at the end?
146 : bool IsEnd(const nsACString::const_char_iterator& caret) const;
147 : // True, when we are at the end of the input data, but it has not been marked
148 : // as complete yet. In that case we cannot proceed with providing a multi-char token.
149 : bool IsPending(const nsACString::const_char_iterator & caret) const;
150 : // Is read cursor on a character that is a word start?
151 : bool IsWordFirst(const char aInput) const;
152 : // Is read cursor on a character that is an in-word letter?
153 : bool IsWord(const char aInput) const;
154 : // Is read cursor on a character that is a valid number?
155 : // TODO - support multiple radix
156 : bool IsNumber(const char aInput) const;
157 : // Is equal to the given custom token?
158 : bool IsCustom(const nsACString::const_char_iterator& caret,
159 : const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
160 :
161 : // Friendly helper to assign a fragment on a Token
162 : static void AssignFragment(Token& aToken,
163 : nsACString::const_char_iterator begin,
164 : nsACString::const_char_iterator end);
165 :
166 : // true iff we have already read the EOF token
167 : bool mPastEof;
168 : // true iff the last Check*() call has returned false, reverts to true on Rollback() call
169 : bool mHasFailed;
170 : // true if the input string is final (finished), false when we expect more data
171 : // yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
172 : bool mInputFinished;
173 : // custom only vs full tokenizing mode, see the Parse() method
174 : Mode mMode;
175 : // minimal raw data chunked delivery during incremental feed
176 : uint32_t mMinRawDelivery;
177 :
178 : // Customizable list of whitespaces
179 : const char* mWhitespaces;
180 : // Additinal custom word characters
181 : const char* mAdditionalWordChars;
182 :
183 : // All these point to the original buffer passed to the constructor or to the incremental
184 : // buffer after FeedInput.
185 : nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
186 : nsACString::const_char_iterator mEnd; // End of the input position
187 :
188 : // This is the list of tokens user has registered with AddCustomToken()
189 : nsTArray<UniquePtr<Token>> mCustomTokens;
190 : uint32_t mNextCustomTokenID;
191 :
192 : private:
193 : TokenizerBase() = delete;
194 : TokenizerBase(const TokenizerBase&) = delete;
195 : TokenizerBase(TokenizerBase&&) = delete;
196 : TokenizerBase(const TokenizerBase&&) = delete;
197 : TokenizerBase &operator=(const TokenizerBase&) = delete;
198 : };
199 :
200 : /**
201 : * This is a simple implementation of a lexical analyzer or maybe better
202 : * called a tokenizer. It doesn't allow any user dictionaries or
203 : * user define token types.
204 : *
205 : * It is limited only to ASCII input for now. UTF-8 or any other input
206 : * encoding must yet be implemented.
207 : */
208 7105 : class Tokenizer : public TokenizerBase
209 : {
210 : public:
211 : /**
212 : * @param aSource
213 : * The string to parse.
214 : * IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer lifetime.
215 : * It's up to the consumer to make sure the string's buffer outlives the Tokenizer!
216 : * @param aWhitespaces
217 : * If non-null Tokenizer will use this custom set of whitespaces for CheckWhite()
218 : * and SkipWhites() calls.
219 : * By default the list consists of space and tab.
220 : * @param aAdditionalWordChars
221 : * If non-null it will be added to the list of characters that consist a word.
222 : * This is useful when you want to accept e.g. '-' in HTTP headers.
223 : * By default a word character is consider any character for which upper case
224 : * is different from lower case.
225 : *
226 : * If there is an overlap between aWhitespaces and aAdditionalWordChars, the check for
227 : * word characters is made first.
228 : */
229 : explicit Tokenizer(const nsACString& aSource,
230 : const char* aWhitespaces = nullptr,
231 : const char* aAdditionalWordChars = nullptr);
232 : explicit Tokenizer(const char* aSource,
233 : const char* aWhitespaces = nullptr,
234 : const char* aAdditionalWordChars = nullptr);
235 :
236 : /**
237 : * When there is still anything to read from the input, tokenize it, store the token type
238 : * and value to aToken result and shift the cursor past this just parsed token. Each call
239 : * to Next() reads another token from the input and shifts the cursor.
240 : * Returns false if we have passed the end of the input.
241 : */
242 : MOZ_MUST_USE
243 : bool Next(Token& aToken);
244 :
245 : /**
246 : * Parse the token on the input read cursor position, check its type is equal to aTokenType
247 : * and if so, put it into aResult, shift the cursor and return true. Otherwise, leave
248 : * the input read cursor position intact and return false.
249 : */
250 : MOZ_MUST_USE
251 : bool Check(const TokenType aTokenType, Token& aResult);
252 : /**
253 : * Same as above method, just compares both token type and token value passed in aToken.
254 : * When both the type and the value equals, shift the cursor and return true. Otherwise
255 : * return false.
256 : */
257 : MOZ_MUST_USE
258 : bool Check(const Token& aToken);
259 :
260 : /**
261 : * SkipWhites method (below) may also skip new line characters automatically.
262 : */
263 : enum WhiteSkipping {
264 : /**
265 : * SkipWhites will only skip what is defined as a white space (default).
266 : */
267 : DONT_INCLUDE_NEW_LINE = 0,
268 : /**
269 : * SkipWhites will skip definited white spaces as well as new lines
270 : * automatically.
271 : */
272 : INCLUDE_NEW_LINE = 1
273 : };
274 :
275 : /**
276 : * Skips any occurence of whitespaces specified in mWhitespaces member,
277 : * optionally skip also new lines.
278 : */
279 : void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
280 :
281 : /**
282 : * Skips all tokens until the given one is found or EOF is hit. The token
283 : * or EOF are next to read.
284 : */
285 : void SkipUntil(Token const& aToken);
286 :
287 : // These are mostly shortcuts for the Check() methods above.
288 :
289 : /**
290 : * Check whitespace character is present.
291 : */
292 : MOZ_MUST_USE
293 7504 : bool CheckWhite() { return Check(Token::Whitespace()); }
294 : /**
295 : * Check there is a single character on the read cursor position. If so, shift the read
296 : * cursor position and return true. Otherwise false.
297 : */
298 : MOZ_MUST_USE
299 7495 : bool CheckChar(const char aChar) { return Check(Token::Char(aChar)); }
300 : /**
301 : * This is a customizable version of CheckChar. aClassifier is a function called with
302 : * value of the character on the current input read position. If this user function
303 : * returns true, read cursor is shifted and true returned. Otherwise false.
304 : * The user classifiction function is not called when we are at or past the end and
305 : * false is immediately returned.
306 : */
307 : MOZ_MUST_USE
308 : bool CheckChar(bool (*aClassifier)(const char aChar));
309 : /**
310 : * Check for a whole expected word.
311 : */
312 : MOZ_MUST_USE
313 : bool CheckWord(const nsACString& aWord) { return Check(Token::Word(aWord)); }
314 : /**
315 : * Shortcut for literal const word check with compile time length calculation.
316 : */
317 : template <uint32_t N>
318 : MOZ_MUST_USE
319 0 : bool CheckWord(const char (&aWord)[N]) { return Check(Token::Word(nsDependentCString(aWord, N - 1))); }
320 : /**
321 : * Checks \r, \n or \r\n.
322 : */
323 : MOZ_MUST_USE
324 0 : bool CheckEOL() { return Check(Token::NewLine()); }
325 : /**
326 : * Checks we are at the end of the input string reading. If so, shift past the end
327 : * and returns true. Otherwise does nothing and returns false.
328 : */
329 : MOZ_MUST_USE
330 19 : bool CheckEOF() { return Check(Token::EndOfFile()); }
331 :
332 : /**
333 : * These are shortcuts to obtain the value immediately when the token type matches.
334 : */
335 : MOZ_MUST_USE bool ReadChar(char* aValue);
336 : MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const char aChar),
337 : char* aValue);
338 : MOZ_MUST_USE bool ReadWord(nsACString& aValue);
339 : MOZ_MUST_USE bool ReadWord(nsDependentCSubstring& aValue);
340 :
341 : /**
342 : * This is an integer read helper. It returns false and doesn't move the read
343 : * cursor when any of the following happens:
344 : * - the token at the read cursor is not an integer
345 : * - the final number doesn't fit the T type
346 : * Otherwise true is returned, aValue is filled with the integral number
347 : * and the cursor is moved forward.
348 : */
349 : template <typename T>
350 16 : MOZ_MUST_USE bool ReadInteger(T* aValue)
351 : {
352 16 : MOZ_RELEASE_ASSERT(aValue);
353 :
354 16 : nsACString::const_char_iterator rollback = mRollback;
355 16 : nsACString::const_char_iterator cursor = mCursor;
356 32 : Token t;
357 16 : if (!Check(TOKEN_INTEGER, t)) {
358 16 : return false;
359 : }
360 :
361 0 : mozilla::CheckedInt<T> checked(t.AsInteger());
362 0 : if (!checked.isValid()) {
363 : // Move to a state as if Check() call has failed
364 0 : mRollback = rollback;
365 0 : mCursor = cursor;
366 0 : mHasFailed = true;
367 0 : return false;
368 : }
369 :
370 0 : *aValue = checked.value();
371 0 : return true;
372 : }
373 :
374 : /**
375 : * Returns the read cursor position back as it was before the last call of any parsing
376 : * method of Tokenizer (Next, Check*, Skip*, Read*) so that the last operation
377 : * can be repeated.
378 : * Rollback cannot be used multiple times, it only reverts the last successfull parse
379 : * operation. It also cannot be used before any parsing operation has been called
380 : * on the Tokenizer.
381 : */
382 : void Rollback();
383 :
384 : /**
385 : * Record() and Claim() are collecting the input as it is being parsed to obtain
386 : * a substring between particular syntax bounderies defined by any recursive
387 : * descent parser or simple parser the Tokenizer is used to read the input for.
388 : * Inlucsion of a token that has just been parsed can be controlled using an arguemnt.
389 : */
390 : enum ClaimInclusion {
391 : /**
392 : * Include resulting (or passed) token of the last lexical analyzer operation in the result.
393 : */
394 : INCLUDE_LAST,
395 : /**
396 : * Do not include it.
397 : */
398 : EXCLUDE_LAST
399 : };
400 :
401 : /**
402 : * Start the process of recording. Based on aInclude value the begining of the recorded
403 : * sub-string is at the current position (EXCLUDE_LAST) or at the position before the last
404 : * parsed token (INCLUDE_LAST).
405 : */
406 : void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
407 : /**
408 : * Claim result of the record started with Record() call before. Depending on aInclude
409 : * the ending of the sub-string result includes or excludes the last parsed or checked
410 : * token.
411 : */
412 : void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
413 : void Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
414 :
415 : /**
416 : * If aToken is found, aResult is set to the substring between the current
417 : * position and the position of aToken, potentially including aToken depending
418 : * on aInclude.
419 : * If aToken isn't found aResult is set to the substring between the current
420 : * position and the end of the string.
421 : * If aToken is found, the method returns true. Otherwise it returns false.
422 : *
423 : * Calling Rollback() after ReadUntil() will return the read cursor to the
424 : * position it had before ReadUntil was called.
425 : */
426 : MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsDependentCSubstring& aResult,
427 : ClaimInclusion aInclude = EXCLUDE_LAST);
428 : MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult,
429 : ClaimInclusion aInclude = EXCLUDE_LAST);
430 :
431 : protected:
432 : // All these point to the original buffer passed to the Tokenizer's constructor
433 : nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
434 : nsACString::const_char_iterator mRollback; // Position of the previous token start
435 :
436 : private:
437 : Tokenizer() = delete;
438 : Tokenizer(const Tokenizer&) = delete;
439 : Tokenizer(Tokenizer&&) = delete;
440 : Tokenizer(const Tokenizer&&) = delete;
441 : Tokenizer &operator=(const Tokenizer&) = delete;
442 : };
443 :
444 : } // mozilla
445 :
446 : #endif // Tokenizer_h__
|