LCOV - code coverage report
Current view: top level - parser/html - nsHtml5StreamParser.h (source / functions) Hit Total Coverage
Test: output.info Lines: 32 39 82.1 %
Date: 2017-07-14 16:53:18 Functions: 14 18 77.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /* This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : #ifndef nsHtml5StreamParser_h
       7             : #define nsHtml5StreamParser_h
       8             : 
       9             : #include "nsAutoPtr.h"
      10             : #include "nsCOMPtr.h"
      11             : #include "nsICharsetDetectionObserver.h"
      12             : #include "nsHtml5MetaScanner.h"
      13             : #include "mozilla/Encoding.h"
      14             : #include "nsHtml5TreeOpExecutor.h"
      15             : #include "nsHtml5OwningUTF16Buffer.h"
      16             : #include "nsIInputStream.h"
      17             : #include "mozilla/Mutex.h"
      18             : #include "mozilla/UniquePtr.h"
      19             : #include "nsHtml5AtomTable.h"
      20             : #include "nsHtml5Speculation.h"
      21             : #include "nsISerialEventTarget.h"
      22             : #include "nsITimer.h"
      23             : #include "nsICharsetDetector.h"
      24             : #include "mozilla/dom/DocGroup.h"
      25             : 
      26             : class nsHtml5Parser;
      27             : 
      28             : #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
      29             : #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
      30             : 
      31             : enum eParserMode {
      32             :   /**
      33             :    * Parse a document normally as HTML.
      34             :    */
      35             :   NORMAL,
      36             : 
      37             :   /**
      38             :    * View document as HTML source.
      39             :    */
      40             :   VIEW_SOURCE_HTML,
      41             : 
      42             :   /**
      43             :    * View document as XML source
      44             :    */
      45             :   VIEW_SOURCE_XML,
      46             : 
      47             :   /**
      48             :    * View document as plain text source
      49             :    */
      50             :   VIEW_SOURCE_PLAIN,
      51             : 
      52             :   /**
      53             :    * View document as plain text
      54             :    */
      55             :   PLAIN_TEXT,
      56             : 
      57             :   /**
      58             :    * Load as data (XHR)
      59             :    */
      60             :   LOAD_AS_DATA
      61             : };
      62             : 
      63             : enum eBomState {
      64             :   /**
      65             :    * BOM sniffing hasn't started.
      66             :    */
      67             :   BOM_SNIFFING_NOT_STARTED = 0,
      68             : 
      69             :   /**
      70             :    * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
      71             :    * seen.
      72             :    */
      73             :   SEEN_UTF_16_LE_FIRST_BYTE = 1,
      74             : 
      75             :   /**
      76             :    * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
      77             :    * seen.
      78             :    */
      79             :   SEEN_UTF_16_BE_FIRST_BYTE = 2,
      80             : 
      81             :   /**
      82             :    * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
      83             :    * seen.
      84             :    */
      85             :   SEEN_UTF_8_FIRST_BYTE = 3,
      86             : 
      87             :   /**
      88             :    * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
      89             :    * have been seen.
      90             :    */
      91             :   SEEN_UTF_8_SECOND_BYTE = 4,
      92             : 
      93             :   /**
      94             :    * BOM sniffing was started but is now over for whatever reason.
      95             :    */
      96             :   BOM_SNIFFING_OVER = 5
      97             : };
      98             : 
      99             : enum eHtml5StreamState {
     100             :   STREAM_NOT_STARTED = 0,
     101             :   STREAM_BEING_READ = 1,
     102             :   STREAM_ENDED = 2
     103             : };
     104             : 
     105             : class nsHtml5StreamParser final : public nsICharsetDetectionObserver {
     106             :   template <typename T> using NotNull = mozilla::NotNull<T>;
     107             :   using Encoding = mozilla::Encoding;
     108             : 
     109             :   friend class nsHtml5RequestStopper;
     110             :   friend class nsHtml5DataAvailable;
     111             :   friend class nsHtml5StreamParserContinuation;
     112             :   friend class nsHtml5TimerKungFu;
     113             :   friend class nsHtml5StreamParserPtr;
     114             : 
     115             : public:
     116             :   NS_DECL_CYCLE_COLLECTING_ISUPPORTS
     117          33 :   NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
     118             :                                            nsICharsetDetectionObserver)
     119             : 
     120             :   static void InitializeStatics();
     121             : 
     122             :   nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
     123             :                       nsHtml5Parser* aOwner,
     124             :                       eParserMode aMode);
     125             : 
     126             :   // Methods that nsHtml5StreamListener calls
     127             :   nsresult CheckListenerChain();
     128             : 
     129             :   nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
     130             : 
     131             :   nsresult OnDataAvailable(nsIRequest* aRequest,
     132             :                            nsISupports* aContext,
     133             :                            nsIInputStream* aInStream,
     134             :                            uint64_t aSourceOffset,
     135             :                            uint32_t aLength);
     136             : 
     137             :   nsresult OnStopRequest(nsIRequest* aRequest,
     138             :                          nsISupports* aContext,
     139             :                          nsresult status);
     140             : 
     141             :   // nsICharsetDetectionObserver
     142             :   /**
     143             :      * Chardet calls this to report the detection result
     144             :      */
     145             :   NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;
     146             : 
     147             :   // EncodingDeclarationHandler
     148             :   // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
     149             :   /**
     150             :      * Tree builder uses this to report a late <meta charset>
     151             :      */
     152             :   bool internalEncodingDeclaration(nsHtml5String aEncoding);
     153             : 
     154             :   // Not from an external interface
     155             : 
     156             :   /**
     157             :      *  Call this method once you've created a parser, and want to instruct it
     158             :      *  about what charset to load
     159             :      *
     160             :      *  @param   aEncoding the charset of a document
     161             :      *  @param   aCharsetSource the source of the charset
     162             :      */
     163           2 :     inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding,
     164             :                                    int32_t aSource) {
     165           2 :       NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
     166             :                       "SetDocumentCharset called too late.");
     167           2 :       NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
     168           2 :       mEncoding = aEncoding;
     169           2 :       mCharsetSource = aSource;
     170           2 :     }
     171             :     
     172           2 :     inline void SetObserver(nsIRequestObserver* aObserver) {
     173           2 :       NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
     174           2 :       mObserver = aObserver;
     175           2 :     }
     176             : 
     177             :     nsresult GetChannel(nsIChannel** aChannel);
     178             : 
     179             :     /**
     180             :      * The owner parser must call this after script execution
     181             :      * when no scripts are executing and the document.written 
     182             :      * buffer has been exhausted.
     183             :      */
     184             :     void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 
     185             :                               nsHtml5TreeBuilder* aTreeBuilder,
     186             :                               bool aLastWasCR);
     187             : 
     188             :     /**
     189             :      * Continues the stream parser if the charset switch failed.
     190             :      */
     191             :     void ContinueAfterFailedCharsetSwitch();
     192             : 
     193           0 :     void Terminate()
     194             :     {
     195           0 :       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
     196           0 :       mTerminated = true;
     197           0 :     }
     198             :     
     199             :     void DropTimer();
     200             : 
     201             :     /**
     202             :      * Sets mEncoding and mCharsetSource appropriately for the XML View Source
     203             :      * case if aEncoding names a supported rough ASCII superset and sets
     204             :      * the mEncoding and mCharsetSource to the UTF-8 default otherwise.
     205             :      */
     206             :     void SetEncodingFromExpat(const char16_t* aEncoding);
     207             : 
     208             :     /**
     209             :      * Sets the URL for View Source title in case this parser ends up being
     210             :      * used for View Source. If aURL is a view-source: URL, takes the inner
     211             :      * URL. data: URLs are shown with an ellipsis instead of the actual data.
     212             :      */
     213             :     void SetViewSourceTitle(nsIURI* aURL);
     214             : 
     215             :   private:
     216             :     virtual ~nsHtml5StreamParser();
     217             : 
     218             : #ifdef DEBUG
     219          28 :     bool IsParserThread() {
     220          28 :       return mEventTarget->IsOnCurrentThread();
     221             :     }
     222             : #endif
     223             : 
     224             :     void MarkAsBroken(nsresult aRv);
     225             : 
     226             :     /**
     227             :      * Marks the stream parser as interrupted. If you ever add calls to this
     228             :      * method, be sure to review Uninterrupt usage very, very carefully to
     229             :      * avoid having a previous in-flight runnable cancel your Interrupt()
     230             :      * call on the other thread too soon.
     231             :      */
     232           1 :     void Interrupt()
     233             :     {
     234           2 :       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
     235           1 :       mInterrupted = true;
     236           1 :     }
     237             : 
     238           1 :     void Uninterrupt()
     239             :     {
     240           1 :       NS_ASSERTION(IsParserThread(), "Wrong thread!");
     241           1 :       mTokenizerMutex.AssertCurrentThreadOwns();
     242             :       // Not acquiring mTerminatedMutex because mTokenizerMutex is already
     243             :       // held at this point and is already stronger.
     244           1 :       mInterrupted = false;      
     245           1 :     }
     246             : 
     247             :     /**
     248             :      * Flushes the tree ops from the tree builder and disarms the flush
     249             :      * timer.
     250             :      */
     251             :     void FlushTreeOpsAndDisarmTimer();
     252             : 
     253             :     void ParseAvailableData();
     254             : 
     255             :     void DoStopRequest();
     256             : 
     257             :     void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
     258             : 
     259             :     static nsresult CopySegmentsToParser(nsIInputStream *aInStream,
     260             :                                          void *aClosure,
     261             :                                          const char *aFromSegment,
     262             :                                          uint32_t aToOffset,
     263             :                                          uint32_t aCount,
     264             :                                          uint32_t *aWriteCount);
     265             : 
     266          19 :     bool IsTerminatedOrInterrupted()
     267             :     {
     268          38 :       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
     269          38 :       return mTerminated || mInterrupted;
     270             :     }
     271             : 
     272           4 :     bool IsTerminated()
     273             :     {
     274           8 :       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
     275           8 :       return mTerminated;
     276             :     }
     277             : 
     278             :     /**
     279             :      * True when there is a Unicode decoder already
     280             :      */
     281           2 :     inline bool HasDecoder()
     282             :     {
     283           2 :       return !!mUnicodeDecoder;
     284             :     }
     285             : 
     286             :     /**
     287             :      * Push bytes from network when there is no Unicode decoder yet
     288             :      */
     289             :     nsresult SniffStreamBytes(const uint8_t* aFromSegment,
     290             :                               uint32_t aCount,
     291             :                               uint32_t* aWriteCount);
     292             : 
     293             :     /**
     294             :      * Push bytes from network when there is a Unicode decoder already
     295             :      */
     296             :     nsresult WriteStreamBytes(const uint8_t* aFromSegment,
     297             :                               uint32_t aCount,
     298             :                               uint32_t* aWriteCount);
     299             : 
     300             :     /**
     301             :      * Check whether every other byte in the sniffing buffer is zero.
     302             :      */
     303             :     void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
     304             :                                      uint32_t aCountToSniffingLimit);
     305             : 
     306             :     /**
     307             :      * <meta charset> scan failed. Try chardet if applicable. After this, the
     308             :      * the parser will have some encoding even if a last resolt fallback.
     309             :      *
     310             :      * @param aFromSegment The current network buffer or null if the sniffing
     311             :      *                     buffer is being flushed due to network stream ending.
     312             :      * @param aCount       The number of bytes in aFromSegment (ignored if
     313             :      *                     aFromSegment is null)
     314             :      * @param aWriteCount  Return value for how many bytes got read from the
     315             :      *                     buffer.
     316             :      * @param aCountToSniffingLimit The number of unfilled slots in
     317             :      *                              mSniffingBuffer
     318             :      */
     319             :     nsresult FinalizeSniffing(const uint8_t* aFromSegment,
     320             :                               uint32_t aCount,
     321             :                               uint32_t* aWriteCount,
     322             :                               uint32_t aCountToSniffingLimit);
     323             : 
     324             :     /**
     325             :      * Set up the Unicode decoder and write the sniffing buffer into it
     326             :      * followed by the current network buffer.
     327             :      *
     328             :      * @param aFromSegment The current network buffer or null if the sniffing
     329             :      *                     buffer is being flushed due to network stream ending.
     330             :      * @param aCount       The number of bytes in aFromSegment (ignored if
     331             :      *                     aFromSegment is null)
     332             :      * @param aWriteCount  Return value for how many bytes got read from the
     333             :      *                     buffer.
     334             :      */
     335             :     nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
     336             :                                                                   uint32_t aCount,
     337             :                                                                   uint32_t* aWriteCount);
     338             : 
     339             :     /**
     340             :      * Initialize the Unicode decoder, mark the BOM as the source and
     341             :      * drop the sniffer.
     342             :      *
     343             :      * @param aDecoderCharsetName The name for the decoder's charset
     344             :      *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
     345             :      *                            been swallowed)
     346             :      */
     347             :     nsresult SetupDecodingFromBom(NotNull<const Encoding*> aEncoding);
     348             : 
     349             :     /**
     350             :      * Become confident or resolve and encoding name to its preferred form.
     351             :      * @param aEncoding the value of an internal encoding decl. Acts as an
     352             :      *                  out param, too, when the method returns true.
     353             :      * @return true if the parser needs to start using the new value of
     354             :      *         aEncoding and false if the parser became confident or if
     355             :      *         the encoding name did not specify a usable encoding
     356             :      */
     357             :     const Encoding* PreferredForInternalEncodingDecl(const nsACString& aEncoding);
     358             : 
     359             :     /**
     360             :      * Callback for mFlushTimer.
     361             :      */
     362             :     static void TimerCallback(nsITimer* aTimer, void* aClosure);
     363             : 
     364             :     /**
     365             :      * Parser thread entry point for (maybe) flushing the ops and posting
     366             :      * a flush runnable back on the main thread.
     367             :      */
     368             :     void TimerFlush();
     369             : 
     370             :     /**
     371             :      * Called when speculation fails.
     372             :      */
     373           0 :     void MaybeDisableFutureSpeculation()
     374             :     {
     375           0 :         mSpeculationFailureCount++;
     376           0 :     }
     377             : 
     378             :     /**
     379             :      * Used to check whether we're getting too many speculation failures and
     380             :      * should just stop trying.  The 100 is picked pretty randomly to be not too
     381             :      * small (so most pages are not affected) but small enough that we don't end
     382             :      * up with failed speculations over and over in pathological cases.
     383             :      */
     384           1 :     bool IsSpeculationEnabled()
     385             :     {
     386           1 :         return mSpeculationFailureCount < 100;
     387             :     }
     388             : 
     389             :     /**
     390             :      * Dispatch an event to a Quantum DOM main thread-ish thread.
     391             :      * (Not the parser thread.)
     392             :      */
     393             :     nsresult DispatchToMain(const char* aName,
     394             :                             already_AddRefed<nsIRunnable>&& aRunnable);
     395             : 
     396             :     nsCOMPtr<nsIRequest>          mRequest;
     397             :     nsCOMPtr<nsIRequestObserver>  mObserver;
     398             : 
     399             :     /**
     400             :      * The document title to use if this turns out to be a View Source parser.
     401             :      */
     402             :     nsCString                     mViewSourceTitle;
     403             : 
     404             :     /**
     405             :      * The Unicode decoder
     406             :      */
     407             :     mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder;
     408             : 
     409             :     /**
     410             :      * The buffer for sniffing the character encoding
     411             :      */
     412             :     mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;
     413             : 
     414             :     /**
     415             :      * The number of meaningful bytes in mSniffingBuffer
     416             :      */
     417             :     uint32_t                      mSniffingLength;
     418             : 
     419             :     /**
     420             :      * BOM sniffing state
     421             :      */
     422             :     eBomState                     mBomState;
     423             : 
     424             :     /**
     425             :      * <meta> prescan implementation
     426             :      */
     427             :     nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
     428             : 
     429             :     // encoding-related stuff
     430             :     /**
     431             :      * The source (confidence) of the character encoding in use
     432             :      */
     433             :     int32_t                       mCharsetSource;
     434             : 
     435             :     /**
     436             :      * The character encoding in use
     437             :      */
     438             :     NotNull<const Encoding*>      mEncoding;
     439             : 
     440             :     /**
     441             :      * Whether reparse is forbidden
     442             :      */
     443             :     bool                          mReparseForbidden;
     444             : 
     445             :     // Portable parser objects
     446             :     /**
     447             :      * The first buffer in the pending UTF-16 buffer queue
     448             :      */
     449             :     RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
     450             : 
     451             :     /**
     452             :      * The last buffer in the pending UTF-16 buffer queue
     453             :      */
     454             :     nsHtml5OwningUTF16Buffer*     mLastBuffer; // weak ref; always points to
     455             :                       // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
     456             : 
     457             :     /**
     458             :      * The tree operation executor
     459             :      */
     460             :     nsHtml5TreeOpExecutor*        mExecutor;
     461             : 
     462             :     /**
     463             :      * The same as mExecutor->mDocument->mDocGroup.
     464             :      */
     465             :     RefPtr<mozilla::dom::DocGroup> mDocGroup;
     466             : 
     467             :     /**
     468             :      * The HTML5 tree builder
     469             :      */
     470             :     nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
     471             : 
     472             :     /**
     473             :      * The HTML5 tokenizer
     474             :      */
     475             :     nsAutoPtr<nsHtml5Tokenizer>   mTokenizer;
     476             : 
     477             :     /**
     478             :      * Makes sure the main thread can't mess the tokenizer state while it's
     479             :      * tokenizing. This mutex also protects the current speculation.
     480             :      */
     481             :     mozilla::Mutex                mTokenizerMutex;
     482             : 
     483             :     /**
     484             :      * The scoped atom table
     485             :      */
     486             :     nsHtml5AtomTable              mAtomTable;
     487             : 
     488             :     /**
     489             :      * The owner parser.
     490             :      */
     491             :     RefPtr<nsHtml5Parser>       mOwner;
     492             : 
     493             :     /**
     494             :      * Whether the last character tokenized was a carriage return (for CRLF)
     495             :      */
     496             :     bool                          mLastWasCR;
     497             : 
     498             :     /**
     499             :      * For tracking stream life cycle
     500             :      */
     501             :     eHtml5StreamState             mStreamState;
     502             :     
     503             :     /**
     504             :      * Whether we are speculating.
     505             :      */
     506             :     bool                          mSpeculating;
     507             : 
     508             :     /**
     509             :      * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
     510             :      */
     511             :     bool                          mAtEOF;
     512             : 
     513             :     /**
     514             :      * The speculations. The mutex protects the nsTArray itself.
     515             :      * To access the queue of current speculation, mTokenizerMutex must be 
     516             :      * obtained.
     517             :      * The current speculation is the last element
     518             :      */
     519             :     nsTArray<nsAutoPtr<nsHtml5Speculation> >  mSpeculations;
     520             :     mozilla::Mutex                            mSpeculationMutex;
     521             : 
     522             :     /**
     523             :      * Number of times speculation has failed for this parser.
     524             :      */
     525             :     uint32_t                      mSpeculationFailureCount;
     526             : 
     527             :     /**
     528             :      * True to terminate early; protected by mTerminatedMutex
     529             :      */
     530             :     bool                          mTerminated;
     531             :     bool                          mInterrupted;
     532             :     mozilla::Mutex                mTerminatedMutex;
     533             :     
     534             :     /**
     535             :      * The thread this stream parser runs on.
     536             :      */
     537             :     nsCOMPtr<nsISerialEventTarget> mEventTarget;
     538             :     
     539             :     nsCOMPtr<nsIRunnable>         mExecutorFlusher;
     540             :     
     541             :     nsCOMPtr<nsIRunnable>         mLoadFlusher;
     542             : 
     543             :     /**
     544             :      * The chardet instance if chardet is enabled.
     545             :      */
     546             :     nsCOMPtr<nsICharsetDetector>  mChardet;
     547             : 
     548             :     /**
     549             :      * If false, don't push data to chardet.
     550             :      */
     551             :     bool                          mFeedChardet;
     552             : 
     553             :     /**
     554             :      * Whether the initial charset source was kCharsetFromParentFrame
     555             :      */
     556             :     bool                          mInitialEncodingWasFromParentFrame;
     557             : 
     558             :     /**
     559             :      * Timer for flushing tree ops once in a while when not speculating.
     560             :      */
     561             :     nsCOMPtr<nsITimer>            mFlushTimer;
     562             : 
     563             :     /**
     564             :      * Mutex for protecting access to mFlushTimer (but not for the two
     565             :      * mFlushTimerFoo booleans below).
     566             :      */
     567             :     mozilla::Mutex                mFlushTimerMutex;
     568             : 
     569             :     /**
     570             :      * Keeps track whether mFlushTimer has been armed. Unfortunately,
     571             :      * nsITimer doesn't enable querying this from the timer itself.
     572             :      */
     573             :     bool                          mFlushTimerArmed;
     574             : 
     575             :     /**
     576             :      * False initially and true after the timer has fired at least once.
     577             :      */
     578             :     bool                          mFlushTimerEverFired;
     579             : 
     580             :     /**
     581             :      * Whether the parser is doing a normal parse, view source or plain text.
     582             :      */
     583             :     eParserMode                   mMode;
     584             : 
     585             :     /**
     586             :      * The pref html5.flushtimer.initialdelay: Time in milliseconds between
     587             :      * the time a network buffer is seen and the timer firing when the
     588             :      * timer hasn't fired previously in this parse.
     589             :      */
     590             :     static int32_t                sTimerInitialDelay;
     591             : 
     592             :     /**
     593             :      * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
     594             :      * the time a network buffer is seen and the timer firing when the
     595             :      * timer has already fired previously in this parse.
     596             :      */
     597             :     static int32_t                sTimerSubsequentDelay;
     598             : };
     599             : 
     600             : #endif // nsHtml5StreamParser_h

Generated by: LCOV version 1.13