Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #ifndef nsHtml5StreamParser_h
7 : #define nsHtml5StreamParser_h
8 :
9 : #include "nsAutoPtr.h"
10 : #include "nsCOMPtr.h"
11 : #include "nsICharsetDetectionObserver.h"
12 : #include "nsHtml5MetaScanner.h"
13 : #include "mozilla/Encoding.h"
14 : #include "nsHtml5TreeOpExecutor.h"
15 : #include "nsHtml5OwningUTF16Buffer.h"
16 : #include "nsIInputStream.h"
17 : #include "mozilla/Mutex.h"
18 : #include "mozilla/UniquePtr.h"
19 : #include "nsHtml5AtomTable.h"
20 : #include "nsHtml5Speculation.h"
21 : #include "nsISerialEventTarget.h"
22 : #include "nsITimer.h"
23 : #include "nsICharsetDetector.h"
24 : #include "mozilla/dom/DocGroup.h"
25 :
26 : class nsHtml5Parser;
27 :
28 : #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
29 : #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
30 :
31 : enum eParserMode {
32 : /**
33 : * Parse a document normally as HTML.
34 : */
35 : NORMAL,
36 :
37 : /**
38 : * View document as HTML source.
39 : */
40 : VIEW_SOURCE_HTML,
41 :
42 : /**
43 : * View document as XML source
44 : */
45 : VIEW_SOURCE_XML,
46 :
47 : /**
48 : * View document as plain text source
49 : */
50 : VIEW_SOURCE_PLAIN,
51 :
52 : /**
53 : * View document as plain text
54 : */
55 : PLAIN_TEXT,
56 :
57 : /**
58 : * Load as data (XHR)
59 : */
60 : LOAD_AS_DATA
61 : };
62 :
63 : enum eBomState {
64 : /**
65 : * BOM sniffing hasn't started.
66 : */
67 : BOM_SNIFFING_NOT_STARTED = 0,
68 :
69 : /**
70 : * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
71 : * seen.
72 : */
73 : SEEN_UTF_16_LE_FIRST_BYTE = 1,
74 :
75 : /**
76 : * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
77 : * seen.
78 : */
79 : SEEN_UTF_16_BE_FIRST_BYTE = 2,
80 :
81 : /**
82 : * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
83 : * seen.
84 : */
85 : SEEN_UTF_8_FIRST_BYTE = 3,
86 :
87 : /**
88 : * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
89 : * have been seen.
90 : */
91 : SEEN_UTF_8_SECOND_BYTE = 4,
92 :
93 : /**
94 : * BOM sniffing was started but is now over for whatever reason.
95 : */
96 : BOM_SNIFFING_OVER = 5
97 : };
98 :
99 : enum eHtml5StreamState {
100 : STREAM_NOT_STARTED = 0,
101 : STREAM_BEING_READ = 1,
102 : STREAM_ENDED = 2
103 : };
104 :
105 : class nsHtml5StreamParser final : public nsICharsetDetectionObserver {
106 : template <typename T> using NotNull = mozilla::NotNull<T>;
107 : using Encoding = mozilla::Encoding;
108 :
109 : friend class nsHtml5RequestStopper;
110 : friend class nsHtml5DataAvailable;
111 : friend class nsHtml5StreamParserContinuation;
112 : friend class nsHtml5TimerKungFu;
113 : friend class nsHtml5StreamParserPtr;
114 :
115 : public:
116 : NS_DECL_CYCLE_COLLECTING_ISUPPORTS
117 33 : NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
118 : nsICharsetDetectionObserver)
119 :
120 : static void InitializeStatics();
121 :
122 : nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
123 : nsHtml5Parser* aOwner,
124 : eParserMode aMode);
125 :
126 : // Methods that nsHtml5StreamListener calls
127 : nsresult CheckListenerChain();
128 :
129 : nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
130 :
131 : nsresult OnDataAvailable(nsIRequest* aRequest,
132 : nsISupports* aContext,
133 : nsIInputStream* aInStream,
134 : uint64_t aSourceOffset,
135 : uint32_t aLength);
136 :
137 : nsresult OnStopRequest(nsIRequest* aRequest,
138 : nsISupports* aContext,
139 : nsresult status);
140 :
141 : // nsICharsetDetectionObserver
142 : /**
143 : * Chardet calls this to report the detection result
144 : */
145 : NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;
146 :
147 : // EncodingDeclarationHandler
148 : // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
149 : /**
150 : * Tree builder uses this to report a late <meta charset>
151 : */
152 : bool internalEncodingDeclaration(nsHtml5String aEncoding);
153 :
154 : // Not from an external interface
155 :
156 : /**
157 : * Call this method once you've created a parser, and want to instruct it
158 : * about what charset to load
159 : *
160 : * @param aEncoding the charset of a document
161 : * @param aCharsetSource the source of the charset
162 : */
163 2 : inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding,
164 : int32_t aSource) {
165 2 : NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
166 : "SetDocumentCharset called too late.");
167 2 : NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
168 2 : mEncoding = aEncoding;
169 2 : mCharsetSource = aSource;
170 2 : }
171 :
172 2 : inline void SetObserver(nsIRequestObserver* aObserver) {
173 2 : NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
174 2 : mObserver = aObserver;
175 2 : }
176 :
177 : nsresult GetChannel(nsIChannel** aChannel);
178 :
179 : /**
180 : * The owner parser must call this after script execution
181 : * when no scripts are executing and the document.written
182 : * buffer has been exhausted.
183 : */
184 : void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
185 : nsHtml5TreeBuilder* aTreeBuilder,
186 : bool aLastWasCR);
187 :
188 : /**
189 : * Continues the stream parser if the charset switch failed.
190 : */
191 : void ContinueAfterFailedCharsetSwitch();
192 :
193 0 : void Terminate()
194 : {
195 0 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
196 0 : mTerminated = true;
197 0 : }
198 :
199 : void DropTimer();
200 :
201 : /**
202 : * Sets mEncoding and mCharsetSource appropriately for the XML View Source
203 : * case if aEncoding names a supported rough ASCII superset and sets
204 : * the mEncoding and mCharsetSource to the UTF-8 default otherwise.
205 : */
206 : void SetEncodingFromExpat(const char16_t* aEncoding);
207 :
208 : /**
209 : * Sets the URL for View Source title in case this parser ends up being
210 : * used for View Source. If aURL is a view-source: URL, takes the inner
211 : * URL. data: URLs are shown with an ellipsis instead of the actual data.
212 : */
213 : void SetViewSourceTitle(nsIURI* aURL);
214 :
215 : private:
216 : virtual ~nsHtml5StreamParser();
217 :
218 : #ifdef DEBUG
219 28 : bool IsParserThread() {
220 28 : return mEventTarget->IsOnCurrentThread();
221 : }
222 : #endif
223 :
224 : void MarkAsBroken(nsresult aRv);
225 :
226 : /**
227 : * Marks the stream parser as interrupted. If you ever add calls to this
228 : * method, be sure to review Uninterrupt usage very, very carefully to
229 : * avoid having a previous in-flight runnable cancel your Interrupt()
230 : * call on the other thread too soon.
231 : */
232 1 : void Interrupt()
233 : {
234 2 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
235 1 : mInterrupted = true;
236 1 : }
237 :
238 1 : void Uninterrupt()
239 : {
240 1 : NS_ASSERTION(IsParserThread(), "Wrong thread!");
241 1 : mTokenizerMutex.AssertCurrentThreadOwns();
242 : // Not acquiring mTerminatedMutex because mTokenizerMutex is already
243 : // held at this point and is already stronger.
244 1 : mInterrupted = false;
245 1 : }
246 :
247 : /**
248 : * Flushes the tree ops from the tree builder and disarms the flush
249 : * timer.
250 : */
251 : void FlushTreeOpsAndDisarmTimer();
252 :
253 : void ParseAvailableData();
254 :
255 : void DoStopRequest();
256 :
257 : void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
258 :
259 : static nsresult CopySegmentsToParser(nsIInputStream *aInStream,
260 : void *aClosure,
261 : const char *aFromSegment,
262 : uint32_t aToOffset,
263 : uint32_t aCount,
264 : uint32_t *aWriteCount);
265 :
266 19 : bool IsTerminatedOrInterrupted()
267 : {
268 38 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
269 38 : return mTerminated || mInterrupted;
270 : }
271 :
272 4 : bool IsTerminated()
273 : {
274 8 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
275 8 : return mTerminated;
276 : }
277 :
278 : /**
279 : * True when there is a Unicode decoder already
280 : */
281 2 : inline bool HasDecoder()
282 : {
283 2 : return !!mUnicodeDecoder;
284 : }
285 :
286 : /**
287 : * Push bytes from network when there is no Unicode decoder yet
288 : */
289 : nsresult SniffStreamBytes(const uint8_t* aFromSegment,
290 : uint32_t aCount,
291 : uint32_t* aWriteCount);
292 :
293 : /**
294 : * Push bytes from network when there is a Unicode decoder already
295 : */
296 : nsresult WriteStreamBytes(const uint8_t* aFromSegment,
297 : uint32_t aCount,
298 : uint32_t* aWriteCount);
299 :
300 : /**
301 : * Check whether every other byte in the sniffing buffer is zero.
302 : */
303 : void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
304 : uint32_t aCountToSniffingLimit);
305 :
306 : /**
307 : * <meta charset> scan failed. Try chardet if applicable. After this, the
308 : * the parser will have some encoding even if a last resolt fallback.
309 : *
310 : * @param aFromSegment The current network buffer or null if the sniffing
311 : * buffer is being flushed due to network stream ending.
312 : * @param aCount The number of bytes in aFromSegment (ignored if
313 : * aFromSegment is null)
314 : * @param aWriteCount Return value for how many bytes got read from the
315 : * buffer.
316 : * @param aCountToSniffingLimit The number of unfilled slots in
317 : * mSniffingBuffer
318 : */
319 : nsresult FinalizeSniffing(const uint8_t* aFromSegment,
320 : uint32_t aCount,
321 : uint32_t* aWriteCount,
322 : uint32_t aCountToSniffingLimit);
323 :
324 : /**
325 : * Set up the Unicode decoder and write the sniffing buffer into it
326 : * followed by the current network buffer.
327 : *
328 : * @param aFromSegment The current network buffer or null if the sniffing
329 : * buffer is being flushed due to network stream ending.
330 : * @param aCount The number of bytes in aFromSegment (ignored if
331 : * aFromSegment is null)
332 : * @param aWriteCount Return value for how many bytes got read from the
333 : * buffer.
334 : */
335 : nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
336 : uint32_t aCount,
337 : uint32_t* aWriteCount);
338 :
339 : /**
340 : * Initialize the Unicode decoder, mark the BOM as the source and
341 : * drop the sniffer.
342 : *
343 : * @param aDecoderCharsetName The name for the decoder's charset
344 : * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
345 : * been swallowed)
346 : */
347 : nsresult SetupDecodingFromBom(NotNull<const Encoding*> aEncoding);
348 :
349 : /**
350 : * Become confident or resolve and encoding name to its preferred form.
351 : * @param aEncoding the value of an internal encoding decl. Acts as an
352 : * out param, too, when the method returns true.
353 : * @return true if the parser needs to start using the new value of
354 : * aEncoding and false if the parser became confident or if
355 : * the encoding name did not specify a usable encoding
356 : */
357 : const Encoding* PreferredForInternalEncodingDecl(const nsACString& aEncoding);
358 :
359 : /**
360 : * Callback for mFlushTimer.
361 : */
362 : static void TimerCallback(nsITimer* aTimer, void* aClosure);
363 :
364 : /**
365 : * Parser thread entry point for (maybe) flushing the ops and posting
366 : * a flush runnable back on the main thread.
367 : */
368 : void TimerFlush();
369 :
370 : /**
371 : * Called when speculation fails.
372 : */
373 0 : void MaybeDisableFutureSpeculation()
374 : {
375 0 : mSpeculationFailureCount++;
376 0 : }
377 :
378 : /**
379 : * Used to check whether we're getting too many speculation failures and
380 : * should just stop trying. The 100 is picked pretty randomly to be not too
381 : * small (so most pages are not affected) but small enough that we don't end
382 : * up with failed speculations over and over in pathological cases.
383 : */
384 1 : bool IsSpeculationEnabled()
385 : {
386 1 : return mSpeculationFailureCount < 100;
387 : }
388 :
389 : /**
390 : * Dispatch an event to a Quantum DOM main thread-ish thread.
391 : * (Not the parser thread.)
392 : */
393 : nsresult DispatchToMain(const char* aName,
394 : already_AddRefed<nsIRunnable>&& aRunnable);
395 :
396 : nsCOMPtr<nsIRequest> mRequest;
397 : nsCOMPtr<nsIRequestObserver> mObserver;
398 :
399 : /**
400 : * The document title to use if this turns out to be a View Source parser.
401 : */
402 : nsCString mViewSourceTitle;
403 :
404 : /**
405 : * The Unicode decoder
406 : */
407 : mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder;
408 :
409 : /**
410 : * The buffer for sniffing the character encoding
411 : */
412 : mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;
413 :
414 : /**
415 : * The number of meaningful bytes in mSniffingBuffer
416 : */
417 : uint32_t mSniffingLength;
418 :
419 : /**
420 : * BOM sniffing state
421 : */
422 : eBomState mBomState;
423 :
424 : /**
425 : * <meta> prescan implementation
426 : */
427 : nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
428 :
429 : // encoding-related stuff
430 : /**
431 : * The source (confidence) of the character encoding in use
432 : */
433 : int32_t mCharsetSource;
434 :
435 : /**
436 : * The character encoding in use
437 : */
438 : NotNull<const Encoding*> mEncoding;
439 :
440 : /**
441 : * Whether reparse is forbidden
442 : */
443 : bool mReparseForbidden;
444 :
445 : // Portable parser objects
446 : /**
447 : * The first buffer in the pending UTF-16 buffer queue
448 : */
449 : RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
450 :
451 : /**
452 : * The last buffer in the pending UTF-16 buffer queue
453 : */
454 : nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
455 : // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
456 :
457 : /**
458 : * The tree operation executor
459 : */
460 : nsHtml5TreeOpExecutor* mExecutor;
461 :
462 : /**
463 : * The same as mExecutor->mDocument->mDocGroup.
464 : */
465 : RefPtr<mozilla::dom::DocGroup> mDocGroup;
466 :
467 : /**
468 : * The HTML5 tree builder
469 : */
470 : nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
471 :
472 : /**
473 : * The HTML5 tokenizer
474 : */
475 : nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
476 :
477 : /**
478 : * Makes sure the main thread can't mess the tokenizer state while it's
479 : * tokenizing. This mutex also protects the current speculation.
480 : */
481 : mozilla::Mutex mTokenizerMutex;
482 :
483 : /**
484 : * The scoped atom table
485 : */
486 : nsHtml5AtomTable mAtomTable;
487 :
488 : /**
489 : * The owner parser.
490 : */
491 : RefPtr<nsHtml5Parser> mOwner;
492 :
493 : /**
494 : * Whether the last character tokenized was a carriage return (for CRLF)
495 : */
496 : bool mLastWasCR;
497 :
498 : /**
499 : * For tracking stream life cycle
500 : */
501 : eHtml5StreamState mStreamState;
502 :
503 : /**
504 : * Whether we are speculating.
505 : */
506 : bool mSpeculating;
507 :
508 : /**
509 : * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
510 : */
511 : bool mAtEOF;
512 :
513 : /**
514 : * The speculations. The mutex protects the nsTArray itself.
515 : * To access the queue of current speculation, mTokenizerMutex must be
516 : * obtained.
517 : * The current speculation is the last element
518 : */
519 : nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations;
520 : mozilla::Mutex mSpeculationMutex;
521 :
522 : /**
523 : * Number of times speculation has failed for this parser.
524 : */
525 : uint32_t mSpeculationFailureCount;
526 :
527 : /**
528 : * True to terminate early; protected by mTerminatedMutex
529 : */
530 : bool mTerminated;
531 : bool mInterrupted;
532 : mozilla::Mutex mTerminatedMutex;
533 :
534 : /**
535 : * The thread this stream parser runs on.
536 : */
537 : nsCOMPtr<nsISerialEventTarget> mEventTarget;
538 :
539 : nsCOMPtr<nsIRunnable> mExecutorFlusher;
540 :
541 : nsCOMPtr<nsIRunnable> mLoadFlusher;
542 :
543 : /**
544 : * The chardet instance if chardet is enabled.
545 : */
546 : nsCOMPtr<nsICharsetDetector> mChardet;
547 :
548 : /**
549 : * If false, don't push data to chardet.
550 : */
551 : bool mFeedChardet;
552 :
553 : /**
554 : * Whether the initial charset source was kCharsetFromParentFrame
555 : */
556 : bool mInitialEncodingWasFromParentFrame;
557 :
558 : /**
559 : * Timer for flushing tree ops once in a while when not speculating.
560 : */
561 : nsCOMPtr<nsITimer> mFlushTimer;
562 :
563 : /**
564 : * Mutex for protecting access to mFlushTimer (but not for the two
565 : * mFlushTimerFoo booleans below).
566 : */
567 : mozilla::Mutex mFlushTimerMutex;
568 :
569 : /**
570 : * Keeps track whether mFlushTimer has been armed. Unfortunately,
571 : * nsITimer doesn't enable querying this from the timer itself.
572 : */
573 : bool mFlushTimerArmed;
574 :
575 : /**
576 : * False initially and true after the timer has fired at least once.
577 : */
578 : bool mFlushTimerEverFired;
579 :
580 : /**
581 : * Whether the parser is doing a normal parse, view source or plain text.
582 : */
583 : eParserMode mMode;
584 :
585 : /**
586 : * The pref html5.flushtimer.initialdelay: Time in milliseconds between
587 : * the time a network buffer is seen and the timer firing when the
588 : * timer hasn't fired previously in this parse.
589 : */
590 : static int32_t sTimerInitialDelay;
591 :
592 : /**
593 : * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
594 : * the time a network buffer is seen and the timer firing when the
595 : * timer has already fired previously in this parse.
596 : */
597 : static int32_t sTimerSubsequentDelay;
598 : };
599 :
600 : #endif // nsHtml5StreamParser_h
|