Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=2 sw=2 et tw=78: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : //#define __INCREMENTAL 1
8 :
9 : #include "nsScanner.h"
10 :
11 : #include "mozilla/Attributes.h"
12 : #include "mozilla/DebugOnly.h"
13 : #include "mozilla/Encoding.h"
14 : #include "nsDebug.h"
15 : #include "nsReadableUtils.h"
16 : #include "nsIInputStream.h"
17 : #include "nsIFile.h"
18 : #include "nsUTF8Utils.h" // for LossyConvertEncoding
19 : #include "nsCRT.h"
20 : #include "nsParser.h"
21 : #include "nsCharsetSource.h"
22 :
23 0 : nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) :
24 0 : mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set
25 : {
26 : // Build filter that will be used to filter out characters with
27 : // bits that none of the terminal chars have. This works very well
28 : // because terminal chars often have only the last 4-6 bits set and
29 : // normal ascii letters have bit 7 set. Other letters have even higher
30 : // bits set.
31 :
32 : // Calculate filter
33 0 : const char16_t *current = aTerminateChars;
34 0 : char16_t terminalChar = *current;
35 0 : while (terminalChar) {
36 0 : mFilter &= ~terminalChar;
37 0 : ++current;
38 0 : terminalChar = *current;
39 : }
40 0 : }
41 :
42 : /**
43 : * Use this constructor if you want i/o to be based on
44 : * a single string you hand in during construction.
45 : * This short cut was added for Javascript.
46 : *
47 : * @update gess 5/12/98
48 : * @param aMode represents the parser mode (nav, other)
49 : * @return
50 : */
51 0 : nsScanner::nsScanner(const nsAString& anHTMLString)
52 : {
53 0 : MOZ_COUNT_CTOR(nsScanner);
54 :
55 0 : mSlidingBuffer = nullptr;
56 0 : if (AppendToBuffer(anHTMLString)) {
57 0 : mSlidingBuffer->BeginReading(mCurrentPosition);
58 : } else {
59 : /* XXX see hack below, re: bug 182067 */
60 0 : memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
61 0 : mEndPosition = mCurrentPosition;
62 : }
63 0 : mMarkPosition = mCurrentPosition;
64 0 : mIncremental = false;
65 0 : mUnicodeDecoder = nullptr;
66 0 : mCharsetSource = kCharsetUninitialized;
67 0 : }
68 :
69 : /**
70 : * Use this constructor if you want i/o to be based on strings
71 : * the scanner receives. If you pass a null filename, you
72 : * can still provide data to the scanner via append.
73 : */
74 23 : nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
75 23 : : mFilename(aFilename)
76 : {
77 23 : MOZ_COUNT_CTOR(nsScanner);
78 23 : NS_ASSERTION(!aCreateStream, "This is always true.");
79 :
80 23 : mSlidingBuffer = nullptr;
81 :
82 : // XXX This is a big hack. We need to initialize the iterators to something.
83 : // What matters is that mCurrentPosition == mEndPosition, so that our methods
84 : // believe that we are at EOF (see bug 182067). We null out mCurrentPosition
85 : // so that we have some hope of catching null pointer dereferences associated
86 : // with this hack. --darin
87 23 : memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
88 23 : mMarkPosition = mCurrentPosition;
89 23 : mEndPosition = mCurrentPosition;
90 :
91 23 : mIncremental = true;
92 :
93 23 : mUnicodeDecoder = nullptr;
94 23 : mCharsetSource = kCharsetUninitialized;
95 : // XML defaults to UTF-8 and about:blank is UTF-8, too.
96 23 : SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault);
97 23 : }
98 :
99 45 : nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding,
100 : int32_t aSource)
101 : {
102 45 : if (aSource < mCharsetSource) // priority is lower than the current one
103 0 : return NS_OK;
104 :
105 45 : mCharsetSource = aSource;
106 90 : nsCString charsetName;
107 45 : aEncoding->Name(charsetName);
108 45 : if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
109 22 : return NS_OK; // no difference, don't change it
110 : }
111 :
112 : // different, need to change it
113 :
114 23 : mCharset.Assign(charsetName);
115 :
116 23 : mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval();
117 :
118 23 : return NS_OK;
119 : }
120 :
121 :
122 : /**
123 : * default destructor
124 : *
125 : * @update gess 3/25/98
126 : * @param
127 : * @return
128 : */
129 44 : nsScanner::~nsScanner() {
130 :
131 22 : delete mSlidingBuffer;
132 :
133 22 : MOZ_COUNT_DTOR(nsScanner);
134 22 : }
135 :
136 : /**
137 : * Resets current offset position of input stream to marked position.
138 : * This allows us to back up to this point if the need should arise,
139 : * such as when tokenization gets interrupted.
140 : * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
141 : *
142 : * @update gess 5/12/98
143 : * @param
144 : * @return
145 : */
146 45 : void nsScanner::RewindToMark(void){
147 45 : if (mSlidingBuffer) {
148 44 : mCurrentPosition = mMarkPosition;
149 : }
150 45 : }
151 :
152 :
153 : /**
154 : * Records current offset position in input stream. This allows us
155 : * to back up to this point if the need should arise, such as when
156 : * tokenization gets interrupted.
157 : *
158 : * @update gess 7/29/98
159 : * @param
160 : * @return
161 : */
162 89 : int32_t nsScanner::Mark() {
163 89 : int32_t distance = 0;
164 89 : if (mSlidingBuffer) {
165 88 : nsScannerIterator oldStart;
166 88 : mSlidingBuffer->BeginReading(oldStart);
167 :
168 88 : distance = Distance(oldStart, mCurrentPosition);
169 :
170 88 : mSlidingBuffer->DiscardPrefix(mCurrentPosition);
171 88 : mSlidingBuffer->BeginReading(mCurrentPosition);
172 88 : mMarkPosition = mCurrentPosition;
173 : }
174 :
175 89 : return distance;
176 : }
177 :
178 : /**
179 : * Insert data to our underlying input buffer as
180 : * if it were read from an input stream.
181 : *
182 : * @update harishd 01/12/99
183 : * @return error code
184 : */
185 0 : bool nsScanner::UngetReadable(const nsAString& aBuffer) {
186 0 : if (!mSlidingBuffer) {
187 0 : return false;
188 : }
189 :
190 0 : mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition);
191 0 : mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators
192 0 : mSlidingBuffer->EndReading(mEndPosition);
193 :
194 0 : return true;
195 : }
196 :
197 : /**
198 : * Append data to our underlying input buffer as
199 : * if it were read from an input stream.
200 : *
201 : * @update gess4/3/98
202 : * @return error code
203 : */
204 0 : nsresult nsScanner::Append(const nsAString& aBuffer) {
205 0 : if (!AppendToBuffer(aBuffer))
206 0 : return NS_ERROR_OUT_OF_MEMORY;
207 0 : return NS_OK;
208 : }
209 :
210 : /**
211 : *
212 : *
213 : * @update gess 5/21/98
214 : * @param
215 : * @return
216 : */
217 22 : nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen)
218 : {
219 22 : nsresult res = NS_OK;
220 22 : if (mUnicodeDecoder) {
221 22 : CheckedInt<size_t> needed = mUnicodeDecoder->MaxUTF16BufferLength(aLen);
222 22 : if (!needed.isValid()) {
223 0 : return NS_ERROR_OUT_OF_MEMORY;
224 : }
225 22 : CheckedInt<uint32_t> allocLen(1); // null terminator due to legacy sadness
226 22 : allocLen += needed.value();
227 22 : if (!allocLen.isValid()) {
228 0 : return NS_ERROR_OUT_OF_MEMORY;
229 : }
230 : nsScannerString::Buffer* buffer =
231 22 : nsScannerString::AllocBuffer(allocLen.value());
232 22 : NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY);
233 22 : char16_t *unichars = buffer->DataStart();
234 :
235 : uint32_t result;
236 : size_t read;
237 : size_t written;
238 44 : Tie(result, read, written) =
239 44 : mUnicodeDecoder->DecodeToUTF16WithoutReplacement(
240 : AsBytes(MakeSpan(aBuffer, aLen)),
241 : MakeSpan(unichars, needed.value()),
242 22 : false); // Retain bug about failure to handle EOF
243 22 : MOZ_ASSERT(result != kOutputFull);
244 22 : MOZ_ASSERT(read <= aLen);
245 22 : MOZ_ASSERT(written <= needed.value());
246 22 : if (result != kInputEmpty) {
247 : // Since about:blank is empty, this line runs only for XML. Use a
248 : // character that's illegal in XML instead of U+FFFD in order to make
249 : // expat flag the error. There is no need to loop and convert more, since
250 : // expat will stop here anyway.
251 0 : unichars[written++] = 0xFFFF;
252 : }
253 22 : buffer->SetDataLength(written);
254 : // Don't propagate return code of unicode decoder
255 : // since it doesn't reflect on our success or failure
256 : // - Ref. bug 87110
257 22 : res = NS_OK;
258 22 : if (!AppendToBuffer(buffer))
259 0 : res = NS_ERROR_OUT_OF_MEMORY;
260 : }
261 : else {
262 0 : NS_WARNING("No decoder found.");
263 0 : res = NS_ERROR_FAILURE;
264 : }
265 :
266 22 : return res;
267 : }
268 :
269 : /**
270 : * retrieve next char from scanners internal input stream
271 : *
272 : * @update gess 3/25/98
273 : * @param
274 : * @return error code reflecting read status
275 : */
276 0 : nsresult nsScanner::GetChar(char16_t& aChar) {
277 0 : if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
278 0 : aChar = 0;
279 0 : return NS_ERROR_HTMLPARSER_EOF;
280 : }
281 :
282 0 : aChar = *mCurrentPosition++;
283 :
284 0 : return NS_OK;
285 : }
286 :
287 0 : void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd)
288 : {
289 0 : aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
290 0 : }
291 :
292 44 : void nsScanner::CurrentPosition(nsScannerIterator& aPosition)
293 : {
294 44 : aPosition = mCurrentPosition;
295 44 : }
296 :
297 88 : void nsScanner::EndReading(nsScannerIterator& aPosition)
298 : {
299 88 : aPosition = mEndPosition;
300 88 : }
301 :
302 44 : void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate)
303 : {
304 44 : if (mSlidingBuffer) {
305 44 : mCurrentPosition = aPosition;
306 44 : if (aTerminate && (mCurrentPosition == mEndPosition)) {
307 44 : mMarkPosition = mCurrentPosition;
308 44 : mSlidingBuffer->DiscardPrefix(mCurrentPosition);
309 : }
310 : }
311 44 : }
312 :
313 22 : bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf)
314 : {
315 22 : if (!mSlidingBuffer) {
316 22 : mSlidingBuffer = new nsScannerString(aBuf);
317 22 : if (!mSlidingBuffer)
318 0 : return false;
319 22 : mSlidingBuffer->BeginReading(mCurrentPosition);
320 22 : mMarkPosition = mCurrentPosition;
321 22 : mSlidingBuffer->EndReading(mEndPosition);
322 : }
323 : else {
324 0 : mSlidingBuffer->AppendBuffer(aBuf);
325 0 : if (mCurrentPosition == mEndPosition) {
326 0 : mSlidingBuffer->BeginReading(mCurrentPosition);
327 : }
328 0 : mSlidingBuffer->EndReading(mEndPosition);
329 : }
330 :
331 22 : return true;
332 : }
333 :
334 : /**
335 : * call this to copy bytes out of the scanner that have not yet been consumed
336 : * by the tokenization process.
337 : *
338 : * @update gess 5/12/98
339 : * @param aCopyBuffer is where the scanner buffer will be copied to
340 : * @return true if OK or false on OOM
341 : */
342 0 : bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
343 0 : if (!mSlidingBuffer) {
344 0 : aCopyBuffer.Truncate();
345 0 : return true;
346 : }
347 :
348 0 : nsScannerIterator start, end;
349 0 : start = mCurrentPosition;
350 0 : end = mEndPosition;
351 :
352 0 : return CopyUnicodeTo(start, end, aCopyBuffer);
353 : }
354 :
355 : /**
356 : * Retrieve the name of the file that the scanner is reading from.
357 : * In some cases, it's just a given name, because the scanner isn't
358 : * really reading from a file.
359 : *
360 : * @update gess 5/12/98
361 : * @return
362 : */
363 67 : nsString& nsScanner::GetFilename(void) {
364 67 : return mFilename;
365 : }
366 :
367 : /**
368 : * Conduct self test. Actually, selftesting for this class
369 : * occurs in the parser selftest.
370 : *
371 : * @update gess 3/25/98
372 : * @param
373 : * @return
374 : */
375 :
376 0 : void nsScanner::SelfTest(void) {
377 : #ifdef _DEBUG
378 : #endif
379 0 : }
|