Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "mozInlineSpellWordUtil.h"
7 : #include "nsDebug.h"
8 : #include "nsIAtom.h"
9 : #include "nsComponentManagerUtils.h"
10 : #include "nsIDOMCSSStyleDeclaration.h"
11 : #include "nsIDOMElement.h"
12 : #include "nsIDOMRange.h"
13 : #include "nsIEditor.h"
14 : #include "nsIDOMNode.h"
15 : #include "nsUnicodeProperties.h"
16 : #include "nsServiceManagerUtils.h"
17 : #include "nsIContent.h"
18 : #include "nsTextFragment.h"
19 : #include "mozilla/dom/Element.h"
20 : #include "nsRange.h"
21 : #include "nsContentUtils.h"
22 : #include "nsIFrame.h"
23 : #include <algorithm>
24 : #include "mozilla/BinarySearch.h"
25 :
26 : using namespace mozilla;
27 :
28 : // IsIgnorableCharacter
29 : //
30 : // These characters are ones that we should ignore in input.
31 :
32 0 : inline bool IsIgnorableCharacter(char16_t ch)
33 : {
34 0 : return (ch == 0xAD || // SOFT HYPHEN
35 0 : ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
36 : }
37 :
38 : // IsConditionalPunctuation
39 : //
40 : // Some characters (like apostrophes) require characters on each side to be
41 : // part of a word, and are otherwise punctuation.
42 :
43 0 : inline bool IsConditionalPunctuation(char16_t ch)
44 : {
45 0 : return (ch == '\'' ||
46 0 : ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
47 0 : ch == 0x00B7); // MIDDLE DOT
48 : }
49 :
50 : // mozInlineSpellWordUtil::Init
51 :
52 : nsresult
53 0 : mozInlineSpellWordUtil::Init(const nsWeakPtr& aWeakEditor)
54 : {
55 : nsresult rv;
56 :
57 : // getting the editor can fail commonly because the editor was detached, so
58 : // don't assert
59 0 : nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
60 0 : if (NS_FAILED(rv))
61 0 : return rv;
62 :
63 0 : nsCOMPtr<nsIDOMDocument> domDoc;
64 0 : rv = editor->GetDocument(getter_AddRefs(domDoc));
65 0 : NS_ENSURE_SUCCESS(rv, rv);
66 0 : NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
67 :
68 0 : mDOMDocument = domDoc;
69 0 : mDocument = do_QueryInterface(domDoc);
70 :
71 : // Find the root node for the editor. For contenteditable we'll need something
72 : // cleverer here.
73 0 : nsCOMPtr<nsIDOMElement> rootElt;
74 0 : rv = editor->GetRootElement(getter_AddRefs(rootElt));
75 0 : NS_ENSURE_SUCCESS(rv, rv);
76 :
77 0 : nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
78 0 : mRootNode = rootNode;
79 0 : NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
80 0 : return NS_OK;
81 : }
82 :
83 : static inline bool
84 0 : IsSpellCheckingTextNode(nsINode* aNode)
85 : {
86 0 : nsIContent *parent = aNode->GetParent();
87 0 : if (parent && parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
88 0 : return false;
89 0 : return aNode->IsNodeOfType(nsINode::eTEXT);
90 : }
91 :
92 : typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
93 :
94 : // Find the next node in the DOM tree in preorder.
95 : // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
96 : // why we can't just use GetNextNode here, sadly.
97 : static nsINode*
98 0 : FindNextNode(nsINode* aNode, nsINode* aRoot,
99 : OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
100 : {
101 0 : NS_PRECONDITION(aNode, "Null starting node?");
102 :
103 0 : nsINode* next = aNode->GetFirstChild();
104 0 : if (next)
105 0 : return next;
106 :
107 : // Don't look at siblings or otherwise outside of aRoot
108 0 : if (aNode == aRoot)
109 0 : return nullptr;
110 :
111 0 : next = aNode->GetNextSibling();
112 0 : if (next)
113 0 : return next;
114 :
115 : // Go up
116 : for (;;) {
117 0 : if (aOnLeaveNode) {
118 0 : aOnLeaveNode(aNode, aClosure);
119 : }
120 :
121 0 : next = aNode->GetParent();
122 0 : if (next == aRoot || ! next)
123 0 : return nullptr;
124 0 : aNode = next;
125 :
126 0 : next = aNode->GetNextSibling();
127 0 : if (next)
128 0 : return next;
129 : }
130 : }
131 :
132 : // aNode is not a text node. Find the first text node starting at aNode/aOffset
133 : // in a preorder DOM traversal.
134 : static nsINode*
135 0 : FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)
136 : {
137 0 : NS_PRECONDITION(aNode, "Null starting node?");
138 0 : NS_ASSERTION(!IsSpellCheckingTextNode(aNode), "FindNextTextNode should start with a non-text node");
139 :
140 : nsINode* checkNode;
141 : // Need to start at the aOffset'th child
142 0 : nsIContent* child = aNode->GetChildAt(aOffset);
143 :
144 0 : if (child) {
145 0 : checkNode = child;
146 : } else {
147 : // aOffset was beyond the end of the child list.
148 : // goto next node after the last descendant of aNode in
149 : // a preorder DOM traversal.
150 0 : checkNode = aNode->GetNextNonChildNode(aRoot);
151 : }
152 :
153 0 : while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
154 0 : checkNode = checkNode->GetNextNode(aRoot);
155 : }
156 0 : return checkNode;
157 : }
158 :
159 : // mozInlineSpellWordUtil::SetEnd
160 : //
161 : // We have two ranges "hard" and "soft". The hard boundary is simply
162 : // the scope of the root node. The soft boundary is that which is set
163 : // by the caller of this class by calling this function. If this function is
164 : // not called, the soft boundary is the same as the hard boundary.
165 : //
166 : // When we reach the soft boundary (mSoftEnd), we keep
167 : // going until we reach the end of a word. This allows the caller to set the
168 : // end of the range to anything, and we will always check whole multiples of
169 : // words. When we reach the hard boundary we stop no matter what.
170 : //
171 : // There is no beginning soft boundary. This is because we only go to the
172 : // previous node once, when finding the previous word boundary in
173 : // SetPosition(). You might think of the soft boundary as being this initial
174 : // position.
175 :
176 : nsresult
177 0 : mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset)
178 : {
179 0 : NS_PRECONDITION(aEndNode, "Null end node?");
180 :
181 0 : NS_ASSERTION(mRootNode, "Not initialized");
182 :
183 0 : InvalidateWords();
184 :
185 0 : if (!IsSpellCheckingTextNode(aEndNode)) {
186 : // End at the start of the first text node after aEndNode/aEndOffset.
187 0 : aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
188 0 : aEndOffset = 0;
189 : }
190 0 : mSoftEnd = NodeOffset(aEndNode, aEndOffset);
191 0 : return NS_OK;
192 : }
193 :
194 : nsresult
195 0 : mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset)
196 : {
197 0 : InvalidateWords();
198 :
199 0 : if (!IsSpellCheckingTextNode(aNode)) {
200 : // Start at the start of the first text node after aNode/aOffset.
201 0 : aNode = FindNextTextNode(aNode, aOffset, mRootNode);
202 0 : aOffset = 0;
203 : }
204 0 : mSoftBegin = NodeOffset(aNode, aOffset);
205 :
206 0 : nsresult rv = EnsureWords();
207 0 : if (NS_FAILED(rv)) {
208 0 : return rv;
209 : }
210 :
211 0 : int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
212 0 : if (textOffset < 0)
213 0 : return NS_OK;
214 0 : mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
215 0 : return NS_OK;
216 : }
217 :
218 : nsresult
219 0 : mozInlineSpellWordUtil::EnsureWords()
220 : {
221 0 : if (mSoftTextValid)
222 0 : return NS_OK;
223 0 : BuildSoftText();
224 0 : nsresult rv = BuildRealWords();
225 0 : if (NS_FAILED(rv)) {
226 0 : mRealWords.Clear();
227 0 : return rv;
228 : }
229 0 : mSoftTextValid = true;
230 0 : return NS_OK;
231 : }
232 :
233 : nsresult
234 0 : mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
235 : {
236 0 : NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
237 0 : NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
238 0 : return MakeRange(begin, end, aRange);
239 : }
240 :
241 : // mozInlineSpellWordUtil::GetRangeForWord
242 :
243 : nsresult
244 0 : mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
245 : int32_t aWordOffset,
246 : nsRange** aRange)
247 : {
248 : // Set our soft end and start
249 0 : nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
250 0 : NodeOffset pt = NodeOffset(wordNode, aWordOffset);
251 :
252 0 : if (!mSoftTextValid || pt != mSoftBegin || pt != mSoftEnd) {
253 0 : InvalidateWords();
254 0 : mSoftBegin = mSoftEnd = pt;
255 0 : nsresult rv = EnsureWords();
256 0 : if (NS_FAILED(rv)) {
257 0 : return rv;
258 : }
259 : }
260 :
261 0 : int32_t offset = MapDOMPositionToSoftTextOffset(pt);
262 0 : if (offset < 0)
263 0 : return MakeRange(pt, pt, aRange);
264 0 : int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
265 0 : if (wordIndex < 0)
266 0 : return MakeRange(pt, pt, aRange);
267 0 : return MakeRangeForWord(mRealWords[wordIndex], aRange);
268 : }
269 :
270 : // This is to fix characters that the spellchecker may not like
271 : static void
272 0 : NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)
273 : {
274 0 : aOutput.Truncate();
275 0 : for (int32_t i = 0; i < aLen; i++) {
276 0 : char16_t ch = aInput.CharAt(i + aPos);
277 :
278 : // remove ignorable characters from the word
279 0 : if (IsIgnorableCharacter(ch))
280 0 : continue;
281 :
282 : // the spellchecker doesn't handle curly apostrophes in all languages
283 0 : if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
284 0 : ch = '\'';
285 : }
286 :
287 0 : aOutput.Append(ch);
288 : }
289 0 : }
290 :
291 : // mozInlineSpellWordUtil::GetNextWord
292 : //
293 : // FIXME-optimization: we shouldn't have to generate a range every single
294 : // time. It would be better if the inline spellchecker didn't require a
295 : // range unless the word was misspelled. This may or may not be possible.
296 :
297 : nsresult
298 0 : mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
299 : bool* aSkipChecking)
300 : {
301 : #ifdef DEBUG_SPELLCHECK
302 : printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
303 : #endif
304 :
305 0 : if (mNextWordIndex < 0 ||
306 0 : mNextWordIndex >= int32_t(mRealWords.Length())) {
307 0 : mNextWordIndex = -1;
308 0 : *aRange = nullptr;
309 0 : *aSkipChecking = true;
310 0 : return NS_OK;
311 : }
312 :
313 0 : const RealWord& word = mRealWords[mNextWordIndex];
314 0 : nsresult rv = MakeRangeForWord(word, aRange);
315 0 : NS_ENSURE_SUCCESS(rv, rv);
316 0 : ++mNextWordIndex;
317 0 : *aSkipChecking = !word.mCheckableWord;
318 0 : ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
319 :
320 : #ifdef DEBUG_SPELLCHECK
321 : printf("GetNextWord returning: %s (skip=%d)\n",
322 : NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
323 : #endif
324 :
325 0 : return NS_OK;
326 : }
327 :
328 : // mozInlineSpellWordUtil::MakeRange
329 : //
330 : // Convenience function for creating a range over the current document.
331 :
332 : nsresult
333 0 : mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
334 : nsRange** aRange)
335 : {
336 0 : NS_ENSURE_ARG_POINTER(aBegin.mNode);
337 0 : if (!mDOMDocument)
338 0 : return NS_ERROR_NOT_INITIALIZED;
339 :
340 0 : RefPtr<nsRange> range = new nsRange(aBegin.mNode);
341 0 : nsresult rv = range->SetStartAndEnd(aBegin.mNode, aBegin.mOffset,
342 0 : aEnd.mNode, aEnd.mOffset);
343 0 : if (NS_WARN_IF(NS_FAILED(rv))) {
344 0 : return rv;
345 : }
346 0 : range.forget(aRange);
347 :
348 0 : return NS_OK;
349 : }
350 :
351 : /*********** Word Splitting ************/
352 :
353 : // classifies a given character in the DOM word
354 : enum CharClass {
355 : CHAR_CLASS_WORD,
356 : CHAR_CLASS_SEPARATOR,
357 : CHAR_CLASS_END_OF_INPUT };
358 :
359 : // Encapsulates DOM-word to real-word splitting
360 0 : struct MOZ_STACK_CLASS WordSplitState
361 : {
362 : mozInlineSpellWordUtil* mWordUtil;
363 : const nsDependentSubstring mDOMWordText;
364 : int32_t mDOMWordOffset;
365 : CharClass mCurCharClass;
366 :
367 0 : WordSplitState(mozInlineSpellWordUtil* aWordUtil,
368 : const nsString& aString, int32_t aStart, int32_t aLen)
369 0 : : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
370 0 : mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
371 :
372 : CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
373 : void Advance();
374 : void AdvanceThroughSeparators();
375 : void AdvanceThroughWord();
376 :
377 : // Finds special words like email addresses and URLs that may start at the
378 : // current position, and returns their length, or 0 if not found. This allows
379 : // arbitrary word breaking rules to be used for these special entities, as
380 : // long as they can not contain whitespace.
381 : bool IsSpecialWord();
382 :
383 : // Similar to IsSpecialWord except that this takes a split word as
384 : // input. This checks for things that do not require special word-breaking
385 : // rules.
386 : bool ShouldSkipWord(int32_t aStart, int32_t aLength);
387 : };
388 :
389 : // WordSplitState::ClassifyCharacter
390 :
391 : CharClass
392 0 : WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
393 : {
394 0 : NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
395 : "Index out of range");
396 0 : if (aIndex == int32_t(mDOMWordText.Length()))
397 0 : return CHAR_CLASS_SEPARATOR;
398 :
399 : // this will classify the character, we want to treat "ignorable" characters
400 : // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
401 : nsUGenCategory charCategory =
402 0 : mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
403 0 : if (charCategory == nsUGenCategory::kLetter ||
404 0 : IsIgnorableCharacter(mDOMWordText[aIndex]) ||
405 0 : mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
406 0 : mDOMWordText[aIndex] == 0x200D /* ZWJ */)
407 0 : return CHAR_CLASS_WORD;
408 :
409 : // If conditional punctuation is surrounded immediately on both sides by word
410 : // characters it also counts as a word character.
411 0 : if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
412 0 : if (!aRecurse) {
413 : // not allowed to look around, this punctuation counts like a separator
414 0 : return CHAR_CLASS_SEPARATOR;
415 : }
416 :
417 : // check the left-hand character
418 0 : if (aIndex == 0)
419 0 : return CHAR_CLASS_SEPARATOR;
420 0 : if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
421 0 : return CHAR_CLASS_SEPARATOR;
422 : // If the previous charatcer is a word-char, make sure that it's not a
423 : // special dot character.
424 0 : if (mDOMWordText[aIndex - 1] == '.')
425 0 : return CHAR_CLASS_SEPARATOR;
426 :
427 : // now we know left char is a word-char, check the right-hand character
428 0 : if (aIndex == int32_t(mDOMWordText.Length()) - 1)
429 0 : return CHAR_CLASS_SEPARATOR;
430 0 : if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
431 0 : return CHAR_CLASS_SEPARATOR;
432 : // If the next charatcer is a word-char, make sure that it's not a
433 : // special dot character.
434 0 : if (mDOMWordText[aIndex + 1] == '.')
435 0 : return CHAR_CLASS_SEPARATOR;
436 :
437 : // char on either side is a word, this counts as a word
438 0 : return CHAR_CLASS_WORD;
439 : }
440 :
441 : // The dot character, if appearing at the end of a word, should
442 : // be considered part of that word. Example: "etc.", or
443 : // abbreviations
444 0 : if (aIndex > 0 &&
445 0 : mDOMWordText[aIndex] == '.' &&
446 0 : mDOMWordText[aIndex - 1] != '.' &&
447 0 : ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
448 0 : return CHAR_CLASS_WORD;
449 : }
450 :
451 : // all other punctuation
452 0 : if (charCategory == nsUGenCategory::kSeparator ||
453 0 : charCategory == nsUGenCategory::kOther ||
454 0 : charCategory == nsUGenCategory::kPunctuation ||
455 : charCategory == nsUGenCategory::kSymbol) {
456 : // Don't break on hyphens, as hunspell handles them on its own.
457 0 : if (aIndex > 0 &&
458 0 : mDOMWordText[aIndex] == '-' &&
459 0 : mDOMWordText[aIndex - 1] != '-' &&
460 0 : ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
461 : // A hyphen is only meaningful as a separator inside a word
462 : // if the previous and next characters are a word character.
463 0 : if (aIndex == int32_t(mDOMWordText.Length()) - 1)
464 0 : return CHAR_CLASS_SEPARATOR;
465 0 : if (mDOMWordText[aIndex + 1] != '.' &&
466 0 : ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
467 0 : return CHAR_CLASS_WORD;
468 : }
469 0 : return CHAR_CLASS_SEPARATOR;
470 : }
471 :
472 : // any other character counts as a word
473 0 : return CHAR_CLASS_WORD;
474 : }
475 :
476 :
477 : // WordSplitState::Advance
478 :
479 : void
480 0 : WordSplitState::Advance()
481 : {
482 0 : NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
483 0 : NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
484 : "Length beyond end");
485 :
486 0 : mDOMWordOffset ++;
487 0 : if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
488 0 : mCurCharClass = CHAR_CLASS_END_OF_INPUT;
489 : else
490 0 : mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
491 0 : }
492 :
493 :
494 : // WordSplitState::AdvanceThroughSeparators
495 :
496 : void
497 0 : WordSplitState::AdvanceThroughSeparators()
498 : {
499 0 : while (mCurCharClass == CHAR_CLASS_SEPARATOR)
500 0 : Advance();
501 0 : }
502 :
503 : // WordSplitState::AdvanceThroughWord
504 :
505 : void
506 0 : WordSplitState::AdvanceThroughWord()
507 : {
508 0 : while (mCurCharClass == CHAR_CLASS_WORD)
509 0 : Advance();
510 0 : }
511 :
512 :
513 : // WordSplitState::IsSpecialWord
514 :
515 : bool
516 0 : WordSplitState::IsSpecialWord()
517 : {
518 : // Search for email addresses. We simply define these as any sequence of
519 : // characters with an '@' character in the middle. The DOM word is already
520 : // split on whitepace, so we know that everything to the end is the address
521 0 : int32_t firstColon = -1;
522 0 : for (int32_t i = mDOMWordOffset;
523 0 : i < int32_t(mDOMWordText.Length()); i ++) {
524 0 : if (mDOMWordText[i] == '@') {
525 : // only accept this if there are unambiguous word characters (don't bother
526 : // recursing to disambiguate apostrophes) on each side. This prevents
527 : // classifying, e.g. "@home" as an email address
528 :
529 : // Use this condition to only accept words with '@' in the middle of
530 : // them. It works, but the inlinespellcker doesn't like this. The problem
531 : // is that you type "fhsgfh@" that's a misspelled word followed by a
532 : // symbol, but when you type another letter "fhsgfh@g" that first word
533 : // need to be unmarked misspelled. It doesn't do this. it only checks the
534 : // current position for potentially removing a spelling range.
535 0 : if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
536 0 : i < (int32_t)mDOMWordText.Length() - 1 &&
537 0 : ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
538 0 : return true;
539 : }
540 0 : } else if (mDOMWordText[i] == ':' && firstColon < 0) {
541 0 : firstColon = i;
542 :
543 : // If the first colon is followed by a slash, consider it a URL
544 : // This will catch things like asdf://foo.com
545 0 : if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
546 0 : mDOMWordText[firstColon + 1] == '/') {
547 0 : return true;
548 : }
549 : }
550 : }
551 :
552 : // Check the text before the first colon against some known protocols. It
553 : // is impossible to check against all protocols, especially since you can
554 : // plug in new protocols. We also don't want to waste time here checking
555 : // against a lot of obscure protocols.
556 0 : if (firstColon > mDOMWordOffset) {
557 0 : nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
558 0 : firstColon - mDOMWordOffset));
559 0 : if (protocol.EqualsIgnoreCase("http") ||
560 0 : protocol.EqualsIgnoreCase("https") ||
561 0 : protocol.EqualsIgnoreCase("news") ||
562 0 : protocol.EqualsIgnoreCase("file") ||
563 0 : protocol.EqualsIgnoreCase("javascript") ||
564 0 : protocol.EqualsIgnoreCase("data") ||
565 0 : protocol.EqualsIgnoreCase("ftp")) {
566 0 : return true;
567 : }
568 : }
569 :
570 : // not anything special
571 0 : return false;
572 : }
573 :
574 : // WordSplitState::ShouldSkipWord
575 :
576 : bool
577 0 : WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
578 : {
579 0 : int32_t last = aStart + aLength;
580 :
581 : // check to see if the word contains a digit
582 0 : for (int32_t i = aStart; i < last; i ++) {
583 0 : if (unicode::GetGenCategory(mDOMWordText[i]) == nsUGenCategory::kNumber) {
584 0 : return true;
585 : }
586 : }
587 :
588 : // not special
589 0 : return false;
590 : }
591 :
592 : /*********** DOM text extraction ************/
593 :
594 : // IsDOMWordSeparator
595 : //
596 : // Determines if the given character should be considered as a DOM Word
597 : // separator. Basically, this is whitespace, although it could also have
598 : // certain punctuation that we know ALWAYS breaks words. This is important.
599 : // For example, we can't have any punctuation that could appear in a URL
600 : // or email address in this, because those need to always fit into a single
601 : // DOM word.
602 :
603 : static bool
604 0 : IsDOMWordSeparator(char16_t ch)
605 : {
606 : // simple spaces
607 0 : if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
608 0 : return true;
609 :
610 : // complex spaces - check only if char isn't ASCII (uncommon)
611 0 : if (ch >= 0xA0 &&
612 0 : (ch == 0x00A0 || // NO-BREAK SPACE
613 0 : ch == 0x2002 || // EN SPACE
614 0 : ch == 0x2003 || // EM SPACE
615 0 : ch == 0x2009 || // THIN SPACE
616 : ch == 0x3000)) // IDEOGRAPHIC SPACE
617 0 : return true;
618 :
619 : // otherwise not a space
620 0 : return false;
621 : }
622 :
623 : static inline bool
624 0 : IsBRElement(nsINode* aNode)
625 : {
626 0 : return aNode->IsHTMLElement(nsGkAtoms::br);
627 : }
628 :
629 : /**
630 : * Given a TextNode, checks to see if there's a DOM word separator before
631 : * aBeforeOffset within it. This function does not modify aSeparatorOffset when
632 : * it returns false.
633 : *
634 : * @param aNode the TextNode to check.
635 : * @param aBeforeOffset the offset in the TextNode before which we will search
636 : * for the DOM separator. You can pass INT32_MAX to search the entire
637 : * length of the string.
638 : * @param aSeparatorOffset will be set to the offset of the first separator it
639 : * encounters. Will not be written to if no separator is found.
640 : * @returns True if it found a separator.
641 : */
642 : static bool
643 0 : TextNodeContainsDOMWordSeparator(nsINode* aNode,
644 : int32_t aBeforeOffset,
645 : int32_t* aSeparatorOffset)
646 : {
647 : // aNode is actually an nsIContent, since it's eTEXT
648 0 : nsIContent* content = static_cast<nsIContent*>(aNode);
649 0 : const nsTextFragment* textFragment = content->GetText();
650 0 : NS_ASSERTION(textFragment, "Where is our text?");
651 0 : nsString text;
652 0 : int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));
653 0 : bool ok = textFragment->AppendTo(text, 0, end, mozilla::fallible);
654 0 : if(!ok)
655 0 : return false;
656 :
657 0 : WordSplitState state(nullptr, text, 0, end);
658 0 : for (int32_t i = end - 1; i >= 0; --i) {
659 0 : if (IsDOMWordSeparator(textFragment->CharAt(i)) ||
660 0 : state.ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR) {
661 : // Be greedy, find as many separators as we can
662 0 : for (int32_t j = i - 1; j >= 0; --j) {
663 0 : if (IsDOMWordSeparator(textFragment->CharAt(j)) ||
664 0 : state.ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR) {
665 0 : i = j;
666 : } else {
667 0 : break;
668 : }
669 : }
670 0 : *aSeparatorOffset = i;
671 0 : return true;
672 : }
673 : }
674 0 : return false;
675 : }
676 :
677 : /**
678 : * Check if there's a DOM word separator before aBeforeOffset in this node.
679 : * Always returns true if it's a BR element.
680 : * aSeparatorOffset is set to the index of the first character in the last
681 : * separator if any is found (0 for BR elements).
682 : *
683 : * This function does not modify aSeparatorOffset when it returns false.
684 : */
685 : static bool
686 0 : ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
687 : int32_t* aSeparatorOffset)
688 : {
689 0 : if (IsBRElement(aNode)) {
690 0 : *aSeparatorOffset = 0;
691 0 : return true;
692 : }
693 :
694 0 : if (!IsSpellCheckingTextNode(aNode))
695 0 : return false;
696 :
697 : return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,
698 0 : aSeparatorOffset);
699 : }
700 :
701 : static bool
702 0 : IsBreakElement(nsINode* aNode)
703 : {
704 0 : if (!aNode->IsElement()) {
705 0 : return false;
706 : }
707 :
708 0 : dom::Element *element = aNode->AsElement();
709 :
710 0 : if (element->IsHTMLElement(nsGkAtoms::br))
711 0 : return true;
712 :
713 : // If we don't have a frame, we don't consider ourselves a break
714 : // element. In particular, words can span us.
715 0 : if (!element->GetPrimaryFrame())
716 0 : return false;
717 :
718 : // Anything that's not an inline element is a break element.
719 : // XXXbz should replaced inlines be break elements, though?
720 0 : return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=
721 0 : StyleDisplay::Inline;
722 : }
723 :
724 : struct CheckLeavingBreakElementClosure {
725 : bool mLeftBreakElement;
726 : };
727 :
728 : static void
729 0 : CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
730 : {
731 : CheckLeavingBreakElementClosure* cl =
732 0 : static_cast<CheckLeavingBreakElementClosure*>(aClosure);
733 0 : if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
734 0 : cl->mLeftBreakElement = true;
735 : }
736 0 : }
737 :
738 : void
739 0 : mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord)
740 : {
741 0 : nsAutoString result;
742 0 : ::NormalizeWord(aWord, 0, aWord.Length(), result);
743 0 : aWord = result;
744 0 : }
745 :
746 : void
747 0 : mozInlineSpellWordUtil::BuildSoftText()
748 : {
749 : // First we have to work backwards from mSoftStart to find a text node
750 : // containing a DOM word separator, a non-inline-element
751 : // boundary, or the hard start node. That's where we'll start building the
752 : // soft string from.
753 0 : nsINode* node = mSoftBegin.mNode;
754 0 : int32_t firstOffsetInNode = 0;
755 0 : int32_t checkBeforeOffset = mSoftBegin.mOffset;
756 0 : while (node) {
757 0 : if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
758 0 : if (node == mSoftBegin.mNode) {
759 : // If we find a word separator on the first node, look at the preceding
760 : // word on the text node as well.
761 0 : int32_t newOffset = 0;
762 0 : if (firstOffsetInNode > 0) {
763 : // Try to find the previous word boundary in the current node. If
764 : // we can't find one, start checking previous sibling nodes (if any
765 : // adjacent ones exist) to see if we can find any text nodes with
766 : // DOM word separators. We bail out as soon as we see a node that is
767 : // not a text node, or we run out of previous sibling nodes. In the
768 : // event that we simply cannot find any preceding word separator, the
769 : // offset is set to 0, and the soft text beginning node is set to the
770 : // "most previous" text node before the original starting node, or
771 : // kept at the original starting node if no previous text nodes exist.
772 0 : if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
773 : &newOffset)) {
774 0 : nsINode* prevNode = node->GetPreviousSibling();
775 0 : while (prevNode && IsSpellCheckingTextNode(prevNode)) {
776 0 : mSoftBegin.mNode = prevNode;
777 0 : if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
778 : &newOffset)) {
779 0 : break;
780 : }
781 0 : prevNode = prevNode->GetPreviousSibling();
782 : }
783 : }
784 : }
785 0 : firstOffsetInNode = newOffset;
786 0 : mSoftBegin.mOffset = newOffset;
787 : }
788 0 : break;
789 : }
790 0 : checkBeforeOffset = INT32_MAX;
791 0 : if (IsBreakElement(node)) {
792 : // Since GetPreviousContent follows tree *preorder*, we're about to traverse
793 : // up out of 'node'. Since node induces breaks (e.g., it's a block),
794 : // don't bother trying to look outside it, just stop now.
795 0 : break;
796 : }
797 : // GetPreviousContent below expects mRootNode to be an ancestor of node.
798 0 : if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
799 0 : break;
800 : }
801 0 : node = node->GetPreviousContent(mRootNode);
802 : }
803 :
804 : // Now build up the string moving forward through the DOM until we reach
805 : // the soft end and *then* see a DOM word separator, a non-inline-element
806 : // boundary, or the hard end node.
807 0 : mSoftText.Truncate();
808 0 : mSoftTextDOMMapping.Clear();
809 0 : bool seenSoftEnd = false;
810 : // Leave this outside the loop so large heap string allocations can be reused
811 : // across iterations
812 0 : while (node) {
813 0 : if (node == mSoftEnd.mNode) {
814 0 : seenSoftEnd = true;
815 : }
816 :
817 0 : bool exit = false;
818 0 : if (IsSpellCheckingTextNode(node)) {
819 0 : nsIContent* content = static_cast<nsIContent*>(node);
820 0 : NS_ASSERTION(content, "Where is our content?");
821 0 : const nsTextFragment* textFragment = content->GetText();
822 0 : NS_ASSERTION(textFragment, "Where is our text?");
823 0 : int32_t lastOffsetInNode = textFragment->GetLength();
824 :
825 0 : if (seenSoftEnd) {
826 : // check whether we can stop after this
827 0 : for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
828 0 : i < int32_t(textFragment->GetLength()); ++i) {
829 0 : if (IsDOMWordSeparator(textFragment->CharAt(i))) {
830 0 : exit = true;
831 : // stop at the first separator after the soft end point
832 0 : lastOffsetInNode = i;
833 0 : break;
834 : }
835 : }
836 : }
837 :
838 0 : if (firstOffsetInNode < lastOffsetInNode) {
839 0 : int32_t len = lastOffsetInNode - firstOffsetInNode;
840 0 : mSoftTextDOMMapping.AppendElement(
841 0 : DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
842 :
843 0 : bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
844 0 : mozilla::fallible);
845 0 : if (!ok) {
846 : // probably out of memory, remove from mSoftTextDOMMapping
847 0 : mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);
848 0 : exit = true;
849 : }
850 : }
851 :
852 0 : firstOffsetInNode = 0;
853 : }
854 :
855 0 : if (exit)
856 0 : break;
857 :
858 0 : CheckLeavingBreakElementClosure closure = { false };
859 0 : node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
860 0 : if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
861 : // We left, or are entering, a break element (e.g., block). Maybe we can
862 : // stop now.
863 0 : if (seenSoftEnd)
864 0 : break;
865 : // Record the break
866 0 : mSoftText.Append(' ');
867 : }
868 : }
869 :
870 : #ifdef DEBUG_SPELLCHECK
871 : printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
872 : #endif
873 0 : }
874 :
875 : nsresult
876 0 : mozInlineSpellWordUtil::BuildRealWords()
877 : {
878 : // This is pretty simple. We just have to walk mSoftText, tokenizing it
879 : // into "real words".
880 : // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
881 : // SplitDOMWord on each of those DOM words
882 0 : int32_t wordStart = -1;
883 0 : mRealWords.Clear();
884 0 : for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
885 0 : if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
886 0 : if (wordStart >= 0) {
887 0 : nsresult rv = SplitDOMWord(wordStart, i);
888 0 : if (NS_FAILED(rv)) {
889 0 : return rv;
890 : }
891 0 : wordStart = -1;
892 : }
893 : } else {
894 0 : if (wordStart < 0) {
895 0 : wordStart = i;
896 : }
897 : }
898 : }
899 0 : if (wordStart >= 0) {
900 0 : nsresult rv = SplitDOMWord(wordStart, mSoftText.Length());
901 0 : if (NS_FAILED(rv)) {
902 0 : return rv;
903 : }
904 : }
905 :
906 0 : return NS_OK;
907 : }
908 :
909 : /*********** DOM/realwords<->mSoftText mapping functions ************/
910 :
911 : int32_t
912 0 : mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
913 : {
914 0 : if (!mSoftTextValid) {
915 0 : NS_ERROR("Soft text must be valid if we're to map into it");
916 0 : return -1;
917 : }
918 :
919 0 : for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
920 0 : const DOMTextMapping& map = mSoftTextDOMMapping[i];
921 0 : if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
922 : // Allow offsets at either end of the string, in particular, allow the
923 : // offset that's at the end of the contributed string
924 : int32_t offsetInContributedString =
925 0 : aNodeOffset.mOffset - map.mNodeOffset.mOffset;
926 0 : if (offsetInContributedString >= 0 &&
927 0 : offsetInContributedString <= map.mLength)
928 0 : return map.mSoftTextOffset + offsetInContributedString;
929 0 : return -1;
930 : }
931 : }
932 0 : return -1;
933 : }
934 :
935 : namespace {
936 :
937 : template<class T>
938 : class FirstLargerOffset
939 : {
940 : int32_t mSoftTextOffset;
941 :
942 : public:
943 0 : explicit FirstLargerOffset(int32_t aSoftTextOffset) : mSoftTextOffset(aSoftTextOffset) {}
944 0 : int operator()(const T& t) const {
945 : // We want the first larger offset, so never return 0 (which would
946 : // short-circuit evaluation before finding the last such offset).
947 0 : return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
948 : }
949 : };
950 :
951 : template<class T>
952 : bool
953 0 : FindLastNongreaterOffset(const nsTArray<T>& aContainer, int32_t aSoftTextOffset, size_t* aIndex)
954 : {
955 0 : if (aContainer.Length() == 0) {
956 0 : return false;
957 : }
958 :
959 0 : BinarySearchIf(aContainer, 0, aContainer.Length(),
960 : FirstLargerOffset<T>(aSoftTextOffset), aIndex);
961 0 : if (*aIndex > 0) {
962 : // There was at least one mapping with offset <= aSoftTextOffset. Step back
963 : // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
964 0 : *aIndex -= 1;
965 : } else {
966 : // Every mapping had offset greater than aSoftTextOffset.
967 0 : MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
968 : }
969 0 : return true;
970 : }
971 :
972 : } // namespace
973 :
974 : mozInlineSpellWordUtil::NodeOffset
975 0 : mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
976 : DOMMapHint aHint)
977 : {
978 0 : NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
979 0 : if (!mSoftTextValid)
980 0 : return NodeOffset(nullptr, -1);
981 :
982 : // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
983 : size_t index;
984 0 : bool found = FindLastNongreaterOffset(mSoftTextDOMMapping, aSoftTextOffset, &index);
985 0 : if (!found) {
986 0 : return NodeOffset(nullptr, -1);
987 : }
988 :
989 : // 'index' is now the last mapping, if any, such that
990 : // mSoftTextOffset <= aSoftTextOffset.
991 : // If we're doing HINT_END, then we may want to return the end of the
992 : // the previous mapping instead of the start of this mapping
993 0 : if (aHint == HINT_END && index > 0) {
994 0 : const DOMTextMapping& map = mSoftTextDOMMapping[index - 1];
995 0 : if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
996 0 : return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
997 : }
998 :
999 : // We allow ourselves to return the end of this mapping even if we're
1000 : // doing HINT_START. This will only happen if there is no mapping which this
1001 : // point is the start of. I'm not 100% sure this is OK...
1002 0 : const DOMTextMapping& map = mSoftTextDOMMapping[index];
1003 0 : int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
1004 0 : if (offset >= 0 && offset <= map.mLength)
1005 0 : return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
1006 :
1007 0 : return NodeOffset(nullptr, -1);
1008 : }
1009 :
1010 : int32_t
1011 0 : mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,
1012 : DOMMapHint aHint, bool aSearchForward)
1013 : {
1014 0 : NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
1015 0 : if (!mSoftTextValid)
1016 0 : return -1;
1017 :
1018 : // Find the last word, if any, such that mSoftTextOffset <= aSoftTextOffset
1019 : size_t index;
1020 0 : bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
1021 0 : if (!found) {
1022 0 : return -1;
1023 : }
1024 :
1025 : // 'index' is now the last word, if any, such that
1026 : // mSoftTextOffset <= aSoftTextOffset.
1027 : // If we're doing HINT_END, then we may want to return the end of the
1028 : // the previous word instead of the start of this word
1029 0 : if (aHint == HINT_END && index > 0) {
1030 0 : const RealWord& word = mRealWords[index - 1];
1031 0 : if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
1032 0 : return index - 1;
1033 : }
1034 :
1035 : // We allow ourselves to return the end of this word even if we're
1036 : // doing HINT_START. This will only happen if there is no word which this
1037 : // point is the start of. I'm not 100% sure this is OK...
1038 0 : const RealWord& word = mRealWords[index];
1039 0 : int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
1040 0 : if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength))
1041 0 : return index;
1042 :
1043 0 : if (aSearchForward) {
1044 0 : if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
1045 : // All words have mSoftTextOffset > aSoftTextOffset
1046 0 : return 0;
1047 : }
1048 : // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
1049 : // Word index+1, if it exists, will be the first with
1050 : // mSoftTextOffset > aSoftTextOffset.
1051 0 : if (index + 1 < mRealWords.Length())
1052 0 : return index + 1;
1053 : }
1054 :
1055 0 : return -1;
1056 : }
1057 :
1058 : // mozInlineSpellWordUtil::SplitDOMWord
1059 :
1060 : nsresult
1061 0 : mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
1062 : {
1063 0 : WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
1064 0 : state.mCurCharClass = state.ClassifyCharacter(0, true);
1065 :
1066 0 : state.AdvanceThroughSeparators();
1067 0 : if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&
1068 0 : state.IsSpecialWord()) {
1069 0 : int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;
1070 0 : if (!mRealWords.AppendElement(
1071 0 : RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
1072 : fallible)) {
1073 0 : return NS_ERROR_OUT_OF_MEMORY;
1074 : }
1075 :
1076 0 : return NS_OK;
1077 : }
1078 :
1079 0 : while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1080 0 : state.AdvanceThroughSeparators();
1081 0 : if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
1082 0 : break;
1083 :
1084 : // save the beginning of the word
1085 0 : int32_t wordOffset = state.mDOMWordOffset;
1086 :
1087 : // find the end of the word
1088 0 : state.AdvanceThroughWord();
1089 0 : int32_t wordLen = state.mDOMWordOffset - wordOffset;
1090 0 : if (!mRealWords.AppendElement(
1091 0 : RealWord(aStart + wordOffset, wordLen,
1092 0 : !state.ShouldSkipWord(wordOffset, wordLen)), fallible)) {
1093 0 : return NS_ERROR_OUT_OF_MEMORY;
1094 : }
1095 : }
1096 :
1097 0 : return NS_OK;
1098 : }
|