Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "mozEnglishWordUtils.h"
7 : #include "nsReadableUtils.h"
8 : #include "nsIServiceManager.h"
9 : #include "nsUnicharUtils.h"
10 : #include "nsUnicodeProperties.h"
11 : #include "nsCRT.h"
12 : #include "mozilla/Likely.h"
13 :
14 0 : NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils)
15 0 : NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils)
16 :
17 0 : NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils)
18 0 : NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil)
19 0 : NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil)
20 0 : NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils)
21 0 : NS_INTERFACE_MAP_END
22 :
23 0 : NS_IMPL_CYCLE_COLLECTION(mozEnglishWordUtils,
24 : mURLDetector)
25 :
26 0 : mozEnglishWordUtils::mozEnglishWordUtils()
27 : {
28 0 : mLanguage.AssignLiteral("en");
29 :
30 : nsresult rv;
31 0 : mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv);
32 0 : }
33 :
34 0 : mozEnglishWordUtils::~mozEnglishWordUtils()
35 : {
36 0 : }
37 :
38 0 : NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(char16_t * *aLanguage)
39 : {
40 0 : NS_ENSURE_ARG_POINTER(aLanguage);
41 :
42 0 : *aLanguage = ToNewUnicode(mLanguage);
43 0 : if (!*aLanguage) {
44 0 : return NS_ERROR_OUT_OF_MEMORY;
45 : }
46 0 : return NS_OK;
47 : }
48 :
49 : // return the possible root forms of aWord.
50 0 : NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const char16_t *aWord, uint32_t type, char16_t ***words, uint32_t *count)
51 : {
52 0 : nsAutoString word(aWord);
53 : char16_t **tmpPtr;
54 0 : int32_t length = word.Length();
55 :
56 0 : *count = 0;
57 :
58 0 : mozEnglishWordUtils::myspCapitalization ct = captype(word);
59 0 : switch (ct)
60 : {
61 : case HuhCap:
62 : case NoCap:
63 0 : tmpPtr = (char16_t **)moz_xmalloc(sizeof(char16_t *));
64 0 : if (!tmpPtr)
65 0 : return NS_ERROR_OUT_OF_MEMORY;
66 0 : tmpPtr[0] = ToNewUnicode(word);
67 0 : if (!tmpPtr[0]) {
68 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
69 0 : return NS_ERROR_OUT_OF_MEMORY;
70 : }
71 0 : *words = tmpPtr;
72 0 : *count = 1;
73 0 : break;
74 :
75 :
76 : case AllCap:
77 0 : tmpPtr = (char16_t **)moz_xmalloc(sizeof(char16_t *) * 3);
78 0 : if (!tmpPtr)
79 0 : return NS_ERROR_OUT_OF_MEMORY;
80 0 : tmpPtr[0] = ToNewUnicode(word);
81 0 : if (!tmpPtr[0]) {
82 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
83 0 : return NS_ERROR_OUT_OF_MEMORY;
84 : }
85 0 : ToLowerCase(tmpPtr[0], tmpPtr[0], length);
86 :
87 0 : tmpPtr[1] = ToNewUnicode(word);
88 0 : if (!tmpPtr[1]) {
89 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
90 0 : return NS_ERROR_OUT_OF_MEMORY;
91 : }
92 0 : ToLowerCase(tmpPtr[1], tmpPtr[1], length);
93 0 : ToUpperCase(tmpPtr[1], tmpPtr[1], 1);
94 :
95 0 : tmpPtr[2] = ToNewUnicode(word);
96 0 : if (!tmpPtr[2]) {
97 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr);
98 0 : return NS_ERROR_OUT_OF_MEMORY;
99 : }
100 :
101 0 : *words = tmpPtr;
102 0 : *count = 3;
103 0 : break;
104 :
105 : case InitCap:
106 0 : tmpPtr = (char16_t **)moz_xmalloc(sizeof(char16_t *) * 2);
107 0 : if (!tmpPtr)
108 0 : return NS_ERROR_OUT_OF_MEMORY;
109 :
110 0 : tmpPtr[0] = ToNewUnicode(word);
111 0 : if (!tmpPtr[0]) {
112 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
113 0 : return NS_ERROR_OUT_OF_MEMORY;
114 : }
115 0 : ToLowerCase(tmpPtr[0], tmpPtr[0], length);
116 :
117 0 : tmpPtr[1] = ToNewUnicode(word);
118 0 : if (!tmpPtr[1]) {
119 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
120 0 : return NS_ERROR_OUT_OF_MEMORY;
121 : }
122 :
123 0 : *words = tmpPtr;
124 0 : *count = 2;
125 0 : break;
126 : default:
127 0 : return NS_ERROR_FAILURE; // should never get here;
128 : }
129 0 : return NS_OK;
130 : }
131 :
132 : // This needs vast improvement
133 0 : bool mozEnglishWordUtils::ucIsAlpha(char16_t aChar)
134 : {
135 : // XXX we have to fix callers to handle the full Unicode range
136 0 : return nsUGenCategory::kLetter == mozilla::unicode::GetGenCategory(aChar);
137 : }
138 :
139 0 : NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const char16_t *word, uint32_t length, uint32_t offset, int32_t *begin, int32_t *end)
140 : {
141 0 : const char16_t *p = word + offset;
142 0 : const char16_t *endbuf = word + length;
143 0 : const char16_t *startWord=p;
144 0 : if(p<endbuf){
145 : // XXX These loops should be modified to handle non-BMP characters.
146 : // if previous character is a word character, need to advance out of the word
147 0 : if (offset > 0 && ucIsAlpha(*(p-1))) {
148 0 : while (p < endbuf && ucIsAlpha(*p))
149 0 : p++;
150 : }
151 0 : while((p < endbuf) && (!ucIsAlpha(*p)))
152 : {
153 0 : p++;
154 : }
155 0 : startWord=p;
156 0 : while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\'')))
157 : {
158 0 : p++;
159 : }
160 :
161 : // we could be trying to break down a url, we don't want to break a url into parts,
162 : // instead we want to find out if it really is a url and if so, skip it, advancing startWord
163 : // to a point after the url.
164 :
165 : // before we spend more time looking to see if the word is a url, look for a url identifer
166 : // and make sure that identifer isn't the last character in the word fragment.
167 0 : if ( (*p == ':' || *p == '@' || *p == '.') && p < endbuf - 1) {
168 :
169 : // ok, we have a possible url...do more research to find out if we really have one
170 : // and determine the length of the url so we can skip over it.
171 :
172 0 : if (mURLDetector)
173 : {
174 0 : int32_t startPos = -1;
175 0 : int32_t endPos = -1;
176 :
177 0 : mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos);
178 :
179 : // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again
180 0 : if (startPos != -1 && endPos != -1) {
181 0 : startWord = p + endPos + 1; // skip over the url
182 0 : p = startWord; // reset p
183 :
184 : // now recursively call FindNextWord to search for the next word now that we have skipped the url
185 0 : return FindNextWord(word, length, startWord - word, begin, end);
186 : }
187 : }
188 : }
189 :
190 0 : while((p > startWord)&&(*(p-1) == '\'')){ // trim trailing apostrophes
191 0 : p--;
192 : }
193 : }
194 : else{
195 0 : startWord = endbuf;
196 : }
197 0 : if(startWord == endbuf){
198 0 : *begin = -1;
199 0 : *end = -1;
200 : }
201 : else{
202 0 : *begin = startWord-word;
203 0 : *end = p-word;
204 : }
205 0 : return NS_OK;
206 : }
207 :
208 : mozEnglishWordUtils::myspCapitalization
209 0 : mozEnglishWordUtils::captype(const nsString &word)
210 : {
211 0 : char16_t* lword=ToNewUnicode(word);
212 0 : ToUpperCase(lword,lword,word.Length());
213 0 : if(word.Equals(lword)){
214 0 : free(lword);
215 0 : return AllCap;
216 : }
217 :
218 0 : ToLowerCase(lword,lword,word.Length());
219 0 : if(word.Equals(lword)){
220 0 : free(lword);
221 0 : return NoCap;
222 : }
223 0 : int32_t length=word.Length();
224 0 : if(Substring(word,1,length-1).Equals(lword+1)){
225 0 : free(lword);
226 0 : return InitCap;
227 : }
228 0 : free(lword);
229 0 : return HuhCap;
230 : }
231 :
232 : // Convert the list of words in iwords to the same capitalization aWord and
233 : // return them in owords.
234 0 : NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const char16_t *aWord, const char16_t **iwords, uint32_t icount, char16_t ***owords, uint32_t *ocount)
235 : {
236 0 : nsAutoString word(aWord);
237 0 : nsresult rv = NS_OK;
238 :
239 : int32_t length;
240 0 : char16_t **tmpPtr = (char16_t **)moz_xmalloc(sizeof(char16_t *)*icount);
241 0 : if (!tmpPtr)
242 0 : return NS_ERROR_OUT_OF_MEMORY;
243 :
244 0 : mozEnglishWordUtils::myspCapitalization ct = captype(word);
245 0 : for(uint32_t i = 0; i < icount; ++i) {
246 0 : length = NS_strlen(iwords[i]);
247 0 : tmpPtr[i] = (char16_t *) moz_xmalloc(sizeof(char16_t) * (length + 1));
248 0 : if (MOZ_UNLIKELY(!tmpPtr[i])) {
249 0 : NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr);
250 0 : return NS_ERROR_OUT_OF_MEMORY;
251 : }
252 0 : memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(char16_t));
253 :
254 0 : nsAutoString capTest(tmpPtr[i]);
255 0 : mozEnglishWordUtils::myspCapitalization newCt=captype(capTest);
256 0 : if(newCt == NoCap){
257 0 : switch(ct)
258 : {
259 : case HuhCap:
260 : case NoCap:
261 0 : break;
262 : case AllCap:
263 0 : ToUpperCase(tmpPtr[i],tmpPtr[i],length);
264 0 : rv = NS_OK;
265 0 : break;
266 : case InitCap:
267 0 : ToUpperCase(tmpPtr[i],tmpPtr[i],1);
268 0 : rv = NS_OK;
269 0 : break;
270 : default:
271 0 : rv = NS_ERROR_FAILURE; // should never get here;
272 0 : break;
273 :
274 : }
275 : }
276 : }
277 0 : if (NS_SUCCEEDED(rv)){
278 0 : *owords = tmpPtr;
279 0 : *ocount = icount;
280 : }
281 0 : return rv;
282 : }
283 :
|