Line data Source code
1 : /* ***** BEGIN LICENSE BLOCK *****
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * Copyright (C) 2002-2017 Németh László
5 : *
6 : * The contents of this file are subject to the Mozilla Public License Version
7 : * 1.1 (the "License"); you may not use this file except in compliance with
8 : * the License. You may obtain a copy of the License at
9 : * http://www.mozilla.org/MPL/
10 : *
11 : * Software distributed under the License is distributed on an "AS IS" basis,
12 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 : * for the specific language governing rights and limitations under the
14 : * License.
15 : *
16 : * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 : *
18 : * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 : * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 : * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 : * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 : * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either the GNU General Public License Version 2 or later (the "GPL"), or
26 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 : /*
38 : * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 : * And Contributors. All rights reserved.
40 : *
41 : * Redistribution and use in source and binary forms, with or without
42 : * modification, are permitted provided that the following conditions
43 : * are met:
44 : *
45 : * 1. Redistributions of source code must retain the above copyright
46 : * notice, this list of conditions and the following disclaimer.
47 : *
48 : * 2. Redistributions in binary form must reproduce the above copyright
49 : * notice, this list of conditions and the following disclaimer in the
50 : * documentation and/or other materials provided with the distribution.
51 : *
52 : * 3. All modifications to the source code must be clearly marked as
53 : * such. Binary redistributions based on modified source code
54 : * must be clearly marked as modified versions in the documentation
55 : * and/or other materials provided with the distribution.
56 : *
57 : * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 : * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 : * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 : * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 : * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 : * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 : * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 : * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 : * SUCH DAMAGE.
69 : */
70 :
71 : #include <stdlib.h>
72 : #include <string.h>
73 : #include <stdio.h>
74 : #include <ctype.h>
75 : #include <limits>
76 : #include <sstream>
77 :
78 : #include "hashmgr.hxx"
79 : #include "csutil.hxx"
80 : #include "atypes.hxx"
81 :
82 : // build a hash table from a munched word list
83 :
84 0 : HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
85 : : tablesize(0),
86 : tableptr(NULL),
87 : flag_mode(FLAG_CHAR),
88 : complexprefixes(0),
89 : utf8(0),
90 : forbiddenword(FORBIDDENWORD) // forbidden word signing flag
91 : ,
92 : numaliasf(0),
93 : aliasf(NULL),
94 : aliasflen(0),
95 : numaliasm(0),
96 0 : aliasm(NULL) {
97 0 : langnum = 0;
98 0 : csconv = 0;
99 0 : load_config(apath, key);
100 0 : int ec = load_tables(tpath, key);
101 0 : if (ec) {
102 : /* error condition - what should we do here */
103 0 : HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec);
104 0 : free(tableptr);
105 : //keep tablesize to 1 to fix possible division with zero
106 0 : tablesize = 1;
107 0 : tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
108 0 : if (!tableptr) {
109 0 : tablesize = 0;
110 : }
111 : }
112 0 : }
113 :
114 0 : HashMgr::~HashMgr() {
115 0 : if (tableptr) {
116 : // now pass through hash table freeing up everything
117 : // go through column by column of the table
118 0 : for (int i = 0; i < tablesize; i++) {
119 0 : struct hentry* pt = tableptr[i];
120 0 : struct hentry* nt = NULL;
121 0 : while (pt) {
122 0 : nt = pt->next;
123 0 : if (pt->astr &&
124 0 : (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)))
125 0 : free(pt->astr);
126 0 : free(pt);
127 0 : pt = nt;
128 : }
129 : }
130 0 : free(tableptr);
131 : }
132 0 : tablesize = 0;
133 :
134 0 : if (aliasf) {
135 0 : for (int j = 0; j < (numaliasf); j++)
136 0 : free(aliasf[j]);
137 0 : free(aliasf);
138 0 : aliasf = NULL;
139 0 : if (aliasflen) {
140 0 : free(aliasflen);
141 0 : aliasflen = NULL;
142 : }
143 : }
144 0 : if (aliasm) {
145 0 : for (int j = 0; j < (numaliasm); j++)
146 0 : free(aliasm[j]);
147 0 : free(aliasm);
148 0 : aliasm = NULL;
149 : }
150 :
151 : #ifndef OPENOFFICEORG
152 : #ifndef MOZILLA_CLIENT
153 : if (utf8)
154 : free_utf_tbl();
155 : #endif
156 : #endif
157 :
158 : #ifdef MOZILLA_CLIENT
159 0 : delete[] csconv;
160 : #endif
161 0 : }
162 :
163 : // lookup a root word in the hashtable
164 :
165 0 : struct hentry* HashMgr::lookup(const char* word) const {
166 : struct hentry* dp;
167 0 : if (tableptr) {
168 0 : dp = tableptr[hash(word)];
169 0 : if (!dp)
170 0 : return NULL;
171 0 : for (; dp != NULL; dp = dp->next) {
172 0 : if (strcmp(word, dp->word) == 0)
173 0 : return dp;
174 : }
175 : }
176 0 : return NULL;
177 : }
178 :
179 : // add a word to the hash table (private)
180 0 : int HashMgr::add_word(const std::string& in_word,
181 : int wcl,
182 : unsigned short* aff,
183 : int al,
184 : const std::string* in_desc,
185 : bool onlyupcase) {
186 0 : const std::string* word = &in_word;
187 0 : const std::string* desc = in_desc;
188 :
189 0 : std::string *word_copy = NULL;
190 0 : std::string *desc_copy = NULL;
191 0 : if (!ignorechars.empty() || complexprefixes) {
192 0 : word_copy = new std::string(in_word);
193 :
194 0 : if (!ignorechars.empty()) {
195 0 : if (utf8) {
196 0 : wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
197 : } else {
198 0 : remove_ignored_chars(*word_copy, ignorechars);
199 : }
200 : }
201 :
202 0 : if (complexprefixes) {
203 0 : if (utf8)
204 0 : wcl = reverseword_utf(*word_copy);
205 : else
206 0 : reverseword(*word_copy);
207 :
208 0 : if (in_desc && !aliasm) {
209 0 : desc_copy = new std::string(*in_desc);
210 :
211 0 : if (complexprefixes) {
212 0 : if (utf8)
213 0 : reverseword_utf(*desc_copy);
214 : else
215 0 : reverseword(*desc_copy);
216 : }
217 0 : desc = desc_copy;
218 : }
219 : }
220 :
221 0 : word = word_copy;
222 : }
223 :
224 0 : bool upcasehomonym = false;
225 0 : int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0;
226 : // variable-length hash record with word and optional fields
227 : struct hentry* hp =
228 0 : (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl);
229 0 : if (!hp) {
230 0 : delete desc_copy;
231 0 : delete word_copy;
232 0 : return 1;
233 : }
234 :
235 0 : char* hpw = hp->word;
236 0 : strcpy(hpw, word->c_str());
237 :
238 0 : int i = hash(hpw);
239 :
240 0 : hp->blen = (unsigned char)word->size();
241 0 : hp->clen = (unsigned char)wcl;
242 0 : hp->alen = (short)al;
243 0 : hp->astr = aff;
244 0 : hp->next = NULL;
245 0 : hp->next_homonym = NULL;
246 :
247 : // store the description string or its pointer
248 0 : if (desc) {
249 0 : hp->var = H_OPT;
250 0 : if (aliasm) {
251 0 : hp->var += H_OPT_ALIASM;
252 0 : store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
253 : } else {
254 0 : strcpy(hpw + word->size() + 1, desc->c_str());
255 : }
256 0 : if (strstr(HENTRY_DATA(hp), MORPH_PHON))
257 0 : hp->var += H_OPT_PHON;
258 : } else
259 0 : hp->var = 0;
260 :
261 0 : struct hentry* dp = tableptr[i];
262 0 : if (!dp) {
263 0 : tableptr[i] = hp;
264 0 : delete desc_copy;
265 0 : delete word_copy;
266 0 : return 0;
267 : }
268 0 : while (dp->next != NULL) {
269 0 : if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
270 : // remove hidden onlyupcase homonym
271 0 : if (!onlyupcase) {
272 0 : if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
273 0 : free(dp->astr);
274 0 : dp->astr = hp->astr;
275 0 : dp->alen = hp->alen;
276 0 : free(hp);
277 0 : delete desc_copy;
278 0 : delete word_copy;
279 0 : return 0;
280 : } else {
281 0 : dp->next_homonym = hp;
282 : }
283 : } else {
284 0 : upcasehomonym = true;
285 : }
286 : }
287 0 : dp = dp->next;
288 : }
289 0 : if (strcmp(hp->word, dp->word) == 0) {
290 : // remove hidden onlyupcase homonym
291 0 : if (!onlyupcase) {
292 0 : if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
293 0 : free(dp->astr);
294 0 : dp->astr = hp->astr;
295 0 : dp->alen = hp->alen;
296 0 : free(hp);
297 0 : delete desc_copy;
298 0 : delete word_copy;
299 0 : return 0;
300 : } else {
301 0 : dp->next_homonym = hp;
302 : }
303 : } else {
304 0 : upcasehomonym = true;
305 : }
306 : }
307 0 : if (!upcasehomonym) {
308 0 : dp->next = hp;
309 : } else {
310 : // remove hidden onlyupcase homonym
311 0 : if (hp->astr)
312 0 : free(hp->astr);
313 0 : free(hp);
314 : }
315 :
316 0 : delete desc_copy;
317 0 : delete word_copy;
318 0 : return 0;
319 : }
320 :
321 0 : int HashMgr::add_hidden_capitalized_word(const std::string& word,
322 : int wcl,
323 : unsigned short* flags,
324 : int flagslen,
325 : const std::string* dp,
326 : int captype) {
327 0 : if (flags == NULL)
328 0 : flagslen = 0;
329 :
330 : // add inner capitalized forms to handle the following allcap forms:
331 : // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
332 : // Allcaps with suffixes: CIA's -> CIA'S
333 0 : if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
334 0 : ((captype == ALLCAP) && (flagslen != 0))) &&
335 0 : !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) {
336 : unsigned short* flags2 =
337 0 : (unsigned short*)malloc(sizeof(unsigned short) * (flagslen + 1));
338 0 : if (!flags2)
339 0 : return 1;
340 0 : if (flagslen)
341 0 : memcpy(flags2, flags, flagslen * sizeof(unsigned short));
342 0 : flags2[flagslen] = ONLYUPCASEFLAG;
343 0 : if (utf8) {
344 0 : std::string st;
345 0 : std::vector<w_char> w;
346 0 : u8_u16(w, word);
347 0 : mkallsmall_utf(w, langnum);
348 0 : mkinitcap_utf(w, langnum);
349 0 : u16_u8(st, w);
350 0 : return add_word(st, wcl, flags2, flagslen + 1, dp, true);
351 : } else {
352 0 : std::string new_word(word);
353 0 : mkallsmall(new_word, csconv);
354 0 : mkinitcap(new_word, csconv);
355 0 : int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
356 0 : return ret;
357 : }
358 : }
359 0 : return 0;
360 : }
361 :
362 : // detect captype and modify word length for UTF-8 encoding
363 0 : int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
364 : int len;
365 0 : if (utf8) {
366 0 : len = u8_u16(workbuf, word);
367 0 : *captype = get_captype_utf8(workbuf, langnum);
368 : } else {
369 0 : len = word.size();
370 0 : *captype = get_captype(word, csconv);
371 : }
372 0 : return len;
373 : }
374 :
375 0 : int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
376 0 : std::vector<w_char> workbuf;
377 0 : return get_clen_and_captype(word, captype, workbuf);
378 : }
379 :
380 : // remove word (personal dictionary function for standalone applications)
381 0 : int HashMgr::remove(const std::string& word) {
382 0 : struct hentry* dp = lookup(word.c_str());
383 0 : while (dp) {
384 0 : if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
385 : unsigned short* flags =
386 0 : (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen + 1));
387 0 : if (!flags)
388 0 : return 1;
389 0 : for (int i = 0; i < dp->alen; i++)
390 0 : flags[i] = dp->astr[i];
391 0 : flags[dp->alen] = forbiddenword;
392 0 : free(dp->astr);
393 0 : dp->astr = flags;
394 0 : dp->alen++;
395 0 : std::sort(flags, flags + dp->alen);
396 : }
397 0 : dp = dp->next_homonym;
398 : }
399 0 : return 0;
400 : }
401 :
402 : /* remove forbidden flag to add a personal word to the hash */
403 0 : int HashMgr::remove_forbidden_flag(const std::string& word) {
404 0 : struct hentry* dp = lookup(word.c_str());
405 0 : if (!dp)
406 0 : return 1;
407 0 : while (dp) {
408 0 : if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
409 0 : if (dp->alen == 1)
410 0 : dp->alen = 0; // XXX forbidden words of personal dic.
411 : else {
412 : unsigned short* flags2 =
413 0 : (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1));
414 0 : if (!flags2)
415 0 : return 1;
416 0 : int i, j = 0;
417 0 : for (i = 0; i < dp->alen; i++) {
418 0 : if (dp->astr[i] != forbiddenword)
419 0 : flags2[j++] = dp->astr[i];
420 : }
421 0 : dp->alen--;
422 0 : free(dp->astr);
423 0 : dp->astr = flags2; // XXX allowed forbidden words
424 : }
425 : }
426 0 : dp = dp->next_homonym;
427 : }
428 0 : return 0;
429 : }
430 :
431 : // add a custom dic. word to the hash table (public)
432 0 : int HashMgr::add(const std::string& word) {
433 0 : if (remove_forbidden_flag(word)) {
434 : int captype;
435 0 : int al = 0;
436 0 : unsigned short* flags = NULL;
437 0 : int wcl = get_clen_and_captype(word, &captype);
438 0 : add_word(word, wcl, flags, al, NULL, false);
439 0 : return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
440 0 : captype);
441 : }
442 0 : return 0;
443 : }
444 :
445 0 : int HashMgr::add_with_affix(const std::string& word, const std::string& example) {
446 : // detect captype and modify word length for UTF-8 encoding
447 0 : struct hentry* dp = lookup(example.c_str());
448 0 : remove_forbidden_flag(word);
449 0 : if (dp && dp->astr) {
450 : int captype;
451 0 : int wcl = get_clen_and_captype(word, &captype);
452 0 : if (aliasf) {
453 0 : add_word(word, wcl, dp->astr, dp->alen, NULL, false);
454 : } else {
455 : unsigned short* flags =
456 0 : (unsigned short*)malloc(dp->alen * sizeof(unsigned short));
457 0 : if (flags) {
458 0 : memcpy((void*)flags, (void*)dp->astr,
459 0 : dp->alen * sizeof(unsigned short));
460 0 : add_word(word, wcl, flags, dp->alen, NULL, false);
461 : } else
462 0 : return 1;
463 : }
464 0 : return add_hidden_capitalized_word(word, wcl, dp->astr,
465 0 : dp->alen, NULL, captype);
466 : }
467 0 : return 1;
468 : }
469 :
470 : // walk the hash table entry by entry - null at end
471 : // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
472 0 : struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const {
473 0 : if (hp && hp->next != NULL)
474 0 : return hp->next;
475 0 : for (col++; col < tablesize; col++) {
476 0 : if (tableptr[col])
477 0 : return tableptr[col];
478 : }
479 : // null at end and reset to start
480 0 : col = -1;
481 0 : return NULL;
482 : }
483 :
484 : // load a munched word list and build a hash table on the fly
485 0 : int HashMgr::load_tables(const char* tpath, const char* key) {
486 : // open dictionary file
487 0 : FileMgr* dict = new FileMgr(tpath, key);
488 0 : if (dict == NULL)
489 0 : return 1;
490 :
491 : // first read the first line of file to get hash table size */
492 0 : std::string ts;
493 0 : if (!dict->getline(ts)) {
494 0 : HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
495 0 : delete dict;
496 0 : return 2;
497 : }
498 0 : mychomp(ts);
499 :
500 : /* remove byte order mark */
501 0 : if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
502 0 : ts.erase(0, 3);
503 : }
504 :
505 0 : tablesize = atoi(ts.c_str());
506 :
507 0 : int nExtra = 5 + USERWORD;
508 :
509 0 : if (tablesize <= 0 ||
510 0 : (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) /
511 : int(sizeof(struct hentry*)))) {
512 : HUNSPELL_WARNING(
513 0 : stderr, "error: line 1: missing or bad word count in the dic file\n");
514 0 : delete dict;
515 0 : return 4;
516 : }
517 0 : tablesize += nExtra;
518 0 : if ((tablesize % 2) == 0)
519 0 : tablesize++;
520 :
521 : // allocate the hash table
522 0 : tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
523 0 : if (!tableptr) {
524 0 : delete dict;
525 0 : return 3;
526 : }
527 :
528 : // loop through all words on much list and add to hash
529 : // table and create word and affix strings
530 :
531 0 : std::vector<w_char> workbuf;
532 :
533 0 : while (dict->getline(ts)) {
534 0 : mychomp(ts);
535 : // split each line into word and morphological description
536 0 : size_t dp_pos = 0;
537 0 : while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) {
538 0 : if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) {
539 0 : for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos)
540 : ;
541 0 : if (dp_pos == 0) { // missing word
542 0 : dp_pos = std::string::npos;
543 : } else {
544 0 : ++dp_pos;
545 : }
546 0 : break;
547 : }
548 0 : ++dp_pos;
549 : }
550 :
551 : // tabulator is the old morphological field separator
552 0 : size_t dp2_pos = ts.find('\t');
553 0 : if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) {
554 0 : dp_pos = dp2_pos + 1;
555 : }
556 :
557 0 : std::string dp;
558 0 : if (dp_pos != std::string::npos) {
559 0 : dp.assign(ts.substr(dp_pos));
560 0 : ts.resize(dp_pos - 1);
561 : }
562 :
563 : // split each line into word and affix char strings
564 : // "\/" signs slash in words (not affix separator)
565 : // "/" at beginning of the line is word character (not affix separator)
566 0 : size_t ap_pos = ts.find('/');
567 0 : while (ap_pos != std::string::npos) {
568 0 : if (ap_pos == 0) {
569 0 : ++ap_pos;
570 0 : continue;
571 0 : } else if (ts[ap_pos - 1] != '\\')
572 0 : break;
573 : // replace "\/" with "/"
574 0 : ts.erase(ap_pos - 1, 1);
575 0 : ap_pos = ts.find('/', ap_pos);
576 : }
577 :
578 : unsigned short* flags;
579 : int al;
580 0 : if (ap_pos != std::string::npos && ap_pos != ts.size()) {
581 0 : std::string ap(ts.substr(ap_pos + 1));
582 0 : ts.resize(ap_pos);
583 0 : if (aliasf) {
584 0 : int index = atoi(ap.c_str());
585 0 : al = get_aliasf(index, &flags, dict);
586 0 : if (!al) {
587 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
588 0 : dict->getlinenum());
589 : }
590 : } else {
591 0 : al = decode_flags(&flags, ap.c_str(), dict);
592 0 : if (al == -1) {
593 0 : HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
594 0 : delete dict;
595 0 : return 6;
596 : }
597 0 : std::sort(flags, flags + al);
598 : }
599 : } else {
600 0 : al = 0;
601 0 : flags = NULL;
602 : }
603 :
604 : int captype;
605 0 : int wcl = get_clen_and_captype(ts, &captype, workbuf);
606 0 : const std::string *dp_str = dp.empty() ? NULL : &dp;
607 : // add the word and its index plus its capitalized form optionally
608 0 : if (add_word(ts, wcl, flags, al, dp_str, false) ||
609 0 : add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
610 0 : delete dict;
611 0 : return 5;
612 : }
613 : }
614 :
615 0 : delete dict;
616 0 : return 0;
617 : }
618 :
619 : // the hash function is a simple load and rotate
620 : // algorithm borrowed
621 0 : int HashMgr::hash(const char* word) const {
622 0 : unsigned long hv = 0;
623 0 : for (int i = 0; i < 4 && *word != 0; i++)
624 0 : hv = (hv << 8) | (*word++);
625 0 : while (*word != 0) {
626 0 : ROTATE(hv, ROTATE_LEN);
627 0 : hv ^= (*word++);
628 : }
629 0 : return (unsigned long)hv % tablesize;
630 : }
631 :
632 0 : int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const {
633 : int len;
634 0 : if (flags.empty()) {
635 0 : *result = NULL;
636 0 : return 0;
637 : }
638 0 : switch (flag_mode) {
639 : case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
640 0 : len = flags.size();
641 0 : if (len % 2 == 1)
642 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
643 0 : af->getlinenum());
644 0 : len /= 2;
645 0 : *result = (unsigned short*)malloc(len * sizeof(unsigned short));
646 0 : if (!*result)
647 0 : return -1;
648 0 : for (int i = 0; i < len; i++) {
649 0 : (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) +
650 0 : (unsigned char)flags[i * 2 + 1];
651 : }
652 0 : break;
653 : }
654 : case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521
655 : // 23 233)
656 0 : len = 1;
657 : unsigned short* dest;
658 0 : for (size_t i = 0; i < flags.size(); ++i) {
659 0 : if (flags[i] == ',')
660 0 : len++;
661 : }
662 0 : *result = (unsigned short*)malloc(len * sizeof(unsigned short));
663 0 : if (!*result)
664 0 : return -1;
665 0 : dest = *result;
666 0 : const char* src = flags.c_str();
667 0 : for (const char* p = src; *p; p++) {
668 0 : if (*p == ',') {
669 0 : int i = atoi(src);
670 0 : if (i >= DEFAULTFLAGS)
671 0 : HUNSPELL_WARNING(
672 : stderr, "error: line %d: flag id %d is too large (max: %d)\n",
673 0 : af->getlinenum(), i, DEFAULTFLAGS - 1);
674 0 : *dest = (unsigned short)i;
675 0 : if (*dest == 0)
676 0 : HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
677 0 : af->getlinenum());
678 0 : src = p + 1;
679 0 : dest++;
680 : }
681 : }
682 0 : int i = atoi(src);
683 0 : if (i >= DEFAULTFLAGS)
684 0 : HUNSPELL_WARNING(stderr,
685 : "error: line %d: flag id %d is too large (max: %d)\n",
686 0 : af->getlinenum(), i, DEFAULTFLAGS - 1);
687 0 : *dest = (unsigned short)i;
688 0 : if (*dest == 0)
689 0 : HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
690 0 : af->getlinenum());
691 0 : break;
692 : }
693 : case FLAG_UNI: { // UTF-8 characters
694 0 : std::vector<w_char> w;
695 0 : u8_u16(w, flags);
696 0 : len = w.size();
697 0 : *result = (unsigned short*)malloc(len * sizeof(unsigned short));
698 0 : if (!*result)
699 0 : return -1;
700 0 : memcpy(*result, &w[0], len * sizeof(short));
701 0 : break;
702 : }
703 : default: { // Ispell's one-character flags (erfg -> e r f g)
704 : unsigned short* dest;
705 0 : len = flags.size();
706 0 : *result = (unsigned short*)malloc(len * sizeof(unsigned short));
707 0 : if (!*result)
708 0 : return -1;
709 0 : dest = *result;
710 0 : for (size_t i = 0; i < flags.size(); ++i) {
711 0 : *dest = (unsigned char)flags[i];
712 0 : dest++;
713 : }
714 : }
715 : }
716 0 : return len;
717 : }
718 :
719 0 : bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const {
720 0 : if (flags.empty()) {
721 0 : return false;
722 : }
723 0 : switch (flag_mode) {
724 : case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
725 0 : size_t len = flags.size();
726 0 : if (len % 2 == 1)
727 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
728 0 : af->getlinenum());
729 0 : len /= 2;
730 0 : result.reserve(result.size() + len);
731 0 : for (size_t i = 0; i < len; ++i) {
732 0 : result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) +
733 0 : (unsigned char)flags[i * 2 + 1]);
734 : }
735 0 : break;
736 : }
737 : case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521
738 : // 23 233)
739 0 : const char* src = flags.c_str();
740 0 : for (const char* p = src; *p; p++) {
741 0 : if (*p == ',') {
742 0 : int i = atoi(src);
743 0 : if (i >= DEFAULTFLAGS)
744 0 : HUNSPELL_WARNING(
745 : stderr, "error: line %d: flag id %d is too large (max: %d)\n",
746 0 : af->getlinenum(), i, DEFAULTFLAGS - 1);
747 0 : result.push_back((unsigned short)i);
748 0 : if (result.back() == 0)
749 0 : HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
750 0 : af->getlinenum());
751 0 : src = p + 1;
752 : }
753 : }
754 0 : int i = atoi(src);
755 0 : if (i >= DEFAULTFLAGS)
756 0 : HUNSPELL_WARNING(stderr,
757 : "error: line %d: flag id %d is too large (max: %d)\n",
758 0 : af->getlinenum(), i, DEFAULTFLAGS - 1);
759 0 : result.push_back((unsigned short)i);
760 0 : if (result.back() == 0)
761 0 : HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
762 0 : af->getlinenum());
763 0 : break;
764 : }
765 : case FLAG_UNI: { // UTF-8 characters
766 0 : std::vector<w_char> w;
767 0 : u8_u16(w, flags);
768 0 : size_t len = w.size();
769 0 : size_t origsize = result.size();
770 0 : result.resize(origsize + len);
771 0 : memcpy(&result[origsize], &w[0], len * sizeof(short));
772 0 : break;
773 : }
774 : default: { // Ispell's one-character flags (erfg -> e r f g)
775 0 : result.reserve(flags.size());
776 0 : for (size_t i = 0; i < flags.size(); ++i) {
777 0 : result.push_back((unsigned char)flags[i]);
778 : }
779 : }
780 : }
781 0 : return true;
782 : }
783 :
784 0 : unsigned short HashMgr::decode_flag(const char* f) const {
785 0 : unsigned short s = 0;
786 : int i;
787 0 : switch (flag_mode) {
788 : case FLAG_LONG:
789 0 : s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1];
790 0 : break;
791 : case FLAG_NUM:
792 0 : i = atoi(f);
793 0 : if (i >= DEFAULTFLAGS)
794 : HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n",
795 0 : i, DEFAULTFLAGS - 1);
796 0 : s = (unsigned short)i;
797 0 : break;
798 : case FLAG_UNI: {
799 0 : std::vector<w_char> w;
800 0 : u8_u16(w, f);
801 0 : if (!w.empty())
802 0 : memcpy(&s, &w[0], 1 * sizeof(short));
803 0 : break;
804 : }
805 : default:
806 0 : s = *(unsigned char*)f;
807 : }
808 0 : if (s == 0)
809 0 : HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
810 0 : return s;
811 : }
812 :
813 0 : char* HashMgr::encode_flag(unsigned short f) const {
814 0 : if (f == 0)
815 0 : return mystrdup("(NULL)");
816 0 : std::string ch;
817 0 : if (flag_mode == FLAG_LONG) {
818 0 : ch.push_back((unsigned char)(f >> 8));
819 0 : ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
820 0 : } else if (flag_mode == FLAG_NUM) {
821 0 : std::ostringstream stream;
822 0 : stream << f;
823 0 : ch = stream.str();
824 0 : } else if (flag_mode == FLAG_UNI) {
825 0 : const w_char* w_c = (const w_char*)&f;
826 0 : std::vector<w_char> w(w_c, w_c + 1);
827 0 : u16_u8(ch, w);
828 : } else {
829 0 : ch.push_back((unsigned char)(f));
830 : }
831 0 : return mystrdup(ch.c_str());
832 : }
833 :
834 : // read in aff file and set flag mode
835 0 : int HashMgr::load_config(const char* affpath, const char* key) {
836 0 : int firstline = 1;
837 :
838 : // open the affix file
839 0 : FileMgr* afflst = new FileMgr(affpath, key);
840 0 : if (!afflst) {
841 : HUNSPELL_WARNING(
842 0 : stderr, "Error - could not open affix description file %s\n", affpath);
843 0 : return 1;
844 : }
845 :
846 : // read in each line ignoring any that do not
847 : // start with a known line type indicator
848 :
849 0 : std::string line;
850 0 : while (afflst->getline(line)) {
851 0 : mychomp(line);
852 :
853 : /* remove byte order mark */
854 0 : if (firstline) {
855 0 : firstline = 0;
856 0 : if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
857 0 : line.erase(0, 3);
858 : }
859 : }
860 :
861 : /* parse in the try string */
862 0 : if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) {
863 0 : if (flag_mode != FLAG_CHAR) {
864 0 : HUNSPELL_WARNING(stderr,
865 : "error: line %d: multiple definitions of the FLAG "
866 : "affix file parameter\n",
867 0 : afflst->getlinenum());
868 : }
869 0 : if (line.find("long") != std::string::npos)
870 0 : flag_mode = FLAG_LONG;
871 0 : if (line.find("num") != std::string::npos)
872 0 : flag_mode = FLAG_NUM;
873 0 : if (line.find("UTF-8") != std::string::npos)
874 0 : flag_mode = FLAG_UNI;
875 0 : if (flag_mode == FLAG_CHAR) {
876 0 : HUNSPELL_WARNING(
877 : stderr,
878 : "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n",
879 0 : afflst->getlinenum());
880 : }
881 : }
882 :
883 0 : if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
884 0 : std::string st;
885 0 : if (!parse_string(line, st, afflst->getlinenum())) {
886 0 : delete afflst;
887 0 : return 1;
888 : }
889 0 : forbiddenword = decode_flag(st.c_str());
890 : }
891 :
892 0 : if (line.compare(0, 3, "SET", 3) == 0) {
893 0 : if (!parse_string(line, enc, afflst->getlinenum())) {
894 0 : delete afflst;
895 0 : return 1;
896 : }
897 0 : if (enc == "UTF-8") {
898 0 : utf8 = 1;
899 : #ifndef OPENOFFICEORG
900 : #ifndef MOZILLA_CLIENT
901 : initialize_utf_tbl();
902 : #endif
903 : #endif
904 : } else
905 0 : csconv = get_current_cs(enc);
906 : }
907 :
908 0 : if (line.compare(0, 4, "LANG", 4) == 0) {
909 0 : if (!parse_string(line, lang, afflst->getlinenum())) {
910 0 : delete afflst;
911 0 : return 1;
912 : }
913 0 : langnum = get_lang_num(lang);
914 : }
915 :
916 : /* parse in the ignored characters (for example, Arabic optional diacritics
917 : * characters */
918 0 : if (line.compare(0, 6, "IGNORE", 6) == 0) {
919 0 : if (!parse_array(line, ignorechars, ignorechars_utf16,
920 : utf8, afflst->getlinenum())) {
921 0 : delete afflst;
922 0 : return 1;
923 : }
924 : }
925 :
926 0 : if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) {
927 0 : if (!parse_aliasf(line, afflst)) {
928 0 : delete afflst;
929 0 : return 1;
930 : }
931 : }
932 :
933 0 : if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) {
934 0 : if (!parse_aliasm(line, afflst)) {
935 0 : delete afflst;
936 0 : return 1;
937 : }
938 : }
939 :
940 0 : if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
941 0 : complexprefixes = 1;
942 :
943 0 : if (((line.compare(0, 3, "SFX", 3) == 0) ||
944 0 : (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
945 0 : break;
946 : }
947 :
948 0 : if (csconv == NULL)
949 0 : csconv = get_current_cs(SPELL_ENCODING);
950 0 : delete afflst;
951 0 : return 0;
952 : }
953 :
954 : /* parse in the ALIAS table */
955 0 : bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
956 0 : if (numaliasf != 0) {
957 0 : HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
958 0 : af->getlinenum());
959 0 : return false;
960 : }
961 0 : int i = 0;
962 0 : int np = 0;
963 0 : std::string::const_iterator iter = line.begin();
964 0 : std::string::const_iterator start_piece = mystrsep(line, iter);
965 0 : while (start_piece != line.end()) {
966 0 : switch (i) {
967 : case 0: {
968 0 : np++;
969 0 : break;
970 : }
971 : case 1: {
972 0 : numaliasf = atoi(std::string(start_piece, iter).c_str());
973 0 : if (numaliasf < 1) {
974 0 : numaliasf = 0;
975 0 : aliasf = NULL;
976 0 : aliasflen = NULL;
977 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
978 0 : af->getlinenum());
979 0 : return false;
980 : }
981 0 : aliasf =
982 0 : (unsigned short**)malloc(numaliasf * sizeof(unsigned short*));
983 0 : aliasflen =
984 0 : (unsigned short*)malloc(numaliasf * sizeof(unsigned short));
985 0 : if (!aliasf || !aliasflen) {
986 0 : numaliasf = 0;
987 0 : if (aliasf)
988 0 : free(aliasf);
989 0 : if (aliasflen)
990 0 : free(aliasflen);
991 0 : aliasf = NULL;
992 0 : aliasflen = NULL;
993 0 : return false;
994 : }
995 0 : np++;
996 0 : break;
997 : }
998 : default:
999 0 : break;
1000 : }
1001 0 : ++i;
1002 0 : start_piece = mystrsep(line, iter);
1003 : }
1004 0 : if (np != 2) {
1005 0 : numaliasf = 0;
1006 0 : free(aliasf);
1007 0 : free(aliasflen);
1008 0 : aliasf = NULL;
1009 0 : aliasflen = NULL;
1010 0 : HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1011 0 : af->getlinenum());
1012 0 : return false;
1013 : }
1014 :
1015 : /* now parse the numaliasf lines to read in the remainder of the table */
1016 0 : for (int j = 0; j < numaliasf; j++) {
1017 0 : std::string nl;
1018 0 : if (!af->getline(nl))
1019 0 : return false;
1020 0 : mychomp(nl);
1021 0 : i = 0;
1022 0 : aliasf[j] = NULL;
1023 0 : aliasflen[j] = 0;
1024 0 : iter = nl.begin();
1025 0 : start_piece = mystrsep(nl, iter);
1026 0 : while (start_piece != nl.end()) {
1027 0 : switch (i) {
1028 : case 0: {
1029 0 : if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
1030 0 : numaliasf = 0;
1031 0 : free(aliasf);
1032 0 : free(aliasflen);
1033 0 : aliasf = NULL;
1034 0 : aliasflen = NULL;
1035 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1036 0 : af->getlinenum());
1037 0 : return false;
1038 : }
1039 0 : break;
1040 : }
1041 : case 1: {
1042 0 : std::string piece(start_piece, iter);
1043 0 : aliasflen[j] =
1044 0 : (unsigned short)decode_flags(&(aliasf[j]), piece, af);
1045 0 : std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
1046 0 : break;
1047 : }
1048 : default:
1049 0 : break;
1050 : }
1051 0 : ++i;
1052 0 : start_piece = mystrsep(nl, iter);
1053 : }
1054 0 : if (!aliasf[j]) {
1055 0 : free(aliasf);
1056 0 : free(aliasflen);
1057 0 : aliasf = NULL;
1058 0 : aliasflen = NULL;
1059 0 : numaliasf = 0;
1060 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1061 0 : af->getlinenum());
1062 0 : return false;
1063 : }
1064 : }
1065 0 : return true;
1066 : }
1067 :
1068 0 : int HashMgr::is_aliasf() const {
1069 0 : return (aliasf != NULL);
1070 : }
1071 :
1072 0 : int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const {
1073 0 : if ((index > 0) && (index <= numaliasf)) {
1074 0 : *fvec = aliasf[index - 1];
1075 0 : return aliasflen[index - 1];
1076 : }
1077 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n",
1078 0 : af->getlinenum(), index);
1079 0 : *fvec = NULL;
1080 0 : return 0;
1081 : }
1082 :
1083 : /* parse morph alias definitions */
1084 0 : bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
1085 0 : if (numaliasm != 0) {
1086 0 : HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1087 0 : af->getlinenum());
1088 0 : return false;
1089 : }
1090 0 : int i = 0;
1091 0 : int np = 0;
1092 0 : std::string::const_iterator iter = line.begin();
1093 0 : std::string::const_iterator start_piece = mystrsep(line, iter);
1094 0 : while (start_piece != line.end()) {
1095 0 : switch (i) {
1096 : case 0: {
1097 0 : np++;
1098 0 : break;
1099 : }
1100 : case 1: {
1101 0 : numaliasm = atoi(std::string(start_piece, iter).c_str());
1102 0 : if (numaliasm < 1) {
1103 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
1104 0 : af->getlinenum());
1105 0 : return false;
1106 : }
1107 0 : aliasm = (char**)malloc(numaliasm * sizeof(char*));
1108 0 : if (!aliasm) {
1109 0 : numaliasm = 0;
1110 0 : return false;
1111 : }
1112 0 : np++;
1113 0 : break;
1114 : }
1115 : default:
1116 0 : break;
1117 : }
1118 0 : ++i;
1119 0 : start_piece = mystrsep(line, iter);
1120 : }
1121 0 : if (np != 2) {
1122 0 : numaliasm = 0;
1123 0 : free(aliasm);
1124 0 : aliasm = NULL;
1125 0 : HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1126 0 : af->getlinenum());
1127 0 : return false;
1128 : }
1129 :
1130 : /* now parse the numaliasm lines to read in the remainder of the table */
1131 0 : for (int j = 0; j < numaliasm; j++) {
1132 0 : std::string nl;
1133 0 : if (!af->getline(nl))
1134 0 : return false;
1135 0 : mychomp(nl);
1136 0 : aliasm[j] = NULL;
1137 0 : iter = nl.begin();
1138 0 : i = 0;
1139 0 : start_piece = mystrsep(nl, iter);
1140 0 : while (start_piece != nl.end()) {
1141 0 : switch (i) {
1142 : case 0: {
1143 0 : if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
1144 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1145 0 : af->getlinenum());
1146 0 : numaliasm = 0;
1147 0 : free(aliasm);
1148 0 : aliasm = NULL;
1149 0 : return false;
1150 : }
1151 0 : break;
1152 : }
1153 : case 1: {
1154 : // add the remaining of the line
1155 0 : std::string::const_iterator end = nl.end();
1156 0 : std::string chunk(start_piece, end);
1157 0 : if (complexprefixes) {
1158 0 : if (utf8)
1159 0 : reverseword_utf(chunk);
1160 : else
1161 0 : reverseword(chunk);
1162 : }
1163 0 : aliasm[j] = mystrdup(chunk.c_str());
1164 0 : break;
1165 : }
1166 : default:
1167 0 : break;
1168 : }
1169 0 : ++i;
1170 0 : start_piece = mystrsep(nl, iter);
1171 : }
1172 0 : if (!aliasm[j]) {
1173 0 : numaliasm = 0;
1174 0 : free(aliasm);
1175 0 : aliasm = NULL;
1176 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1177 0 : af->getlinenum());
1178 0 : return false;
1179 : }
1180 : }
1181 0 : return true;
1182 : }
1183 :
1184 0 : int HashMgr::is_aliasm() const {
1185 0 : return (aliasm != NULL);
1186 : }
1187 :
1188 0 : char* HashMgr::get_aliasm(int index) const {
1189 0 : if ((index > 0) && (index <= numaliasm))
1190 0 : return aliasm[index - 1];
1191 0 : HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
1192 0 : return NULL;
1193 : }
|