Line data Source code
1 : /* ***** BEGIN LICENSE BLOCK *****
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * Copyright (C) 2002-2017 Németh László
5 : *
6 : * The contents of this file are subject to the Mozilla Public License Version
7 : * 1.1 (the "License"); you may not use this file except in compliance with
8 : * the License. You may obtain a copy of the License at
9 : * http://www.mozilla.org/MPL/
10 : *
11 : * Software distributed under the License is distributed on an "AS IS" basis,
12 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 : * for the specific language governing rights and limitations under the
14 : * License.
15 : *
16 : * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 : *
18 : * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 : * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 : * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 : * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 : * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either the GNU General Public License Version 2 or later (the "GPL"), or
26 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 : /*
38 : * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 : * And Contributors. All rights reserved.
40 : *
41 : * Redistribution and use in source and binary forms, with or without
42 : * modification, are permitted provided that the following conditions
43 : * are met:
44 : *
45 : * 1. Redistributions of source code must retain the above copyright
46 : * notice, this list of conditions and the following disclaimer.
47 : *
48 : * 2. Redistributions in binary form must reproduce the above copyright
49 : * notice, this list of conditions and the following disclaimer in the
50 : * documentation and/or other materials provided with the distribution.
51 : *
52 : * 3. All modifications to the source code must be clearly marked as
53 : * such. Binary redistributions based on modified source code
54 : * must be clearly marked as modified versions in the documentation
55 : * and/or other materials provided with the distribution.
56 : *
57 : * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 : * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 : * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 : * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 : * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 : * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 : * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 : * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 : * SUCH DAMAGE.
69 : */
70 :
71 : #include <stdlib.h>
72 : #include <string.h>
73 : #include <stdio.h>
74 : #include <ctype.h>
75 :
76 : #include "affentry.hxx"
77 : #include "csutil.hxx"
78 :
79 0 : AffEntry::~AffEntry() {
80 0 : if (opts & aeLONGCOND)
81 0 : free(c.l.conds2);
82 0 : if (morphcode && !(opts & aeALIASM))
83 0 : free(morphcode);
84 0 : if (contclass && !(opts & aeALIASF))
85 0 : free(contclass);
86 0 : }
87 :
88 0 : PfxEntry::PfxEntry(AffixMgr* pmgr)
89 : // register affix manager
90 : : pmyMgr(pmgr),
91 : next(NULL),
92 : nexteq(NULL),
93 : nextne(NULL),
94 0 : flgnxt(NULL) {
95 0 : }
96 :
97 : // add prefix to this word assuming conditions hold
98 0 : std::string PfxEntry::add(const char* word, size_t len) {
99 0 : std::string result;
100 0 : if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
101 0 : (len >= numconds) && test_condition(word) &&
102 0 : (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
103 : /* we have a match so add prefix */
104 0 : result.assign(appnd);
105 0 : result.append(word + strip.size());
106 : }
107 0 : return result;
108 : }
109 :
110 0 : inline char* PfxEntry::nextchar(char* p) {
111 0 : if (p) {
112 0 : p++;
113 0 : if (opts & aeLONGCOND) {
114 : // jump to the 2nd part of the condition
115 0 : if (p == c.conds + MAXCONDLEN_1)
116 0 : return c.l.conds2;
117 : // end of the MAXCONDLEN length condition
118 0 : } else if (p == c.conds + MAXCONDLEN)
119 0 : return NULL;
120 0 : return *p ? p : NULL;
121 : }
122 0 : return NULL;
123 : }
124 :
125 0 : inline int PfxEntry::test_condition(const char* st) {
126 0 : const char* pos = NULL; // group with pos input position
127 0 : bool neg = false; // complementer
128 0 : bool ingroup = false; // character in the group
129 0 : if (numconds == 0)
130 0 : return 1;
131 0 : char* p = c.conds;
132 0 : while (1) {
133 0 : switch (*p) {
134 : case '\0':
135 0 : return 1;
136 : case '[': {
137 0 : neg = false;
138 0 : ingroup = false;
139 0 : p = nextchar(p);
140 0 : pos = st;
141 0 : break;
142 : }
143 : case '^': {
144 0 : p = nextchar(p);
145 0 : neg = true;
146 0 : break;
147 : }
148 : case ']': {
149 0 : if ((neg && ingroup) || (!neg && !ingroup))
150 0 : return 0;
151 0 : pos = NULL;
152 0 : p = nextchar(p);
153 : // skip the next character
154 0 : if (!ingroup && *st)
155 0 : for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
156 : ;
157 0 : if (*st == '\0' && p)
158 0 : return 0; // word <= condition
159 0 : break;
160 : }
161 : case '.':
162 0 : if (!pos) { // dots are not metacharacters in groups: [.]
163 0 : p = nextchar(p);
164 : // skip the next character
165 0 : for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
166 : ;
167 0 : if (*st == '\0' && p)
168 0 : return 0; // word <= condition
169 0 : break;
170 : }
171 : /* FALLTHROUGH */
172 : default: {
173 0 : if (*st == *p) {
174 0 : st++;
175 0 : p = nextchar(p);
176 0 : if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
177 0 : while (p && (*p & 0xc0) == 0x80) { // character
178 0 : if (*p != *st) {
179 0 : if (!pos)
180 0 : return 0;
181 0 : st = pos;
182 0 : break;
183 : }
184 0 : p = nextchar(p);
185 0 : st++;
186 : }
187 0 : if (pos && st != pos) {
188 0 : ingroup = true;
189 0 : while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
190 : }
191 : }
192 0 : } else if (pos) {
193 0 : ingroup = true;
194 0 : while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
195 : }
196 : }
197 0 : } else if (pos) { // group
198 0 : p = nextchar(p);
199 : } else
200 0 : return 0;
201 : }
202 : }
203 0 : if (!p)
204 0 : return 1;
205 : }
206 : }
207 :
208 : // check if this prefix entry matches
209 0 : struct hentry* PfxEntry::checkword(const char* word,
210 : int len,
211 : char in_compound,
212 : const FLAG needflag) {
213 : struct hentry* he; // hash entry of root word or NULL
214 :
215 : // on entry prefix is 0 length or already matches the beginning of the word.
216 : // So if the remaining root word has positive length
217 : // and if there are enough chars in root word and added back strip chars
218 : // to meet the number of characters conditions, then test it
219 :
220 0 : int tmpl = len - appnd.size(); // length of tmpword
221 :
222 0 : if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
223 : // generate new root word by removing prefix and adding
224 : // back any characters that would have been stripped
225 :
226 0 : std::string tmpword(strip);
227 0 : tmpword.append(word + appnd.size());
228 :
229 : // now make sure all of the conditions on characters
230 : // are met. Please see the appendix at the end of
231 : // this file for more info on exactly what is being
232 : // tested
233 :
234 : // if all conditions are met then check if resulting
235 : // root word in the dictionary
236 :
237 0 : if (test_condition(tmpword.c_str())) {
238 0 : tmpl += strip.size();
239 0 : if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
240 0 : do {
241 0 : if (TESTAFF(he->astr, aflag, he->alen) &&
242 : // forbid single prefixes with needaffix flag
243 0 : !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
244 : // needflag
245 0 : ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
246 0 : (contclass && TESTAFF(contclass, needflag, contclasslen))))
247 0 : return he;
248 0 : he = he->next_homonym; // check homonyms
249 0 : } while (he);
250 : }
251 :
252 : // prefix matched but no root word was found
253 : // if aeXPRODUCT is allowed, try again but now
254 : // ross checked combined with a suffix
255 :
256 : // if ((opts & aeXPRODUCT) && in_compound) {
257 0 : if ((opts & aeXPRODUCT)) {
258 0 : he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
259 0 : FLAG_NULL, needflag, in_compound);
260 0 : if (he)
261 0 : return he;
262 : }
263 : }
264 : }
265 0 : return NULL;
266 : }
267 :
268 : // check if this prefix entry matches
269 0 : struct hentry* PfxEntry::check_twosfx(const char* word,
270 : int len,
271 : char in_compound,
272 : const FLAG needflag) {
273 : // on entry prefix is 0 length or already matches the beginning of the word.
274 : // So if the remaining root word has positive length
275 : // and if there are enough chars in root word and added back strip chars
276 : // to meet the number of characters conditions, then test it
277 :
278 0 : int tmpl = len - appnd.size(); // length of tmpword
279 :
280 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
281 0 : (tmpl + strip.size() >= numconds)) {
282 : // generate new root word by removing prefix and adding
283 : // back any characters that would have been stripped
284 :
285 0 : std::string tmpword(strip);
286 0 : tmpword.append(word + appnd.size());
287 :
288 : // now make sure all of the conditions on characters
289 : // are met. Please see the appendix at the end of
290 : // this file for more info on exactly what is being
291 : // tested
292 :
293 : // if all conditions are met then check if resulting
294 : // root word in the dictionary
295 :
296 0 : if (test_condition(tmpword.c_str())) {
297 0 : tmpl += strip.size();
298 :
299 : // prefix matched but no root word was found
300 : // if aeXPRODUCT is allowed, try again but now
301 : // cross checked combined with a suffix
302 :
303 0 : if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
304 : // hash entry of root word or NULL
305 0 : struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
306 0 : needflag);
307 0 : if (he)
308 0 : return he;
309 : }
310 : }
311 : }
312 0 : return NULL;
313 : }
314 :
315 : // check if this prefix entry matches
316 0 : std::string PfxEntry::check_twosfx_morph(const char* word,
317 : int len,
318 : char in_compound,
319 : const FLAG needflag) {
320 0 : std::string result;
321 : // on entry prefix is 0 length or already matches the beginning of the word.
322 : // So if the remaining root word has positive length
323 : // and if there are enough chars in root word and added back strip chars
324 : // to meet the number of characters conditions, then test it
325 0 : int tmpl = len - appnd.size(); // length of tmpword
326 :
327 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
328 0 : (tmpl + strip.size() >= numconds)) {
329 : // generate new root word by removing prefix and adding
330 : // back any characters that would have been stripped
331 :
332 0 : std::string tmpword(strip);
333 0 : tmpword.append(word + appnd.size());
334 :
335 : // now make sure all of the conditions on characters
336 : // are met. Please see the appendix at the end of
337 : // this file for more info on exactly what is being
338 : // tested
339 :
340 : // if all conditions are met then check if resulting
341 : // root word in the dictionary
342 :
343 0 : if (test_condition(tmpword.c_str())) {
344 0 : tmpl += strip.size();
345 :
346 : // prefix matched but no root word was found
347 : // if aeXPRODUCT is allowed, try again but now
348 : // ross checked combined with a suffix
349 :
350 0 : if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
351 0 : result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
352 : aeXPRODUCT,
353 0 : this, needflag);
354 : }
355 : }
356 : }
357 0 : return result;
358 : }
359 :
360 : // check if this prefix entry matches
361 0 : std::string PfxEntry::check_morph(const char* word,
362 : int len,
363 : char in_compound,
364 : const FLAG needflag) {
365 0 : std::string result;
366 :
367 : // on entry prefix is 0 length or already matches the beginning of the word.
368 : // So if the remaining root word has positive length
369 : // and if there are enough chars in root word and added back strip chars
370 : // to meet the number of characters conditions, then test it
371 :
372 0 : int tmpl = len - appnd.size(); // length of tmpword
373 :
374 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
375 0 : (tmpl + strip.size() >= numconds)) {
376 : // generate new root word by removing prefix and adding
377 : // back any characters that would have been stripped
378 :
379 0 : std::string tmpword(strip);
380 0 : tmpword.append(word + appnd.size());
381 :
382 : // now make sure all of the conditions on characters
383 : // are met. Please see the appendix at the end of
384 : // this file for more info on exactly what is being
385 : // tested
386 :
387 : // if all conditions are met then check if resulting
388 : // root word in the dictionary
389 :
390 0 : if (test_condition(tmpword.c_str())) {
391 0 : tmpl += strip.size();
392 : struct hentry* he; // hash entry of root word or NULL
393 0 : if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
394 0 : do {
395 0 : if (TESTAFF(he->astr, aflag, he->alen) &&
396 : // forbid single prefixes with needaffix flag
397 0 : !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
398 : // needflag
399 0 : ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
400 0 : (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
401 0 : if (morphcode) {
402 0 : result.append(" ");
403 0 : result.append(morphcode);
404 : } else
405 0 : result.append(getKey());
406 0 : if (!HENTRY_FIND(he, MORPH_STEM)) {
407 0 : result.append(" ");
408 0 : result.append(MORPH_STEM);
409 0 : result.append(HENTRY_WORD(he));
410 : }
411 : // store the pointer of the hash entry
412 0 : if (HENTRY_DATA(he)) {
413 0 : result.append(" ");
414 0 : result.append(HENTRY_DATA2(he));
415 : } else {
416 : // return with debug information
417 0 : char* flag = pmyMgr->encode_flag(getFlag());
418 0 : result.append(" ");
419 0 : result.append(MORPH_FLAG);
420 0 : result.append(flag);
421 0 : free(flag);
422 : }
423 0 : result.append("\n");
424 : }
425 0 : he = he->next_homonym;
426 0 : } while (he);
427 : }
428 :
429 : // prefix matched but no root word was found
430 : // if aeXPRODUCT is allowed, try again but now
431 : // ross checked combined with a suffix
432 :
433 0 : if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
434 0 : std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
435 0 : FLAG_NULL, needflag);
436 0 : if (!st.empty()) {
437 0 : result.append(st);
438 : }
439 : }
440 : }
441 : }
442 :
443 0 : return result;
444 : }
445 :
446 0 : SfxEntry::SfxEntry(AffixMgr* pmgr)
447 : : pmyMgr(pmgr) // register affix manager
448 : ,
449 : next(NULL),
450 : nexteq(NULL),
451 : nextne(NULL),
452 : flgnxt(NULL),
453 : l_morph(NULL),
454 : r_morph(NULL),
455 0 : eq_morph(NULL) {
456 0 : }
457 :
458 : // add suffix to this word assuming conditions hold
459 0 : std::string SfxEntry::add(const char* word, size_t len) {
460 0 : std::string result;
461 : /* make sure all conditions match */
462 0 : if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
463 0 : (len >= numconds) && test_condition(word + len, word) &&
464 0 : (!strip.size() ||
465 0 : (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
466 0 : result.assign(word);
467 : /* we have a match so add suffix */
468 0 : result.replace(len - strip.size(), std::string::npos, appnd);
469 : }
470 0 : return result;
471 : }
472 :
473 0 : inline char* SfxEntry::nextchar(char* p) {
474 0 : if (p) {
475 0 : p++;
476 0 : if (opts & aeLONGCOND) {
477 : // jump to the 2nd part of the condition
478 0 : if (p == c.l.conds1 + MAXCONDLEN_1)
479 0 : return c.l.conds2;
480 : // end of the MAXCONDLEN length condition
481 0 : } else if (p == c.conds + MAXCONDLEN)
482 0 : return NULL;
483 0 : return *p ? p : NULL;
484 : }
485 0 : return NULL;
486 : }
487 :
488 0 : inline int SfxEntry::test_condition(const char* st, const char* beg) {
489 0 : const char* pos = NULL; // group with pos input position
490 0 : bool neg = false; // complementer
491 0 : bool ingroup = false; // character in the group
492 0 : if (numconds == 0)
493 0 : return 1;
494 0 : char* p = c.conds;
495 0 : st--;
496 0 : int i = 1;
497 0 : while (1) {
498 0 : switch (*p) {
499 : case '\0':
500 0 : return 1;
501 : case '[':
502 0 : p = nextchar(p);
503 0 : pos = st;
504 0 : break;
505 : case '^':
506 0 : p = nextchar(p);
507 0 : neg = true;
508 0 : break;
509 : case ']':
510 0 : if (!neg && !ingroup)
511 0 : return 0;
512 0 : i++;
513 : // skip the next character
514 0 : if (!ingroup) {
515 0 : for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
516 : ;
517 0 : st--;
518 : }
519 0 : pos = NULL;
520 0 : neg = false;
521 0 : ingroup = false;
522 0 : p = nextchar(p);
523 0 : if (st < beg && p)
524 0 : return 0; // word <= condition
525 0 : break;
526 : case '.':
527 0 : if (!pos) {
528 : // dots are not metacharacters in groups: [.]
529 0 : p = nextchar(p);
530 : // skip the next character
531 0 : for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
532 : st--)
533 : ;
534 0 : if (st < beg) { // word <= condition
535 0 : if (p)
536 0 : return 0;
537 : else
538 0 : return 1;
539 : }
540 0 : if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
541 0 : st--;
542 0 : if (st < beg) { // word <= condition
543 0 : if (p)
544 0 : return 0;
545 : else
546 0 : return 1;
547 : }
548 : }
549 0 : break;
550 : }
551 : /* FALLTHROUGH */
552 : default: {
553 0 : if (*st == *p) {
554 0 : p = nextchar(p);
555 0 : if ((opts & aeUTF8) && (*st & 0x80)) {
556 0 : st--;
557 0 : while (p && (st >= beg)) {
558 0 : if (*p != *st) {
559 0 : if (!pos)
560 0 : return 0;
561 0 : st = pos;
562 0 : break;
563 : }
564 : // first byte of the UTF-8 multibyte character
565 0 : if ((*p & 0xc0) != 0x80)
566 0 : break;
567 0 : p = nextchar(p);
568 0 : st--;
569 : }
570 0 : if (pos && st != pos) {
571 0 : if (neg)
572 0 : return 0;
573 0 : else if (i == numconds)
574 0 : return 1;
575 0 : ingroup = true;
576 0 : while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
577 : }
578 0 : st--;
579 : }
580 0 : if (p && *p != ']')
581 0 : p = nextchar(p);
582 0 : } else if (pos) {
583 0 : if (neg)
584 0 : return 0;
585 0 : else if (i == numconds)
586 0 : return 1;
587 0 : ingroup = true;
588 0 : while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
589 : }
590 : // if (p && *p != ']') p = nextchar(p);
591 0 : st--;
592 : }
593 0 : if (!pos) {
594 0 : i++;
595 0 : st--;
596 : }
597 0 : if (st < beg && p && *p != ']')
598 0 : return 0; // word <= condition
599 0 : } else if (pos) { // group
600 0 : p = nextchar(p);
601 : } else
602 0 : return 0;
603 : }
604 : }
605 0 : if (!p)
606 0 : return 1;
607 : }
608 : }
609 :
610 : // see if this suffix is present in the word
611 0 : struct hentry* SfxEntry::checkword(const char* word,
612 : int len,
613 : int optflags,
614 : PfxEntry* ppfx,
615 : const FLAG cclass,
616 : const FLAG needflag,
617 : const FLAG badflag) {
618 : struct hentry* he; // hash entry pointer
619 0 : PfxEntry* ep = ppfx;
620 :
621 : // if this suffix is being cross checked with a prefix
622 : // but it does not support cross products skip it
623 :
624 0 : if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
625 0 : return NULL;
626 :
627 : // upon entry suffix is 0 length or already matches the end of the word.
628 : // So if the remaining root word has positive length
629 : // and if there are enough chars in root word and added back strip chars
630 : // to meet the number of characters conditions, then test it
631 :
632 0 : int tmpl = len - appnd.size(); // length of tmpword
633 : // the second condition is not enough for UTF-8 strings
634 : // it checked in test_condition()
635 :
636 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
637 0 : (tmpl + strip.size() >= numconds)) {
638 : // generate new root word by removing suffix and adding
639 : // back any characters that would have been stripped or
640 : // or null terminating the shorter string
641 :
642 0 : std::string tmpstring(word, tmpl);
643 0 : if (strip.size()) {
644 0 : tmpstring.append(strip);
645 : }
646 :
647 0 : const char* tmpword = tmpstring.c_str();
648 0 : const char* endword = tmpword + tmpstring.size();
649 :
650 : // now make sure all of the conditions on characters
651 : // are met. Please see the appendix at the end of
652 : // this file for more info on exactly what is being
653 : // tested
654 :
655 : // if all conditions are met then check if resulting
656 : // root word in the dictionary
657 :
658 0 : if (test_condition(endword, tmpword)) {
659 : #ifdef SZOSZABLYA_POSSIBLE_ROOTS
660 : fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
661 : #endif
662 0 : if ((he = pmyMgr->lookup(tmpword)) != NULL) {
663 0 : do {
664 : // check conditional suffix (enabled by prefix)
665 0 : if ((TESTAFF(he->astr, aflag, he->alen) ||
666 0 : (ep && ep->getCont() &&
667 0 : TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
668 0 : (((optflags & aeXPRODUCT) == 0) ||
669 0 : (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
670 : // enabled by prefix
671 0 : ((contclass) &&
672 0 : (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
673 : // handle cont. class
674 0 : ((!cclass) ||
675 0 : ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
676 : // check only in compound homonyms (bad flags)
677 0 : (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
678 : // handle required flag
679 0 : ((!needflag) ||
680 0 : (TESTAFF(he->astr, needflag, he->alen) ||
681 0 : ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
682 0 : return he;
683 0 : he = he->next_homonym; // check homonyms
684 0 : } while (he);
685 : }
686 : }
687 : }
688 0 : return NULL;
689 : }
690 :
691 : // see if two-level suffix is present in the word
692 0 : struct hentry* SfxEntry::check_twosfx(const char* word,
693 : int len,
694 : int optflags,
695 : PfxEntry* ppfx,
696 : const FLAG needflag) {
697 0 : PfxEntry* ep = ppfx;
698 :
699 : // if this suffix is being cross checked with a prefix
700 : // but it does not support cross products skip it
701 :
702 0 : if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
703 0 : return NULL;
704 :
705 : // upon entry suffix is 0 length or already matches the end of the word.
706 : // So if the remaining root word has positive length
707 : // and if there are enough chars in root word and added back strip chars
708 : // to meet the number of characters conditions, then test it
709 :
710 0 : int tmpl = len - appnd.size(); // length of tmpword
711 :
712 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
713 0 : (tmpl + strip.size() >= numconds)) {
714 : // generate new root word by removing suffix and adding
715 : // back any characters that would have been stripped or
716 : // or null terminating the shorter string
717 :
718 0 : std::string tmpword(word);
719 0 : tmpword.resize(tmpl);
720 0 : tmpword.append(strip);
721 0 : tmpl += strip.size();
722 :
723 0 : const char* beg = tmpword.c_str();
724 0 : const char* end = beg + tmpl;
725 :
726 : // now make sure all of the conditions on characters
727 : // are met. Please see the appendix at the end of
728 : // this file for more info on exactly what is being
729 : // tested
730 :
731 : // if all conditions are met then recall suffix_check
732 :
733 0 : if (test_condition(end, beg)) {
734 : struct hentry* he; // hash entry pointer
735 0 : if (ppfx) {
736 : // handle conditional suffix
737 0 : if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
738 0 : he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
739 0 : (FLAG)aflag, needflag, IN_CPD_NOT);
740 : else
741 0 : he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx,
742 0 : (FLAG)aflag, needflag, IN_CPD_NOT);
743 : } else {
744 0 : he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
745 0 : (FLAG)aflag, needflag, IN_CPD_NOT);
746 : }
747 0 : if (he)
748 0 : return he;
749 : }
750 : }
751 0 : return NULL;
752 : }
753 :
754 : // see if two-level suffix is present in the word
755 0 : std::string SfxEntry::check_twosfx_morph(const char* word,
756 : int len,
757 : int optflags,
758 : PfxEntry* ppfx,
759 : const FLAG needflag) {
760 0 : PfxEntry* ep = ppfx;
761 :
762 0 : std::string result;
763 :
764 : // if this suffix is being cross checked with a prefix
765 : // but it does not support cross products skip it
766 :
767 0 : if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
768 0 : return result;
769 :
770 : // upon entry suffix is 0 length or already matches the end of the word.
771 : // So if the remaining root word has positive length
772 : // and if there are enough chars in root word and added back strip chars
773 : // to meet the number of characters conditions, then test it
774 :
775 0 : int tmpl = len - appnd.size(); // length of tmpword
776 :
777 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
778 0 : (tmpl + strip.size() >= numconds)) {
779 : // generate new root word by removing suffix and adding
780 : // back any characters that would have been stripped or
781 : // or null terminating the shorter string
782 :
783 0 : std::string tmpword(word);
784 0 : tmpword.resize(tmpl);
785 0 : tmpword.append(strip);
786 0 : tmpl += strip.size();
787 :
788 0 : const char* beg = tmpword.c_str();
789 0 : const char* end = beg + tmpl;
790 :
791 : // now make sure all of the conditions on characters
792 : // are met. Please see the appendix at the end of
793 : // this file for more info on exactly what is being
794 : // tested
795 :
796 : // if all conditions are met then recall suffix_check
797 :
798 0 : if (test_condition(end, beg)) {
799 0 : if (ppfx) {
800 : // handle conditional suffix
801 0 : if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
802 0 : std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
803 0 : needflag);
804 0 : if (!st.empty()) {
805 0 : if (ppfx->getMorph()) {
806 0 : result.append(ppfx->getMorph());
807 0 : result.append(" ");
808 : }
809 0 : result.append(st);
810 0 : mychomp(result);
811 : }
812 : } else {
813 0 : std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
814 0 : needflag);
815 0 : if (!st.empty()) {
816 0 : result.append(st);
817 0 : mychomp(result);
818 : }
819 : }
820 : } else {
821 0 : std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
822 0 : if (!st.empty()) {
823 0 : result.append(st);
824 0 : mychomp(result);
825 : }
826 : }
827 : }
828 : }
829 0 : return result;
830 : }
831 :
832 : // get next homonym with same affix
833 0 : struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
834 : int optflags,
835 : PfxEntry* ppfx,
836 : const FLAG cclass,
837 : const FLAG needflag) {
838 0 : PfxEntry* ep = ppfx;
839 0 : FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
840 :
841 0 : while (he->next_homonym) {
842 0 : he = he->next_homonym;
843 0 : if ((TESTAFF(he->astr, aflag, he->alen) ||
844 0 : (ep && ep->getCont() &&
845 0 : TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
846 0 : ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
847 : // handle conditional suffix
848 0 : ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
849 : // handle cont. class
850 0 : ((!cclass) ||
851 0 : ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
852 : // handle required flag
853 0 : ((!needflag) ||
854 0 : (TESTAFF(he->astr, needflag, he->alen) ||
855 0 : ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
856 0 : return he;
857 : }
858 0 : return NULL;
859 : }
860 :
861 0 : void SfxEntry::initReverseWord() {
862 0 : rappnd = appnd;
863 0 : reverseword(rappnd);
864 0 : }
865 :
866 : #if 0
867 :
868 : Appendix: Understanding Affix Code
869 :
870 :
871 : An affix is either a prefix or a suffix attached to root words to make
872 : other words.
873 :
874 : Basically a Prefix or a Suffix is set of AffEntry objects
875 : which store information about the prefix or suffix along
876 : with supporting routines to check if a word has a particular
877 : prefix or suffix or a combination.
878 :
879 : The structure affentry is defined as follows:
880 :
881 : struct affentry
882 : {
883 : unsigned short aflag; // ID used to represent the affix
884 : std::string strip; // string to strip before adding affix
885 : std::string appnd; // the affix string to add
886 : char numconds; // the number of conditions that must be met
887 : char opts; // flag: aeXPRODUCT- combine both prefix and suffix
888 : char conds[SETSIZE]; // array which encodes the conditions to be met
889 : };
890 :
891 :
892 : Here is a suffix borrowed from the en_US.aff file. This file
893 : is whitespace delimited.
894 :
895 : SFX D Y 4
896 : SFX D 0 e d
897 : SFX D y ied [^aeiou]y
898 : SFX D 0 ed [^ey]
899 : SFX D 0 ed [aeiou]y
900 :
901 : This information can be interpreted as follows:
902 :
903 : In the first line has 4 fields
904 :
905 : Field
906 : -----
907 : 1 SFX - indicates this is a suffix
908 : 2 D - is the name of the character flag which represents this suffix
909 : 3 Y - indicates it can be combined with prefixes (cross product)
910 : 4 4 - indicates that sequence of 4 affentry structures are needed to
911 : properly store the affix information
912 :
913 : The remaining lines describe the unique information for the 4 SfxEntry
914 : objects that make up this affix. Each line can be interpreted
915 : as follows: (note fields 1 and 2 are as a check against line 1 info)
916 :
917 : Field
918 : -----
919 : 1 SFX - indicates this is a suffix
920 : 2 D - is the name of the character flag for this affix
921 : 3 y - the string of chars to strip off before adding affix
922 : (a 0 here indicates the NULL string)
923 : 4 ied - the string of affix characters to add
924 : 5 [^aeiou]y - the conditions which must be met before the affix
925 : can be applied
926 :
927 : Field 5 is interesting. Since this is a suffix, field 5 tells us that
928 : there are 2 conditions that must be met. The first condition is that
929 : the next to the last character in the word must *NOT* be any of the
930 : following "a", "e", "i", "o" or "u". The second condition is that
931 : the last character of the word must end in "y".
932 :
933 : So how can we encode this information concisely and be able to
934 : test for both conditions in a fast manner? The answer is found
935 : but studying the wonderful ispell code of Geoff Kuenning, et.al.
936 : (now available under a normal BSD license).
937 :
938 : If we set up a conds array of 256 bytes indexed (0 to 255) and access it
939 : using a character (cast to an unsigned char) of a string, we have 8 bits
940 : of information we can store about that character. Specifically we
941 : could use each bit to say if that character is allowed in any of the
942 : last (or first for prefixes) 8 characters of the word.
943 :
944 : Basically, each character at one end of the word (up to the number
945 : of conditions) is used to index into the conds array and the resulting
946 : value found there says whether the that character is valid for a
947 : specific character position in the word.
948 :
949 : For prefixes, it does this by setting bit 0 if that char is valid
950 : in the first position, bit 1 if valid in the second position, and so on.
951 :
952 : If a bit is not set, then that char is not valid for that postion in the
953 : word.
954 :
955 : If working with suffixes bit 0 is used for the character closest
956 : to the front, bit 1 for the next character towards the end, ...,
957 : with bit numconds-1 representing the last char at the end of the string.
958 :
959 : Note: since entries in the conds[] are 8 bits, only 8 conditions
960 : (read that only 8 character positions) can be examined at one
961 : end of a word (the beginning for prefixes and the end for suffixes.
962 :
963 : So to make this clearer, lets encode the conds array values for the
964 : first two affentries for the suffix D described earlier.
965 :
966 :
967 : For the first affentry:
968 : numconds = 1 (only examine the last character)
969 :
970 : conds['e'] = (1 << 0) (the word must end in an E)
971 : all others are all 0
972 :
973 : For the second affentry:
974 : numconds = 2 (only examine the last two characters)
975 :
976 : conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
977 : where X is all characters *but* a, e, i, o, or u
978 :
979 :
980 : conds['y'] = (1 << 1) (the last char must be a y)
981 : all other bits for all other entries in the conds array are zero
982 :
983 : #endif
|