Line data Source code
1 : /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2 : * licenses follows.
3 : */
4 :
5 : /* LibHnj - a library for high quality hyphenation and justification
6 : * Copyright (C) 1998 Raph Levien,
7 : * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8 : * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9 : * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
10 : *
11 : * This library is free software; you can redistribute it and/or
12 : * modify it under the terms of the GNU Library General Public
13 : * License as published by the Free Software Foundation; either
14 : * version 2 of the License, or (at your option) any later version.
15 : *
16 : * This library is distributed in the hope that it will be useful,
17 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 : * Library General Public License for more details.
20 : *
21 : * You should have received a copy of the GNU Library General Public
22 : * License along with this library; if not, write to the
23 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 : * Boston, MA 02111-1307 USA.
25 : */
26 :
27 : /*
28 : * The contents of this file are subject to the Mozilla Public License
29 : * Version 1.0 (the "MPL"); you may not use this file except in
30 : * compliance with the MPL. You may obtain a copy of the MPL at
31 : * http://www.mozilla.org/MPL/
32 : *
33 : * Software distributed under the MPL is distributed on an "AS IS" basis,
34 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35 : * for the specific language governing rights and limitations under the
36 : * MPL.
37 : *
38 : */
39 : #include <stdlib.h> /* for NULL, malloc */
40 : #include <stdio.h> /* for fprintf */
41 : #include <string.h> /* for strdup */
42 :
43 : #ifdef UNX
44 : #include <unistd.h> /* for exit */
45 : #endif
46 :
47 : #define noVERBOSE
48 :
49 : /* calculate hyphenmin values with long ligature length (2 or 3 characters
50 : * instead of 1 or 2) for comparison with hyphenation without ligatures */
51 : #define noLONG_LIGATURE
52 :
53 : #ifdef LONG_LIGATURE
54 : #define LIG_xx 1
55 : #define LIG_xxx 2
56 : #else
57 : #define LIG_xx 0
58 : #define LIG_xxx 1
59 : #endif
60 :
61 : #include "hnjalloc.h"
62 : #include "hyphen.h"
63 :
64 : static char *
65 0 : hnj_strdup (const char *s)
66 : {
67 : char *newstr;
68 : int l;
69 :
70 0 : l = strlen (s);
71 0 : newstr = (char *) hnj_malloc (l + 1);
72 0 : memcpy (newstr, s, l);
73 0 : newstr[l] = 0;
74 0 : return newstr;
75 : }
76 :
77 : /* remove cross-platform text line end characters */
78 0 : void hnj_strchomp(char * s)
79 : {
80 0 : int k = strlen(s);
81 0 : if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
82 0 : if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
83 0 : }
84 :
85 : /* a little bit of a hash table implementation. This simply maps strings
86 : to state numbers */
87 :
88 : typedef struct _HashTab HashTab;
89 : typedef struct _HashEntry HashEntry;
90 :
91 : /* A cheap, but effective, hack. */
92 : #define HASH_SIZE 31627
93 :
94 : struct _HashTab {
95 : HashEntry *entries[HASH_SIZE];
96 : };
97 :
98 : struct _HashEntry {
99 : HashEntry *next;
100 : char *key;
101 : int val;
102 : };
103 :
104 : /* a char* hash function from ASU - adapted from Gtk+ */
105 : static unsigned int
106 0 : hnj_string_hash (const char *s)
107 : {
108 : const char *p;
109 0 : unsigned int h=0, g;
110 0 : for(p = s; *p != '\0'; p += 1) {
111 0 : h = ( h << 4 ) + *p;
112 0 : if ( ( g = h & 0xf0000000 ) ) {
113 0 : h = h ^ (g >> 24);
114 0 : h = h ^ g;
115 : }
116 : }
117 0 : return h /* % M */;
118 : }
119 :
120 : static HashTab *
121 0 : hnj_hash_new (void)
122 : {
123 : HashTab *hashtab;
124 : int i;
125 :
126 0 : hashtab = (HashTab *) hnj_malloc (sizeof(HashTab));
127 0 : for (i = 0; i < HASH_SIZE; i++)
128 0 : hashtab->entries[i] = NULL;
129 :
130 0 : return hashtab;
131 : }
132 :
133 : static void
134 0 : hnj_hash_free (HashTab *hashtab)
135 : {
136 : int i;
137 : HashEntry *e, *next;
138 :
139 0 : for (i = 0; i < HASH_SIZE; i++)
140 0 : for (e = hashtab->entries[i]; e; e = next)
141 : {
142 0 : next = e->next;
143 0 : hnj_free (e->key);
144 0 : hnj_free (e);
145 : }
146 :
147 0 : hnj_free (hashtab);
148 0 : }
149 :
150 : /* assumes that key is not already present! */
151 : static void
152 0 : hnj_hash_insert (HashTab *hashtab, const char *key, int val)
153 : {
154 : int i;
155 : HashEntry *e;
156 :
157 0 : i = hnj_string_hash (key) % HASH_SIZE;
158 0 : e = (HashEntry *) hnj_malloc (sizeof(HashEntry));
159 0 : e->next = hashtab->entries[i];
160 0 : e->key = hnj_strdup (key);
161 0 : e->val = val;
162 0 : hashtab->entries[i] = e;
163 0 : }
164 :
165 : /* return val if found, otherwise -1 */
166 : static int
167 0 : hnj_hash_lookup (HashTab *hashtab, const char *key)
168 : {
169 : int i;
170 : HashEntry *e;
171 0 : i = hnj_string_hash (key) % HASH_SIZE;
172 0 : for (e = hashtab->entries[i]; e; e = e->next)
173 0 : if (!strcmp (key, e->key))
174 0 : return e->val;
175 0 : return -1;
176 : }
177 :
178 : /* Get the state number, allocating a new state if necessary. */
179 : static int
180 0 : hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
181 : {
182 : int state_num;
183 :
184 0 : state_num = hnj_hash_lookup (hashtab, string);
185 :
186 0 : if (state_num >= 0)
187 0 : return state_num;
188 :
189 0 : hnj_hash_insert (hashtab, string, dict->num_states);
190 : /* predicate is true if dict->num_states is a power of two */
191 0 : if (!(dict->num_states & (dict->num_states - 1)))
192 : {
193 0 : dict->states = (HyphenState *) hnj_realloc (dict->states,
194 : (dict->num_states << 1) *
195 : sizeof(HyphenState));
196 : }
197 0 : dict->states[dict->num_states].match = NULL;
198 0 : dict->states[dict->num_states].repl = NULL;
199 0 : dict->states[dict->num_states].fallback_state = -1;
200 0 : dict->states[dict->num_states].num_trans = 0;
201 0 : dict->states[dict->num_states].trans = NULL;
202 0 : return dict->num_states++;
203 : }
204 :
205 : /* add a transition from state1 to state2 through ch - assumes that the
206 : transition does not already exist */
207 : static void
208 0 : hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
209 : {
210 : int num_trans;
211 :
212 0 : num_trans = dict->states[state1].num_trans;
213 0 : if (num_trans == 0)
214 : {
215 0 : dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans));
216 : }
217 0 : else if (!(num_trans & (num_trans - 1)))
218 : {
219 0 : dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans,
220 : (num_trans << 1) *
221 : sizeof(HyphenTrans));
222 : }
223 0 : dict->states[state1].trans[num_trans].ch = ch;
224 0 : dict->states[state1].trans[num_trans].new_state = state2;
225 0 : dict->states[state1].num_trans++;
226 0 : }
227 :
228 : #ifdef VERBOSE
229 : HashTab *global[1];
230 :
231 : static char *
232 : get_state_str (int state, int level)
233 : {
234 : int i;
235 : HashEntry *e;
236 :
237 : for (i = 0; i < HASH_SIZE; i++)
238 : for (e = global[level]->entries[i]; e; e = e->next)
239 : if (e->val == state)
240 : return e->key;
241 : return NULL;
242 : }
243 : #endif
244 :
245 0 : void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
246 : int i, j;
247 : char word[MAX_CHARS];
248 : char pattern[MAX_CHARS];
249 : char * repl;
250 : signed char replindex;
251 : signed char replcut;
252 0 : int state_num = 0;
253 : int last_state;
254 : char ch;
255 : int found;
256 :
257 0 : if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
258 0 : dict->lhmin = atoi(buf + 13);
259 0 : return;
260 0 : } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
261 0 : dict->rhmin = atoi(buf + 14);
262 0 : return;
263 0 : } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
264 0 : dict->clhmin = atoi(buf + 21);
265 0 : return;
266 0 : } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
267 0 : dict->crhmin = atoi(buf + 22);
268 0 : return;
269 0 : } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
270 0 : char * space = buf + 8;
271 0 : while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
272 0 : if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
273 0 : if (dict->nohyphen) {
274 0 : char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
275 0 : *nhe = 0;
276 0 : for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
277 0 : if (*nhe == ',') {
278 0 : dict->nohyphenl++;
279 0 : *nhe = 0;
280 : }
281 : }
282 : }
283 0 : return;
284 : }
285 0 : j = 0;
286 0 : pattern[j] = '0';
287 0 : repl = strchr(buf, '/');
288 0 : replindex = 0;
289 0 : replcut = 0;
290 0 : if (repl) {
291 0 : char * index = strchr(repl + 1, ',');
292 0 : *repl = '\0';
293 0 : if (index) {
294 0 : char * index2 = strchr(index + 1, ',');
295 0 : *index = '\0';
296 0 : if (index2) {
297 0 : *index2 = '\0';
298 0 : replindex = (signed char) atoi(index + 1) - 1;
299 0 : replcut = (signed char) atoi(index2 + 1);
300 : }
301 : } else {
302 0 : hnj_strchomp(repl + 1);
303 0 : replindex = 0;
304 0 : replcut = (signed char) strlen(buf);
305 : }
306 0 : repl = hnj_strdup(repl + 1);
307 : }
308 0 : for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
309 : {
310 0 : if (buf[i] >= '0' && buf[i] <= '9')
311 0 : pattern[j] = buf[i];
312 : else
313 : {
314 0 : word[j] = buf[i];
315 0 : pattern[++j] = '0';
316 : }
317 : }
318 0 : word[j] = '\0';
319 0 : pattern[j + 1] = '\0';
320 :
321 0 : i = 0;
322 0 : if (!repl) {
323 : /* Optimize away leading zeroes */
324 0 : for (; pattern[i] == '0'; i++);
325 : } else {
326 0 : if (*word == '.') i++;
327 : /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
328 0 : if (dict->utf8) {
329 0 : int pu = -1; /* unicode character position */
330 0 : int ps = -1; /* unicode start position (original replindex) */
331 0 : int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
332 0 : for (; pc < (strlen(word) + 1); pc++) {
333 : /* beginning of an UTF-8 character (not '10' start bits) */
334 0 : if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
335 0 : if ((ps < 0) && (replindex == pu)) {
336 0 : ps = replindex;
337 0 : replindex = (signed char) pc;
338 : }
339 0 : if ((ps >= 0) && ((pu - ps) == replcut)) {
340 0 : replcut = (signed char) (pc - replindex);
341 0 : break;
342 : }
343 : }
344 0 : if (*word == '.') replindex--;
345 : }
346 : }
347 :
348 : #ifdef VERBOSE
349 : printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
350 : #endif
351 0 : found = hnj_hash_lookup (hashtab, word);
352 0 : state_num = hnj_get_state (dict, hashtab, word);
353 0 : dict->states[state_num].match = hnj_strdup (pattern + i);
354 0 : dict->states[state_num].repl = repl;
355 0 : dict->states[state_num].replindex = replindex;
356 0 : if (!replcut) {
357 0 : dict->states[state_num].replcut = (signed char) strlen(word);
358 : } else {
359 0 : dict->states[state_num].replcut = replcut;
360 : }
361 :
362 : /* now, put in the prefix transitions */
363 0 : for (; found < 0 && j > 0; --j)
364 : {
365 0 : last_state = state_num;
366 0 : ch = word[j - 1];
367 0 : word[j - 1] = '\0';
368 0 : found = hnj_hash_lookup (hashtab, word);
369 0 : state_num = hnj_get_state (dict, hashtab, word);
370 0 : hnj_add_trans (dict, state_num, last_state, ch);
371 : }
372 : }
373 :
374 : HyphenDict *
375 0 : hnj_hyphen_load (const char *fn)
376 : {
377 : HyphenDict *result;
378 : FILE *f;
379 0 : f = fopen (fn, "r");
380 0 : if (f == NULL)
381 0 : return NULL;
382 :
383 0 : result = hnj_hyphen_load_file(f);
384 :
385 0 : fclose(f);
386 0 : return result;
387 : }
388 :
389 : HyphenDict *
390 0 : hnj_hyphen_load_file (FILE *f)
391 : {
392 : HyphenDict *dict[2];
393 : HashTab *hashtab;
394 : char buf[MAX_CHARS];
395 0 : int nextlevel = 0;
396 : int i, j, k;
397 : HashEntry *e;
398 0 : int state_num = 0;
399 : // loading one or two dictionaries (separated by NEXTLEVEL keyword)
400 0 : for (k = 0; k < 2; k++) {
401 0 : hashtab = hnj_hash_new ();
402 : #ifdef VERBOSE
403 : global[k] = hashtab;
404 : #endif
405 0 : hnj_hash_insert (hashtab, "", 0);
406 0 : dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
407 0 : dict[k]->num_states = 1;
408 0 : dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
409 0 : dict[k]->states[0].match = NULL;
410 0 : dict[k]->states[0].repl = NULL;
411 0 : dict[k]->states[0].fallback_state = -1;
412 0 : dict[k]->states[0].num_trans = 0;
413 0 : dict[k]->states[0].trans = NULL;
414 0 : dict[k]->nextlevel = NULL;
415 0 : dict[k]->lhmin = 0;
416 0 : dict[k]->rhmin = 0;
417 0 : dict[k]->clhmin = 0;
418 0 : dict[k]->crhmin = 0;
419 0 : dict[k]->nohyphen = NULL;
420 0 : dict[k]->nohyphenl = 0;
421 :
422 : /* read in character set info */
423 0 : if (k == 0) {
424 0 : for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
425 0 : if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
426 0 : for (i=0;i<MAX_NAME;i++)
427 0 : if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
428 0 : dict[k]->cset[i] = 0;
429 : } else {
430 0 : dict[k]->cset[0] = 0;
431 : }
432 0 : dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
433 : } else {
434 0 : strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
435 0 : dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
436 0 : dict[k]->utf8 = dict[0]->utf8;
437 : }
438 :
439 0 : if (k == 0 || nextlevel) {
440 0 : while (fgets (buf, sizeof(buf), f) != NULL) {
441 0 : if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
442 0 : nextlevel = 1;
443 0 : break;
444 0 : } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
445 : }
446 0 : } else if (k == 1) {
447 : /* default first level: hyphen and ASCII apostrophe */
448 0 : if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
449 0 : else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
450 0 : strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
451 0 : buf[MAX_CHARS-1] = '\0';
452 0 : hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
453 0 : hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
454 0 : if (dict[0]->utf8) {
455 0 : hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
456 0 : hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
457 : }
458 : }
459 :
460 : /* Could do unioning of matches here (instead of the preprocessor script).
461 : If we did, the pseudocode would look something like this:
462 :
463 : foreach state in the hash table
464 : foreach i = [1..length(state) - 1]
465 : state to check is substr (state, i)
466 : look it up
467 : if found, and if there is a match, union the match in.
468 :
469 : It's also possible to avoid the quadratic blowup by doing the
470 : search in order of increasing state string sizes - then you
471 : can break the loop after finding the first match.
472 :
473 : This step should be optional in any case - if there is a
474 : preprocessed rule table, it's always faster to use that.
475 :
476 : */
477 :
478 : /* put in the fallback states */
479 0 : for (i = 0; i < HASH_SIZE; i++)
480 0 : for (e = hashtab->entries[i]; e; e = e->next)
481 : {
482 0 : if (*(e->key)) for (j = 1; 1; j++)
483 : {
484 0 : state_num = hnj_hash_lookup (hashtab, e->key + j);
485 0 : if (state_num >= 0)
486 0 : break;
487 : }
488 : /* KBH: FIXME state 0 fallback_state should always be -1? */
489 0 : if (e->val)
490 0 : dict[k]->states[e->val].fallback_state = state_num;
491 : }
492 : #ifdef VERBOSE
493 : for (i = 0; i < HASH_SIZE; i++)
494 : for (e = hashtab->entries[i]; e; e = e->next)
495 : {
496 : printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
497 : dict[k]->states[e->val].fallback_state);
498 : for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
499 : printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
500 : dict[k]->states[e->val].trans[j].new_state);
501 : }
502 : #endif
503 :
504 : #ifndef VERBOSE
505 0 : hnj_hash_free (hashtab);
506 : #endif
507 0 : state_num = 0;
508 : }
509 0 : if (nextlevel) dict[0]->nextlevel = dict[1];
510 : else {
511 0 : dict[1] -> nextlevel = dict[0];
512 0 : dict[1]->lhmin = dict[0]->lhmin;
513 0 : dict[1]->rhmin = dict[0]->rhmin;
514 0 : dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
515 0 : dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
516 : #ifdef VERBOSE
517 : HashTab *r = global[0];
518 : global[0] = global[1];
519 : global[1] = r;
520 : #endif
521 0 : return dict[1];
522 : }
523 0 : return dict[0];
524 : }
525 :
526 0 : void hnj_hyphen_free (HyphenDict *dict)
527 : {
528 : int state_num;
529 : HyphenState *hstate;
530 :
531 0 : for (state_num = 0; state_num < dict->num_states; state_num++)
532 : {
533 0 : hstate = &dict->states[state_num];
534 0 : if (hstate->match)
535 0 : hnj_free (hstate->match);
536 0 : if (hstate->repl)
537 0 : hnj_free (hstate->repl);
538 0 : if (hstate->trans)
539 0 : hnj_free (hstate->trans);
540 : }
541 0 : if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
542 :
543 0 : if (dict->nohyphen) hnj_free(dict->nohyphen);
544 :
545 0 : hnj_free (dict->states);
546 :
547 0 : hnj_free (dict);
548 0 : }
549 :
550 : #define MAX_WORD 256
551 :
552 0 : int hnj_hyphen_hyphenate (HyphenDict *dict,
553 : const char *word, int word_size,
554 : char *hyphens)
555 : {
556 : char *prep_word;
557 : int i, j, k;
558 : int state;
559 : char ch;
560 : HyphenState *hstate;
561 : char *match;
562 : int offset;
563 :
564 0 : prep_word = (char*) hnj_malloc (word_size + 3);
565 :
566 0 : j = 0;
567 0 : prep_word[j++] = '.';
568 :
569 0 : for (i = 0; i < word_size; i++) {
570 0 : if (word[i] <= '9' && word[i] >= '0') {
571 0 : prep_word[j++] = '.';
572 : } else {
573 0 : prep_word[j++] = word[i];
574 : }
575 : }
576 :
577 0 : prep_word[j++] = '.';
578 0 : prep_word[j] = '\0';
579 :
580 0 : for (i = 0; i < word_size + 5; i++)
581 0 : hyphens[i] = '0';
582 :
583 : #ifdef VERBOSE
584 : printf ("prep_word = %s\n", prep_word);
585 : #endif
586 :
587 : /* now, run the finite state machine */
588 0 : state = 0;
589 0 : for (i = 0; i < j; i++)
590 : {
591 0 : ch = prep_word[i];
592 : for (;;)
593 : {
594 :
595 0 : if (state == -1) {
596 : /* return 1; */
597 : /* KBH: FIXME shouldn't this be as follows? */
598 0 : state = 0;
599 0 : goto try_next_letter;
600 : }
601 :
602 : #ifdef VERBOSE
603 : char *state_str;
604 : state_str = get_state_str (state, 0);
605 :
606 : for (k = 0; k < i - strlen (state_str); k++)
607 : putchar (' ');
608 : printf ("%s", state_str);
609 : #endif
610 :
611 0 : hstate = &dict->states[state];
612 0 : for (k = 0; k < hstate->num_trans; k++)
613 0 : if (hstate->trans[k].ch == ch)
614 : {
615 0 : state = hstate->trans[k].new_state;
616 0 : goto found_state;
617 : }
618 0 : state = hstate->fallback_state;
619 : #ifdef VERBOSE
620 : printf (" falling back, fallback_state %d\n", state);
621 : #endif
622 : }
623 : found_state:
624 : #ifdef VERBOSE
625 : printf ("found state %d\n",state);
626 : #endif
627 : /* Additional optimization is possible here - especially,
628 : elimination of trailing zeroes from the match. Leading zeroes
629 : have already been optimized. */
630 0 : match = dict->states[state].match;
631 : /* replacing rules not handled by hyphen_hyphenate() */
632 0 : if (match && !dict->states[state].repl)
633 : {
634 0 : offset = i + 1 - strlen (match);
635 : #ifdef VERBOSE
636 : for (k = 0; k < offset; k++)
637 : putchar (' ');
638 : printf ("%s\n", match);
639 : #endif
640 : /* This is a linear search because I tried a binary search and
641 : found it to be just a teeny bit slower. */
642 0 : for (k = 0; match[k]; k++)
643 0 : if (hyphens[offset + k] < match[k])
644 0 : hyphens[offset + k] = match[k];
645 : }
646 :
647 : /* KBH: we need this to make sure we keep looking in a word */
648 : /* for patterns even if the current character is not known in state 0 */
649 : /* since patterns for hyphenation may occur anywhere in the word */
650 : try_next_letter: ;
651 :
652 : }
653 : #ifdef VERBOSE
654 : for (i = 0; i < j; i++)
655 : putchar (hyphens[i]);
656 : putchar ('\n');
657 : #endif
658 :
659 0 : for (i = 0; i < j - 4; i++)
660 : #if 0
661 : if (hyphens[i + 1] & 1)
662 : hyphens[i] = '-';
663 : #else
664 0 : hyphens[i] = hyphens[i + 1];
665 : #endif
666 0 : hyphens[0] = '0';
667 0 : for (; i < word_size; i++)
668 0 : hyphens[i] = '0';
669 0 : hyphens[word_size] = '\0';
670 :
671 0 : hnj_free (prep_word);
672 :
673 0 : return 0;
674 : }
675 :
676 : /* Unicode ligature length */
677 0 : int hnj_ligature(unsigned char c) {
678 0 : switch (c) {
679 : case 0x80: /* ff */
680 : case 0x81: /* fi */
681 0 : case 0x82: return LIG_xx; /* fl */
682 : case 0x83: /* ffi */
683 0 : case 0x84: return LIG_xxx; /* ffl */
684 : case 0x85: /* long st */
685 0 : case 0x86: return LIG_xx; /* st */
686 : }
687 0 : return 0;
688 : }
689 :
690 : /* character length of the first n byte of the input word */
691 0 : int hnj_hyphen_strnlen(const char * word, int n, int utf8)
692 : {
693 0 : int i = 0;
694 0 : int j = 0;
695 0 : while (j < n && word[j] != '\0') {
696 0 : i++;
697 : // Unicode ligature support
698 0 : if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
699 0 : i += hnj_ligature(word[j + 2]);
700 : }
701 0 : for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
702 : }
703 0 : return i;
704 : }
705 :
706 0 : int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
707 : char *** rep, int ** pos, int ** cut, int lhmin)
708 : {
709 0 : int i = 1, j;
710 :
711 : // Unicode ligature support
712 0 : if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
713 0 : i += hnj_ligature(word[2]);
714 : }
715 :
716 : // ignore numbers
717 0 : for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
718 :
719 0 : for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
720 : // check length of the non-standard part
721 0 : if (*rep && *pos && *cut && (*rep)[j]) {
722 0 : char * rh = strchr((*rep)[j], '=');
723 0 : if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
724 0 : hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
725 0 : free((*rep)[j]);
726 0 : (*rep)[j] = NULL;
727 0 : hyphens[j] = '0';
728 : }
729 : } else {
730 0 : hyphens[j] = '0';
731 : }
732 0 : j++;
733 :
734 : // Unicode ligature support
735 0 : if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
736 0 : i += hnj_ligature(word[j + 2]);
737 : }
738 0 : } while (utf8 && (word[j] & 0xc0) == 0x80);
739 0 : return 0;
740 : }
741 :
742 0 : int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
743 : char *** rep, int ** pos, int ** cut, int rhmin)
744 : {
745 0 : int i = 0;
746 : int j;
747 :
748 : // ignore numbers
749 0 : for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
750 :
751 0 : for (j = word_size - 1; i < rhmin && j > 0; j--) {
752 : // check length of the non-standard part
753 0 : if (*rep && *pos && *cut && (*rep)[j]) {
754 0 : char * rh = strchr((*rep)[j], '=');
755 0 : if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
756 0 : hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
757 0 : free((*rep)[j]);
758 0 : (*rep)[j] = NULL;
759 0 : hyphens[j] = '0';
760 : }
761 : } else {
762 0 : hyphens[j] = '0';
763 : }
764 0 : if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
765 : }
766 0 : return 0;
767 : }
768 :
769 : // recursive function for compound level hyphenation
770 0 : int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
771 : char * hyphens, char *** rep, int ** pos, int ** cut,
772 : int clhmin, int crhmin, int lend, int rend)
773 : {
774 : char *prep_word;
775 : int i, j, k;
776 : int state;
777 : char ch;
778 : HyphenState *hstate;
779 : char *match;
780 : char *repl;
781 : signed char replindex;
782 : signed char replcut;
783 : int offset;
784 : int * matchlen;
785 : int * matchindex;
786 : char ** matchrepl;
787 0 : int isrepl = 0;
788 : int nHyphCount;
789 :
790 0 : size_t prep_word_size = word_size + 3;
791 0 : prep_word = (char*) hnj_malloc (prep_word_size);
792 0 : matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
793 0 : matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
794 0 : matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *));
795 :
796 0 : j = 0;
797 0 : prep_word[j++] = '.';
798 :
799 0 : for (i = 0; i < word_size; i++) {
800 0 : if (word[i] <= '9' && word[i] >= '0') {
801 0 : prep_word[j++] = '.';
802 : } else {
803 0 : prep_word[j++] = word[i];
804 : }
805 : }
806 :
807 :
808 :
809 0 : prep_word[j++] = '.';
810 0 : prep_word[j] = '\0';
811 :
812 0 : for (i = 0; i < j; i++)
813 0 : hyphens[i] = '0';
814 :
815 : #ifdef VERBOSE
816 : printf ("prep_word = %s\n", prep_word);
817 : #endif
818 :
819 : /* now, run the finite state machine */
820 0 : state = 0;
821 0 : for (i = 0; i < j; i++)
822 : {
823 0 : ch = prep_word[i];
824 : for (;;)
825 : {
826 :
827 0 : if (state == -1) {
828 : /* return 1; */
829 : /* KBH: FIXME shouldn't this be as follows? */
830 0 : state = 0;
831 0 : goto try_next_letter;
832 : }
833 :
834 : #ifdef VERBOSE
835 : char *state_str;
836 : state_str = get_state_str (state, 1);
837 :
838 : for (k = 0; k < i - strlen (state_str); k++)
839 : putchar (' ');
840 : printf ("%s", state_str);
841 : #endif
842 :
843 0 : hstate = &dict->states[state];
844 0 : for (k = 0; k < hstate->num_trans; k++)
845 0 : if (hstate->trans[k].ch == ch)
846 : {
847 0 : state = hstate->trans[k].new_state;
848 0 : goto found_state;
849 : }
850 0 : state = hstate->fallback_state;
851 : #ifdef VERBOSE
852 : printf (" falling back, fallback_state %d\n", state);
853 : #endif
854 : }
855 : found_state:
856 : #ifdef VERBOSE
857 : printf ("found state %d\n",state);
858 : #endif
859 : /* Additional optimization is possible here - especially,
860 : elimination of trailing zeroes from the match. Leading zeroes
861 : have already been optimized. */
862 0 : match = dict->states[state].match;
863 0 : repl = dict->states[state].repl;
864 0 : replindex = dict->states[state].replindex;
865 0 : replcut = dict->states[state].replcut;
866 : /* replacing rules not handled by hyphen_hyphenate() */
867 0 : if (match)
868 : {
869 0 : offset = i + 1 - strlen (match);
870 : #ifdef VERBOSE
871 : for (k = 0; k < offset; k++)
872 : putchar (' ');
873 : printf ("%s (%s)\n", match, repl);
874 : #endif
875 0 : if (repl) {
876 0 : if (!isrepl) for(; isrepl < word_size; isrepl++) {
877 0 : matchrepl[isrepl] = NULL;
878 0 : matchindex[isrepl] = -1;
879 : }
880 0 : matchlen[offset + replindex] = replcut;
881 : }
882 : /* This is a linear search because I tried a binary search and
883 : found it to be just a teeny bit slower. */
884 0 : for (k = 0; match[k]; k++) {
885 0 : if ((hyphens[offset + k] < match[k])) {
886 0 : hyphens[offset + k] = match[k];
887 0 : if (match[k]&1) {
888 0 : matchrepl[offset + k] = repl;
889 0 : if (repl && (k >= replindex) && (k <= replindex + replcut)) {
890 0 : matchindex[offset + replindex] = offset + k;
891 : }
892 : }
893 : }
894 : }
895 :
896 : }
897 :
898 : /* KBH: we need this to make sure we keep looking in a word */
899 : /* for patterns even if the current character is not known in state 0 */
900 : /* since patterns for hyphenation may occur anywhere in the word */
901 : try_next_letter: ;
902 :
903 : }
904 : #ifdef VERBOSE
905 : for (i = 0; i < j; i++)
906 : putchar (hyphens[i]);
907 : putchar ('\n');
908 : #endif
909 :
910 0 : for (i = 0; i < j - 3; i++)
911 : #if 0
912 : if (hyphens[i + 1] & 1)
913 : hyphens[i] = '-';
914 : #else
915 0 : hyphens[i] = hyphens[i + 1];
916 : #endif
917 0 : for (; i < word_size; i++)
918 0 : hyphens[i] = '0';
919 0 : hyphens[word_size] = '\0';
920 :
921 : /* now create a new char string showing hyphenation positions */
922 : /* count the hyphens and allocate space for the new hyphenated string */
923 0 : nHyphCount = 0;
924 0 : for (i = 0; i < word_size; i++)
925 0 : if (hyphens[i]&1)
926 0 : nHyphCount++;
927 0 : j = 0;
928 0 : for (i = 0; i < word_size; i++) {
929 0 : if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
930 0 : if (rep && pos && cut) {
931 0 : if (!*rep)
932 0 : *rep = (char **) calloc(word_size, sizeof(char *));
933 0 : if (!*pos)
934 0 : *pos = (int *) calloc(word_size, sizeof(int));
935 0 : if (!*cut) {
936 0 : *cut = (int *) calloc(word_size, sizeof(int));
937 : }
938 0 : (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
939 0 : (*pos)[matchindex[i] - 1] = matchindex[i] - i;
940 0 : (*cut)[matchindex[i] - 1] = matchlen[i];
941 : }
942 0 : j += strlen(matchrepl[matchindex[i]]);
943 0 : i += matchlen[i] - 1;
944 : }
945 : }
946 :
947 0 : hnj_free (matchrepl);
948 0 : hnj_free (matchlen);
949 0 : hnj_free (matchindex);
950 :
951 : // recursive hyphenation of the first (compound) level segments
952 0 : if (dict->nextlevel) {
953 : char ** rep2;
954 : int * pos2;
955 : int * cut2;
956 : char * hyphens2;
957 0 : int begin = 0;
958 :
959 0 : rep2 = (char**) hnj_malloc (word_size * sizeof(char *));
960 0 : pos2 = (int*) hnj_malloc (word_size * sizeof(int));
961 0 : cut2 = (int*) hnj_malloc (word_size * sizeof(int));
962 0 : hyphens2 = (char*) hnj_malloc (word_size + 3);
963 0 : for (i = 0; i < word_size; i++) rep2[i] = NULL;
964 0 : for (i = 0; i < word_size; i++) if
965 0 : (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
966 0 : if (i - begin > 1) {
967 0 : int hyph = 0;
968 0 : prep_word[i + 2] = '\0';
969 : /* non-standard hyphenation at compound boundary (Schiffahrt) */
970 0 : if (rep && *rep && *pos && *cut && (*rep)[i]) {
971 0 : char * l = strchr((*rep)[i], '=');
972 0 : size_t offset = 2 + i - (*pos)[i];
973 0 : strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
974 0 : prep_word[prep_word_size - 1] = '\0';
975 0 : if (l) {
976 0 : hyph = (l - (*rep)[i]) - (*pos)[i];
977 0 : prep_word[2 + i + hyph] = '\0';
978 : }
979 : }
980 0 : hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
981 : hyphens2, &rep2, &pos2, &cut2, clhmin,
982 0 : crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
983 0 : for (j = 0; j < i - begin - 1; j++) {
984 0 : hyphens[begin + j] = hyphens2[j];
985 0 : if (rep2[j] && rep && pos && cut) {
986 0 : if (!*rep && !*pos && !*cut) {
987 : int k;
988 0 : *rep = (char **) malloc(sizeof(char *) * word_size);
989 0 : *pos = (int *) malloc(sizeof(int) * word_size);
990 0 : *cut = (int *) malloc(sizeof(int) * word_size);
991 0 : for (k = 0; k < word_size; k++) {
992 0 : (*rep)[k] = NULL;
993 0 : (*pos)[k] = 0;
994 0 : (*cut)[k] = 0;
995 : }
996 : }
997 0 : (*rep)[begin + j] = rep2[j];
998 0 : (*pos)[begin + j] = pos2[j];
999 0 : (*cut)[begin + j] = cut2[j];
1000 : }
1001 : }
1002 0 : prep_word[i + 2] = word[i + 1];
1003 0 : if (*rep && *pos && *cut && (*rep)[i]) {
1004 0 : size_t offset = 1;
1005 0 : strncpy(prep_word + offset, word, prep_word_size - offset - 1);
1006 0 : prep_word[prep_word_size - 1] = '\0';
1007 : }
1008 : }
1009 0 : begin = i + 1;
1010 0 : for (j = 0; j < word_size; j++) rep2[j] = NULL;
1011 : }
1012 :
1013 : // non-compound
1014 0 : if (begin == 0) {
1015 0 : hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
1016 : hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
1017 0 : if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1018 : rep, pos, cut, clhmin);
1019 0 : if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1020 : rep, pos, cut, crhmin);
1021 : }
1022 :
1023 0 : free(rep2);
1024 0 : free(cut2);
1025 0 : free(pos2);
1026 0 : free(hyphens2);
1027 : }
1028 :
1029 0 : hnj_free (prep_word);
1030 0 : return 0;
1031 : }
1032 :
1033 : /* UTF-8 normalization of hyphen and non-standard positions */
1034 0 : int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
1035 : char *** rep, int ** pos, int ** cut)
1036 : {
1037 : int i, j, k;
1038 0 : if ((((unsigned char) word[0]) >> 6) == 2) {
1039 0 : fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1040 0 : return 1;
1041 : }
1042 :
1043 : /* calculate UTF-8 character positions */
1044 0 : for (i = 0, j = -1; i < word_size; i++) {
1045 : /* beginning of an UTF-8 character (not '10' start bits) */
1046 0 : if ((((unsigned char) word[i]) >> 6) != 2) j++;
1047 0 : hyphens[j] = hyphens[i];
1048 0 : if (rep && pos && cut && *rep && *pos && *cut) {
1049 0 : int l = (*pos)[i];
1050 0 : (*pos)[j] = 0;
1051 0 : for (k = 0; k < l; k++) {
1052 0 : if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1053 : }
1054 0 : k = i - l + 1;
1055 0 : l = k + (*cut)[i];
1056 0 : (*cut)[j] = 0;
1057 0 : for (; k < l; k++) {
1058 0 : if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1059 : }
1060 0 : (*rep)[j] = (*rep)[i];
1061 0 : if (j < i) {
1062 0 : (*rep)[i] = NULL;
1063 0 : (*pos)[i] = 0;
1064 0 : (*cut)[i] = 0;
1065 : }
1066 : }
1067 : }
1068 0 : hyphens[j + 1] = '\0';
1069 : #ifdef VERBOSE
1070 : printf ("nums: %s\n", hyphens);
1071 : #endif
1072 0 : return 0;
1073 : }
1074 :
1075 : /* get the word with all possible hyphenations (output: hyphword) */
1076 0 : void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1077 : char * hyphword, char *** rep, int ** pos, int ** cut)
1078 : {
1079 0 : int hyphenslen = l + 5;
1080 :
1081 : int i, j;
1082 0 : for (i = 0, j = 0; i < l; i++, j++) {
1083 0 : if (hyphens[i]&1) {
1084 0 : hyphword[j] = word[i];
1085 0 : if (*rep && *pos && *cut && (*rep)[i]) {
1086 0 : size_t offset = j - (*pos)[i] + 1;
1087 0 : strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
1088 0 : hyphword[hyphenslen-1] = '\0';
1089 0 : j += strlen((*rep)[i]) - (*pos)[i];
1090 0 : i += (*cut)[i] - (*pos)[i];
1091 0 : } else hyphword[++j] = '=';
1092 0 : } else hyphword[j] = word[i];
1093 : }
1094 0 : hyphword[j] = '\0';
1095 0 : }
1096 :
1097 :
1098 : /* main api function with default hyphenmin parameters */
1099 0 : int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1100 : const char *word, int word_size, char * hyphens,
1101 : char *hyphword, char *** rep, int ** pos, int ** cut)
1102 : {
1103 0 : hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1104 0 : dict->clhmin, dict->crhmin, 1, 1);
1105 0 : hnj_hyphen_lhmin(dict->utf8, word, word_size,
1106 0 : hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1107 0 : hnj_hyphen_rhmin(dict->utf8, word, word_size,
1108 0 : hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1109 :
1110 : /* nohyphen */
1111 0 : if (dict->nohyphen) {
1112 0 : char * nh = dict->nohyphen;
1113 : int nhi;
1114 0 : for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1115 0 : char * nhy = (char *) strstr(word, nh);
1116 0 : while (nhy) {
1117 0 : hyphens[nhy - word + strlen(nh) - 1] = '0';
1118 0 : if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
1119 0 : nhy = (char *) strstr(nhy + 1, nh);
1120 : }
1121 0 : nh = nh + strlen(nh) + 1;
1122 : }
1123 : }
1124 :
1125 0 : if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1126 0 : if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1127 : #ifdef VERBOSE
1128 : printf ("nums: %s\n", hyphens);
1129 : #endif
1130 0 : return 0;
1131 : }
1132 :
1133 : /* previous main api function with hyphenmin parameters */
1134 0 : int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1135 : const char *word, int word_size, char * hyphens,
1136 : char *hyphword, char *** rep, int ** pos, int ** cut,
1137 : int lhmin, int rhmin, int clhmin, int crhmin)
1138 : {
1139 0 : lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
1140 0 : rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
1141 0 : clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
1142 0 : crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
1143 0 : hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1144 : clhmin, crhmin, 1, 1);
1145 0 : hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1146 : rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1147 0 : hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1148 : rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1149 0 : if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1150 :
1151 : /* nohyphen */
1152 0 : if (dict->nohyphen) {
1153 0 : char * nh = dict->nohyphen;
1154 : int nhi;
1155 0 : for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1156 0 : char * nhy = (char *) strstr(word, nh);
1157 0 : while (nhy) {
1158 0 : hyphens[nhy - word + strlen(nh) - 1] = 0;
1159 0 : if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
1160 0 : nhy = (char *) strstr(nhy + 1, nh);
1161 : }
1162 0 : nh = nh + strlen(nh) + 1;
1163 : }
1164 : }
1165 :
1166 0 : if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1167 0 : return 0;
1168 : }
|