OLD | NEW |
| (Empty) |
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both | |
2 * licenses follows. | |
3 */ | |
4 | |
5 /* LibHnj - a library for high quality hyphenation and justification | |
6 * Copyright (C) 1998 Raph Levien, | |
7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), | |
8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) | |
9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) | |
10 * | |
11 * This library is free software; you can redistribute it and/or | |
12 * modify it under the terms of the GNU Library General Public | |
13 * License as published by the Free Software Foundation; either | |
14 * version 2 of the License, or (at your option) any later version. | |
15 * | |
16 * This library is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 * Library General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU Library General Public | |
22 * License along with this library; if not, write to the | |
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 * Boston, MA 02111-1307 USA. | |
25 */ | |
26 | |
27 /* | |
28 * The contents of this file are subject to the Mozilla Public License | |
29 * Version 1.0 (the "MPL"); you may not use this file except in | |
30 * compliance with the MPL. You may obtain a copy of the MPL at | |
31 * http://www.mozilla.org/MPL/ | |
32 * | |
33 * Software distributed under the MPL is distributed on an "AS IS" basis, | |
34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL | |
35 * for the specific language governing rights and limitations under the | |
36 * MPL. | |
37 * | |
38 */ | |
39 #include <stdlib.h> /* for NULL, malloc */ | |
40 #include <stdio.h> /* for fprintf */ | |
41 #include <string.h> /* for strdup */ | |
42 | |
43 #ifdef UNX | |
44 #include <unistd.h> /* for exit */ | |
45 #endif | |
46 | |
47 #define noVERBOSE | |
48 | |
49 /* calculate hyphenmin values with long ligature length (2 or 3 characters | |
50 * instead of 1 or 2) for comparison with hyphenation without ligatures */ | |
51 #define noLONG_LIGATURE | |
52 | |
53 #ifdef LONG_LIGATURE | |
54 #define LIG_xx 1 | |
55 #define LIG_xxx 2 | |
56 #else | |
57 #define LIG_xx 0 | |
58 #define LIG_xxx 1 | |
59 #endif | |
60 | |
61 #include "hnjalloc.h" | |
62 #include "hyphen.h" | |
63 | |
64 static char * | |
65 hnj_strdup (const char *s) | |
66 { | |
67 char *new; | |
68 int l; | |
69 | |
70 l = strlen (s); | |
71 new = hnj_malloc (l + 1); | |
72 memcpy (new, s, l); | |
73 new[l] = 0; | |
74 return new; | |
75 } | |
76 | |
77 /* remove cross-platform text line end characters */ | |
78 void hnj_strchomp(char * s) | |
79 { | |
80 int k = strlen(s); | |
81 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; | |
82 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; | |
83 } | |
84 | |
85 /* a little bit of a hash table implementation. This simply maps strings | |
86 to state numbers */ | |
87 | |
88 typedef struct _HashTab HashTab; | |
89 typedef struct _HashEntry HashEntry; | |
90 | |
91 /* A cheap, but effective, hack. */ | |
92 #define HASH_SIZE 31627 | |
93 | |
94 struct _HashTab { | |
95 HashEntry *entries[HASH_SIZE]; | |
96 }; | |
97 | |
98 struct _HashEntry { | |
99 HashEntry *next; | |
100 char *key; | |
101 int val; | |
102 }; | |
103 | |
104 /* a char* hash function from ASU - adapted from Gtk+ */ | |
105 static unsigned int | |
106 hnj_string_hash (const char *s) | |
107 { | |
108 const char *p; | |
109 unsigned int h=0, g; | |
110 for(p = s; *p != '\0'; p += 1) { | |
111 h = ( h << 4 ) + *p; | |
112 if ( ( g = h & 0xf0000000 ) ) { | |
113 h = h ^ (g >> 24); | |
114 h = h ^ g; | |
115 } | |
116 } | |
117 return h /* % M */; | |
118 } | |
119 | |
120 static HashTab * | |
121 hnj_hash_new (void) | |
122 { | |
123 HashTab *hashtab; | |
124 int i; | |
125 | |
126 hashtab = hnj_malloc (sizeof(HashTab)); | |
127 for (i = 0; i < HASH_SIZE; i++) | |
128 hashtab->entries[i] = NULL; | |
129 | |
130 return hashtab; | |
131 } | |
132 | |
133 static void | |
134 hnj_hash_free (HashTab *hashtab) | |
135 { | |
136 int i; | |
137 HashEntry *e, *next; | |
138 | |
139 for (i = 0; i < HASH_SIZE; i++) | |
140 for (e = hashtab->entries[i]; e; e = next) | |
141 { | |
142 next = e->next; | |
143 hnj_free (e->key); | |
144 hnj_free (e); | |
145 } | |
146 | |
147 hnj_free (hashtab); | |
148 } | |
149 | |
150 /* assumes that key is not already present! */ | |
151 static void | |
152 hnj_hash_insert (HashTab *hashtab, const char *key, int val) | |
153 { | |
154 int i; | |
155 HashEntry *e; | |
156 | |
157 i = hnj_string_hash (key) % HASH_SIZE; | |
158 e = hnj_malloc (sizeof(HashEntry)); | |
159 e->next = hashtab->entries[i]; | |
160 e->key = hnj_strdup (key); | |
161 e->val = val; | |
162 hashtab->entries[i] = e; | |
163 } | |
164 | |
165 /* return val if found, otherwise -1 */ | |
166 static int | |
167 hnj_hash_lookup (HashTab *hashtab, const char *key) | |
168 { | |
169 int i; | |
170 HashEntry *e; | |
171 i = hnj_string_hash (key) % HASH_SIZE; | |
172 for (e = hashtab->entries[i]; e; e = e->next) | |
173 if (!strcmp (key, e->key)) | |
174 return e->val; | |
175 return -1; | |
176 } | |
177 | |
178 /* Get the state number, allocating a new state if necessary. */ | |
179 static int | |
180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) | |
181 { | |
182 int state_num; | |
183 | |
184 state_num = hnj_hash_lookup (hashtab, string); | |
185 | |
186 if (state_num >= 0) | |
187 return state_num; | |
188 | |
189 hnj_hash_insert (hashtab, string, dict->num_states); | |
190 /* predicate is true if dict->num_states is a power of two */ | |
191 if (!(dict->num_states & (dict->num_states - 1))) | |
192 { | |
193 dict->states = hnj_realloc (dict->states, | |
194 (dict->num_states << 1) * | |
195 sizeof(HyphenState)); | |
196 } | |
197 dict->states[dict->num_states].match = NULL; | |
198 dict->states[dict->num_states].repl = NULL; | |
199 dict->states[dict->num_states].fallback_state = -1; | |
200 dict->states[dict->num_states].num_trans = 0; | |
201 dict->states[dict->num_states].trans = NULL; | |
202 return dict->num_states++; | |
203 } | |
204 | |
205 /* add a transition from state1 to state2 through ch - assumes that the | |
206 transition does not already exist */ | |
207 static void | |
208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) | |
209 { | |
210 int num_trans; | |
211 | |
212 num_trans = dict->states[state1].num_trans; | |
213 if (num_trans == 0) | |
214 { | |
215 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); | |
216 } | |
217 else if (!(num_trans & (num_trans - 1))) | |
218 { | |
219 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, | |
220 (num_trans << 1) * | |
221 sizeof(HyphenTrans)); | |
222 } | |
223 dict->states[state1].trans[num_trans].ch = ch; | |
224 dict->states[state1].trans[num_trans].new_state = state2; | |
225 dict->states[state1].num_trans++; | |
226 } | |
227 | |
228 #ifdef VERBOSE | |
229 HashTab *global; | |
230 | |
231 static char * | |
232 get_state_str (int state) | |
233 { | |
234 int i; | |
235 HashEntry *e; | |
236 | |
237 for (i = 0; i < HASH_SIZE; i++) | |
238 for (e = global->entries[i]; e; e = e->next) | |
239 if (e->val == state) | |
240 return e->key; | |
241 return NULL; | |
242 } | |
243 #endif | |
244 | |
245 HyphenDict * | |
246 hnj_hyphen_load (const char *fn) | |
247 { | |
248 HyphenDict *result; | |
249 FILE *f; | |
250 f = fopen (fn, "r"); | |
251 if (f == NULL) | |
252 return NULL; | |
253 | |
254 result = hnj_hyphen_load_file(f); | |
255 | |
256 fclose(f); | |
257 return result; | |
258 } | |
259 | |
260 HyphenDict * | |
261 hnj_hyphen_load_file (FILE *f) | |
262 { | |
263 HyphenDict *dict[2]; | |
264 HashTab *hashtab; | |
265 char buf[MAX_CHARS]; | |
266 char word[MAX_CHARS]; | |
267 char pattern[MAX_CHARS]; | |
268 char * repl; | |
269 signed char replindex; | |
270 signed char replcut; | |
271 int state_num = 0, last_state; | |
272 int i, j, k; | |
273 char ch; | |
274 int found; | |
275 HashEntry *e; | |
276 int nextlevel = 0; | |
277 | |
278 // loading one or two dictionaries (separated by NEXTLEVEL keyword) | |
279 for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { | |
280 hashtab = hnj_hash_new (); | |
281 #ifdef VERBOSE | |
282 global = hashtab; | |
283 #endif | |
284 hnj_hash_insert (hashtab, "", 0); | |
285 dict[k] = hnj_malloc (sizeof(HyphenDict)); | |
286 dict[k]->num_states = 1; | |
287 dict[k]->states = hnj_malloc (sizeof(HyphenState)); | |
288 dict[k]->states[0].match = NULL; | |
289 dict[k]->states[0].repl = NULL; | |
290 dict[k]->states[0].fallback_state = -1; | |
291 dict[k]->states[0].num_trans = 0; | |
292 dict[k]->states[0].trans = NULL; | |
293 dict[k]->nextlevel = NULL; | |
294 dict[k]->lhmin = 0; | |
295 dict[k]->rhmin = 0; | |
296 dict[k]->clhmin = 0; | |
297 dict[k]->crhmin = 0; | |
298 | |
299 /* read in character set info */ | |
300 if (k == 0) { | |
301 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; | |
302 if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { | |
303 for (i=0;i<MAX_NAME;i++) | |
304 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) | |
305 dict[k]->cset[i] = 0; | |
306 } else { | |
307 dict[k]->cset[0] = 0; | |
308 } | |
309 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); | |
310 } else { | |
311 strcpy(dict[k]->cset, dict[0]->cset); | |
312 dict[k]->utf8 = dict[0]->utf8; | |
313 } | |
314 | |
315 while (fgets (buf, sizeof(buf), f) != NULL) | |
316 { | |
317 if (buf[0] != '%') | |
318 { | |
319 if (strncmp(buf, "NEXTLEVEL", 9) == 0) { | |
320 nextlevel = 1; | |
321 break; | |
322 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { | |
323 dict[k]->lhmin = atoi(buf + 13); | |
324 continue; | |
325 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { | |
326 dict[k]->rhmin = atoi(buf + 14); | |
327 continue; | |
328 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { | |
329 dict[k]->clhmin = atoi(buf + 21); | |
330 continue; | |
331 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { | |
332 dict[k]->crhmin = atoi(buf + 22); | |
333 continue; | |
334 } | |
335 j = 0; | |
336 pattern[j] = '0'; | |
337 repl = strchr(buf, '/'); | |
338 replindex = 0; | |
339 replcut = 0; | |
340 if (repl) { | |
341 char * index = strchr(repl + 1, ','); | |
342 *repl = '\0'; | |
343 if (index) { | |
344 char * index2 = strchr(index + 1, ','); | |
345 *index = '\0'; | |
346 if (index2) { | |
347 *index2 = '\0'; | |
348 replindex = (signed char) atoi(index + 1) - 1; | |
349 replcut = (signed char) atoi(index2 + 1); | |
350 } | |
351 } else { | |
352 hnj_strchomp(repl + 1); | |
353 replindex = 0; | |
354 replcut = (signed char) strlen(buf); | |
355 } | |
356 repl = hnj_strdup(repl + 1); | |
357 } | |
358 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) | |
359 { | |
360 if (buf[i] >= '0' && buf[i] <= '9') | |
361 pattern[j] = buf[i]; | |
362 else | |
363 { | |
364 word[j] = buf[i]; | |
365 pattern[++j] = '0'; | |
366 } | |
367 } | |
368 word[j] = '\0'; | |
369 pattern[j + 1] = '\0'; | |
370 | |
371 i = 0; | |
372 if (!repl) { | |
373 /* Optimize away leading zeroes */ | |
374 for (; pattern[i] == '0'; i++); | |
375 } else { | |
376 if (*word == '.') i++; | |
377 /* convert UTF-8 char. positions of discretionary hyph. replacements
to 8-bit */ | |
378 if (dict[k]->utf8) { | |
379 int pu = -1; /* unicode character position */ | |
380 int ps = -1; /* unicode start position (original replinde
x) */ | |
381 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ | |
382 for (; pc < (strlen(word) + 1); pc++) { | |
383 /* beginning of an UTF-8 character (not '10' start bits) */ | |
384 if ((((unsigned char) word[pc]) >> 6) != 2) pu++; | |
385 if ((ps < 0) && (replindex == pu)) { | |
386 ps = replindex; | |
387 replindex = (signed char) pc; | |
388 } | |
389 if ((ps >= 0) && ((pu - ps) == replcut)) { | |
390 replcut = (signed char) (pc - replindex); | |
391 break; | |
392 } | |
393 } | |
394 if (*word == '.') replindex--; | |
395 } | |
396 } | |
397 | |
398 #ifdef VERBOSE | |
399 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j
, repl); | |
400 #endif | |
401 found = hnj_hash_lookup (hashtab, word); | |
402 state_num = hnj_get_state (dict[k], hashtab, word); | |
403 dict[k]->states[state_num].match = hnj_strdup (pattern + i); | |
404 dict[k]->states[state_num].repl = repl; | |
405 dict[k]->states[state_num].replindex = replindex; | |
406 if (!replcut) { | |
407 dict[k]->states[state_num].replcut = (signed char) strlen(word); | |
408 } else { | |
409 dict[k]->states[state_num].replcut = replcut; | |
410 } | |
411 | |
412 /* now, put in the prefix transitions */ | |
413 for (; found < 0 ;j--) | |
414 { | |
415 last_state = state_num; | |
416 ch = word[j - 1]; | |
417 word[j - 1] = '\0'; | |
418 found = hnj_hash_lookup (hashtab, word); | |
419 state_num = hnj_get_state (dict[k], hashtab, word); | |
420 hnj_add_trans (dict[k], state_num, last_state, ch); | |
421 } | |
422 } | |
423 } | |
424 | |
425 /* Could do unioning of matches here (instead of the preprocessor script). | |
426 If we did, the pseudocode would look something like this: | |
427 | |
428 foreach state in the hash table | |
429 foreach i = [1..length(state) - 1] | |
430 state to check is substr (state, i) | |
431 look it up | |
432 if found, and if there is a match, union the match in. | |
433 | |
434 It's also possible to avoid the quadratic blowup by doing the | |
435 search in order of increasing state string sizes - then you | |
436 can break the loop after finding the first match. | |
437 | |
438 This step should be optional in any case - if there is a | |
439 preprocessed rule table, it's always faster to use that. | |
440 | |
441 */ | |
442 | |
443 /* put in the fallback states */ | |
444 for (i = 0; i < HASH_SIZE; i++) | |
445 for (e = hashtab->entries[i]; e; e = e->next) | |
446 { | |
447 if (*(e->key)) for (j = 1; 1; j++) | |
448 { | |
449 state_num = hnj_hash_lookup (hashtab, e->key + j); | |
450 if (state_num >= 0) | |
451 break; | |
452 } | |
453 /* KBH: FIXME state 0 fallback_state should always be -1? */ | |
454 if (e->val) | |
455 dict[k]->states[e->val].fallback_state = state_num; | |
456 } | |
457 #ifdef VERBOSE | |
458 for (i = 0; i < HASH_SIZE; i++) | |
459 for (e = hashtab->entries[i]; e; e = e->next) | |
460 { | |
461 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, | |
462 dict[k]->states[e->val].fallback_state); | |
463 for (j = 0; j < dict[k]->states[e->val].num_trans; j++) | |
464 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, | |
465 dict[k]->states[e->val].trans[j].new_state); | |
466 } | |
467 #endif | |
468 | |
469 #ifndef VERBOSE | |
470 hnj_hash_free (hashtab); | |
471 #endif | |
472 state_num = 0; | |
473 } | |
474 if (k == 2) dict[0]->nextlevel = dict[1]; | |
475 return dict[0]; | |
476 } | |
477 | |
478 void hnj_hyphen_free (HyphenDict *dict) | |
479 { | |
480 int state_num; | |
481 HyphenState *hstate; | |
482 | |
483 for (state_num = 0; state_num < dict->num_states; state_num++) | |
484 { | |
485 hstate = &dict->states[state_num]; | |
486 if (hstate->match) | |
487 hnj_free (hstate->match); | |
488 if (hstate->repl) | |
489 hnj_free (hstate->repl); | |
490 if (hstate->trans) | |
491 hnj_free (hstate->trans); | |
492 } | |
493 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); | |
494 | |
495 hnj_free (dict->states); | |
496 | |
497 hnj_free (dict); | |
498 } | |
499 | |
500 #define MAX_WORD 256 | |
501 | |
502 int hnj_hyphen_hyphenate (HyphenDict *dict, | |
503 const char *word, int word_size, | |
504 char *hyphens) | |
505 { | |
506 char prep_word_buf[MAX_WORD]; | |
507 char *prep_word; | |
508 int i, j, k; | |
509 int state; | |
510 char ch; | |
511 HyphenState *hstate; | |
512 char *match; | |
513 int offset; | |
514 | |
515 if (word_size + 3 < MAX_WORD) | |
516 prep_word = prep_word_buf; | |
517 else | |
518 prep_word = hnj_malloc (word_size + 3); | |
519 | |
520 j = 0; | |
521 prep_word[j++] = '.'; | |
522 | |
523 for (i = 0; i < word_size; i++) | |
524 prep_word[j++] = word[i]; | |
525 | |
526 prep_word[j++] = '.'; | |
527 prep_word[j] = '\0'; | |
528 | |
529 for (i = 0; i < word_size + 5; i++) | |
530 hyphens[i] = '0'; | |
531 | |
532 #ifdef VERBOSE | |
533 printf ("prep_word = %s\n", prep_word); | |
534 #endif | |
535 | |
536 /* now, run the finite state machine */ | |
537 state = 0; | |
538 for (i = 0; i < j; i++) | |
539 { | |
540 ch = prep_word[i]; | |
541 for (;;) | |
542 { | |
543 | |
544 if (state == -1) { | |
545 /* return 1; */ | |
546 /* KBH: FIXME shouldn't this be as follows? */ | |
547 state = 0; | |
548 goto try_next_letter; | |
549 } | |
550 | |
551 #ifdef VERBOSE | |
552 char *state_str; | |
553 state_str = get_state_str (state); | |
554 | |
555 for (k = 0; k < i - strlen (state_str); k++) | |
556 putchar (' '); | |
557 printf ("%s", state_str); | |
558 #endif | |
559 | |
560 hstate = &dict->states[state]; | |
561 for (k = 0; k < hstate->num_trans; k++) | |
562 if (hstate->trans[k].ch == ch) | |
563 { | |
564 state = hstate->trans[k].new_state; | |
565 goto found_state; | |
566 } | |
567 state = hstate->fallback_state; | |
568 #ifdef VERBOSE | |
569 printf (" falling back, fallback_state %d\n", state); | |
570 #endif | |
571 } | |
572 found_state: | |
573 #ifdef VERBOSE | |
574 printf ("found state %d\n",state); | |
575 #endif | |
576 /* Additional optimization is possible here - especially, | |
577 elimination of trailing zeroes from the match. Leading zeroes | |
578 have already been optimized. */ | |
579 match = dict->states[state].match; | |
580 /* replacing rules not handled by hyphen_hyphenate() */ | |
581 if (match && !dict->states[state].repl) | |
582 { | |
583 offset = i + 1 - strlen (match); | |
584 #ifdef VERBOSE | |
585 for (k = 0; k < offset; k++) | |
586 putchar (' '); | |
587 printf ("%s\n", match); | |
588 #endif | |
589 /* This is a linear search because I tried a binary search and | |
590 found it to be just a teeny bit slower. */ | |
591 for (k = 0; match[k]; k++) | |
592 if (hyphens[offset + k] < match[k]) | |
593 hyphens[offset + k] = match[k]; | |
594 } | |
595 | |
596 /* KBH: we need this to make sure we keep looking in a word */ | |
597 /* for patterns even if the current character is not known in state 0 */ | |
598 /* since patterns for hyphenation may occur anywhere in the word */ | |
599 try_next_letter: ; | |
600 | |
601 } | |
602 #ifdef VERBOSE | |
603 for (i = 0; i < j; i++) | |
604 putchar (hyphens[i]); | |
605 putchar ('\n'); | |
606 #endif | |
607 | |
608 for (i = 0; i < j - 4; i++) | |
609 #if 0 | |
610 if (hyphens[i + 1] & 1) | |
611 hyphens[i] = '-'; | |
612 #else | |
613 hyphens[i] = hyphens[i + 1]; | |
614 #endif | |
615 hyphens[0] = '0'; | |
616 for (; i < word_size; i++) | |
617 hyphens[i] = '0'; | |
618 hyphens[word_size] = '\0'; | |
619 | |
620 if (prep_word != prep_word_buf) | |
621 hnj_free (prep_word); | |
622 | |
623 return 0; | |
624 } | |
625 | |
626 /* Unicode ligature length */ | |
627 int hnj_ligature(unsigned char c) { | |
628 switch (c) { | |
629 case 0x80: /* ff */ | |
630 case 0x81: /* fi */ | |
631 case 0x82: return LIG_xx; /* fl */ | |
632 case 0x83: /* ffi */ | |
633 case 0x84: return LIG_xxx; /* ffl */ | |
634 case 0x85: /* long st */ | |
635 case 0x86: return LIG_xx; /* st */ | |
636 } | |
637 return 0; | |
638 } | |
639 | |
640 /* character length of the first n byte of the input word */ | |
641 int hnj_hyphen_strnlen(const char * word, int n, int utf8) | |
642 { | |
643 int i = 0; | |
644 int j = 0; | |
645 while (j < n && word[j] != '\0') { | |
646 i++; | |
647 // Unicode ligature support | |
648 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j +
1] == 0xAC)) { | |
649 i += hnj_ligature(word[j + 2]); | |
650 } | |
651 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); | |
652 } | |
653 return i; | |
654 } | |
655 | |
656 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, | |
657 char *** rep, int ** pos, int ** cut, int lhmin) | |
658 { | |
659 int i = 1, j; | |
660 | |
661 // Unicode ligature support | |
662 if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] ==
0xAC)) { | |
663 i += hnj_ligature(word[2]); | |
664 } | |
665 | |
666 for (j = 0; i < lhmin && word[j] != '\0'; i++) do { | |
667 // check length of the non-standard part | |
668 if (*rep && *pos && *cut && (*rep)[j]) { | |
669 char * rh = strchr((*rep)[j], '='); | |
670 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + | |
671 hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { | |
672 free((*rep)[j]); | |
673 (*rep)[j] = NULL; | |
674 hyphens[j] = '0'; | |
675 } | |
676 } else { | |
677 hyphens[j] = '0'; | |
678 } | |
679 j++; | |
680 | |
681 // Unicode ligature support | |
682 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j
+ 1] == 0xAC)) { | |
683 i += hnj_ligature(word[j + 2]); | |
684 } | |
685 } while (utf8 && (word[j] & 0xc0) == 0x80); | |
686 return 0; | |
687 } | |
688 | |
689 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, | |
690 char *** rep, int ** pos, int ** cut, int rhmin) | |
691 { | |
692 int i; | |
693 int j = word_size - 2; | |
694 for (i = 1; i < rhmin && j > 0; j--) { | |
695 // check length of the non-standard part | |
696 if (*rep && *pos && *cut && (*rep)[j]) { | |
697 char * rh = strchr((*rep)[j], '='); | |
698 if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100,
utf8) + | |
699 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { | |
700 free((*rep)[j]); | |
701 (*rep)[j] = NULL; | |
702 hyphens[j] = '0'; | |
703 } | |
704 } else { | |
705 hyphens[j] = '0'; | |
706 } | |
707 if (!utf8 || (word[j] & 0xc0) != 0xc0) i++; | |
708 } | |
709 return 0; | |
710 } | |
711 | |
712 // recursive function for compound level hyphenation | |
713 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, | |
714 char * hyphens, char *** rep, int ** pos, int ** cut, | |
715 int clhmin, int crhmin, int lend, int rend) | |
716 { | |
717 char prep_word_buf[MAX_WORD]; | |
718 char *prep_word; | |
719 int i, j, k; | |
720 int state; | |
721 char ch; | |
722 HyphenState *hstate; | |
723 char *match; | |
724 char *repl; | |
725 signed char replindex; | |
726 signed char replcut; | |
727 int offset; | |
728 int matchlen_buf[MAX_CHARS]; | |
729 int matchindex_buf[MAX_CHARS]; | |
730 char * matchrepl_buf[MAX_CHARS]; | |
731 int * matchlen; | |
732 int * matchindex; | |
733 char ** matchrepl; | |
734 int isrepl = 0; | |
735 int nHyphCount; | |
736 | |
737 if (word_size + 3 < MAX_CHARS) { | |
738 prep_word = prep_word_buf; | |
739 matchlen = matchlen_buf; | |
740 matchindex = matchindex_buf; | |
741 matchrepl = matchrepl_buf; | |
742 } else { | |
743 prep_word = hnj_malloc (word_size + 3); | |
744 matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); | |
745 matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); | |
746 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); | |
747 } | |
748 | |
749 j = 0; | |
750 prep_word[j++] = '.'; | |
751 | |
752 for (i = 0; i < word_size; i++) | |
753 prep_word[j++] = word[i]; | |
754 | |
755 prep_word[j++] = '.'; | |
756 prep_word[j] = '\0'; | |
757 | |
758 for (i = 0; i < j; i++) | |
759 hyphens[i] = '0'; | |
760 | |
761 #ifdef VERBOSE | |
762 printf ("prep_word = %s\n", prep_word); | |
763 #endif | |
764 | |
765 /* now, run the finite state machine */ | |
766 state = 0; | |
767 for (i = 0; i < j; i++) | |
768 { | |
769 ch = prep_word[i]; | |
770 for (;;) | |
771 { | |
772 | |
773 if (state == -1) { | |
774 /* return 1; */ | |
775 /* KBH: FIXME shouldn't this be as follows? */ | |
776 state = 0; | |
777 goto try_next_letter; | |
778 } | |
779 | |
780 #ifdef VERBOSE | |
781 char *state_str; | |
782 state_str = get_state_str (state); | |
783 | |
784 for (k = 0; k < i - strlen (state_str); k++) | |
785 putchar (' '); | |
786 printf ("%s", state_str); | |
787 #endif | |
788 | |
789 hstate = &dict->states[state]; | |
790 for (k = 0; k < hstate->num_trans; k++) | |
791 if (hstate->trans[k].ch == ch) | |
792 { | |
793 state = hstate->trans[k].new_state; | |
794 goto found_state; | |
795 } | |
796 state = hstate->fallback_state; | |
797 #ifdef VERBOSE | |
798 printf (" falling back, fallback_state %d\n", state); | |
799 #endif | |
800 } | |
801 found_state: | |
802 #ifdef VERBOSE | |
803 printf ("found state %d\n",state); | |
804 #endif | |
805 /* Additional optimization is possible here - especially, | |
806 elimination of trailing zeroes from the match. Leading zeroes | |
807 have already been optimized. */ | |
808 match = dict->states[state].match; | |
809 repl = dict->states[state].repl; | |
810 replindex = dict->states[state].replindex; | |
811 replcut = dict->states[state].replcut; | |
812 /* replacing rules not handled by hyphen_hyphenate() */ | |
813 if (match) | |
814 { | |
815 offset = i + 1 - strlen (match); | |
816 #ifdef VERBOSE | |
817 for (k = 0; k < offset; k++) | |
818 putchar (' '); | |
819 printf ("%s (%s)\n", match, repl); | |
820 #endif | |
821 if (repl) { | |
822 if (!isrepl) for(; isrepl < word_size; isrepl++) { | |
823 matchrepl[isrepl] = NULL; | |
824 matchindex[isrepl] = -1; | |
825 } | |
826 matchlen[offset + replindex] = replcut; | |
827 } | |
828 /* This is a linear search because I tried a binary search and | |
829 found it to be just a teeny bit slower. */ | |
830 for (k = 0; match[k]; k++) { | |
831 if ((hyphens[offset + k] < match[k])) { | |
832 hyphens[offset + k] = match[k]; | |
833 if (match[k]&1) { | |
834 matchrepl[offset + k] = repl; | |
835 if (repl && (k >= replindex) && (k <= replindex + replcut)) { | |
836 matchindex[offset + replindex] = offset + k; | |
837 } | |
838 } | |
839 } | |
840 } | |
841 | |
842 } | |
843 | |
844 /* KBH: we need this to make sure we keep looking in a word */ | |
845 /* for patterns even if the current character is not known in state 0 */ | |
846 /* since patterns for hyphenation may occur anywhere in the word */ | |
847 try_next_letter: ; | |
848 | |
849 } | |
850 #ifdef VERBOSE | |
851 for (i = 0; i < j; i++) | |
852 putchar (hyphens[i]); | |
853 putchar ('\n'); | |
854 #endif | |
855 | |
856 for (i = 0; i < j - 3; i++) | |
857 #if 0 | |
858 if (hyphens[i + 1] & 1) | |
859 hyphens[i] = '-'; | |
860 #else | |
861 hyphens[i] = hyphens[i + 1]; | |
862 #endif | |
863 for (; i < word_size; i++) | |
864 hyphens[i] = '0'; | |
865 hyphens[word_size] = '\0'; | |
866 | |
867 /* now create a new char string showing hyphenation positions */ | |
868 /* count the hyphens and allocate space for the new hyphenated string */ | |
869 nHyphCount = 0; | |
870 for (i = 0; i < word_size; i++) | |
871 if (hyphens[i]&1) | |
872 nHyphCount++; | |
873 j = 0; | |
874 for (i = 0; i < word_size; i++) { | |
875 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { | |
876 if (rep && pos && cut) { | |
877 if (!*rep && !*pos && !*cut) { | |
878 int k; | |
879 *rep = (char **) malloc(sizeof(char *) * word_size); | |
880 *pos = (int *) malloc(sizeof(int) * word_size); | |
881 *cut = (int *) malloc(sizeof(int) * word_size); | |
882 for (k = 0; k < word_size; k++) { | |
883 (*rep)[k] = NULL; | |
884 (*pos)[k] = 0; | |
885 (*cut)[k] = 0; | |
886 } | |
887 } | |
888 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[
i]]); | |
889 (*pos)[matchindex[i] - 1] = matchindex[i] - i; | |
890 (*cut)[matchindex[i] - 1] = matchlen[i]; | |
891 } | |
892 j += strlen(matchrepl[matchindex[i]]); | |
893 i += matchlen[i] - 1; | |
894 } | |
895 } | |
896 | |
897 if (matchrepl != matchrepl_buf) { | |
898 hnj_free (matchrepl); | |
899 hnj_free (matchlen); | |
900 hnj_free (matchindex); | |
901 } | |
902 | |
903 // recursive hyphenation of the first (compound) level segments | |
904 if (dict->nextlevel) { | |
905 char * rep2_buf[MAX_WORD]; | |
906 int pos2_buf[MAX_WORD]; | |
907 int cut2_buf[MAX_WORD]; | |
908 char hyphens2_buf[MAX_WORD]; | |
909 char ** rep2; | |
910 int * pos2; | |
911 int * cut2; | |
912 char * hyphens2; | |
913 int begin = 0; | |
914 if (word_size < MAX_CHARS) { | |
915 rep2 = rep2_buf; | |
916 pos2 = pos2_buf; | |
917 cut2 = cut2_buf; | |
918 hyphens2 = hyphens2_buf; | |
919 } else { | |
920 rep2 = hnj_malloc (word_size * sizeof(char *)); | |
921 pos2 = hnj_malloc (word_size * sizeof(int)); | |
922 cut2 = hnj_malloc (word_size * sizeof(int)); | |
923 hyphens2 = hnj_malloc (word_size); | |
924 } | |
925 for (i = 0; i < word_size; i++) rep2[i] = NULL; | |
926 for (i = 0; i < word_size; i++) if | |
927 (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { | |
928 if (i - begin > 1) { | |
929 int hyph = 0; | |
930 prep_word[i + 2] = '\0'; | |
931 /* non-standard hyphenation at compound boundary (Schiffahrt) */ | |
932 if (*rep && *pos && *cut && (*rep)[i]) { | |
933 char * l = strchr((*rep)[i], '='); | |
934 strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]); | |
935 if (l) { | |
936 hyph = (l - (*rep)[i]) - (*pos)[i]; | |
937 prep_word[2 + i + hyph] = '\0'; | |
938 } | |
939 } | |
940 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, | |
941 hyphens2, &rep2, &pos2, &cut2, clhmin, | |
942 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); | |
943 for (j = 0; j < i - begin - 1; j++) { | |
944 hyphens[begin + j] = hyphens2[j]; | |
945 if (rep2[j] && rep && pos && cut) { | |
946 if (!*rep && !*pos && !*cut) { | |
947 int k; | |
948 *rep = (char **) malloc(sizeof(char *) * word_size); | |
949 *pos = (int *) malloc(sizeof(int) * word_size); | |
950 *cut = (int *) malloc(sizeof(int) * word_size); | |
951 for (k = 0; k < word_size; k++) { | |
952 (*rep)[k] = NULL; | |
953 (*pos)[k] = 0; | |
954 (*cut)[k] = 0; | |
955 } | |
956 } | |
957 (*rep)[begin + j] = rep2[j]; | |
958 (*pos)[begin + j] = pos2[j]; | |
959 (*cut)[begin + j] = cut2[j]; | |
960 } | |
961 } | |
962 prep_word[i + 2] = word[i + 1]; | |
963 if (*rep && *pos && *cut && (*rep)[i]) { | |
964 strcpy(prep_word + 1, word); | |
965 } | |
966 } | |
967 begin = i + 1; | |
968 for (j = 0; j < word_size; j++) rep2[j] = NULL; | |
969 } | |
970 | |
971 // non-compound | |
972 if (begin == 0) { | |
973 hnj_hyphen_hyph_(dict->nextlevel, word, word_size, | |
974 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); | |
975 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, | |
976 rep, pos, cut, clhmin); | |
977 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, | |
978 rep, pos, cut, crhmin); | |
979 } | |
980 | |
981 if (rep2 != rep2_buf) { | |
982 free(rep2); | |
983 free(cut2); | |
984 free(pos2); | |
985 free(hyphens2); | |
986 } | |
987 } | |
988 | |
989 if (prep_word != prep_word_buf) hnj_free (prep_word); | |
990 return 0; | |
991 } | |
992 | |
993 /* UTF-8 normalization of hyphen and non-standard positions */ | |
994 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, | |
995 char *** rep, int ** pos, int ** cut) | |
996 { | |
997 int i, j, k; | |
998 if ((((unsigned char) word[0]) >> 6) == 2) { | |
999 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); | |
1000 return 1; | |
1001 } | |
1002 | |
1003 /* calculate UTF-8 character positions */ | |
1004 for (i = 0, j = -1; i < word_size; i++) { | |
1005 /* beginning of an UTF-8 character (not '10' start bits) */ | |
1006 if ((((unsigned char) word[i]) >> 6) != 2) j++; | |
1007 hyphens[j] = hyphens[i]; | |
1008 if (rep && pos && cut && *rep && *pos && *cut) { | |
1009 int l = (*pos)[i]; | |
1010 (*pos)[j] = 0; | |
1011 for (k = 0; k < l; k++) { | |
1012 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; | |
1013 } | |
1014 k = i - l + 1; | |
1015 l = k + (*cut)[i]; | |
1016 (*cut)[j] = 0; | |
1017 for (; k < l; k++) { | |
1018 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; | |
1019 } | |
1020 (*rep)[j] = (*rep)[i]; | |
1021 if (j < i) { | |
1022 (*rep)[i] = NULL; | |
1023 (*pos)[i] = 0; | |
1024 (*cut)[i] = 0; | |
1025 } | |
1026 } | |
1027 } | |
1028 hyphens[j + 1] = '\0'; | |
1029 return 0; | |
1030 } | |
1031 | |
1032 /* get the word with all possible hyphenations (output: hyphword) */ | |
1033 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, | |
1034 char * hyphword, char *** rep, int ** pos, int ** cut) | |
1035 { | |
1036 int i, j; | |
1037 for (i = 0, j = 0; i < l; i++, j++) { | |
1038 if (hyphens[i]&1) { | |
1039 hyphword[j] = word[i]; | |
1040 if (*rep && *pos && *cut && (*rep)[i]) { | |
1041 strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]); | |
1042 j += strlen((*rep)[i]) - (*pos)[i]; | |
1043 i += (*cut)[i] - (*pos)[i]; | |
1044 } else hyphword[++j] = '='; | |
1045 } else hyphword[j] = word[i]; | |
1046 } | |
1047 hyphword[j] = '\0'; | |
1048 } | |
1049 | |
1050 | |
1051 /* main api function with default hyphenmin parameters */ | |
1052 int hnj_hyphen_hyphenate2 (HyphenDict *dict, | |
1053 const char *word, int word_size, char * hyphens, | |
1054 char *hyphword, char *** rep, int ** pos, int ** cut) | |
1055 { | |
1056 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, | |
1057 dict->clhmin, dict->crhmin, 1, 1); | |
1058 hnj_hyphen_lhmin(dict->utf8, word, word_size, | |
1059 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); | |
1060 hnj_hyphen_rhmin(dict->utf8, word, word_size, | |
1061 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); | |
1062 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos
, cut); | |
1063 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut
); | |
1064 return 0; | |
1065 } | |
1066 | |
1067 /* previous main api function with hyphenmin parameters */ | |
1068 int hnj_hyphen_hyphenate3 (HyphenDict *dict, | |
1069 const char *word, int word_size, char * hyphens, | |
1070 char *hyphword, char *** rep, int ** pos, int ** cut, | |
1071 int lhmin, int rhmin, int clhmin, int crhmin) | |
1072 { | |
1073 lhmin = (lhmin > 0 ? lhmin : dict->lhmin); | |
1074 rhmin = (rhmin > 0 ? rhmin : dict->rhmin); | |
1075 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, | |
1076 clhmin, crhmin, 1, 1); | |
1077 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, | |
1078 rep, pos, cut, (lhmin > 0 ? lhmin : 2)); | |
1079 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, | |
1080 rep, pos, cut, (rhmin > 0 ? rhmin : 2)); | |
1081 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos
, cut); | |
1082 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut
); | |
1083 return 0; | |
1084 } | |
OLD | NEW |