Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(268)

Side by Side Diff: third_party/hyphen/hyphen.c

Issue 20860003: Remove hyphenation code from Chromium. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: rebase Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/hyphen/hyphen.h ('k') | third_party/hyphen/hyphen.gyp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2 * licenses follows.
3 */
4
5 /* LibHnj - a library for high quality hyphenation and justification
6 * Copyright (C) 1998 Raph Levien,
7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Library General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Library General Public License for more details.
20 *
21 * You should have received a copy of the GNU Library General Public
22 * License along with this library; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 02111-1307 USA.
25 */
26
27 /*
28 * The contents of this file are subject to the Mozilla Public License
29 * Version 1.0 (the "MPL"); you may not use this file except in
30 * compliance with the MPL. You may obtain a copy of the MPL at
31 * http://www.mozilla.org/MPL/
32 *
33 * Software distributed under the MPL is distributed on an "AS IS" basis,
34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35 * for the specific language governing rights and limitations under the
36 * MPL.
37 *
38 */
39 #include <stdlib.h> /* for NULL, malloc */
40 #include <stdio.h> /* for fprintf */
41 #include <string.h> /* for strdup */
42
43 #ifdef UNX
44 #include <unistd.h> /* for exit */
45 #endif
46
47 #define noVERBOSE
48
49 /* calculate hyphenmin values with long ligature length (2 or 3 characters
50 * instead of 1 or 2) for comparison with hyphenation without ligatures */
51 #define noLONG_LIGATURE
52
53 #ifdef LONG_LIGATURE
54 #define LIG_xx 1
55 #define LIG_xxx 2
56 #else
57 #define LIG_xx 0
58 #define LIG_xxx 1
59 #endif
60
61 #include "hnjalloc.h"
62 #include "hyphen.h"
63
64 static char *
65 hnj_strdup (const char *s)
66 {
67 char *new;
68 int l;
69
70 l = strlen (s);
71 new = hnj_malloc (l + 1);
72 memcpy (new, s, l);
73 new[l] = 0;
74 return new;
75 }
76
77 /* remove cross-platform text line end characters */
78 void hnj_strchomp(char * s)
79 {
80 int k = strlen(s);
81 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
82 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
83 }
84
85 /* a little bit of a hash table implementation. This simply maps strings
86 to state numbers */
87
88 typedef struct _HashTab HashTab;
89 typedef struct _HashEntry HashEntry;
90
91 /* A cheap, but effective, hack. */
92 #define HASH_SIZE 31627
93
94 struct _HashTab {
95 HashEntry *entries[HASH_SIZE];
96 };
97
98 struct _HashEntry {
99 HashEntry *next;
100 char *key;
101 int val;
102 };
103
104 /* a char* hash function from ASU - adapted from Gtk+ */
105 static unsigned int
106 hnj_string_hash (const char *s)
107 {
108 const char *p;
109 unsigned int h=0, g;
110 for(p = s; *p != '\0'; p += 1) {
111 h = ( h << 4 ) + *p;
112 if ( ( g = h & 0xf0000000 ) ) {
113 h = h ^ (g >> 24);
114 h = h ^ g;
115 }
116 }
117 return h /* % M */;
118 }
119
120 static HashTab *
121 hnj_hash_new (void)
122 {
123 HashTab *hashtab;
124 int i;
125
126 hashtab = hnj_malloc (sizeof(HashTab));
127 for (i = 0; i < HASH_SIZE; i++)
128 hashtab->entries[i] = NULL;
129
130 return hashtab;
131 }
132
133 static void
134 hnj_hash_free (HashTab *hashtab)
135 {
136 int i;
137 HashEntry *e, *next;
138
139 for (i = 0; i < HASH_SIZE; i++)
140 for (e = hashtab->entries[i]; e; e = next)
141 {
142 next = e->next;
143 hnj_free (e->key);
144 hnj_free (e);
145 }
146
147 hnj_free (hashtab);
148 }
149
150 /* assumes that key is not already present! */
151 static void
152 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
153 {
154 int i;
155 HashEntry *e;
156
157 i = hnj_string_hash (key) % HASH_SIZE;
158 e = hnj_malloc (sizeof(HashEntry));
159 e->next = hashtab->entries[i];
160 e->key = hnj_strdup (key);
161 e->val = val;
162 hashtab->entries[i] = e;
163 }
164
165 /* return val if found, otherwise -1 */
166 static int
167 hnj_hash_lookup (HashTab *hashtab, const char *key)
168 {
169 int i;
170 HashEntry *e;
171 i = hnj_string_hash (key) % HASH_SIZE;
172 for (e = hashtab->entries[i]; e; e = e->next)
173 if (!strcmp (key, e->key))
174 return e->val;
175 return -1;
176 }
177
178 /* Get the state number, allocating a new state if necessary. */
179 static int
180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
181 {
182 int state_num;
183
184 state_num = hnj_hash_lookup (hashtab, string);
185
186 if (state_num >= 0)
187 return state_num;
188
189 hnj_hash_insert (hashtab, string, dict->num_states);
190 /* predicate is true if dict->num_states is a power of two */
191 if (!(dict->num_states & (dict->num_states - 1)))
192 {
193 dict->states = hnj_realloc (dict->states,
194 (dict->num_states << 1) *
195 sizeof(HyphenState));
196 }
197 dict->states[dict->num_states].match = NULL;
198 dict->states[dict->num_states].repl = NULL;
199 dict->states[dict->num_states].fallback_state = -1;
200 dict->states[dict->num_states].num_trans = 0;
201 dict->states[dict->num_states].trans = NULL;
202 return dict->num_states++;
203 }
204
205 /* add a transition from state1 to state2 through ch - assumes that the
206 transition does not already exist */
207 static void
208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
209 {
210 int num_trans;
211
212 num_trans = dict->states[state1].num_trans;
213 if (num_trans == 0)
214 {
215 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
216 }
217 else if (!(num_trans & (num_trans - 1)))
218 {
219 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
220 (num_trans << 1) *
221 sizeof(HyphenTrans));
222 }
223 dict->states[state1].trans[num_trans].ch = ch;
224 dict->states[state1].trans[num_trans].new_state = state2;
225 dict->states[state1].num_trans++;
226 }
227
228 #ifdef VERBOSE
229 HashTab *global;
230
231 static char *
232 get_state_str (int state)
233 {
234 int i;
235 HashEntry *e;
236
237 for (i = 0; i < HASH_SIZE; i++)
238 for (e = global->entries[i]; e; e = e->next)
239 if (e->val == state)
240 return e->key;
241 return NULL;
242 }
243 #endif
244
245 HyphenDict *
246 hnj_hyphen_load (const char *fn)
247 {
248 HyphenDict *result;
249 FILE *f;
250 f = fopen (fn, "r");
251 if (f == NULL)
252 return NULL;
253
254 result = hnj_hyphen_load_file(f);
255
256 fclose(f);
257 return result;
258 }
259
260 HyphenDict *
261 hnj_hyphen_load_file (FILE *f)
262 {
263 HyphenDict *dict[2];
264 HashTab *hashtab;
265 char buf[MAX_CHARS];
266 char word[MAX_CHARS];
267 char pattern[MAX_CHARS];
268 char * repl;
269 signed char replindex;
270 signed char replcut;
271 int state_num = 0, last_state;
272 int i, j, k;
273 char ch;
274 int found;
275 HashEntry *e;
276 int nextlevel = 0;
277
278 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
279 for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
280 hashtab = hnj_hash_new ();
281 #ifdef VERBOSE
282 global = hashtab;
283 #endif
284 hnj_hash_insert (hashtab, "", 0);
285 dict[k] = hnj_malloc (sizeof(HyphenDict));
286 dict[k]->num_states = 1;
287 dict[k]->states = hnj_malloc (sizeof(HyphenState));
288 dict[k]->states[0].match = NULL;
289 dict[k]->states[0].repl = NULL;
290 dict[k]->states[0].fallback_state = -1;
291 dict[k]->states[0].num_trans = 0;
292 dict[k]->states[0].trans = NULL;
293 dict[k]->nextlevel = NULL;
294 dict[k]->lhmin = 0;
295 dict[k]->rhmin = 0;
296 dict[k]->clhmin = 0;
297 dict[k]->crhmin = 0;
298
299 /* read in character set info */
300 if (k == 0) {
301 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
302 if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
303 for (i=0;i<MAX_NAME;i++)
304 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
305 dict[k]->cset[i] = 0;
306 } else {
307 dict[k]->cset[0] = 0;
308 }
309 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
310 } else {
311 strcpy(dict[k]->cset, dict[0]->cset);
312 dict[k]->utf8 = dict[0]->utf8;
313 }
314
315 while (fgets (buf, sizeof(buf), f) != NULL)
316 {
317 if (buf[0] != '%')
318 {
319 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
320 nextlevel = 1;
321 break;
322 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
323 dict[k]->lhmin = atoi(buf + 13);
324 continue;
325 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
326 dict[k]->rhmin = atoi(buf + 14);
327 continue;
328 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
329 dict[k]->clhmin = atoi(buf + 21);
330 continue;
331 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
332 dict[k]->crhmin = atoi(buf + 22);
333 continue;
334 }
335 j = 0;
336 pattern[j] = '0';
337 repl = strchr(buf, '/');
338 replindex = 0;
339 replcut = 0;
340 if (repl) {
341 char * index = strchr(repl + 1, ',');
342 *repl = '\0';
343 if (index) {
344 char * index2 = strchr(index + 1, ',');
345 *index = '\0';
346 if (index2) {
347 *index2 = '\0';
348 replindex = (signed char) atoi(index + 1) - 1;
349 replcut = (signed char) atoi(index2 + 1);
350 }
351 } else {
352 hnj_strchomp(repl + 1);
353 replindex = 0;
354 replcut = (signed char) strlen(buf);
355 }
356 repl = hnj_strdup(repl + 1);
357 }
358 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
359 {
360 if (buf[i] >= '0' && buf[i] <= '9')
361 pattern[j] = buf[i];
362 else
363 {
364 word[j] = buf[i];
365 pattern[++j] = '0';
366 }
367 }
368 word[j] = '\0';
369 pattern[j + 1] = '\0';
370
371 i = 0;
372 if (!repl) {
373 /* Optimize away leading zeroes */
374 for (; pattern[i] == '0'; i++);
375 } else {
376 if (*word == '.') i++;
377 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
378 if (dict[k]->utf8) {
379 int pu = -1; /* unicode character position */
380 int ps = -1; /* unicode start position (original replinde x) */
381 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
382 for (; pc < (strlen(word) + 1); pc++) {
383 /* beginning of an UTF-8 character (not '10' start bits) */
384 if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
385 if ((ps < 0) && (replindex == pu)) {
386 ps = replindex;
387 replindex = (signed char) pc;
388 }
389 if ((ps >= 0) && ((pu - ps) == replcut)) {
390 replcut = (signed char) (pc - replindex);
391 break;
392 }
393 }
394 if (*word == '.') replindex--;
395 }
396 }
397
398 #ifdef VERBOSE
399 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j , repl);
400 #endif
401 found = hnj_hash_lookup (hashtab, word);
402 state_num = hnj_get_state (dict[k], hashtab, word);
403 dict[k]->states[state_num].match = hnj_strdup (pattern + i);
404 dict[k]->states[state_num].repl = repl;
405 dict[k]->states[state_num].replindex = replindex;
406 if (!replcut) {
407 dict[k]->states[state_num].replcut = (signed char) strlen(word);
408 } else {
409 dict[k]->states[state_num].replcut = replcut;
410 }
411
412 /* now, put in the prefix transitions */
413 for (; found < 0 ;j--)
414 {
415 last_state = state_num;
416 ch = word[j - 1];
417 word[j - 1] = '\0';
418 found = hnj_hash_lookup (hashtab, word);
419 state_num = hnj_get_state (dict[k], hashtab, word);
420 hnj_add_trans (dict[k], state_num, last_state, ch);
421 }
422 }
423 }
424
425 /* Could do unioning of matches here (instead of the preprocessor script).
426 If we did, the pseudocode would look something like this:
427
428 foreach state in the hash table
429 foreach i = [1..length(state) - 1]
430 state to check is substr (state, i)
431 look it up
432 if found, and if there is a match, union the match in.
433
434 It's also possible to avoid the quadratic blowup by doing the
435 search in order of increasing state string sizes - then you
436 can break the loop after finding the first match.
437
438 This step should be optional in any case - if there is a
439 preprocessed rule table, it's always faster to use that.
440
441 */
442
443 /* put in the fallback states */
444 for (i = 0; i < HASH_SIZE; i++)
445 for (e = hashtab->entries[i]; e; e = e->next)
446 {
447 if (*(e->key)) for (j = 1; 1; j++)
448 {
449 state_num = hnj_hash_lookup (hashtab, e->key + j);
450 if (state_num >= 0)
451 break;
452 }
453 /* KBH: FIXME state 0 fallback_state should always be -1? */
454 if (e->val)
455 dict[k]->states[e->val].fallback_state = state_num;
456 }
457 #ifdef VERBOSE
458 for (i = 0; i < HASH_SIZE; i++)
459 for (e = hashtab->entries[i]; e; e = e->next)
460 {
461 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
462 dict[k]->states[e->val].fallback_state);
463 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
464 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
465 dict[k]->states[e->val].trans[j].new_state);
466 }
467 #endif
468
469 #ifndef VERBOSE
470 hnj_hash_free (hashtab);
471 #endif
472 state_num = 0;
473 }
474 if (k == 2) dict[0]->nextlevel = dict[1];
475 return dict[0];
476 }
477
478 void hnj_hyphen_free (HyphenDict *dict)
479 {
480 int state_num;
481 HyphenState *hstate;
482
483 for (state_num = 0; state_num < dict->num_states; state_num++)
484 {
485 hstate = &dict->states[state_num];
486 if (hstate->match)
487 hnj_free (hstate->match);
488 if (hstate->repl)
489 hnj_free (hstate->repl);
490 if (hstate->trans)
491 hnj_free (hstate->trans);
492 }
493 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
494
495 hnj_free (dict->states);
496
497 hnj_free (dict);
498 }
499
500 #define MAX_WORD 256
501
502 int hnj_hyphen_hyphenate (HyphenDict *dict,
503 const char *word, int word_size,
504 char *hyphens)
505 {
506 char prep_word_buf[MAX_WORD];
507 char *prep_word;
508 int i, j, k;
509 int state;
510 char ch;
511 HyphenState *hstate;
512 char *match;
513 int offset;
514
515 if (word_size + 3 < MAX_WORD)
516 prep_word = prep_word_buf;
517 else
518 prep_word = hnj_malloc (word_size + 3);
519
520 j = 0;
521 prep_word[j++] = '.';
522
523 for (i = 0; i < word_size; i++)
524 prep_word[j++] = word[i];
525
526 prep_word[j++] = '.';
527 prep_word[j] = '\0';
528
529 for (i = 0; i < word_size + 5; i++)
530 hyphens[i] = '0';
531
532 #ifdef VERBOSE
533 printf ("prep_word = %s\n", prep_word);
534 #endif
535
536 /* now, run the finite state machine */
537 state = 0;
538 for (i = 0; i < j; i++)
539 {
540 ch = prep_word[i];
541 for (;;)
542 {
543
544 if (state == -1) {
545 /* return 1; */
546 /* KBH: FIXME shouldn't this be as follows? */
547 state = 0;
548 goto try_next_letter;
549 }
550
551 #ifdef VERBOSE
552 char *state_str;
553 state_str = get_state_str (state);
554
555 for (k = 0; k < i - strlen (state_str); k++)
556 putchar (' ');
557 printf ("%s", state_str);
558 #endif
559
560 hstate = &dict->states[state];
561 for (k = 0; k < hstate->num_trans; k++)
562 if (hstate->trans[k].ch == ch)
563 {
564 state = hstate->trans[k].new_state;
565 goto found_state;
566 }
567 state = hstate->fallback_state;
568 #ifdef VERBOSE
569 printf (" falling back, fallback_state %d\n", state);
570 #endif
571 }
572 found_state:
573 #ifdef VERBOSE
574 printf ("found state %d\n",state);
575 #endif
576 /* Additional optimization is possible here - especially,
577 elimination of trailing zeroes from the match. Leading zeroes
578 have already been optimized. */
579 match = dict->states[state].match;
580 /* replacing rules not handled by hyphen_hyphenate() */
581 if (match && !dict->states[state].repl)
582 {
583 offset = i + 1 - strlen (match);
584 #ifdef VERBOSE
585 for (k = 0; k < offset; k++)
586 putchar (' ');
587 printf ("%s\n", match);
588 #endif
589 /* This is a linear search because I tried a binary search and
590 found it to be just a teeny bit slower. */
591 for (k = 0; match[k]; k++)
592 if (hyphens[offset + k] < match[k])
593 hyphens[offset + k] = match[k];
594 }
595
596 /* KBH: we need this to make sure we keep looking in a word */
597 /* for patterns even if the current character is not known in state 0 */
598 /* since patterns for hyphenation may occur anywhere in the word */
599 try_next_letter: ;
600
601 }
602 #ifdef VERBOSE
603 for (i = 0; i < j; i++)
604 putchar (hyphens[i]);
605 putchar ('\n');
606 #endif
607
608 for (i = 0; i < j - 4; i++)
609 #if 0
610 if (hyphens[i + 1] & 1)
611 hyphens[i] = '-';
612 #else
613 hyphens[i] = hyphens[i + 1];
614 #endif
615 hyphens[0] = '0';
616 for (; i < word_size; i++)
617 hyphens[i] = '0';
618 hyphens[word_size] = '\0';
619
620 if (prep_word != prep_word_buf)
621 hnj_free (prep_word);
622
623 return 0;
624 }
625
626 /* Unicode ligature length */
627 int hnj_ligature(unsigned char c) {
628 switch (c) {
629 case 0x80: /* ff */
630 case 0x81: /* fi */
631 case 0x82: return LIG_xx; /* fl */
632 case 0x83: /* ffi */
633 case 0x84: return LIG_xxx; /* ffl */
634 case 0x85: /* long st */
635 case 0x86: return LIG_xx; /* st */
636 }
637 return 0;
638 }
639
640 /* character length of the first n byte of the input word */
641 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
642 {
643 int i = 0;
644 int j = 0;
645 while (j < n && word[j] != '\0') {
646 i++;
647 // Unicode ligature support
648 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
649 i += hnj_ligature(word[j + 2]);
650 }
651 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
652 }
653 return i;
654 }
655
656 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
657 char *** rep, int ** pos, int ** cut, int lhmin)
658 {
659 int i = 1, j;
660
661 // Unicode ligature support
662 if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
663 i += hnj_ligature(word[2]);
664 }
665
666 for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
667 // check length of the non-standard part
668 if (*rep && *pos && *cut && (*rep)[j]) {
669 char * rh = strchr((*rep)[j], '=');
670 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
671 hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
672 free((*rep)[j]);
673 (*rep)[j] = NULL;
674 hyphens[j] = '0';
675 }
676 } else {
677 hyphens[j] = '0';
678 }
679 j++;
680
681 // Unicode ligature support
682 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
683 i += hnj_ligature(word[j + 2]);
684 }
685 } while (utf8 && (word[j] & 0xc0) == 0x80);
686 return 0;
687 }
688
689 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
690 char *** rep, int ** pos, int ** cut, int rhmin)
691 {
692 int i;
693 int j = word_size - 2;
694 for (i = 1; i < rhmin && j > 0; j--) {
695 // check length of the non-standard part
696 if (*rep && *pos && *cut && (*rep)[j]) {
697 char * rh = strchr((*rep)[j], '=');
698 if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
699 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
700 free((*rep)[j]);
701 (*rep)[j] = NULL;
702 hyphens[j] = '0';
703 }
704 } else {
705 hyphens[j] = '0';
706 }
707 if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
708 }
709 return 0;
710 }
711
712 // recursive function for compound level hyphenation
713 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
714 char * hyphens, char *** rep, int ** pos, int ** cut,
715 int clhmin, int crhmin, int lend, int rend)
716 {
717 char prep_word_buf[MAX_WORD];
718 char *prep_word;
719 int i, j, k;
720 int state;
721 char ch;
722 HyphenState *hstate;
723 char *match;
724 char *repl;
725 signed char replindex;
726 signed char replcut;
727 int offset;
728 int matchlen_buf[MAX_CHARS];
729 int matchindex_buf[MAX_CHARS];
730 char * matchrepl_buf[MAX_CHARS];
731 int * matchlen;
732 int * matchindex;
733 char ** matchrepl;
734 int isrepl = 0;
735 int nHyphCount;
736
737 if (word_size + 3 < MAX_CHARS) {
738 prep_word = prep_word_buf;
739 matchlen = matchlen_buf;
740 matchindex = matchindex_buf;
741 matchrepl = matchrepl_buf;
742 } else {
743 prep_word = hnj_malloc (word_size + 3);
744 matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
745 matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
746 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
747 }
748
749 j = 0;
750 prep_word[j++] = '.';
751
752 for (i = 0; i < word_size; i++)
753 prep_word[j++] = word[i];
754
755 prep_word[j++] = '.';
756 prep_word[j] = '\0';
757
758 for (i = 0; i < j; i++)
759 hyphens[i] = '0';
760
761 #ifdef VERBOSE
762 printf ("prep_word = %s\n", prep_word);
763 #endif
764
765 /* now, run the finite state machine */
766 state = 0;
767 for (i = 0; i < j; i++)
768 {
769 ch = prep_word[i];
770 for (;;)
771 {
772
773 if (state == -1) {
774 /* return 1; */
775 /* KBH: FIXME shouldn't this be as follows? */
776 state = 0;
777 goto try_next_letter;
778 }
779
780 #ifdef VERBOSE
781 char *state_str;
782 state_str = get_state_str (state);
783
784 for (k = 0; k < i - strlen (state_str); k++)
785 putchar (' ');
786 printf ("%s", state_str);
787 #endif
788
789 hstate = &dict->states[state];
790 for (k = 0; k < hstate->num_trans; k++)
791 if (hstate->trans[k].ch == ch)
792 {
793 state = hstate->trans[k].new_state;
794 goto found_state;
795 }
796 state = hstate->fallback_state;
797 #ifdef VERBOSE
798 printf (" falling back, fallback_state %d\n", state);
799 #endif
800 }
801 found_state:
802 #ifdef VERBOSE
803 printf ("found state %d\n",state);
804 #endif
805 /* Additional optimization is possible here - especially,
806 elimination of trailing zeroes from the match. Leading zeroes
807 have already been optimized. */
808 match = dict->states[state].match;
809 repl = dict->states[state].repl;
810 replindex = dict->states[state].replindex;
811 replcut = dict->states[state].replcut;
812 /* replacing rules not handled by hyphen_hyphenate() */
813 if (match)
814 {
815 offset = i + 1 - strlen (match);
816 #ifdef VERBOSE
817 for (k = 0; k < offset; k++)
818 putchar (' ');
819 printf ("%s (%s)\n", match, repl);
820 #endif
821 if (repl) {
822 if (!isrepl) for(; isrepl < word_size; isrepl++) {
823 matchrepl[isrepl] = NULL;
824 matchindex[isrepl] = -1;
825 }
826 matchlen[offset + replindex] = replcut;
827 }
828 /* This is a linear search because I tried a binary search and
829 found it to be just a teeny bit slower. */
830 for (k = 0; match[k]; k++) {
831 if ((hyphens[offset + k] < match[k])) {
832 hyphens[offset + k] = match[k];
833 if (match[k]&1) {
834 matchrepl[offset + k] = repl;
835 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
836 matchindex[offset + replindex] = offset + k;
837 }
838 }
839 }
840 }
841
842 }
843
844 /* KBH: we need this to make sure we keep looking in a word */
845 /* for patterns even if the current character is not known in state 0 */
846 /* since patterns for hyphenation may occur anywhere in the word */
847 try_next_letter: ;
848
849 }
850 #ifdef VERBOSE
851 for (i = 0; i < j; i++)
852 putchar (hyphens[i]);
853 putchar ('\n');
854 #endif
855
856 for (i = 0; i < j - 3; i++)
857 #if 0
858 if (hyphens[i + 1] & 1)
859 hyphens[i] = '-';
860 #else
861 hyphens[i] = hyphens[i + 1];
862 #endif
863 for (; i < word_size; i++)
864 hyphens[i] = '0';
865 hyphens[word_size] = '\0';
866
867 /* now create a new char string showing hyphenation positions */
868 /* count the hyphens and allocate space for the new hyphenated string */
869 nHyphCount = 0;
870 for (i = 0; i < word_size; i++)
871 if (hyphens[i]&1)
872 nHyphCount++;
873 j = 0;
874 for (i = 0; i < word_size; i++) {
875 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
876 if (rep && pos && cut) {
877 if (!*rep && !*pos && !*cut) {
878 int k;
879 *rep = (char **) malloc(sizeof(char *) * word_size);
880 *pos = (int *) malloc(sizeof(int) * word_size);
881 *cut = (int *) malloc(sizeof(int) * word_size);
882 for (k = 0; k < word_size; k++) {
883 (*rep)[k] = NULL;
884 (*pos)[k] = 0;
885 (*cut)[k] = 0;
886 }
887 }
888 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[ i]]);
889 (*pos)[matchindex[i] - 1] = matchindex[i] - i;
890 (*cut)[matchindex[i] - 1] = matchlen[i];
891 }
892 j += strlen(matchrepl[matchindex[i]]);
893 i += matchlen[i] - 1;
894 }
895 }
896
897 if (matchrepl != matchrepl_buf) {
898 hnj_free (matchrepl);
899 hnj_free (matchlen);
900 hnj_free (matchindex);
901 }
902
903 // recursive hyphenation of the first (compound) level segments
904 if (dict->nextlevel) {
905 char * rep2_buf[MAX_WORD];
906 int pos2_buf[MAX_WORD];
907 int cut2_buf[MAX_WORD];
908 char hyphens2_buf[MAX_WORD];
909 char ** rep2;
910 int * pos2;
911 int * cut2;
912 char * hyphens2;
913 int begin = 0;
914 if (word_size < MAX_CHARS) {
915 rep2 = rep2_buf;
916 pos2 = pos2_buf;
917 cut2 = cut2_buf;
918 hyphens2 = hyphens2_buf;
919 } else {
920 rep2 = hnj_malloc (word_size * sizeof(char *));
921 pos2 = hnj_malloc (word_size * sizeof(int));
922 cut2 = hnj_malloc (word_size * sizeof(int));
923 hyphens2 = hnj_malloc (word_size);
924 }
925 for (i = 0; i < word_size; i++) rep2[i] = NULL;
926 for (i = 0; i < word_size; i++) if
927 (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
928 if (i - begin > 1) {
929 int hyph = 0;
930 prep_word[i + 2] = '\0';
931 /* non-standard hyphenation at compound boundary (Schiffahrt) */
932 if (*rep && *pos && *cut && (*rep)[i]) {
933 char * l = strchr((*rep)[i], '=');
934 strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
935 if (l) {
936 hyph = (l - (*rep)[i]) - (*pos)[i];
937 prep_word[2 + i + hyph] = '\0';
938 }
939 }
940 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
941 hyphens2, &rep2, &pos2, &cut2, clhmin,
942 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
943 for (j = 0; j < i - begin - 1; j++) {
944 hyphens[begin + j] = hyphens2[j];
945 if (rep2[j] && rep && pos && cut) {
946 if (!*rep && !*pos && !*cut) {
947 int k;
948 *rep = (char **) malloc(sizeof(char *) * word_size);
949 *pos = (int *) malloc(sizeof(int) * word_size);
950 *cut = (int *) malloc(sizeof(int) * word_size);
951 for (k = 0; k < word_size; k++) {
952 (*rep)[k] = NULL;
953 (*pos)[k] = 0;
954 (*cut)[k] = 0;
955 }
956 }
957 (*rep)[begin + j] = rep2[j];
958 (*pos)[begin + j] = pos2[j];
959 (*cut)[begin + j] = cut2[j];
960 }
961 }
962 prep_word[i + 2] = word[i + 1];
963 if (*rep && *pos && *cut && (*rep)[i]) {
964 strcpy(prep_word + 1, word);
965 }
966 }
967 begin = i + 1;
968 for (j = 0; j < word_size; j++) rep2[j] = NULL;
969 }
970
971 // non-compound
972 if (begin == 0) {
973 hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
974 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
975 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
976 rep, pos, cut, clhmin);
977 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
978 rep, pos, cut, crhmin);
979 }
980
981 if (rep2 != rep2_buf) {
982 free(rep2);
983 free(cut2);
984 free(pos2);
985 free(hyphens2);
986 }
987 }
988
989 if (prep_word != prep_word_buf) hnj_free (prep_word);
990 return 0;
991 }
992
993 /* UTF-8 normalization of hyphen and non-standard positions */
994 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
995 char *** rep, int ** pos, int ** cut)
996 {
997 int i, j, k;
998 if ((((unsigned char) word[0]) >> 6) == 2) {
999 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1000 return 1;
1001 }
1002
1003 /* calculate UTF-8 character positions */
1004 for (i = 0, j = -1; i < word_size; i++) {
1005 /* beginning of an UTF-8 character (not '10' start bits) */
1006 if ((((unsigned char) word[i]) >> 6) != 2) j++;
1007 hyphens[j] = hyphens[i];
1008 if (rep && pos && cut && *rep && *pos && *cut) {
1009 int l = (*pos)[i];
1010 (*pos)[j] = 0;
1011 for (k = 0; k < l; k++) {
1012 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1013 }
1014 k = i - l + 1;
1015 l = k + (*cut)[i];
1016 (*cut)[j] = 0;
1017 for (; k < l; k++) {
1018 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1019 }
1020 (*rep)[j] = (*rep)[i];
1021 if (j < i) {
1022 (*rep)[i] = NULL;
1023 (*pos)[i] = 0;
1024 (*cut)[i] = 0;
1025 }
1026 }
1027 }
1028 hyphens[j + 1] = '\0';
1029 return 0;
1030 }
1031
1032 /* get the word with all possible hyphenations (output: hyphword) */
1033 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1034 char * hyphword, char *** rep, int ** pos, int ** cut)
1035 {
1036 int i, j;
1037 for (i = 0, j = 0; i < l; i++, j++) {
1038 if (hyphens[i]&1) {
1039 hyphword[j] = word[i];
1040 if (*rep && *pos && *cut && (*rep)[i]) {
1041 strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
1042 j += strlen((*rep)[i]) - (*pos)[i];
1043 i += (*cut)[i] - (*pos)[i];
1044 } else hyphword[++j] = '=';
1045 } else hyphword[j] = word[i];
1046 }
1047 hyphword[j] = '\0';
1048 }
1049
1050
1051 /* main api function with default hyphenmin parameters */
1052 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1053 const char *word, int word_size, char * hyphens,
1054 char *hyphword, char *** rep, int ** pos, int ** cut)
1055 {
1056 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1057 dict->clhmin, dict->crhmin, 1, 1);
1058 hnj_hyphen_lhmin(dict->utf8, word, word_size,
1059 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1060 hnj_hyphen_rhmin(dict->utf8, word, word_size,
1061 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1062 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos , cut);
1063 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut );
1064 return 0;
1065 }
1066
1067 /* previous main api function with hyphenmin parameters */
1068 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1069 const char *word, int word_size, char * hyphens,
1070 char *hyphword, char *** rep, int ** pos, int ** cut,
1071 int lhmin, int rhmin, int clhmin, int crhmin)
1072 {
1073 lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
1074 rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
1075 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1076 clhmin, crhmin, 1, 1);
1077 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1078 rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1079 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1080 rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1081 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos , cut);
1082 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut );
1083 return 0;
1084 }
OLDNEW
« no previous file with comments | « third_party/hyphen/hyphen.h ('k') | third_party/hyphen/hyphen.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698