Index: third_party/hyphen/hyphen.c |
diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c |
deleted file mode 100644 |
index ebae5107ed59425c56bb59e6d6955762a9ad353f..0000000000000000000000000000000000000000 |
--- a/third_party/hyphen/hyphen.c |
+++ /dev/null |
@@ -1,1084 +0,0 @@ |
-/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both |
- * licenses follows. |
- */ |
- |
-/* LibHnj - a library for high quality hyphenation and justification |
- * Copyright (C) 1998 Raph Levien, |
- * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), |
- * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) |
- * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) |
- * |
- * This library is free software; you can redistribute it and/or |
- * modify it under the terms of the GNU Library General Public |
- * License as published by the Free Software Foundation; either |
- * version 2 of the License, or (at your option) any later version. |
- * |
- * This library is distributed in the hope that it will be useful, |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
- * Library General Public License for more details. |
- * |
- * You should have received a copy of the GNU Library General Public |
- * License along with this library; if not, write to the |
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
- * Boston, MA 02111-1307 USA. |
-*/ |
- |
-/* |
- * The contents of this file are subject to the Mozilla Public License |
- * Version 1.0 (the "MPL"); you may not use this file except in |
- * compliance with the MPL. You may obtain a copy of the MPL at |
- * http://www.mozilla.org/MPL/ |
- * |
- * Software distributed under the MPL is distributed on an "AS IS" basis, |
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL |
- * for the specific language governing rights and limitations under the |
- * MPL. |
- * |
- */ |
-#include <stdlib.h> /* for NULL, malloc */ |
-#include <stdio.h> /* for fprintf */ |
-#include <string.h> /* for strdup */ |
- |
-#ifdef UNX |
-#include <unistd.h> /* for exit */ |
-#endif |
- |
-#define noVERBOSE |
- |
-/* calculate hyphenmin values with long ligature length (2 or 3 characters |
- * instead of 1 or 2) for comparison with hyphenation without ligatures */ |
-#define noLONG_LIGATURE |
- |
-#ifdef LONG_LIGATURE |
-#define LIG_xx 1 |
-#define LIG_xxx 2 |
-#else |
-#define LIG_xx 0 |
-#define LIG_xxx 1 |
-#endif |
- |
-#include "hnjalloc.h" |
-#include "hyphen.h" |
- |
-static char * |
-hnj_strdup (const char *s) |
-{ |
- char *new; |
- int l; |
- |
- l = strlen (s); |
- new = hnj_malloc (l + 1); |
- memcpy (new, s, l); |
- new[l] = 0; |
- return new; |
-} |
- |
-/* remove cross-platform text line end characters */ |
-void hnj_strchomp(char * s) |
-{ |
- int k = strlen(s); |
- if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; |
- if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; |
-} |
- |
-/* a little bit of a hash table implementation. This simply maps strings |
- to state numbers */ |
- |
-typedef struct _HashTab HashTab; |
-typedef struct _HashEntry HashEntry; |
- |
-/* A cheap, but effective, hack. */ |
-#define HASH_SIZE 31627 |
- |
-struct _HashTab { |
- HashEntry *entries[HASH_SIZE]; |
-}; |
- |
-struct _HashEntry { |
- HashEntry *next; |
- char *key; |
- int val; |
-}; |
- |
-/* a char* hash function from ASU - adapted from Gtk+ */ |
-static unsigned int |
-hnj_string_hash (const char *s) |
-{ |
- const char *p; |
- unsigned int h=0, g; |
- for(p = s; *p != '\0'; p += 1) { |
- h = ( h << 4 ) + *p; |
- if ( ( g = h & 0xf0000000 ) ) { |
- h = h ^ (g >> 24); |
- h = h ^ g; |
- } |
- } |
- return h /* % M */; |
-} |
- |
-static HashTab * |
-hnj_hash_new (void) |
-{ |
- HashTab *hashtab; |
- int i; |
- |
- hashtab = hnj_malloc (sizeof(HashTab)); |
- for (i = 0; i < HASH_SIZE; i++) |
- hashtab->entries[i] = NULL; |
- |
- return hashtab; |
-} |
- |
-static void |
-hnj_hash_free (HashTab *hashtab) |
-{ |
- int i; |
- HashEntry *e, *next; |
- |
- for (i = 0; i < HASH_SIZE; i++) |
- for (e = hashtab->entries[i]; e; e = next) |
- { |
- next = e->next; |
- hnj_free (e->key); |
- hnj_free (e); |
- } |
- |
- hnj_free (hashtab); |
-} |
- |
-/* assumes that key is not already present! */ |
-static void |
-hnj_hash_insert (HashTab *hashtab, const char *key, int val) |
-{ |
- int i; |
- HashEntry *e; |
- |
- i = hnj_string_hash (key) % HASH_SIZE; |
- e = hnj_malloc (sizeof(HashEntry)); |
- e->next = hashtab->entries[i]; |
- e->key = hnj_strdup (key); |
- e->val = val; |
- hashtab->entries[i] = e; |
-} |
- |
-/* return val if found, otherwise -1 */ |
-static int |
-hnj_hash_lookup (HashTab *hashtab, const char *key) |
-{ |
- int i; |
- HashEntry *e; |
- i = hnj_string_hash (key) % HASH_SIZE; |
- for (e = hashtab->entries[i]; e; e = e->next) |
- if (!strcmp (key, e->key)) |
- return e->val; |
- return -1; |
-} |
- |
-/* Get the state number, allocating a new state if necessary. */ |
-static int |
-hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) |
-{ |
- int state_num; |
- |
- state_num = hnj_hash_lookup (hashtab, string); |
- |
- if (state_num >= 0) |
- return state_num; |
- |
- hnj_hash_insert (hashtab, string, dict->num_states); |
- /* predicate is true if dict->num_states is a power of two */ |
- if (!(dict->num_states & (dict->num_states - 1))) |
- { |
- dict->states = hnj_realloc (dict->states, |
- (dict->num_states << 1) * |
- sizeof(HyphenState)); |
- } |
- dict->states[dict->num_states].match = NULL; |
- dict->states[dict->num_states].repl = NULL; |
- dict->states[dict->num_states].fallback_state = -1; |
- dict->states[dict->num_states].num_trans = 0; |
- dict->states[dict->num_states].trans = NULL; |
- return dict->num_states++; |
-} |
- |
-/* add a transition from state1 to state2 through ch - assumes that the |
- transition does not already exist */ |
-static void |
-hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) |
-{ |
- int num_trans; |
- |
- num_trans = dict->states[state1].num_trans; |
- if (num_trans == 0) |
- { |
- dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); |
- } |
- else if (!(num_trans & (num_trans - 1))) |
- { |
- dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, |
- (num_trans << 1) * |
- sizeof(HyphenTrans)); |
- } |
- dict->states[state1].trans[num_trans].ch = ch; |
- dict->states[state1].trans[num_trans].new_state = state2; |
- dict->states[state1].num_trans++; |
-} |
- |
-#ifdef VERBOSE |
-HashTab *global; |
- |
-static char * |
-get_state_str (int state) |
-{ |
- int i; |
- HashEntry *e; |
- |
- for (i = 0; i < HASH_SIZE; i++) |
- for (e = global->entries[i]; e; e = e->next) |
- if (e->val == state) |
- return e->key; |
- return NULL; |
-} |
-#endif |
- |
-HyphenDict * |
-hnj_hyphen_load (const char *fn) |
-{ |
- HyphenDict *result; |
- FILE *f; |
- f = fopen (fn, "r"); |
- if (f == NULL) |
- return NULL; |
- |
- result = hnj_hyphen_load_file(f); |
- |
- fclose(f); |
- return result; |
-} |
- |
-HyphenDict * |
-hnj_hyphen_load_file (FILE *f) |
-{ |
- HyphenDict *dict[2]; |
- HashTab *hashtab; |
- char buf[MAX_CHARS]; |
- char word[MAX_CHARS]; |
- char pattern[MAX_CHARS]; |
- char * repl; |
- signed char replindex; |
- signed char replcut; |
- int state_num = 0, last_state; |
- int i, j, k; |
- char ch; |
- int found; |
- HashEntry *e; |
- int nextlevel = 0; |
- |
-// loading one or two dictionaries (separated by NEXTLEVEL keyword) |
-for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { |
- hashtab = hnj_hash_new (); |
-#ifdef VERBOSE |
- global = hashtab; |
-#endif |
- hnj_hash_insert (hashtab, "", 0); |
- dict[k] = hnj_malloc (sizeof(HyphenDict)); |
- dict[k]->num_states = 1; |
- dict[k]->states = hnj_malloc (sizeof(HyphenState)); |
- dict[k]->states[0].match = NULL; |
- dict[k]->states[0].repl = NULL; |
- dict[k]->states[0].fallback_state = -1; |
- dict[k]->states[0].num_trans = 0; |
- dict[k]->states[0].trans = NULL; |
- dict[k]->nextlevel = NULL; |
- dict[k]->lhmin = 0; |
- dict[k]->rhmin = 0; |
- dict[k]->clhmin = 0; |
- dict[k]->crhmin = 0; |
- |
- /* read in character set info */ |
- if (k == 0) { |
- for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; |
- if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { |
- for (i=0;i<MAX_NAME;i++) |
- if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) |
- dict[k]->cset[i] = 0; |
- } else { |
- dict[k]->cset[0] = 0; |
- } |
- dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); |
- } else { |
- strcpy(dict[k]->cset, dict[0]->cset); |
- dict[k]->utf8 = dict[0]->utf8; |
- } |
- |
- while (fgets (buf, sizeof(buf), f) != NULL) |
- { |
- if (buf[0] != '%') |
- { |
- if (strncmp(buf, "NEXTLEVEL", 9) == 0) { |
- nextlevel = 1; |
- break; |
- } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { |
- dict[k]->lhmin = atoi(buf + 13); |
- continue; |
- } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { |
- dict[k]->rhmin = atoi(buf + 14); |
- continue; |
- } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { |
- dict[k]->clhmin = atoi(buf + 21); |
- continue; |
- } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { |
- dict[k]->crhmin = atoi(buf + 22); |
- continue; |
- } |
- j = 0; |
- pattern[j] = '0'; |
- repl = strchr(buf, '/'); |
- replindex = 0; |
- replcut = 0; |
- if (repl) { |
- char * index = strchr(repl + 1, ','); |
- *repl = '\0'; |
- if (index) { |
- char * index2 = strchr(index + 1, ','); |
- *index = '\0'; |
- if (index2) { |
- *index2 = '\0'; |
- replindex = (signed char) atoi(index + 1) - 1; |
- replcut = (signed char) atoi(index2 + 1); |
- } |
- } else { |
- hnj_strchomp(repl + 1); |
- replindex = 0; |
- replcut = (signed char) strlen(buf); |
- } |
- repl = hnj_strdup(repl + 1); |
- } |
- for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) |
- { |
- if (buf[i] >= '0' && buf[i] <= '9') |
- pattern[j] = buf[i]; |
- else |
- { |
- word[j] = buf[i]; |
- pattern[++j] = '0'; |
- } |
- } |
- word[j] = '\0'; |
- pattern[j + 1] = '\0'; |
- |
- i = 0; |
- if (!repl) { |
- /* Optimize away leading zeroes */ |
- for (; pattern[i] == '0'; i++); |
- } else { |
- if (*word == '.') i++; |
- /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ |
- if (dict[k]->utf8) { |
- int pu = -1; /* unicode character position */ |
- int ps = -1; /* unicode start position (original replindex) */ |
- int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ |
- for (; pc < (strlen(word) + 1); pc++) { |
- /* beginning of an UTF-8 character (not '10' start bits) */ |
- if ((((unsigned char) word[pc]) >> 6) != 2) pu++; |
- if ((ps < 0) && (replindex == pu)) { |
- ps = replindex; |
- replindex = (signed char) pc; |
- } |
- if ((ps >= 0) && ((pu - ps) == replcut)) { |
- replcut = (signed char) (pc - replindex); |
- break; |
- } |
- } |
- if (*word == '.') replindex--; |
- } |
- } |
- |
-#ifdef VERBOSE |
- printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); |
-#endif |
- found = hnj_hash_lookup (hashtab, word); |
- state_num = hnj_get_state (dict[k], hashtab, word); |
- dict[k]->states[state_num].match = hnj_strdup (pattern + i); |
- dict[k]->states[state_num].repl = repl; |
- dict[k]->states[state_num].replindex = replindex; |
- if (!replcut) { |
- dict[k]->states[state_num].replcut = (signed char) strlen(word); |
- } else { |
- dict[k]->states[state_num].replcut = replcut; |
- } |
- |
- /* now, put in the prefix transitions */ |
- for (; found < 0 ;j--) |
- { |
- last_state = state_num; |
- ch = word[j - 1]; |
- word[j - 1] = '\0'; |
- found = hnj_hash_lookup (hashtab, word); |
- state_num = hnj_get_state (dict[k], hashtab, word); |
- hnj_add_trans (dict[k], state_num, last_state, ch); |
- } |
- } |
- } |
- |
- /* Could do unioning of matches here (instead of the preprocessor script). |
- If we did, the pseudocode would look something like this: |
- |
- foreach state in the hash table |
- foreach i = [1..length(state) - 1] |
- state to check is substr (state, i) |
- look it up |
- if found, and if there is a match, union the match in. |
- |
- It's also possible to avoid the quadratic blowup by doing the |
- search in order of increasing state string sizes - then you |
- can break the loop after finding the first match. |
- |
- This step should be optional in any case - if there is a |
- preprocessed rule table, it's always faster to use that. |
- |
-*/ |
- |
- /* put in the fallback states */ |
- for (i = 0; i < HASH_SIZE; i++) |
- for (e = hashtab->entries[i]; e; e = e->next) |
- { |
- if (*(e->key)) for (j = 1; 1; j++) |
- { |
- state_num = hnj_hash_lookup (hashtab, e->key + j); |
- if (state_num >= 0) |
- break; |
- } |
- /* KBH: FIXME state 0 fallback_state should always be -1? */ |
- if (e->val) |
- dict[k]->states[e->val].fallback_state = state_num; |
- } |
-#ifdef VERBOSE |
- for (i = 0; i < HASH_SIZE; i++) |
- for (e = hashtab->entries[i]; e; e = e->next) |
- { |
- printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, |
- dict[k]->states[e->val].fallback_state); |
- for (j = 0; j < dict[k]->states[e->val].num_trans; j++) |
- printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, |
- dict[k]->states[e->val].trans[j].new_state); |
- } |
-#endif |
- |
-#ifndef VERBOSE |
- hnj_hash_free (hashtab); |
-#endif |
- state_num = 0; |
-} |
- if (k == 2) dict[0]->nextlevel = dict[1]; |
- return dict[0]; |
-} |
- |
-void hnj_hyphen_free (HyphenDict *dict) |
-{ |
- int state_num; |
- HyphenState *hstate; |
- |
- for (state_num = 0; state_num < dict->num_states; state_num++) |
- { |
- hstate = &dict->states[state_num]; |
- if (hstate->match) |
- hnj_free (hstate->match); |
- if (hstate->repl) |
- hnj_free (hstate->repl); |
- if (hstate->trans) |
- hnj_free (hstate->trans); |
- } |
- if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); |
- |
- hnj_free (dict->states); |
- |
- hnj_free (dict); |
-} |
- |
-#define MAX_WORD 256 |
- |
-int hnj_hyphen_hyphenate (HyphenDict *dict, |
- const char *word, int word_size, |
- char *hyphens) |
-{ |
- char prep_word_buf[MAX_WORD]; |
- char *prep_word; |
- int i, j, k; |
- int state; |
- char ch; |
- HyphenState *hstate; |
- char *match; |
- int offset; |
- |
- if (word_size + 3 < MAX_WORD) |
- prep_word = prep_word_buf; |
- else |
- prep_word = hnj_malloc (word_size + 3); |
- |
- j = 0; |
- prep_word[j++] = '.'; |
- |
- for (i = 0; i < word_size; i++) |
- prep_word[j++] = word[i]; |
- |
- prep_word[j++] = '.'; |
- prep_word[j] = '\0'; |
- |
- for (i = 0; i < word_size + 5; i++) |
- hyphens[i] = '0'; |
- |
-#ifdef VERBOSE |
- printf ("prep_word = %s\n", prep_word); |
-#endif |
- |
- /* now, run the finite state machine */ |
- state = 0; |
- for (i = 0; i < j; i++) |
- { |
- ch = prep_word[i]; |
- for (;;) |
- { |
- |
- if (state == -1) { |
- /* return 1; */ |
- /* KBH: FIXME shouldn't this be as follows? */ |
- state = 0; |
- goto try_next_letter; |
- } |
- |
-#ifdef VERBOSE |
- char *state_str; |
- state_str = get_state_str (state); |
- |
- for (k = 0; k < i - strlen (state_str); k++) |
- putchar (' '); |
- printf ("%s", state_str); |
-#endif |
- |
- hstate = &dict->states[state]; |
- for (k = 0; k < hstate->num_trans; k++) |
- if (hstate->trans[k].ch == ch) |
- { |
- state = hstate->trans[k].new_state; |
- goto found_state; |
- } |
- state = hstate->fallback_state; |
-#ifdef VERBOSE |
- printf (" falling back, fallback_state %d\n", state); |
-#endif |
- } |
- found_state: |
-#ifdef VERBOSE |
- printf ("found state %d\n",state); |
-#endif |
- /* Additional optimization is possible here - especially, |
- elimination of trailing zeroes from the match. Leading zeroes |
- have already been optimized. */ |
- match = dict->states[state].match; |
- /* replacing rules not handled by hyphen_hyphenate() */ |
- if (match && !dict->states[state].repl) |
- { |
- offset = i + 1 - strlen (match); |
-#ifdef VERBOSE |
- for (k = 0; k < offset; k++) |
- putchar (' '); |
- printf ("%s\n", match); |
-#endif |
- /* This is a linear search because I tried a binary search and |
- found it to be just a teeny bit slower. */ |
- for (k = 0; match[k]; k++) |
- if (hyphens[offset + k] < match[k]) |
- hyphens[offset + k] = match[k]; |
- } |
- |
- /* KBH: we need this to make sure we keep looking in a word */ |
- /* for patterns even if the current character is not known in state 0 */ |
- /* since patterns for hyphenation may occur anywhere in the word */ |
- try_next_letter: ; |
- |
- } |
-#ifdef VERBOSE |
- for (i = 0; i < j; i++) |
- putchar (hyphens[i]); |
- putchar ('\n'); |
-#endif |
- |
- for (i = 0; i < j - 4; i++) |
-#if 0 |
- if (hyphens[i + 1] & 1) |
- hyphens[i] = '-'; |
-#else |
- hyphens[i] = hyphens[i + 1]; |
-#endif |
- hyphens[0] = '0'; |
- for (; i < word_size; i++) |
- hyphens[i] = '0'; |
- hyphens[word_size] = '\0'; |
- |
- if (prep_word != prep_word_buf) |
- hnj_free (prep_word); |
- |
- return 0; |
-} |
- |
-/* Unicode ligature length */ |
-int hnj_ligature(unsigned char c) { |
- switch (c) { |
- case 0x80: /* ff */ |
- case 0x81: /* fi */ |
- case 0x82: return LIG_xx; /* fl */ |
- case 0x83: /* ffi */ |
- case 0x84: return LIG_xxx; /* ffl */ |
- case 0x85: /* long st */ |
- case 0x86: return LIG_xx; /* st */ |
- } |
- return 0; |
-} |
- |
-/* character length of the first n byte of the input word */ |
-int hnj_hyphen_strnlen(const char * word, int n, int utf8) |
-{ |
- int i = 0; |
- int j = 0; |
- while (j < n && word[j] != '\0') { |
- i++; |
- // Unicode ligature support |
- if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
- i += hnj_ligature(word[j + 2]); |
- } |
- for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); |
- } |
- return i; |
-} |
- |
-int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, |
- char *** rep, int ** pos, int ** cut, int lhmin) |
-{ |
- int i = 1, j; |
- |
- // Unicode ligature support |
- if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { |
- i += hnj_ligature(word[2]); |
- } |
- |
- for (j = 0; i < lhmin && word[j] != '\0'; i++) do { |
- // check length of the non-standard part |
- if (*rep && *pos && *cut && (*rep)[j]) { |
- char * rh = strchr((*rep)[j], '='); |
- if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + |
- hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { |
- free((*rep)[j]); |
- (*rep)[j] = NULL; |
- hyphens[j] = '0'; |
- } |
- } else { |
- hyphens[j] = '0'; |
- } |
- j++; |
- |
- // Unicode ligature support |
- if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
- i += hnj_ligature(word[j + 2]); |
- } |
- } while (utf8 && (word[j] & 0xc0) == 0x80); |
- return 0; |
-} |
- |
-int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, |
- char *** rep, int ** pos, int ** cut, int rhmin) |
-{ |
- int i; |
- int j = word_size - 2; |
- for (i = 1; i < rhmin && j > 0; j--) { |
- // check length of the non-standard part |
- if (*rep && *pos && *cut && (*rep)[j]) { |
- char * rh = strchr((*rep)[j], '='); |
- if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + |
- hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { |
- free((*rep)[j]); |
- (*rep)[j] = NULL; |
- hyphens[j] = '0'; |
- } |
- } else { |
- hyphens[j] = '0'; |
- } |
- if (!utf8 || (word[j] & 0xc0) != 0xc0) i++; |
- } |
- return 0; |
-} |
- |
-// recursive function for compound level hyphenation |
-int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, |
- char * hyphens, char *** rep, int ** pos, int ** cut, |
- int clhmin, int crhmin, int lend, int rend) |
-{ |
- char prep_word_buf[MAX_WORD]; |
- char *prep_word; |
- int i, j, k; |
- int state; |
- char ch; |
- HyphenState *hstate; |
- char *match; |
- char *repl; |
- signed char replindex; |
- signed char replcut; |
- int offset; |
- int matchlen_buf[MAX_CHARS]; |
- int matchindex_buf[MAX_CHARS]; |
- char * matchrepl_buf[MAX_CHARS]; |
- int * matchlen; |
- int * matchindex; |
- char ** matchrepl; |
- int isrepl = 0; |
- int nHyphCount; |
- |
- if (word_size + 3 < MAX_CHARS) { |
- prep_word = prep_word_buf; |
- matchlen = matchlen_buf; |
- matchindex = matchindex_buf; |
- matchrepl = matchrepl_buf; |
- } else { |
- prep_word = hnj_malloc (word_size + 3); |
- matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); |
- matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); |
- matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); |
- } |
- |
- j = 0; |
- prep_word[j++] = '.'; |
- |
- for (i = 0; i < word_size; i++) |
- prep_word[j++] = word[i]; |
- |
- prep_word[j++] = '.'; |
- prep_word[j] = '\0'; |
- |
- for (i = 0; i < j; i++) |
- hyphens[i] = '0'; |
- |
-#ifdef VERBOSE |
- printf ("prep_word = %s\n", prep_word); |
-#endif |
- |
- /* now, run the finite state machine */ |
- state = 0; |
- for (i = 0; i < j; i++) |
- { |
- ch = prep_word[i]; |
- for (;;) |
- { |
- |
- if (state == -1) { |
- /* return 1; */ |
- /* KBH: FIXME shouldn't this be as follows? */ |
- state = 0; |
- goto try_next_letter; |
- } |
- |
-#ifdef VERBOSE |
- char *state_str; |
- state_str = get_state_str (state); |
- |
- for (k = 0; k < i - strlen (state_str); k++) |
- putchar (' '); |
- printf ("%s", state_str); |
-#endif |
- |
- hstate = &dict->states[state]; |
- for (k = 0; k < hstate->num_trans; k++) |
- if (hstate->trans[k].ch == ch) |
- { |
- state = hstate->trans[k].new_state; |
- goto found_state; |
- } |
- state = hstate->fallback_state; |
-#ifdef VERBOSE |
- printf (" falling back, fallback_state %d\n", state); |
-#endif |
- } |
- found_state: |
-#ifdef VERBOSE |
- printf ("found state %d\n",state); |
-#endif |
- /* Additional optimization is possible here - especially, |
- elimination of trailing zeroes from the match. Leading zeroes |
- have already been optimized. */ |
- match = dict->states[state].match; |
- repl = dict->states[state].repl; |
- replindex = dict->states[state].replindex; |
- replcut = dict->states[state].replcut; |
- /* replacing rules not handled by hyphen_hyphenate() */ |
- if (match) |
- { |
- offset = i + 1 - strlen (match); |
-#ifdef VERBOSE |
- for (k = 0; k < offset; k++) |
- putchar (' '); |
- printf ("%s (%s)\n", match, repl); |
-#endif |
- if (repl) { |
- if (!isrepl) for(; isrepl < word_size; isrepl++) { |
- matchrepl[isrepl] = NULL; |
- matchindex[isrepl] = -1; |
- } |
- matchlen[offset + replindex] = replcut; |
- } |
- /* This is a linear search because I tried a binary search and |
- found it to be just a teeny bit slower. */ |
- for (k = 0; match[k]; k++) { |
- if ((hyphens[offset + k] < match[k])) { |
- hyphens[offset + k] = match[k]; |
- if (match[k]&1) { |
- matchrepl[offset + k] = repl; |
- if (repl && (k >= replindex) && (k <= replindex + replcut)) { |
- matchindex[offset + replindex] = offset + k; |
- } |
- } |
- } |
- } |
- |
- } |
- |
- /* KBH: we need this to make sure we keep looking in a word */ |
- /* for patterns even if the current character is not known in state 0 */ |
- /* since patterns for hyphenation may occur anywhere in the word */ |
- try_next_letter: ; |
- |
- } |
-#ifdef VERBOSE |
- for (i = 0; i < j; i++) |
- putchar (hyphens[i]); |
- putchar ('\n'); |
-#endif |
- |
- for (i = 0; i < j - 3; i++) |
-#if 0 |
- if (hyphens[i + 1] & 1) |
- hyphens[i] = '-'; |
-#else |
- hyphens[i] = hyphens[i + 1]; |
-#endif |
- for (; i < word_size; i++) |
- hyphens[i] = '0'; |
- hyphens[word_size] = '\0'; |
- |
- /* now create a new char string showing hyphenation positions */ |
- /* count the hyphens and allocate space for the new hyphenated string */ |
- nHyphCount = 0; |
- for (i = 0; i < word_size; i++) |
- if (hyphens[i]&1) |
- nHyphCount++; |
- j = 0; |
- for (i = 0; i < word_size; i++) { |
- if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { |
- if (rep && pos && cut) { |
- if (!*rep && !*pos && !*cut) { |
- int k; |
- *rep = (char **) malloc(sizeof(char *) * word_size); |
- *pos = (int *) malloc(sizeof(int) * word_size); |
- *cut = (int *) malloc(sizeof(int) * word_size); |
- for (k = 0; k < word_size; k++) { |
- (*rep)[k] = NULL; |
- (*pos)[k] = 0; |
- (*cut)[k] = 0; |
- } |
- } |
- (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); |
- (*pos)[matchindex[i] - 1] = matchindex[i] - i; |
- (*cut)[matchindex[i] - 1] = matchlen[i]; |
- } |
- j += strlen(matchrepl[matchindex[i]]); |
- i += matchlen[i] - 1; |
- } |
- } |
- |
- if (matchrepl != matchrepl_buf) { |
- hnj_free (matchrepl); |
- hnj_free (matchlen); |
- hnj_free (matchindex); |
- } |
- |
- // recursive hyphenation of the first (compound) level segments |
- if (dict->nextlevel) { |
- char * rep2_buf[MAX_WORD]; |
- int pos2_buf[MAX_WORD]; |
- int cut2_buf[MAX_WORD]; |
- char hyphens2_buf[MAX_WORD]; |
- char ** rep2; |
- int * pos2; |
- int * cut2; |
- char * hyphens2; |
- int begin = 0; |
- if (word_size < MAX_CHARS) { |
- rep2 = rep2_buf; |
- pos2 = pos2_buf; |
- cut2 = cut2_buf; |
- hyphens2 = hyphens2_buf; |
- } else { |
- rep2 = hnj_malloc (word_size * sizeof(char *)); |
- pos2 = hnj_malloc (word_size * sizeof(int)); |
- cut2 = hnj_malloc (word_size * sizeof(int)); |
- hyphens2 = hnj_malloc (word_size); |
- } |
- for (i = 0; i < word_size; i++) rep2[i] = NULL; |
- for (i = 0; i < word_size; i++) if |
- (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { |
- if (i - begin > 1) { |
- int hyph = 0; |
- prep_word[i + 2] = '\0'; |
- /* non-standard hyphenation at compound boundary (Schiffahrt) */ |
- if (*rep && *pos && *cut && (*rep)[i]) { |
- char * l = strchr((*rep)[i], '='); |
- strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]); |
- if (l) { |
- hyph = (l - (*rep)[i]) - (*pos)[i]; |
- prep_word[2 + i + hyph] = '\0'; |
- } |
- } |
- hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, |
- hyphens2, &rep2, &pos2, &cut2, clhmin, |
- crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); |
- for (j = 0; j < i - begin - 1; j++) { |
- hyphens[begin + j] = hyphens2[j]; |
- if (rep2[j] && rep && pos && cut) { |
- if (!*rep && !*pos && !*cut) { |
- int k; |
- *rep = (char **) malloc(sizeof(char *) * word_size); |
- *pos = (int *) malloc(sizeof(int) * word_size); |
- *cut = (int *) malloc(sizeof(int) * word_size); |
- for (k = 0; k < word_size; k++) { |
- (*rep)[k] = NULL; |
- (*pos)[k] = 0; |
- (*cut)[k] = 0; |
- } |
- } |
- (*rep)[begin + j] = rep2[j]; |
- (*pos)[begin + j] = pos2[j]; |
- (*cut)[begin + j] = cut2[j]; |
- } |
- } |
- prep_word[i + 2] = word[i + 1]; |
- if (*rep && *pos && *cut && (*rep)[i]) { |
- strcpy(prep_word + 1, word); |
- } |
- } |
- begin = i + 1; |
- for (j = 0; j < word_size; j++) rep2[j] = NULL; |
- } |
- |
- // non-compound |
- if (begin == 0) { |
- hnj_hyphen_hyph_(dict->nextlevel, word, word_size, |
- hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); |
- if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
- rep, pos, cut, clhmin); |
- if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
- rep, pos, cut, crhmin); |
- } |
- |
- if (rep2 != rep2_buf) { |
- free(rep2); |
- free(cut2); |
- free(pos2); |
- free(hyphens2); |
- } |
- } |
- |
- if (prep_word != prep_word_buf) hnj_free (prep_word); |
- return 0; |
-} |
- |
-/* UTF-8 normalization of hyphen and non-standard positions */ |
-int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, |
- char *** rep, int ** pos, int ** cut) |
-{ |
- int i, j, k; |
- if ((((unsigned char) word[0]) >> 6) == 2) { |
- fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); |
- return 1; |
- } |
- |
- /* calculate UTF-8 character positions */ |
- for (i = 0, j = -1; i < word_size; i++) { |
- /* beginning of an UTF-8 character (not '10' start bits) */ |
- if ((((unsigned char) word[i]) >> 6) != 2) j++; |
- hyphens[j] = hyphens[i]; |
- if (rep && pos && cut && *rep && *pos && *cut) { |
- int l = (*pos)[i]; |
- (*pos)[j] = 0; |
- for (k = 0; k < l; k++) { |
- if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; |
- } |
- k = i - l + 1; |
- l = k + (*cut)[i]; |
- (*cut)[j] = 0; |
- for (; k < l; k++) { |
- if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; |
- } |
- (*rep)[j] = (*rep)[i]; |
- if (j < i) { |
- (*rep)[i] = NULL; |
- (*pos)[i] = 0; |
- (*cut)[i] = 0; |
- } |
- } |
- } |
- hyphens[j + 1] = '\0'; |
- return 0; |
-} |
- |
-/* get the word with all possible hyphenations (output: hyphword) */ |
-void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, |
- char * hyphword, char *** rep, int ** pos, int ** cut) |
-{ |
- int i, j; |
- for (i = 0, j = 0; i < l; i++, j++) { |
- if (hyphens[i]&1) { |
- hyphword[j] = word[i]; |
- if (*rep && *pos && *cut && (*rep)[i]) { |
- strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]); |
- j += strlen((*rep)[i]) - (*pos)[i]; |
- i += (*cut)[i] - (*pos)[i]; |
- } else hyphword[++j] = '='; |
- } else hyphword[j] = word[i]; |
- } |
- hyphword[j] = '\0'; |
-} |
- |
- |
-/* main api function with default hyphenmin parameters */ |
-int hnj_hyphen_hyphenate2 (HyphenDict *dict, |
- const char *word, int word_size, char * hyphens, |
- char *hyphword, char *** rep, int ** pos, int ** cut) |
-{ |
- hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
- dict->clhmin, dict->crhmin, 1, 1); |
- hnj_hyphen_lhmin(dict->utf8, word, word_size, |
- hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); |
- hnj_hyphen_rhmin(dict->utf8, word, word_size, |
- hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); |
- if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
- if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
- return 0; |
-} |
- |
-/* previous main api function with hyphenmin parameters */ |
-int hnj_hyphen_hyphenate3 (HyphenDict *dict, |
- const char *word, int word_size, char * hyphens, |
- char *hyphword, char *** rep, int ** pos, int ** cut, |
- int lhmin, int rhmin, int clhmin, int crhmin) |
-{ |
- lhmin = (lhmin > 0 ? lhmin : dict->lhmin); |
- rhmin = (rhmin > 0 ? rhmin : dict->rhmin); |
- hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
- clhmin, crhmin, 1, 1); |
- hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
- rep, pos, cut, (lhmin > 0 ? lhmin : 2)); |
- hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
- rep, pos, cut, (rhmin > 0 ? rhmin : 2)); |
- if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
- if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
- return 0; |
-} |