| Index: third_party/hyphen/hyphen.c
|
| diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c
|
| deleted file mode 100644
|
| index ebae5107ed59425c56bb59e6d6955762a9ad353f..0000000000000000000000000000000000000000
|
| --- a/third_party/hyphen/hyphen.c
|
| +++ /dev/null
|
| @@ -1,1084 +0,0 @@
|
| -/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
|
| - * licenses follows.
|
| - */
|
| -
|
| -/* LibHnj - a library for high quality hyphenation and justification
|
| - * Copyright (C) 1998 Raph Levien,
|
| - * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
|
| - * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
|
| - * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
|
| - *
|
| - * This library is free software; you can redistribute it and/or
|
| - * modify it under the terms of the GNU Library General Public
|
| - * License as published by the Free Software Foundation; either
|
| - * version 2 of the License, or (at your option) any later version.
|
| - *
|
| - * This library is distributed in the hope that it will be useful,
|
| - * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| - * Library General Public License for more details.
|
| - *
|
| - * You should have received a copy of the GNU Library General Public
|
| - * License along with this library; if not, write to the
|
| - * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
| - * Boston, MA 02111-1307 USA.
|
| -*/
|
| -
|
| -/*
|
| - * The contents of this file are subject to the Mozilla Public License
|
| - * Version 1.0 (the "MPL"); you may not use this file except in
|
| - * compliance with the MPL. You may obtain a copy of the MPL at
|
| - * http://www.mozilla.org/MPL/
|
| - *
|
| - * Software distributed under the MPL is distributed on an "AS IS" basis,
|
| - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
|
| - * for the specific language governing rights and limitations under the
|
| - * MPL.
|
| - *
|
| - */
|
| -#include <stdlib.h> /* for NULL, malloc */
|
| -#include <stdio.h> /* for fprintf */
|
| -#include <string.h> /* for strdup */
|
| -
|
| -#ifdef UNX
|
| -#include <unistd.h> /* for exit */
|
| -#endif
|
| -
|
| -#define noVERBOSE
|
| -
|
| -/* calculate hyphenmin values with long ligature length (2 or 3 characters
|
| - * instead of 1 or 2) for comparison with hyphenation without ligatures */
|
| -#define noLONG_LIGATURE
|
| -
|
| -#ifdef LONG_LIGATURE
|
| -#define LIG_xx 1
|
| -#define LIG_xxx 2
|
| -#else
|
| -#define LIG_xx 0
|
| -#define LIG_xxx 1
|
| -#endif
|
| -
|
| -#include "hnjalloc.h"
|
| -#include "hyphen.h"
|
| -
|
| -static char *
|
| -hnj_strdup (const char *s)
|
| -{
|
| - char *new;
|
| - int l;
|
| -
|
| - l = strlen (s);
|
| - new = hnj_malloc (l + 1);
|
| - memcpy (new, s, l);
|
| - new[l] = 0;
|
| - return new;
|
| -}
|
| -
|
| -/* remove cross-platform text line end characters */
|
| -void hnj_strchomp(char * s)
|
| -{
|
| - int k = strlen(s);
|
| - if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
|
| - if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
|
| -}
|
| -
|
| -/* a little bit of a hash table implementation. This simply maps strings
|
| - to state numbers */
|
| -
|
| -typedef struct _HashTab HashTab;
|
| -typedef struct _HashEntry HashEntry;
|
| -
|
| -/* A cheap, but effective, hack. */
|
| -#define HASH_SIZE 31627
|
| -
|
| -struct _HashTab {
|
| - HashEntry *entries[HASH_SIZE];
|
| -};
|
| -
|
| -struct _HashEntry {
|
| - HashEntry *next;
|
| - char *key;
|
| - int val;
|
| -};
|
| -
|
| -/* a char* hash function from ASU - adapted from Gtk+ */
|
| -static unsigned int
|
| -hnj_string_hash (const char *s)
|
| -{
|
| - const char *p;
|
| - unsigned int h=0, g;
|
| - for(p = s; *p != '\0'; p += 1) {
|
| - h = ( h << 4 ) + *p;
|
| - if ( ( g = h & 0xf0000000 ) ) {
|
| - h = h ^ (g >> 24);
|
| - h = h ^ g;
|
| - }
|
| - }
|
| - return h /* % M */;
|
| -}
|
| -
|
| -static HashTab *
|
| -hnj_hash_new (void)
|
| -{
|
| - HashTab *hashtab;
|
| - int i;
|
| -
|
| - hashtab = hnj_malloc (sizeof(HashTab));
|
| - for (i = 0; i < HASH_SIZE; i++)
|
| - hashtab->entries[i] = NULL;
|
| -
|
| - return hashtab;
|
| -}
|
| -
|
| -static void
|
| -hnj_hash_free (HashTab *hashtab)
|
| -{
|
| - int i;
|
| - HashEntry *e, *next;
|
| -
|
| - for (i = 0; i < HASH_SIZE; i++)
|
| - for (e = hashtab->entries[i]; e; e = next)
|
| - {
|
| - next = e->next;
|
| - hnj_free (e->key);
|
| - hnj_free (e);
|
| - }
|
| -
|
| - hnj_free (hashtab);
|
| -}
|
| -
|
| -/* assumes that key is not already present! */
|
| -static void
|
| -hnj_hash_insert (HashTab *hashtab, const char *key, int val)
|
| -{
|
| - int i;
|
| - HashEntry *e;
|
| -
|
| - i = hnj_string_hash (key) % HASH_SIZE;
|
| - e = hnj_malloc (sizeof(HashEntry));
|
| - e->next = hashtab->entries[i];
|
| - e->key = hnj_strdup (key);
|
| - e->val = val;
|
| - hashtab->entries[i] = e;
|
| -}
|
| -
|
| -/* return val if found, otherwise -1 */
|
| -static int
|
| -hnj_hash_lookup (HashTab *hashtab, const char *key)
|
| -{
|
| - int i;
|
| - HashEntry *e;
|
| - i = hnj_string_hash (key) % HASH_SIZE;
|
| - for (e = hashtab->entries[i]; e; e = e->next)
|
| - if (!strcmp (key, e->key))
|
| - return e->val;
|
| - return -1;
|
| -}
|
| -
|
| -/* Get the state number, allocating a new state if necessary. */
|
| -static int
|
| -hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
|
| -{
|
| - int state_num;
|
| -
|
| - state_num = hnj_hash_lookup (hashtab, string);
|
| -
|
| - if (state_num >= 0)
|
| - return state_num;
|
| -
|
| - hnj_hash_insert (hashtab, string, dict->num_states);
|
| - /* predicate is true if dict->num_states is a power of two */
|
| - if (!(dict->num_states & (dict->num_states - 1)))
|
| - {
|
| - dict->states = hnj_realloc (dict->states,
|
| - (dict->num_states << 1) *
|
| - sizeof(HyphenState));
|
| - }
|
| - dict->states[dict->num_states].match = NULL;
|
| - dict->states[dict->num_states].repl = NULL;
|
| - dict->states[dict->num_states].fallback_state = -1;
|
| - dict->states[dict->num_states].num_trans = 0;
|
| - dict->states[dict->num_states].trans = NULL;
|
| - return dict->num_states++;
|
| -}
|
| -
|
| -/* add a transition from state1 to state2 through ch - assumes that the
|
| - transition does not already exist */
|
| -static void
|
| -hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
|
| -{
|
| - int num_trans;
|
| -
|
| - num_trans = dict->states[state1].num_trans;
|
| - if (num_trans == 0)
|
| - {
|
| - dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
|
| - }
|
| - else if (!(num_trans & (num_trans - 1)))
|
| - {
|
| - dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
|
| - (num_trans << 1) *
|
| - sizeof(HyphenTrans));
|
| - }
|
| - dict->states[state1].trans[num_trans].ch = ch;
|
| - dict->states[state1].trans[num_trans].new_state = state2;
|
| - dict->states[state1].num_trans++;
|
| -}
|
| -
|
| -#ifdef VERBOSE
|
| -HashTab *global;
|
| -
|
| -static char *
|
| -get_state_str (int state)
|
| -{
|
| - int i;
|
| - HashEntry *e;
|
| -
|
| - for (i = 0; i < HASH_SIZE; i++)
|
| - for (e = global->entries[i]; e; e = e->next)
|
| - if (e->val == state)
|
| - return e->key;
|
| - return NULL;
|
| -}
|
| -#endif
|
| -
|
| -HyphenDict *
|
| -hnj_hyphen_load (const char *fn)
|
| -{
|
| - HyphenDict *result;
|
| - FILE *f;
|
| - f = fopen (fn, "r");
|
| - if (f == NULL)
|
| - return NULL;
|
| -
|
| - result = hnj_hyphen_load_file(f);
|
| -
|
| - fclose(f);
|
| - return result;
|
| -}
|
| -
|
| -HyphenDict *
|
| -hnj_hyphen_load_file (FILE *f)
|
| -{
|
| - HyphenDict *dict[2];
|
| - HashTab *hashtab;
|
| - char buf[MAX_CHARS];
|
| - char word[MAX_CHARS];
|
| - char pattern[MAX_CHARS];
|
| - char * repl;
|
| - signed char replindex;
|
| - signed char replcut;
|
| - int state_num = 0, last_state;
|
| - int i, j, k;
|
| - char ch;
|
| - int found;
|
| - HashEntry *e;
|
| - int nextlevel = 0;
|
| -
|
| -// loading one or two dictionaries (separated by NEXTLEVEL keyword)
|
| -for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
|
| - hashtab = hnj_hash_new ();
|
| -#ifdef VERBOSE
|
| - global = hashtab;
|
| -#endif
|
| - hnj_hash_insert (hashtab, "", 0);
|
| - dict[k] = hnj_malloc (sizeof(HyphenDict));
|
| - dict[k]->num_states = 1;
|
| - dict[k]->states = hnj_malloc (sizeof(HyphenState));
|
| - dict[k]->states[0].match = NULL;
|
| - dict[k]->states[0].repl = NULL;
|
| - dict[k]->states[0].fallback_state = -1;
|
| - dict[k]->states[0].num_trans = 0;
|
| - dict[k]->states[0].trans = NULL;
|
| - dict[k]->nextlevel = NULL;
|
| - dict[k]->lhmin = 0;
|
| - dict[k]->rhmin = 0;
|
| - dict[k]->clhmin = 0;
|
| - dict[k]->crhmin = 0;
|
| -
|
| - /* read in character set info */
|
| - if (k == 0) {
|
| - for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
|
| - if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
|
| - for (i=0;i<MAX_NAME;i++)
|
| - if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
|
| - dict[k]->cset[i] = 0;
|
| - } else {
|
| - dict[k]->cset[0] = 0;
|
| - }
|
| - dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
|
| - } else {
|
| - strcpy(dict[k]->cset, dict[0]->cset);
|
| - dict[k]->utf8 = dict[0]->utf8;
|
| - }
|
| -
|
| - while (fgets (buf, sizeof(buf), f) != NULL)
|
| - {
|
| - if (buf[0] != '%')
|
| - {
|
| - if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
|
| - nextlevel = 1;
|
| - break;
|
| - } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
|
| - dict[k]->lhmin = atoi(buf + 13);
|
| - continue;
|
| - } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
|
| - dict[k]->rhmin = atoi(buf + 14);
|
| - continue;
|
| - } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
|
| - dict[k]->clhmin = atoi(buf + 21);
|
| - continue;
|
| - } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
|
| - dict[k]->crhmin = atoi(buf + 22);
|
| - continue;
|
| - }
|
| - j = 0;
|
| - pattern[j] = '0';
|
| - repl = strchr(buf, '/');
|
| - replindex = 0;
|
| - replcut = 0;
|
| - if (repl) {
|
| - char * index = strchr(repl + 1, ',');
|
| - *repl = '\0';
|
| - if (index) {
|
| - char * index2 = strchr(index + 1, ',');
|
| - *index = '\0';
|
| - if (index2) {
|
| - *index2 = '\0';
|
| - replindex = (signed char) atoi(index + 1) - 1;
|
| - replcut = (signed char) atoi(index2 + 1);
|
| - }
|
| - } else {
|
| - hnj_strchomp(repl + 1);
|
| - replindex = 0;
|
| - replcut = (signed char) strlen(buf);
|
| - }
|
| - repl = hnj_strdup(repl + 1);
|
| - }
|
| - for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
|
| - {
|
| - if (buf[i] >= '0' && buf[i] <= '9')
|
| - pattern[j] = buf[i];
|
| - else
|
| - {
|
| - word[j] = buf[i];
|
| - pattern[++j] = '0';
|
| - }
|
| - }
|
| - word[j] = '\0';
|
| - pattern[j + 1] = '\0';
|
| -
|
| - i = 0;
|
| - if (!repl) {
|
| - /* Optimize away leading zeroes */
|
| - for (; pattern[i] == '0'; i++);
|
| - } else {
|
| - if (*word == '.') i++;
|
| - /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
|
| - if (dict[k]->utf8) {
|
| - int pu = -1; /* unicode character position */
|
| - int ps = -1; /* unicode start position (original replindex) */
|
| - int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
|
| - for (; pc < (strlen(word) + 1); pc++) {
|
| - /* beginning of an UTF-8 character (not '10' start bits) */
|
| - if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
|
| - if ((ps < 0) && (replindex == pu)) {
|
| - ps = replindex;
|
| - replindex = (signed char) pc;
|
| - }
|
| - if ((ps >= 0) && ((pu - ps) == replcut)) {
|
| - replcut = (signed char) (pc - replindex);
|
| - break;
|
| - }
|
| - }
|
| - if (*word == '.') replindex--;
|
| - }
|
| - }
|
| -
|
| -#ifdef VERBOSE
|
| - printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
|
| -#endif
|
| - found = hnj_hash_lookup (hashtab, word);
|
| - state_num = hnj_get_state (dict[k], hashtab, word);
|
| - dict[k]->states[state_num].match = hnj_strdup (pattern + i);
|
| - dict[k]->states[state_num].repl = repl;
|
| - dict[k]->states[state_num].replindex = replindex;
|
| - if (!replcut) {
|
| - dict[k]->states[state_num].replcut = (signed char) strlen(word);
|
| - } else {
|
| - dict[k]->states[state_num].replcut = replcut;
|
| - }
|
| -
|
| - /* now, put in the prefix transitions */
|
| - for (; found < 0 ;j--)
|
| - {
|
| - last_state = state_num;
|
| - ch = word[j - 1];
|
| - word[j - 1] = '\0';
|
| - found = hnj_hash_lookup (hashtab, word);
|
| - state_num = hnj_get_state (dict[k], hashtab, word);
|
| - hnj_add_trans (dict[k], state_num, last_state, ch);
|
| - }
|
| - }
|
| - }
|
| -
|
| - /* Could do unioning of matches here (instead of the preprocessor script).
|
| - If we did, the pseudocode would look something like this:
|
| -
|
| - foreach state in the hash table
|
| - foreach i = [1..length(state) - 1]
|
| - state to check is substr (state, i)
|
| - look it up
|
| - if found, and if there is a match, union the match in.
|
| -
|
| - It's also possible to avoid the quadratic blowup by doing the
|
| - search in order of increasing state string sizes - then you
|
| - can break the loop after finding the first match.
|
| -
|
| - This step should be optional in any case - if there is a
|
| - preprocessed rule table, it's always faster to use that.
|
| -
|
| -*/
|
| -
|
| - /* put in the fallback states */
|
| - for (i = 0; i < HASH_SIZE; i++)
|
| - for (e = hashtab->entries[i]; e; e = e->next)
|
| - {
|
| - if (*(e->key)) for (j = 1; 1; j++)
|
| - {
|
| - state_num = hnj_hash_lookup (hashtab, e->key + j);
|
| - if (state_num >= 0)
|
| - break;
|
| - }
|
| - /* KBH: FIXME state 0 fallback_state should always be -1? */
|
| - if (e->val)
|
| - dict[k]->states[e->val].fallback_state = state_num;
|
| - }
|
| -#ifdef VERBOSE
|
| - for (i = 0; i < HASH_SIZE; i++)
|
| - for (e = hashtab->entries[i]; e; e = e->next)
|
| - {
|
| - printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
|
| - dict[k]->states[e->val].fallback_state);
|
| - for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
|
| - printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
|
| - dict[k]->states[e->val].trans[j].new_state);
|
| - }
|
| -#endif
|
| -
|
| -#ifndef VERBOSE
|
| - hnj_hash_free (hashtab);
|
| -#endif
|
| - state_num = 0;
|
| -}
|
| - if (k == 2) dict[0]->nextlevel = dict[1];
|
| - return dict[0];
|
| -}
|
| -
|
| -void hnj_hyphen_free (HyphenDict *dict)
|
| -{
|
| - int state_num;
|
| - HyphenState *hstate;
|
| -
|
| - for (state_num = 0; state_num < dict->num_states; state_num++)
|
| - {
|
| - hstate = &dict->states[state_num];
|
| - if (hstate->match)
|
| - hnj_free (hstate->match);
|
| - if (hstate->repl)
|
| - hnj_free (hstate->repl);
|
| - if (hstate->trans)
|
| - hnj_free (hstate->trans);
|
| - }
|
| - if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
|
| -
|
| - hnj_free (dict->states);
|
| -
|
| - hnj_free (dict);
|
| -}
|
| -
|
| -#define MAX_WORD 256
|
| -
|
| -int hnj_hyphen_hyphenate (HyphenDict *dict,
|
| - const char *word, int word_size,
|
| - char *hyphens)
|
| -{
|
| - char prep_word_buf[MAX_WORD];
|
| - char *prep_word;
|
| - int i, j, k;
|
| - int state;
|
| - char ch;
|
| - HyphenState *hstate;
|
| - char *match;
|
| - int offset;
|
| -
|
| - if (word_size + 3 < MAX_WORD)
|
| - prep_word = prep_word_buf;
|
| - else
|
| - prep_word = hnj_malloc (word_size + 3);
|
| -
|
| - j = 0;
|
| - prep_word[j++] = '.';
|
| -
|
| - for (i = 0; i < word_size; i++)
|
| - prep_word[j++] = word[i];
|
| -
|
| - prep_word[j++] = '.';
|
| - prep_word[j] = '\0';
|
| -
|
| - for (i = 0; i < word_size + 5; i++)
|
| - hyphens[i] = '0';
|
| -
|
| -#ifdef VERBOSE
|
| - printf ("prep_word = %s\n", prep_word);
|
| -#endif
|
| -
|
| - /* now, run the finite state machine */
|
| - state = 0;
|
| - for (i = 0; i < j; i++)
|
| - {
|
| - ch = prep_word[i];
|
| - for (;;)
|
| - {
|
| -
|
| - if (state == -1) {
|
| - /* return 1; */
|
| - /* KBH: FIXME shouldn't this be as follows? */
|
| - state = 0;
|
| - goto try_next_letter;
|
| - }
|
| -
|
| -#ifdef VERBOSE
|
| - char *state_str;
|
| - state_str = get_state_str (state);
|
| -
|
| - for (k = 0; k < i - strlen (state_str); k++)
|
| - putchar (' ');
|
| - printf ("%s", state_str);
|
| -#endif
|
| -
|
| - hstate = &dict->states[state];
|
| - for (k = 0; k < hstate->num_trans; k++)
|
| - if (hstate->trans[k].ch == ch)
|
| - {
|
| - state = hstate->trans[k].new_state;
|
| - goto found_state;
|
| - }
|
| - state = hstate->fallback_state;
|
| -#ifdef VERBOSE
|
| - printf (" falling back, fallback_state %d\n", state);
|
| -#endif
|
| - }
|
| - found_state:
|
| -#ifdef VERBOSE
|
| - printf ("found state %d\n",state);
|
| -#endif
|
| - /* Additional optimization is possible here - especially,
|
| - elimination of trailing zeroes from the match. Leading zeroes
|
| - have already been optimized. */
|
| - match = dict->states[state].match;
|
| - /* replacing rules not handled by hyphen_hyphenate() */
|
| - if (match && !dict->states[state].repl)
|
| - {
|
| - offset = i + 1 - strlen (match);
|
| -#ifdef VERBOSE
|
| - for (k = 0; k < offset; k++)
|
| - putchar (' ');
|
| - printf ("%s\n", match);
|
| -#endif
|
| - /* This is a linear search because I tried a binary search and
|
| - found it to be just a teeny bit slower. */
|
| - for (k = 0; match[k]; k++)
|
| - if (hyphens[offset + k] < match[k])
|
| - hyphens[offset + k] = match[k];
|
| - }
|
| -
|
| - /* KBH: we need this to make sure we keep looking in a word */
|
| - /* for patterns even if the current character is not known in state 0 */
|
| - /* since patterns for hyphenation may occur anywhere in the word */
|
| - try_next_letter: ;
|
| -
|
| - }
|
| -#ifdef VERBOSE
|
| - for (i = 0; i < j; i++)
|
| - putchar (hyphens[i]);
|
| - putchar ('\n');
|
| -#endif
|
| -
|
| - for (i = 0; i < j - 4; i++)
|
| -#if 0
|
| - if (hyphens[i + 1] & 1)
|
| - hyphens[i] = '-';
|
| -#else
|
| - hyphens[i] = hyphens[i + 1];
|
| -#endif
|
| - hyphens[0] = '0';
|
| - for (; i < word_size; i++)
|
| - hyphens[i] = '0';
|
| - hyphens[word_size] = '\0';
|
| -
|
| - if (prep_word != prep_word_buf)
|
| - hnj_free (prep_word);
|
| -
|
| - return 0;
|
| -}
|
| -
|
| -/* Unicode ligature length */
|
| -int hnj_ligature(unsigned char c) {
|
| - switch (c) {
|
| - case 0x80: /* ff */
|
| - case 0x81: /* fi */
|
| - case 0x82: return LIG_xx; /* fl */
|
| - case 0x83: /* ffi */
|
| - case 0x84: return LIG_xxx; /* ffl */
|
| - case 0x85: /* long st */
|
| - case 0x86: return LIG_xx; /* st */
|
| - }
|
| - return 0;
|
| -}
|
| -
|
| -/* character length of the first n byte of the input word */
|
| -int hnj_hyphen_strnlen(const char * word, int n, int utf8)
|
| -{
|
| - int i = 0;
|
| - int j = 0;
|
| - while (j < n && word[j] != '\0') {
|
| - i++;
|
| - // Unicode ligature support
|
| - if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
|
| - i += hnj_ligature(word[j + 2]);
|
| - }
|
| - for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
|
| - }
|
| - return i;
|
| -}
|
| -
|
| -int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
|
| - char *** rep, int ** pos, int ** cut, int lhmin)
|
| -{
|
| - int i = 1, j;
|
| -
|
| - // Unicode ligature support
|
| - if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
|
| - i += hnj_ligature(word[2]);
|
| - }
|
| -
|
| - for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
|
| - // check length of the non-standard part
|
| - if (*rep && *pos && *cut && (*rep)[j]) {
|
| - char * rh = strchr((*rep)[j], '=');
|
| - if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
|
| - hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
|
| - free((*rep)[j]);
|
| - (*rep)[j] = NULL;
|
| - hyphens[j] = '0';
|
| - }
|
| - } else {
|
| - hyphens[j] = '0';
|
| - }
|
| - j++;
|
| -
|
| - // Unicode ligature support
|
| - if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
|
| - i += hnj_ligature(word[j + 2]);
|
| - }
|
| - } while (utf8 && (word[j] & 0xc0) == 0x80);
|
| - return 0;
|
| -}
|
| -
|
| -int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
|
| - char *** rep, int ** pos, int ** cut, int rhmin)
|
| -{
|
| - int i;
|
| - int j = word_size - 2;
|
| - for (i = 1; i < rhmin && j > 0; j--) {
|
| - // check length of the non-standard part
|
| - if (*rep && *pos && *cut && (*rep)[j]) {
|
| - char * rh = strchr((*rep)[j], '=');
|
| - if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
|
| - hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
|
| - free((*rep)[j]);
|
| - (*rep)[j] = NULL;
|
| - hyphens[j] = '0';
|
| - }
|
| - } else {
|
| - hyphens[j] = '0';
|
| - }
|
| - if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
|
| - }
|
| - return 0;
|
| -}
|
| -
|
| -// recursive function for compound level hyphenation
|
| -int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
|
| - char * hyphens, char *** rep, int ** pos, int ** cut,
|
| - int clhmin, int crhmin, int lend, int rend)
|
| -{
|
| - char prep_word_buf[MAX_WORD];
|
| - char *prep_word;
|
| - int i, j, k;
|
| - int state;
|
| - char ch;
|
| - HyphenState *hstate;
|
| - char *match;
|
| - char *repl;
|
| - signed char replindex;
|
| - signed char replcut;
|
| - int offset;
|
| - int matchlen_buf[MAX_CHARS];
|
| - int matchindex_buf[MAX_CHARS];
|
| - char * matchrepl_buf[MAX_CHARS];
|
| - int * matchlen;
|
| - int * matchindex;
|
| - char ** matchrepl;
|
| - int isrepl = 0;
|
| - int nHyphCount;
|
| -
|
| - if (word_size + 3 < MAX_CHARS) {
|
| - prep_word = prep_word_buf;
|
| - matchlen = matchlen_buf;
|
| - matchindex = matchindex_buf;
|
| - matchrepl = matchrepl_buf;
|
| - } else {
|
| - prep_word = hnj_malloc (word_size + 3);
|
| - matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
|
| - matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
|
| - matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
|
| - }
|
| -
|
| - j = 0;
|
| - prep_word[j++] = '.';
|
| -
|
| - for (i = 0; i < word_size; i++)
|
| - prep_word[j++] = word[i];
|
| -
|
| - prep_word[j++] = '.';
|
| - prep_word[j] = '\0';
|
| -
|
| - for (i = 0; i < j; i++)
|
| - hyphens[i] = '0';
|
| -
|
| -#ifdef VERBOSE
|
| - printf ("prep_word = %s\n", prep_word);
|
| -#endif
|
| -
|
| - /* now, run the finite state machine */
|
| - state = 0;
|
| - for (i = 0; i < j; i++)
|
| - {
|
| - ch = prep_word[i];
|
| - for (;;)
|
| - {
|
| -
|
| - if (state == -1) {
|
| - /* return 1; */
|
| - /* KBH: FIXME shouldn't this be as follows? */
|
| - state = 0;
|
| - goto try_next_letter;
|
| - }
|
| -
|
| -#ifdef VERBOSE
|
| - char *state_str;
|
| - state_str = get_state_str (state);
|
| -
|
| - for (k = 0; k < i - strlen (state_str); k++)
|
| - putchar (' ');
|
| - printf ("%s", state_str);
|
| -#endif
|
| -
|
| - hstate = &dict->states[state];
|
| - for (k = 0; k < hstate->num_trans; k++)
|
| - if (hstate->trans[k].ch == ch)
|
| - {
|
| - state = hstate->trans[k].new_state;
|
| - goto found_state;
|
| - }
|
| - state = hstate->fallback_state;
|
| -#ifdef VERBOSE
|
| - printf (" falling back, fallback_state %d\n", state);
|
| -#endif
|
| - }
|
| - found_state:
|
| -#ifdef VERBOSE
|
| - printf ("found state %d\n",state);
|
| -#endif
|
| - /* Additional optimization is possible here - especially,
|
| - elimination of trailing zeroes from the match. Leading zeroes
|
| - have already been optimized. */
|
| - match = dict->states[state].match;
|
| - repl = dict->states[state].repl;
|
| - replindex = dict->states[state].replindex;
|
| - replcut = dict->states[state].replcut;
|
| - /* replacing rules not handled by hyphen_hyphenate() */
|
| - if (match)
|
| - {
|
| - offset = i + 1 - strlen (match);
|
| -#ifdef VERBOSE
|
| - for (k = 0; k < offset; k++)
|
| - putchar (' ');
|
| - printf ("%s (%s)\n", match, repl);
|
| -#endif
|
| - if (repl) {
|
| - if (!isrepl) for(; isrepl < word_size; isrepl++) {
|
| - matchrepl[isrepl] = NULL;
|
| - matchindex[isrepl] = -1;
|
| - }
|
| - matchlen[offset + replindex] = replcut;
|
| - }
|
| - /* This is a linear search because I tried a binary search and
|
| - found it to be just a teeny bit slower. */
|
| - for (k = 0; match[k]; k++) {
|
| - if ((hyphens[offset + k] < match[k])) {
|
| - hyphens[offset + k] = match[k];
|
| - if (match[k]&1) {
|
| - matchrepl[offset + k] = repl;
|
| - if (repl && (k >= replindex) && (k <= replindex + replcut)) {
|
| - matchindex[offset + replindex] = offset + k;
|
| - }
|
| - }
|
| - }
|
| - }
|
| -
|
| - }
|
| -
|
| - /* KBH: we need this to make sure we keep looking in a word */
|
| - /* for patterns even if the current character is not known in state 0 */
|
| - /* since patterns for hyphenation may occur anywhere in the word */
|
| - try_next_letter: ;
|
| -
|
| - }
|
| -#ifdef VERBOSE
|
| - for (i = 0; i < j; i++)
|
| - putchar (hyphens[i]);
|
| - putchar ('\n');
|
| -#endif
|
| -
|
| - for (i = 0; i < j - 3; i++)
|
| -#if 0
|
| - if (hyphens[i + 1] & 1)
|
| - hyphens[i] = '-';
|
| -#else
|
| - hyphens[i] = hyphens[i + 1];
|
| -#endif
|
| - for (; i < word_size; i++)
|
| - hyphens[i] = '0';
|
| - hyphens[word_size] = '\0';
|
| -
|
| - /* now create a new char string showing hyphenation positions */
|
| - /* count the hyphens and allocate space for the new hyphenated string */
|
| - nHyphCount = 0;
|
| - for (i = 0; i < word_size; i++)
|
| - if (hyphens[i]&1)
|
| - nHyphCount++;
|
| - j = 0;
|
| - for (i = 0; i < word_size; i++) {
|
| - if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
|
| - if (rep && pos && cut) {
|
| - if (!*rep && !*pos && !*cut) {
|
| - int k;
|
| - *rep = (char **) malloc(sizeof(char *) * word_size);
|
| - *pos = (int *) malloc(sizeof(int) * word_size);
|
| - *cut = (int *) malloc(sizeof(int) * word_size);
|
| - for (k = 0; k < word_size; k++) {
|
| - (*rep)[k] = NULL;
|
| - (*pos)[k] = 0;
|
| - (*cut)[k] = 0;
|
| - }
|
| - }
|
| - (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
|
| - (*pos)[matchindex[i] - 1] = matchindex[i] - i;
|
| - (*cut)[matchindex[i] - 1] = matchlen[i];
|
| - }
|
| - j += strlen(matchrepl[matchindex[i]]);
|
| - i += matchlen[i] - 1;
|
| - }
|
| - }
|
| -
|
| - if (matchrepl != matchrepl_buf) {
|
| - hnj_free (matchrepl);
|
| - hnj_free (matchlen);
|
| - hnj_free (matchindex);
|
| - }
|
| -
|
| - // recursive hyphenation of the first (compound) level segments
|
| - if (dict->nextlevel) {
|
| - char * rep2_buf[MAX_WORD];
|
| - int pos2_buf[MAX_WORD];
|
| - int cut2_buf[MAX_WORD];
|
| - char hyphens2_buf[MAX_WORD];
|
| - char ** rep2;
|
| - int * pos2;
|
| - int * cut2;
|
| - char * hyphens2;
|
| - int begin = 0;
|
| - if (word_size < MAX_CHARS) {
|
| - rep2 = rep2_buf;
|
| - pos2 = pos2_buf;
|
| - cut2 = cut2_buf;
|
| - hyphens2 = hyphens2_buf;
|
| - } else {
|
| - rep2 = hnj_malloc (word_size * sizeof(char *));
|
| - pos2 = hnj_malloc (word_size * sizeof(int));
|
| - cut2 = hnj_malloc (word_size * sizeof(int));
|
| - hyphens2 = hnj_malloc (word_size);
|
| - }
|
| - for (i = 0; i < word_size; i++) rep2[i] = NULL;
|
| - for (i = 0; i < word_size; i++) if
|
| - (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
|
| - if (i - begin > 1) {
|
| - int hyph = 0;
|
| - prep_word[i + 2] = '\0';
|
| - /* non-standard hyphenation at compound boundary (Schiffahrt) */
|
| - if (*rep && *pos && *cut && (*rep)[i]) {
|
| - char * l = strchr((*rep)[i], '=');
|
| - strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
|
| - if (l) {
|
| - hyph = (l - (*rep)[i]) - (*pos)[i];
|
| - prep_word[2 + i + hyph] = '\0';
|
| - }
|
| - }
|
| - hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
|
| - hyphens2, &rep2, &pos2, &cut2, clhmin,
|
| - crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
|
| - for (j = 0; j < i - begin - 1; j++) {
|
| - hyphens[begin + j] = hyphens2[j];
|
| - if (rep2[j] && rep && pos && cut) {
|
| - if (!*rep && !*pos && !*cut) {
|
| - int k;
|
| - *rep = (char **) malloc(sizeof(char *) * word_size);
|
| - *pos = (int *) malloc(sizeof(int) * word_size);
|
| - *cut = (int *) malloc(sizeof(int) * word_size);
|
| - for (k = 0; k < word_size; k++) {
|
| - (*rep)[k] = NULL;
|
| - (*pos)[k] = 0;
|
| - (*cut)[k] = 0;
|
| - }
|
| - }
|
| - (*rep)[begin + j] = rep2[j];
|
| - (*pos)[begin + j] = pos2[j];
|
| - (*cut)[begin + j] = cut2[j];
|
| - }
|
| - }
|
| - prep_word[i + 2] = word[i + 1];
|
| - if (*rep && *pos && *cut && (*rep)[i]) {
|
| - strcpy(prep_word + 1, word);
|
| - }
|
| - }
|
| - begin = i + 1;
|
| - for (j = 0; j < word_size; j++) rep2[j] = NULL;
|
| - }
|
| -
|
| - // non-compound
|
| - if (begin == 0) {
|
| - hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
|
| - hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
|
| - if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
|
| - rep, pos, cut, clhmin);
|
| - if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
|
| - rep, pos, cut, crhmin);
|
| - }
|
| -
|
| - if (rep2 != rep2_buf) {
|
| - free(rep2);
|
| - free(cut2);
|
| - free(pos2);
|
| - free(hyphens2);
|
| - }
|
| - }
|
| -
|
| - if (prep_word != prep_word_buf) hnj_free (prep_word);
|
| - return 0;
|
| -}
|
| -
|
| -/* UTF-8 normalization of hyphen and non-standard positions */
|
| -int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
|
| - char *** rep, int ** pos, int ** cut)
|
| -{
|
| - int i, j, k;
|
| - if ((((unsigned char) word[0]) >> 6) == 2) {
|
| - fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
|
| - return 1;
|
| - }
|
| -
|
| - /* calculate UTF-8 character positions */
|
| - for (i = 0, j = -1; i < word_size; i++) {
|
| - /* beginning of an UTF-8 character (not '10' start bits) */
|
| - if ((((unsigned char) word[i]) >> 6) != 2) j++;
|
| - hyphens[j] = hyphens[i];
|
| - if (rep && pos && cut && *rep && *pos && *cut) {
|
| - int l = (*pos)[i];
|
| - (*pos)[j] = 0;
|
| - for (k = 0; k < l; k++) {
|
| - if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
|
| - }
|
| - k = i - l + 1;
|
| - l = k + (*cut)[i];
|
| - (*cut)[j] = 0;
|
| - for (; k < l; k++) {
|
| - if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
|
| - }
|
| - (*rep)[j] = (*rep)[i];
|
| - if (j < i) {
|
| - (*rep)[i] = NULL;
|
| - (*pos)[i] = 0;
|
| - (*cut)[i] = 0;
|
| - }
|
| - }
|
| - }
|
| - hyphens[j + 1] = '\0';
|
| - return 0;
|
| -}
|
| -
|
| -/* get the word with all possible hyphenations (output: hyphword) */
|
| -void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
|
| - char * hyphword, char *** rep, int ** pos, int ** cut)
|
| -{
|
| - int i, j;
|
| - for (i = 0, j = 0; i < l; i++, j++) {
|
| - if (hyphens[i]&1) {
|
| - hyphword[j] = word[i];
|
| - if (*rep && *pos && *cut && (*rep)[i]) {
|
| - strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
|
| - j += strlen((*rep)[i]) - (*pos)[i];
|
| - i += (*cut)[i] - (*pos)[i];
|
| - } else hyphword[++j] = '=';
|
| - } else hyphword[j] = word[i];
|
| - }
|
| - hyphword[j] = '\0';
|
| -}
|
| -
|
| -
|
| -/* main api function with default hyphenmin parameters */
|
| -int hnj_hyphen_hyphenate2 (HyphenDict *dict,
|
| - const char *word, int word_size, char * hyphens,
|
| - char *hyphword, char *** rep, int ** pos, int ** cut)
|
| -{
|
| - hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
|
| - dict->clhmin, dict->crhmin, 1, 1);
|
| - hnj_hyphen_lhmin(dict->utf8, word, word_size,
|
| - hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
|
| - hnj_hyphen_rhmin(dict->utf8, word, word_size,
|
| - hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
|
| - if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
|
| - if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
|
| - return 0;
|
| -}
|
| -
|
| -/* previous main api function with hyphenmin parameters */
|
| -int hnj_hyphen_hyphenate3 (HyphenDict *dict,
|
| - const char *word, int word_size, char * hyphens,
|
| - char *hyphword, char *** rep, int ** pos, int ** cut,
|
| - int lhmin, int rhmin, int clhmin, int crhmin)
|
| -{
|
| - lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
|
| - rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
|
| - hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
|
| - clhmin, crhmin, 1, 1);
|
| - hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
|
| - rep, pos, cut, (lhmin > 0 ? lhmin : 2));
|
| - hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
|
| - rep, pos, cut, (rhmin > 0 ? rhmin : 2));
|
| - if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
|
| - if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
|
| - return 0;
|
| -}
|
|
|