third_party/hyphen/hyphen.c - Issue 20860003: Remove hyphenation code from Chromium.

Unified Diff: third_party/hyphen/hyphen.c

Issue 20860003: Remove hyphenation code from Chromium. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: rebase Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/hyphen/hyphen.c

diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c

deleted file mode 100644

index ebae5107ed59425c56bb59e6d6955762a9ad353f..0000000000000000000000000000000000000000

--- a/third_party/hyphen/hyphen.c

+++ /dev/null

@@ -1,1084 +0,0 @@

-/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both

- * licenses follows.

- */

-/* LibHnj - a library for high quality hyphenation and justification

- *

- * This library is free software; you can redistribute it and/or

- * modify it under the terms of the GNU Library General Public

- * License as published by the Free Software Foundation; either

- * version 2 of the License, or (at your option) any later version.

- *

- * This library is distributed in the hope that it will be useful,

- * but WITHOUT ANY WARRANTY; without even the implied warranty of

- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

- * Library General Public License for more details.

- *

- * You should have received a copy of the GNU Library General Public

- * License along with this library; if not, write to the

- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,

- * Boston, MA 02111-1307 USA.

-*/

-/*

- * The contents of this file are subject to the Mozilla Public License

- * Version 1.0 (the "MPL"); you may not use this file except in

- * compliance with the MPL. You may obtain a copy of the MPL at

- * http://www.mozilla.org/MPL/

- *

- * Software distributed under the MPL is distributed on an "AS IS" basis,

- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL

- * for the specific language governing rights and limitations under the

- * MPL.

- *

- */

-#include <stdlib.h> /* for NULL, malloc */

-#include <stdio.h> /* for fprintf */

-#include <string.h> /* for strdup */

-#ifdef UNX

-#include <unistd.h> /* for exit */

-#endif

-#define noVERBOSE

-/* calculate hyphenmin values with long ligature length (2 or 3 characters

- * instead of 1 or 2) for comparison with hyphenation without ligatures */

-#define noLONG_LIGATURE

-#ifdef LONG_LIGATURE

-#define LIG_xx 1

-#define LIG_xxx 2

-#else

-#define LIG_xx 0

-#define LIG_xxx 1

-#endif

-#include "hnjalloc.h"

-#include "hyphen.h"

-static char *

-hnj_strdup (const char *s)

- char *new;

- int l;

- l = strlen (s);

- new = hnj_malloc (l + 1);

- memcpy (new, s, l);

- new[l] = 0;

- return new;

-/* remove cross-platform text line end characters */

-void hnj_strchomp(char * s)

- int k = strlen(s);

- if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';

- if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';

-/* a little bit of a hash table implementation. This simply maps strings

- to state numbers */

-typedef struct _HashTab HashTab;

-typedef struct _HashEntry HashEntry;

-/* A cheap, but effective, hack. */

-#define HASH_SIZE 31627

-struct _HashTab {

- HashEntry *entries[HASH_SIZE];

-};

-struct _HashEntry {

- HashEntry *next;

- char *key;

- int val;

-};

-/* a char* hash function from ASU - adapted from Gtk+ */

-static unsigned int

-hnj_string_hash (const char *s)

- const char *p;

- unsigned int h=0, g;

- for(p = s; *p != '\0'; p += 1) {

- h = ( h << 4 ) + *p;

- if ( ( g = h & 0xf0000000 ) ) {

- h = h ^ (g >> 24);

- h = h ^ g;

- }

- return h /* % M */;

-static HashTab *

-hnj_hash_new (void)

- HashTab *hashtab;

- int i;

- hashtab = hnj_malloc (sizeof(HashTab));

- for (i = 0; i < HASH_SIZE; i++)

- hashtab->entries[i] = NULL;

- return hashtab;

-static void

-hnj_hash_free (HashTab *hashtab)

- int i;

- HashEntry *e, *next;

- for (i = 0; i < HASH_SIZE; i++)

- for (e = hashtab->entries[i]; e; e = next)

- {

- next = e->next;

- hnj_free (e->key);

- hnj_free (e);

- }

- hnj_free (hashtab);

-/* assumes that key is not already present! */

-static void

-hnj_hash_insert (HashTab *hashtab, const char *key, int val)

- int i;

- HashEntry *e;

- i = hnj_string_hash (key) % HASH_SIZE;

- e = hnj_malloc (sizeof(HashEntry));

- e->next = hashtab->entries[i];

- e->key = hnj_strdup (key);

- e->val = val;

- hashtab->entries[i] = e;

-/* return val if found, otherwise -1 */

-static int

-hnj_hash_lookup (HashTab *hashtab, const char *key)

- int i;

- HashEntry *e;

- i = hnj_string_hash (key) % HASH_SIZE;

- for (e = hashtab->entries[i]; e; e = e->next)

- if (!strcmp (key, e->key))

- return e->val;

- return -1;

-/* Get the state number, allocating a new state if necessary. */

-static int

-hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)

- int state_num;

- state_num = hnj_hash_lookup (hashtab, string);

- if (state_num >= 0)

- return state_num;

- hnj_hash_insert (hashtab, string, dict->num_states);

- /* predicate is true if dict->num_states is a power of two */

- if (!(dict->num_states & (dict->num_states - 1)))

- {

- dict->states = hnj_realloc (dict->states,

- (dict->num_states << 1) *

- sizeof(HyphenState));

- }

- dict->states[dict->num_states].match = NULL;

- dict->states[dict->num_states].repl = NULL;

- dict->states[dict->num_states].fallback_state = -1;

- dict->states[dict->num_states].num_trans = 0;

- dict->states[dict->num_states].trans = NULL;

- return dict->num_states++;

-/* add a transition from state1 to state2 through ch - assumes that the

- transition does not already exist */

-static void

-hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)

- int num_trans;

- num_trans = dict->states[state1].num_trans;

- if (num_trans == 0)

- {

- dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));

- }

- else if (!(num_trans & (num_trans - 1)))

- {

- dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,

- (num_trans << 1) *

- sizeof(HyphenTrans));

- }

- dict->states[state1].trans[num_trans].ch = ch;

- dict->states[state1].trans[num_trans].new_state = state2;

- dict->states[state1].num_trans++;

-#ifdef VERBOSE

-HashTab *global;

-static char *

-get_state_str (int state)

- int i;

- HashEntry *e;

- for (i = 0; i < HASH_SIZE; i++)

- for (e = global->entries[i]; e; e = e->next)

- if (e->val == state)

- return e->key;

- return NULL;

-#endif

-HyphenDict *

-hnj_hyphen_load (const char *fn)

- HyphenDict *result;

- FILE *f;

- f = fopen (fn, "r");

- if (f == NULL)

- return NULL;

- result = hnj_hyphen_load_file(f);

- fclose(f);

- return result;

-HyphenDict *

-hnj_hyphen_load_file (FILE *f)

- HyphenDict *dict[2];

- HashTab *hashtab;

- char buf[MAX_CHARS];

- char word[MAX_CHARS];

- char pattern[MAX_CHARS];

- char * repl;

- signed char replindex;

- signed char replcut;

- int state_num = 0, last_state;

- int i, j, k;

- char ch;

- int found;

- HashEntry *e;

- int nextlevel = 0;

-// loading one or two dictionaries (separated by NEXTLEVEL keyword)

-for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {

- hashtab = hnj_hash_new ();

-#ifdef VERBOSE

- global = hashtab;

-#endif

- hnj_hash_insert (hashtab, "", 0);

- dict[k] = hnj_malloc (sizeof(HyphenDict));

- dict[k]->num_states = 1;

- dict[k]->states = hnj_malloc (sizeof(HyphenState));

- dict[k]->states[0].match = NULL;

- dict[k]->states[0].repl = NULL;

- dict[k]->states[0].fallback_state = -1;

- dict[k]->states[0].num_trans = 0;

- dict[k]->states[0].trans = NULL;

- dict[k]->nextlevel = NULL;

- dict[k]->lhmin = 0;

- dict[k]->rhmin = 0;

- dict[k]->clhmin = 0;

- dict[k]->crhmin = 0;

- /* read in character set info */

- if (k == 0) {

- for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;

- if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {

- for (i=0;i<MAX_NAME;i++)

- if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))

- dict[k]->cset[i] = 0;

- } else {

- dict[k]->cset[0] = 0;

- }

- dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);

- } else {

- strcpy(dict[k]->cset, dict[0]->cset);

- dict[k]->utf8 = dict[0]->utf8;

- }

- while (fgets (buf, sizeof(buf), f) != NULL)

- {

- if (buf[0] != '%')

- {

- if (strncmp(buf, "NEXTLEVEL", 9) == 0) {

- nextlevel = 1;

- break;

- } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {

- dict[k]->lhmin = atoi(buf + 13);

- continue;

- } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {

- dict[k]->rhmin = atoi(buf + 14);

- continue;

- } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {

- dict[k]->clhmin = atoi(buf + 21);

- continue;

- } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {

- dict[k]->crhmin = atoi(buf + 22);

- continue;

- }

- j = 0;

- pattern[j] = '0';

- repl = strchr(buf, '/');

- replindex = 0;

- replcut = 0;

- if (repl) {

- char * index = strchr(repl + 1, ',');

- *repl = '\0';

- if (index) {

- char * index2 = strchr(index + 1, ',');

- *index = '\0';

- if (index2) {

- *index2 = '\0';

- replindex = (signed char) atoi(index + 1) - 1;

- replcut = (signed char) atoi(index2 + 1);

- }

- } else {

- hnj_strchomp(repl + 1);

- replindex = 0;

- replcut = (signed char) strlen(buf);

- }

- repl = hnj_strdup(repl + 1);

- }

- for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)

- {

- if (buf[i] >= '0' && buf[i] <= '9')

- pattern[j] = buf[i];

- else

- {

- word[j] = buf[i];

- pattern[++j] = '0';

- }

- word[j] = '\0';

- pattern[j + 1] = '\0';

- i = 0;

- if (!repl) {

- /* Optimize away leading zeroes */

- for (; pattern[i] == '0'; i++);

- } else {

- if (*word == '.') i++;

- /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */

- if (dict[k]->utf8) {

- int pu = -1; /* unicode character position */

- int ps = -1; /* unicode start position (original replindex) */

- int pc = (*word == '.') ? 1: 0; /* 8-bit character position */

- for (; pc < (strlen(word) + 1); pc++) {

- /* beginning of an UTF-8 character (not '10' start bits) */

- if ((((unsigned char) word[pc]) >> 6) != 2) pu++;

- if ((ps < 0) && (replindex == pu)) {

- ps = replindex;

- replindex = (signed char) pc;

- }

- if ((ps >= 0) && ((pu - ps) == replcut)) {

- replcut = (signed char) (pc - replindex);

- break;

- }

- if (*word == '.') replindex--;

- }

-#ifdef VERBOSE

- printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);

-#endif

- found = hnj_hash_lookup (hashtab, word);

- state_num = hnj_get_state (dict[k], hashtab, word);

- dict[k]->states[state_num].match = hnj_strdup (pattern + i);

- dict[k]->states[state_num].repl = repl;

- dict[k]->states[state_num].replindex = replindex;

- if (!replcut) {

- dict[k]->states[state_num].replcut = (signed char) strlen(word);

- } else {

- dict[k]->states[state_num].replcut = replcut;

- }

- /* now, put in the prefix transitions */

- for (; found < 0 ;j--)

- {

- last_state = state_num;

- ch = word[j - 1];

- word[j - 1] = '\0';

- found = hnj_hash_lookup (hashtab, word);

- state_num = hnj_get_state (dict[k], hashtab, word);

- hnj_add_trans (dict[k], state_num, last_state, ch);

- }

- /* Could do unioning of matches here (instead of the preprocessor script).

- If we did, the pseudocode would look something like this:

- foreach state in the hash table

- foreach i = [1..length(state) - 1]

- state to check is substr (state, i)

- look it up

- if found, and if there is a match, union the match in.

- It's also possible to avoid the quadratic blowup by doing the

- search in order of increasing state string sizes - then you

- can break the loop after finding the first match.

- This step should be optional in any case - if there is a

- preprocessed rule table, it's always faster to use that.

-*/

- /* put in the fallback states */

- for (i = 0; i < HASH_SIZE; i++)

- for (e = hashtab->entries[i]; e; e = e->next)

- {

- if (*(e->key)) for (j = 1; 1; j++)

- {

- state_num = hnj_hash_lookup (hashtab, e->key + j);

- if (state_num >= 0)

- break;

- }

- /* KBH: FIXME state 0 fallback_state should always be -1? */

- if (e->val)

- dict[k]->states[e->val].fallback_state = state_num;

- }

-#ifdef VERBOSE

- for (i = 0; i < HASH_SIZE; i++)

- for (e = hashtab->entries[i]; e; e = e->next)

- {

- printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,

- dict[k]->states[e->val].fallback_state);

- for (j = 0; j < dict[k]->states[e->val].num_trans; j++)

- printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,

- dict[k]->states[e->val].trans[j].new_state);

- }

-#endif

-#ifndef VERBOSE

- hnj_hash_free (hashtab);

-#endif

- state_num = 0;

- if (k == 2) dict[0]->nextlevel = dict[1];

- return dict[0];

-void hnj_hyphen_free (HyphenDict *dict)

- int state_num;

- HyphenState *hstate;

- for (state_num = 0; state_num < dict->num_states; state_num++)

- {

- hstate = &dict->states[state_num];

- if (hstate->match)

- hnj_free (hstate->match);

- if (hstate->repl)

- hnj_free (hstate->repl);

- if (hstate->trans)

- hnj_free (hstate->trans);

- }

- if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);

- hnj_free (dict->states);

- hnj_free (dict);

-#define MAX_WORD 256

-int hnj_hyphen_hyphenate (HyphenDict *dict,

- const char *word, int word_size,

- char *hyphens)

- char prep_word_buf[MAX_WORD];

- char *prep_word;

- int i, j, k;

- int state;

- char ch;

- HyphenState *hstate;

- char *match;

- int offset;

- if (word_size + 3 < MAX_WORD)

- prep_word = prep_word_buf;

- else

- prep_word = hnj_malloc (word_size + 3);

- j = 0;

- prep_word[j++] = '.';

- for (i = 0; i < word_size; i++)

- prep_word[j++] = word[i];

- prep_word[j++] = '.';

- prep_word[j] = '\0';

- for (i = 0; i < word_size + 5; i++)

- hyphens[i] = '0';

-#ifdef VERBOSE

- printf ("prep_word = %s\n", prep_word);

-#endif

- /* now, run the finite state machine */

- state = 0;

- for (i = 0; i < j; i++)

- {

- ch = prep_word[i];

- for (;;)

- {

- if (state == -1) {

- /* return 1; */

- /* KBH: FIXME shouldn't this be as follows? */

- state = 0;

- goto try_next_letter;

- }

-#ifdef VERBOSE

- char *state_str;

- state_str = get_state_str (state);

- for (k = 0; k < i - strlen (state_str); k++)

- putchar (' ');

- printf ("%s", state_str);

-#endif

- hstate = &dict->states[state];

- for (k = 0; k < hstate->num_trans; k++)

- if (hstate->trans[k].ch == ch)

- {

- state = hstate->trans[k].new_state;

- goto found_state;

- }

- state = hstate->fallback_state;

-#ifdef VERBOSE

- printf (" falling back, fallback_state %d\n", state);

-#endif

- }

- found_state:

-#ifdef VERBOSE

- printf ("found state %d\n",state);

-#endif

- /* Additional optimization is possible here - especially,

- elimination of trailing zeroes from the match. Leading zeroes

- have already been optimized. */

- match = dict->states[state].match;

- /* replacing rules not handled by hyphen_hyphenate() */

- if (match && !dict->states[state].repl)

- {

- offset = i + 1 - strlen (match);

-#ifdef VERBOSE

- for (k = 0; k < offset; k++)

- putchar (' ');

- printf ("%s\n", match);

-#endif

- /* This is a linear search because I tried a binary search and

- found it to be just a teeny bit slower. */

- for (k = 0; match[k]; k++)

- if (hyphens[offset + k] < match[k])

- hyphens[offset + k] = match[k];

- }

- /* KBH: we need this to make sure we keep looking in a word */

- /* for patterns even if the current character is not known in state 0 */

- /* since patterns for hyphenation may occur anywhere in the word */

- try_next_letter: ;

- }

-#ifdef VERBOSE

- for (i = 0; i < j; i++)

- putchar (hyphens[i]);

- putchar ('\n');

-#endif

- for (i = 0; i < j - 4; i++)

-#if 0

- if (hyphens[i + 1] & 1)

- hyphens[i] = '-';

-#else

- hyphens[i] = hyphens[i + 1];

-#endif

- hyphens[0] = '0';

- for (; i < word_size; i++)

- hyphens[i] = '0';

- hyphens[word_size] = '\0';

- if (prep_word != prep_word_buf)

- hnj_free (prep_word);

- return 0;

-/* Unicode ligature length */

-int hnj_ligature(unsigned char c) {

- switch (c) {

- case 0x80: /* ff */

- case 0x81: /* fi */

- case 0x82: return LIG_xx; /* fl */

- case 0x83: /* ffi */

- case 0x84: return LIG_xxx; /* ffl */

- case 0x85: /* long st */

- case 0x86: return LIG_xx; /* st */

- }

- return 0;

-/* character length of the first n byte of the input word */

-int hnj_hyphen_strnlen(const char * word, int n, int utf8)

- int i = 0;

- int j = 0;

- while (j < n && word[j] != '\0') {

- i++;

- // Unicode ligature support

- if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {

- i += hnj_ligature(word[j + 2]);

- }

- for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);

- }

- return i;

-int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,

- char *** rep, int ** pos, int ** cut, int lhmin)

- int i = 1, j;

- // Unicode ligature support

- if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {

- i += hnj_ligature(word[2]);

- }

- for (j = 0; i < lhmin && word[j] != '\0'; i++) do {

- // check length of the non-standard part

- if (*rep && *pos && *cut && (*rep)[j]) {

- char * rh = strchr((*rep)[j], '=');

- if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +

- hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {

- free((*rep)[j]);

- (*rep)[j] = NULL;

- hyphens[j] = '0';

- }

- } else {

- hyphens[j] = '0';

- }

- j++;

- // Unicode ligature support

- if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {

- i += hnj_ligature(word[j + 2]);

- }

- } while (utf8 && (word[j] & 0xc0) == 0x80);

- return 0;

-int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,

- char *** rep, int ** pos, int ** cut, int rhmin)

- int i;

- int j = word_size - 2;

- for (i = 1; i < rhmin && j > 0; j--) {

- // check length of the non-standard part

- if (*rep && *pos && *cut && (*rep)[j]) {

- char * rh = strchr((*rep)[j], '=');

- if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +

- hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {

- free((*rep)[j]);

- (*rep)[j] = NULL;

- hyphens[j] = '0';

- }

- } else {

- hyphens[j] = '0';

- }

- if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;

- }

- return 0;

-// recursive function for compound level hyphenation

-int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,

- char * hyphens, char *** rep, int ** pos, int ** cut,

- int clhmin, int crhmin, int lend, int rend)

- char prep_word_buf[MAX_WORD];

- char *prep_word;

- int i, j, k;

- int state;

- char ch;

- HyphenState *hstate;

- char *match;

- char *repl;

- signed char replindex;

- signed char replcut;

- int offset;

- int matchlen_buf[MAX_CHARS];

- int matchindex_buf[MAX_CHARS];

- char * matchrepl_buf[MAX_CHARS];

- int * matchlen;

- int * matchindex;

- char ** matchrepl;

- int isrepl = 0;

- int nHyphCount;

- if (word_size + 3 < MAX_CHARS) {

- prep_word = prep_word_buf;

- matchlen = matchlen_buf;

- matchindex = matchindex_buf;

- matchrepl = matchrepl_buf;

- } else {

- prep_word = hnj_malloc (word_size + 3);

- matchlen = hnj_malloc ((word_size + 3) * sizeof(int));

- matchindex = hnj_malloc ((word_size + 3) * sizeof(int));

- matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));

- }

- j = 0;

- prep_word[j++] = '.';

- for (i = 0; i < word_size; i++)

- prep_word[j++] = word[i];

- prep_word[j++] = '.';

- prep_word[j] = '\0';

- for (i = 0; i < j; i++)

- hyphens[i] = '0';

-#ifdef VERBOSE

- printf ("prep_word = %s\n", prep_word);

-#endif

- /* now, run the finite state machine */

- state = 0;

- for (i = 0; i < j; i++)

- {

- ch = prep_word[i];

- for (;;)

- {

- if (state == -1) {

- /* return 1; */

- /* KBH: FIXME shouldn't this be as follows? */

- state = 0;

- goto try_next_letter;

- }

-#ifdef VERBOSE

- char *state_str;

- state_str = get_state_str (state);

- for (k = 0; k < i - strlen (state_str); k++)

- putchar (' ');

- printf ("%s", state_str);

-#endif

- hstate = &dict->states[state];

- for (k = 0; k < hstate->num_trans; k++)

- if (hstate->trans[k].ch == ch)

- {

- state = hstate->trans[k].new_state;

- goto found_state;

- }

- state = hstate->fallback_state;

-#ifdef VERBOSE

- printf (" falling back, fallback_state %d\n", state);

-#endif

- }

- found_state:

-#ifdef VERBOSE

- printf ("found state %d\n",state);

-#endif

- /* Additional optimization is possible here - especially,

- elimination of trailing zeroes from the match. Leading zeroes

- have already been optimized. */

- match = dict->states[state].match;

- repl = dict->states[state].repl;

- replindex = dict->states[state].replindex;

- replcut = dict->states[state].replcut;

- /* replacing rules not handled by hyphen_hyphenate() */

- if (match)

- {

- offset = i + 1 - strlen (match);

-#ifdef VERBOSE

- for (k = 0; k < offset; k++)

- putchar (' ');

- printf ("%s (%s)\n", match, repl);

-#endif

- if (repl) {

- if (!isrepl) for(; isrepl < word_size; isrepl++) {

- matchrepl[isrepl] = NULL;

- matchindex[isrepl] = -1;

- }

- matchlen[offset + replindex] = replcut;

- }

- /* This is a linear search because I tried a binary search and

- found it to be just a teeny bit slower. */

- for (k = 0; match[k]; k++) {

- if ((hyphens[offset + k] < match[k])) {

- hyphens[offset + k] = match[k];

- if (match[k]&1) {

- matchrepl[offset + k] = repl;

- if (repl && (k >= replindex) && (k <= replindex + replcut)) {

- matchindex[offset + replindex] = offset + k;

- }

- /* KBH: we need this to make sure we keep looking in a word */

- /* for patterns even if the current character is not known in state 0 */

- /* since patterns for hyphenation may occur anywhere in the word */

- try_next_letter: ;

- }

-#ifdef VERBOSE

- for (i = 0; i < j; i++)

- putchar (hyphens[i]);

- putchar ('\n');

-#endif

- for (i = 0; i < j - 3; i++)

-#if 0

- if (hyphens[i + 1] & 1)

- hyphens[i] = '-';

-#else

- hyphens[i] = hyphens[i + 1];

-#endif

- for (; i < word_size; i++)

- hyphens[i] = '0';

- hyphens[word_size] = '\0';

- /* now create a new char string showing hyphenation positions */

- /* count the hyphens and allocate space for the new hyphenated string */

- nHyphCount = 0;

- for (i = 0; i < word_size; i++)

- if (hyphens[i]&1)

- nHyphCount++;

- j = 0;

- for (i = 0; i < word_size; i++) {

- if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {

- if (rep && pos && cut) {

- if (!*rep && !*pos && !*cut) {

- int k;

- *rep = (char **) malloc(sizeof(char *) * word_size);

- *pos = (int *) malloc(sizeof(int) * word_size);

- *cut = (int *) malloc(sizeof(int) * word_size);

- for (k = 0; k < word_size; k++) {

- (*rep)[k] = NULL;

- (*pos)[k] = 0;

- (*cut)[k] = 0;

- }

- (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);

- (*pos)[matchindex[i] - 1] = matchindex[i] - i;

- (*cut)[matchindex[i] - 1] = matchlen[i];

- }

- j += strlen(matchrepl[matchindex[i]]);

- i += matchlen[i] - 1;

- }

- if (matchrepl != matchrepl_buf) {

- hnj_free (matchrepl);

- hnj_free (matchlen);

- hnj_free (matchindex);

- }

- // recursive hyphenation of the first (compound) level segments

- if (dict->nextlevel) {

- char * rep2_buf[MAX_WORD];

- int pos2_buf[MAX_WORD];

- int cut2_buf[MAX_WORD];

- char hyphens2_buf[MAX_WORD];

- char ** rep2;

- int * pos2;

- int * cut2;

- char * hyphens2;

- int begin = 0;

- if (word_size < MAX_CHARS) {

- rep2 = rep2_buf;

- pos2 = pos2_buf;

- cut2 = cut2_buf;

- hyphens2 = hyphens2_buf;

- } else {

- rep2 = hnj_malloc (word_size * sizeof(char *));

- pos2 = hnj_malloc (word_size * sizeof(int));

- cut2 = hnj_malloc (word_size * sizeof(int));

- hyphens2 = hnj_malloc (word_size);

- }

- for (i = 0; i < word_size; i++) rep2[i] = NULL;

- for (i = 0; i < word_size; i++) if

- (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {

- if (i - begin > 1) {

- int hyph = 0;

- prep_word[i + 2] = '\0';

- /* non-standard hyphenation at compound boundary (Schiffahrt) */

- if (*rep && *pos && *cut && (*rep)[i]) {

- char * l = strchr((*rep)[i], '=');

- strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);

- if (l) {

- hyph = (l - (*rep)[i]) - (*pos)[i];

- prep_word[2 + i + hyph] = '\0';

- }

- hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,

- hyphens2, &rep2, &pos2, &cut2, clhmin,

- crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));

- for (j = 0; j < i - begin - 1; j++) {

- hyphens[begin + j] = hyphens2[j];

- if (rep2[j] && rep && pos && cut) {

- if (!*rep && !*pos && !*cut) {

- int k;

- *rep = (char **) malloc(sizeof(char *) * word_size);

- *pos = (int *) malloc(sizeof(int) * word_size);

- *cut = (int *) malloc(sizeof(int) * word_size);

- for (k = 0; k < word_size; k++) {

- (*rep)[k] = NULL;

- (*pos)[k] = 0;

- (*cut)[k] = 0;

- }

- (*rep)[begin + j] = rep2[j];

- (*pos)[begin + j] = pos2[j];

- (*cut)[begin + j] = cut2[j];

- }

- prep_word[i + 2] = word[i + 1];

- if (*rep && *pos && *cut && (*rep)[i]) {

- strcpy(prep_word + 1, word);

- }

- begin = i + 1;

- for (j = 0; j < word_size; j++) rep2[j] = NULL;

- }

- // non-compound

- if (begin == 0) {

- hnj_hyphen_hyph_(dict->nextlevel, word, word_size,

- hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);

- if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

- rep, pos, cut, clhmin);

- if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

- rep, pos, cut, crhmin);

- }

- if (rep2 != rep2_buf) {

- free(rep2);

- free(cut2);

- free(pos2);

- free(hyphens2);

- }

- if (prep_word != prep_word_buf) hnj_free (prep_word);

- return 0;

-/* UTF-8 normalization of hyphen and non-standard positions */

-int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,

- char *** rep, int ** pos, int ** cut)

- int i, j, k;

- if ((((unsigned char) word[0]) >> 6) == 2) {

- fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);

- return 1;

- }

- /* calculate UTF-8 character positions */

- for (i = 0, j = -1; i < word_size; i++) {

- /* beginning of an UTF-8 character (not '10' start bits) */

- if ((((unsigned char) word[i]) >> 6) != 2) j++;

- hyphens[j] = hyphens[i];

- if (rep && pos && cut && *rep && *pos && *cut) {

- int l = (*pos)[i];

- (*pos)[j] = 0;

- for (k = 0; k < l; k++) {

- if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;

- }

- k = i - l + 1;

- l = k + (*cut)[i];

- (*cut)[j] = 0;

- for (; k < l; k++) {

- if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;

- }

- (*rep)[j] = (*rep)[i];

- if (j < i) {

- (*rep)[i] = NULL;

- (*pos)[i] = 0;

- (*cut)[i] = 0;

- }

- hyphens[j + 1] = '\0';

- return 0;

-/* get the word with all possible hyphenations (output: hyphword) */

-void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,

- char * hyphword, char *** rep, int ** pos, int ** cut)

- int i, j;

- for (i = 0, j = 0; i < l; i++, j++) {

- if (hyphens[i]&1) {

- hyphword[j] = word[i];

- if (*rep && *pos && *cut && (*rep)[i]) {

- strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);

- j += strlen((*rep)[i]) - (*pos)[i];

- i += (*cut)[i] - (*pos)[i];

- } else hyphword[++j] = '=';

- } else hyphword[j] = word[i];

- }

- hyphword[j] = '\0';

-/* main api function with default hyphenmin parameters */

-int hnj_hyphen_hyphenate2 (HyphenDict *dict,

- const char *word, int word_size, char * hyphens,

- char *hyphword, char *** rep, int ** pos, int ** cut)

- hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

- dict->clhmin, dict->crhmin, 1, 1);

- hnj_hyphen_lhmin(dict->utf8, word, word_size,

- hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));

- hnj_hyphen_rhmin(dict->utf8, word, word_size,

- hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));

- if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);

- if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);

- return 0;

-/* previous main api function with hyphenmin parameters */

-int hnj_hyphen_hyphenate3 (HyphenDict *dict,

- const char *word, int word_size, char * hyphens,

- char *hyphword, char *** rep, int ** pos, int ** cut,

- int lhmin, int rhmin, int clhmin, int crhmin)

- lhmin = (lhmin > 0 ? lhmin : dict->lhmin);

- rhmin = (rhmin > 0 ? rhmin : dict->rhmin);

- hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

- clhmin, crhmin, 1, 1);

- hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

- rep, pos, cut, (lhmin > 0 ? lhmin : 2));

- hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

- rep, pos, cut, (rhmin > 0 ? rhmin : 2));

- if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);

- if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);

- return 0;

« no previous file with comments | « third_party/hyphen/hyphen.h ('k') | third_party/hyphen/hyphen.gyp » ('j') | no next file with comments »