Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(23)

Unified Diff: third_party/hyphen/hyphen.c

Issue 20860003: Remove hyphenation code from Chromium. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: rebase Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/hyphen/hyphen.h ('k') | third_party/hyphen/hyphen.gyp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/hyphen/hyphen.c
diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c
deleted file mode 100644
index ebae5107ed59425c56bb59e6d6955762a9ad353f..0000000000000000000000000000000000000000
--- a/third_party/hyphen/hyphen.c
+++ /dev/null
@@ -1,1084 +0,0 @@
-/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
- * licenses follows.
- */
-
-/* LibHnj - a library for high quality hyphenation and justification
- * Copyright (C) 1998 Raph Levien,
- * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
- * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
- * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307 USA.
-*/
-
-/*
- * The contents of this file are subject to the Mozilla Public License
- * Version 1.0 (the "MPL"); you may not use this file except in
- * compliance with the MPL. You may obtain a copy of the MPL at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the MPL is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
- * for the specific language governing rights and limitations under the
- * MPL.
- *
- */
-#include <stdlib.h> /* for NULL, malloc */
-#include <stdio.h> /* for fprintf */
-#include <string.h> /* for strdup */
-
-#ifdef UNX
-#include <unistd.h> /* for exit */
-#endif
-
-#define noVERBOSE
-
-/* calculate hyphenmin values with long ligature length (2 or 3 characters
- * instead of 1 or 2) for comparison with hyphenation without ligatures */
-#define noLONG_LIGATURE
-
-#ifdef LONG_LIGATURE
-#define LIG_xx 1
-#define LIG_xxx 2
-#else
-#define LIG_xx 0
-#define LIG_xxx 1
-#endif
-
-#include "hnjalloc.h"
-#include "hyphen.h"
-
-static char *
-hnj_strdup (const char *s)
-{
- char *new;
- int l;
-
- l = strlen (s);
- new = hnj_malloc (l + 1);
- memcpy (new, s, l);
- new[l] = 0;
- return new;
-}
-
-/* remove cross-platform text line end characters */
-void hnj_strchomp(char * s)
-{
- int k = strlen(s);
- if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
- if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
-}
-
-/* a little bit of a hash table implementation. This simply maps strings
- to state numbers */
-
-typedef struct _HashTab HashTab;
-typedef struct _HashEntry HashEntry;
-
-/* A cheap, but effective, hack. */
-#define HASH_SIZE 31627
-
-struct _HashTab {
- HashEntry *entries[HASH_SIZE];
-};
-
-struct _HashEntry {
- HashEntry *next;
- char *key;
- int val;
-};
-
-/* a char* hash function from ASU - adapted from Gtk+ */
-static unsigned int
-hnj_string_hash (const char *s)
-{
- const char *p;
- unsigned int h=0, g;
- for(p = s; *p != '\0'; p += 1) {
- h = ( h << 4 ) + *p;
- if ( ( g = h & 0xf0000000 ) ) {
- h = h ^ (g >> 24);
- h = h ^ g;
- }
- }
- return h /* % M */;
-}
-
-static HashTab *
-hnj_hash_new (void)
-{
- HashTab *hashtab;
- int i;
-
- hashtab = hnj_malloc (sizeof(HashTab));
- for (i = 0; i < HASH_SIZE; i++)
- hashtab->entries[i] = NULL;
-
- return hashtab;
-}
-
-static void
-hnj_hash_free (HashTab *hashtab)
-{
- int i;
- HashEntry *e, *next;
-
- for (i = 0; i < HASH_SIZE; i++)
- for (e = hashtab->entries[i]; e; e = next)
- {
- next = e->next;
- hnj_free (e->key);
- hnj_free (e);
- }
-
- hnj_free (hashtab);
-}
-
-/* assumes that key is not already present! */
-static void
-hnj_hash_insert (HashTab *hashtab, const char *key, int val)
-{
- int i;
- HashEntry *e;
-
- i = hnj_string_hash (key) % HASH_SIZE;
- e = hnj_malloc (sizeof(HashEntry));
- e->next = hashtab->entries[i];
- e->key = hnj_strdup (key);
- e->val = val;
- hashtab->entries[i] = e;
-}
-
-/* return val if found, otherwise -1 */
-static int
-hnj_hash_lookup (HashTab *hashtab, const char *key)
-{
- int i;
- HashEntry *e;
- i = hnj_string_hash (key) % HASH_SIZE;
- for (e = hashtab->entries[i]; e; e = e->next)
- if (!strcmp (key, e->key))
- return e->val;
- return -1;
-}
-
-/* Get the state number, allocating a new state if necessary. */
-static int
-hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
-{
- int state_num;
-
- state_num = hnj_hash_lookup (hashtab, string);
-
- if (state_num >= 0)
- return state_num;
-
- hnj_hash_insert (hashtab, string, dict->num_states);
- /* predicate is true if dict->num_states is a power of two */
- if (!(dict->num_states & (dict->num_states - 1)))
- {
- dict->states = hnj_realloc (dict->states,
- (dict->num_states << 1) *
- sizeof(HyphenState));
- }
- dict->states[dict->num_states].match = NULL;
- dict->states[dict->num_states].repl = NULL;
- dict->states[dict->num_states].fallback_state = -1;
- dict->states[dict->num_states].num_trans = 0;
- dict->states[dict->num_states].trans = NULL;
- return dict->num_states++;
-}
-
-/* add a transition from state1 to state2 through ch - assumes that the
- transition does not already exist */
-static void
-hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
-{
- int num_trans;
-
- num_trans = dict->states[state1].num_trans;
- if (num_trans == 0)
- {
- dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
- }
- else if (!(num_trans & (num_trans - 1)))
- {
- dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
- (num_trans << 1) *
- sizeof(HyphenTrans));
- }
- dict->states[state1].trans[num_trans].ch = ch;
- dict->states[state1].trans[num_trans].new_state = state2;
- dict->states[state1].num_trans++;
-}
-
-#ifdef VERBOSE
-HashTab *global;
-
-static char *
-get_state_str (int state)
-{
- int i;
- HashEntry *e;
-
- for (i = 0; i < HASH_SIZE; i++)
- for (e = global->entries[i]; e; e = e->next)
- if (e->val == state)
- return e->key;
- return NULL;
-}
-#endif
-
-HyphenDict *
-hnj_hyphen_load (const char *fn)
-{
- HyphenDict *result;
- FILE *f;
- f = fopen (fn, "r");
- if (f == NULL)
- return NULL;
-
- result = hnj_hyphen_load_file(f);
-
- fclose(f);
- return result;
-}
-
-HyphenDict *
-hnj_hyphen_load_file (FILE *f)
-{
- HyphenDict *dict[2];
- HashTab *hashtab;
- char buf[MAX_CHARS];
- char word[MAX_CHARS];
- char pattern[MAX_CHARS];
- char * repl;
- signed char replindex;
- signed char replcut;
- int state_num = 0, last_state;
- int i, j, k;
- char ch;
- int found;
- HashEntry *e;
- int nextlevel = 0;
-
-// loading one or two dictionaries (separated by NEXTLEVEL keyword)
-for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
- hashtab = hnj_hash_new ();
-#ifdef VERBOSE
- global = hashtab;
-#endif
- hnj_hash_insert (hashtab, "", 0);
- dict[k] = hnj_malloc (sizeof(HyphenDict));
- dict[k]->num_states = 1;
- dict[k]->states = hnj_malloc (sizeof(HyphenState));
- dict[k]->states[0].match = NULL;
- dict[k]->states[0].repl = NULL;
- dict[k]->states[0].fallback_state = -1;
- dict[k]->states[0].num_trans = 0;
- dict[k]->states[0].trans = NULL;
- dict[k]->nextlevel = NULL;
- dict[k]->lhmin = 0;
- dict[k]->rhmin = 0;
- dict[k]->clhmin = 0;
- dict[k]->crhmin = 0;
-
- /* read in character set info */
- if (k == 0) {
- for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
- if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
- for (i=0;i<MAX_NAME;i++)
- if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
- dict[k]->cset[i] = 0;
- } else {
- dict[k]->cset[0] = 0;
- }
- dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
- } else {
- strcpy(dict[k]->cset, dict[0]->cset);
- dict[k]->utf8 = dict[0]->utf8;
- }
-
- while (fgets (buf, sizeof(buf), f) != NULL)
- {
- if (buf[0] != '%')
- {
- if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
- nextlevel = 1;
- break;
- } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
- dict[k]->lhmin = atoi(buf + 13);
- continue;
- } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
- dict[k]->rhmin = atoi(buf + 14);
- continue;
- } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
- dict[k]->clhmin = atoi(buf + 21);
- continue;
- } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
- dict[k]->crhmin = atoi(buf + 22);
- continue;
- }
- j = 0;
- pattern[j] = '0';
- repl = strchr(buf, '/');
- replindex = 0;
- replcut = 0;
- if (repl) {
- char * index = strchr(repl + 1, ',');
- *repl = '\0';
- if (index) {
- char * index2 = strchr(index + 1, ',');
- *index = '\0';
- if (index2) {
- *index2 = '\0';
- replindex = (signed char) atoi(index + 1) - 1;
- replcut = (signed char) atoi(index2 + 1);
- }
- } else {
- hnj_strchomp(repl + 1);
- replindex = 0;
- replcut = (signed char) strlen(buf);
- }
- repl = hnj_strdup(repl + 1);
- }
- for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
- {
- if (buf[i] >= '0' && buf[i] <= '9')
- pattern[j] = buf[i];
- else
- {
- word[j] = buf[i];
- pattern[++j] = '0';
- }
- }
- word[j] = '\0';
- pattern[j + 1] = '\0';
-
- i = 0;
- if (!repl) {
- /* Optimize away leading zeroes */
- for (; pattern[i] == '0'; i++);
- } else {
- if (*word == '.') i++;
- /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
- if (dict[k]->utf8) {
- int pu = -1; /* unicode character position */
- int ps = -1; /* unicode start position (original replindex) */
- int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
- for (; pc < (strlen(word) + 1); pc++) {
- /* beginning of an UTF-8 character (not '10' start bits) */
- if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
- if ((ps < 0) && (replindex == pu)) {
- ps = replindex;
- replindex = (signed char) pc;
- }
- if ((ps >= 0) && ((pu - ps) == replcut)) {
- replcut = (signed char) (pc - replindex);
- break;
- }
- }
- if (*word == '.') replindex--;
- }
- }
-
-#ifdef VERBOSE
- printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
-#endif
- found = hnj_hash_lookup (hashtab, word);
- state_num = hnj_get_state (dict[k], hashtab, word);
- dict[k]->states[state_num].match = hnj_strdup (pattern + i);
- dict[k]->states[state_num].repl = repl;
- dict[k]->states[state_num].replindex = replindex;
- if (!replcut) {
- dict[k]->states[state_num].replcut = (signed char) strlen(word);
- } else {
- dict[k]->states[state_num].replcut = replcut;
- }
-
- /* now, put in the prefix transitions */
- for (; found < 0 ;j--)
- {
- last_state = state_num;
- ch = word[j - 1];
- word[j - 1] = '\0';
- found = hnj_hash_lookup (hashtab, word);
- state_num = hnj_get_state (dict[k], hashtab, word);
- hnj_add_trans (dict[k], state_num, last_state, ch);
- }
- }
- }
-
- /* Could do unioning of matches here (instead of the preprocessor script).
- If we did, the pseudocode would look something like this:
-
- foreach state in the hash table
- foreach i = [1..length(state) - 1]
- state to check is substr (state, i)
- look it up
- if found, and if there is a match, union the match in.
-
- It's also possible to avoid the quadratic blowup by doing the
- search in order of increasing state string sizes - then you
- can break the loop after finding the first match.
-
- This step should be optional in any case - if there is a
- preprocessed rule table, it's always faster to use that.
-
-*/
-
- /* put in the fallback states */
- for (i = 0; i < HASH_SIZE; i++)
- for (e = hashtab->entries[i]; e; e = e->next)
- {
- if (*(e->key)) for (j = 1; 1; j++)
- {
- state_num = hnj_hash_lookup (hashtab, e->key + j);
- if (state_num >= 0)
- break;
- }
- /* KBH: FIXME state 0 fallback_state should always be -1? */
- if (e->val)
- dict[k]->states[e->val].fallback_state = state_num;
- }
-#ifdef VERBOSE
- for (i = 0; i < HASH_SIZE; i++)
- for (e = hashtab->entries[i]; e; e = e->next)
- {
- printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
- dict[k]->states[e->val].fallback_state);
- for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
- printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
- dict[k]->states[e->val].trans[j].new_state);
- }
-#endif
-
-#ifndef VERBOSE
- hnj_hash_free (hashtab);
-#endif
- state_num = 0;
-}
- if (k == 2) dict[0]->nextlevel = dict[1];
- return dict[0];
-}
-
-void hnj_hyphen_free (HyphenDict *dict)
-{
- int state_num;
- HyphenState *hstate;
-
- for (state_num = 0; state_num < dict->num_states; state_num++)
- {
- hstate = &dict->states[state_num];
- if (hstate->match)
- hnj_free (hstate->match);
- if (hstate->repl)
- hnj_free (hstate->repl);
- if (hstate->trans)
- hnj_free (hstate->trans);
- }
- if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
-
- hnj_free (dict->states);
-
- hnj_free (dict);
-}
-
-#define MAX_WORD 256
-
-int hnj_hyphen_hyphenate (HyphenDict *dict,
- const char *word, int word_size,
- char *hyphens)
-{
- char prep_word_buf[MAX_WORD];
- char *prep_word;
- int i, j, k;
- int state;
- char ch;
- HyphenState *hstate;
- char *match;
- int offset;
-
- if (word_size + 3 < MAX_WORD)
- prep_word = prep_word_buf;
- else
- prep_word = hnj_malloc (word_size + 3);
-
- j = 0;
- prep_word[j++] = '.';
-
- for (i = 0; i < word_size; i++)
- prep_word[j++] = word[i];
-
- prep_word[j++] = '.';
- prep_word[j] = '\0';
-
- for (i = 0; i < word_size + 5; i++)
- hyphens[i] = '0';
-
-#ifdef VERBOSE
- printf ("prep_word = %s\n", prep_word);
-#endif
-
- /* now, run the finite state machine */
- state = 0;
- for (i = 0; i < j; i++)
- {
- ch = prep_word[i];
- for (;;)
- {
-
- if (state == -1) {
- /* return 1; */
- /* KBH: FIXME shouldn't this be as follows? */
- state = 0;
- goto try_next_letter;
- }
-
-#ifdef VERBOSE
- char *state_str;
- state_str = get_state_str (state);
-
- for (k = 0; k < i - strlen (state_str); k++)
- putchar (' ');
- printf ("%s", state_str);
-#endif
-
- hstate = &dict->states[state];
- for (k = 0; k < hstate->num_trans; k++)
- if (hstate->trans[k].ch == ch)
- {
- state = hstate->trans[k].new_state;
- goto found_state;
- }
- state = hstate->fallback_state;
-#ifdef VERBOSE
- printf (" falling back, fallback_state %d\n", state);
-#endif
- }
- found_state:
-#ifdef VERBOSE
- printf ("found state %d\n",state);
-#endif
- /* Additional optimization is possible here - especially,
- elimination of trailing zeroes from the match. Leading zeroes
- have already been optimized. */
- match = dict->states[state].match;
- /* replacing rules not handled by hyphen_hyphenate() */
- if (match && !dict->states[state].repl)
- {
- offset = i + 1 - strlen (match);
-#ifdef VERBOSE
- for (k = 0; k < offset; k++)
- putchar (' ');
- printf ("%s\n", match);
-#endif
- /* This is a linear search because I tried a binary search and
- found it to be just a teeny bit slower. */
- for (k = 0; match[k]; k++)
- if (hyphens[offset + k] < match[k])
- hyphens[offset + k] = match[k];
- }
-
- /* KBH: we need this to make sure we keep looking in a word */
- /* for patterns even if the current character is not known in state 0 */
- /* since patterns for hyphenation may occur anywhere in the word */
- try_next_letter: ;
-
- }
-#ifdef VERBOSE
- for (i = 0; i < j; i++)
- putchar (hyphens[i]);
- putchar ('\n');
-#endif
-
- for (i = 0; i < j - 4; i++)
-#if 0
- if (hyphens[i + 1] & 1)
- hyphens[i] = '-';
-#else
- hyphens[i] = hyphens[i + 1];
-#endif
- hyphens[0] = '0';
- for (; i < word_size; i++)
- hyphens[i] = '0';
- hyphens[word_size] = '\0';
-
- if (prep_word != prep_word_buf)
- hnj_free (prep_word);
-
- return 0;
-}
-
-/* Unicode ligature length */
-int hnj_ligature(unsigned char c) {
- switch (c) {
- case 0x80: /* ff */
- case 0x81: /* fi */
- case 0x82: return LIG_xx; /* fl */
- case 0x83: /* ffi */
- case 0x84: return LIG_xxx; /* ffl */
- case 0x85: /* long st */
- case 0x86: return LIG_xx; /* st */
- }
- return 0;
-}
-
-/* character length of the first n byte of the input word */
-int hnj_hyphen_strnlen(const char * word, int n, int utf8)
-{
- int i = 0;
- int j = 0;
- while (j < n && word[j] != '\0') {
- i++;
- // Unicode ligature support
- if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
- i += hnj_ligature(word[j + 2]);
- }
- for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
- }
- return i;
-}
-
-int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
- char *** rep, int ** pos, int ** cut, int lhmin)
-{
- int i = 1, j;
-
- // Unicode ligature support
- if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
- i += hnj_ligature(word[2]);
- }
-
- for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
- // check length of the non-standard part
- if (*rep && *pos && *cut && (*rep)[j]) {
- char * rh = strchr((*rep)[j], '=');
- if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
- hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
- free((*rep)[j]);
- (*rep)[j] = NULL;
- hyphens[j] = '0';
- }
- } else {
- hyphens[j] = '0';
- }
- j++;
-
- // Unicode ligature support
- if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
- i += hnj_ligature(word[j + 2]);
- }
- } while (utf8 && (word[j] & 0xc0) == 0x80);
- return 0;
-}
-
-int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
- char *** rep, int ** pos, int ** cut, int rhmin)
-{
- int i;
- int j = word_size - 2;
- for (i = 1; i < rhmin && j > 0; j--) {
- // check length of the non-standard part
- if (*rep && *pos && *cut && (*rep)[j]) {
- char * rh = strchr((*rep)[j], '=');
- if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
- hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
- free((*rep)[j]);
- (*rep)[j] = NULL;
- hyphens[j] = '0';
- }
- } else {
- hyphens[j] = '0';
- }
- if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
- }
- return 0;
-}
-
-// recursive function for compound level hyphenation
-int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
- char * hyphens, char *** rep, int ** pos, int ** cut,
- int clhmin, int crhmin, int lend, int rend)
-{
- char prep_word_buf[MAX_WORD];
- char *prep_word;
- int i, j, k;
- int state;
- char ch;
- HyphenState *hstate;
- char *match;
- char *repl;
- signed char replindex;
- signed char replcut;
- int offset;
- int matchlen_buf[MAX_CHARS];
- int matchindex_buf[MAX_CHARS];
- char * matchrepl_buf[MAX_CHARS];
- int * matchlen;
- int * matchindex;
- char ** matchrepl;
- int isrepl = 0;
- int nHyphCount;
-
- if (word_size + 3 < MAX_CHARS) {
- prep_word = prep_word_buf;
- matchlen = matchlen_buf;
- matchindex = matchindex_buf;
- matchrepl = matchrepl_buf;
- } else {
- prep_word = hnj_malloc (word_size + 3);
- matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
- matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
- matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
- }
-
- j = 0;
- prep_word[j++] = '.';
-
- for (i = 0; i < word_size; i++)
- prep_word[j++] = word[i];
-
- prep_word[j++] = '.';
- prep_word[j] = '\0';
-
- for (i = 0; i < j; i++)
- hyphens[i] = '0';
-
-#ifdef VERBOSE
- printf ("prep_word = %s\n", prep_word);
-#endif
-
- /* now, run the finite state machine */
- state = 0;
- for (i = 0; i < j; i++)
- {
- ch = prep_word[i];
- for (;;)
- {
-
- if (state == -1) {
- /* return 1; */
- /* KBH: FIXME shouldn't this be as follows? */
- state = 0;
- goto try_next_letter;
- }
-
-#ifdef VERBOSE
- char *state_str;
- state_str = get_state_str (state);
-
- for (k = 0; k < i - strlen (state_str); k++)
- putchar (' ');
- printf ("%s", state_str);
-#endif
-
- hstate = &dict->states[state];
- for (k = 0; k < hstate->num_trans; k++)
- if (hstate->trans[k].ch == ch)
- {
- state = hstate->trans[k].new_state;
- goto found_state;
- }
- state = hstate->fallback_state;
-#ifdef VERBOSE
- printf (" falling back, fallback_state %d\n", state);
-#endif
- }
- found_state:
-#ifdef VERBOSE
- printf ("found state %d\n",state);
-#endif
- /* Additional optimization is possible here - especially,
- elimination of trailing zeroes from the match. Leading zeroes
- have already been optimized. */
- match = dict->states[state].match;
- repl = dict->states[state].repl;
- replindex = dict->states[state].replindex;
- replcut = dict->states[state].replcut;
- /* replacing rules not handled by hyphen_hyphenate() */
- if (match)
- {
- offset = i + 1 - strlen (match);
-#ifdef VERBOSE
- for (k = 0; k < offset; k++)
- putchar (' ');
- printf ("%s (%s)\n", match, repl);
-#endif
- if (repl) {
- if (!isrepl) for(; isrepl < word_size; isrepl++) {
- matchrepl[isrepl] = NULL;
- matchindex[isrepl] = -1;
- }
- matchlen[offset + replindex] = replcut;
- }
- /* This is a linear search because I tried a binary search and
- found it to be just a teeny bit slower. */
- for (k = 0; match[k]; k++) {
- if ((hyphens[offset + k] < match[k])) {
- hyphens[offset + k] = match[k];
- if (match[k]&1) {
- matchrepl[offset + k] = repl;
- if (repl && (k >= replindex) && (k <= replindex + replcut)) {
- matchindex[offset + replindex] = offset + k;
- }
- }
- }
- }
-
- }
-
- /* KBH: we need this to make sure we keep looking in a word */
- /* for patterns even if the current character is not known in state 0 */
- /* since patterns for hyphenation may occur anywhere in the word */
- try_next_letter: ;
-
- }
-#ifdef VERBOSE
- for (i = 0; i < j; i++)
- putchar (hyphens[i]);
- putchar ('\n');
-#endif
-
- for (i = 0; i < j - 3; i++)
-#if 0
- if (hyphens[i + 1] & 1)
- hyphens[i] = '-';
-#else
- hyphens[i] = hyphens[i + 1];
-#endif
- for (; i < word_size; i++)
- hyphens[i] = '0';
- hyphens[word_size] = '\0';
-
- /* now create a new char string showing hyphenation positions */
- /* count the hyphens and allocate space for the new hyphenated string */
- nHyphCount = 0;
- for (i = 0; i < word_size; i++)
- if (hyphens[i]&1)
- nHyphCount++;
- j = 0;
- for (i = 0; i < word_size; i++) {
- if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
- if (rep && pos && cut) {
- if (!*rep && !*pos && !*cut) {
- int k;
- *rep = (char **) malloc(sizeof(char *) * word_size);
- *pos = (int *) malloc(sizeof(int) * word_size);
- *cut = (int *) malloc(sizeof(int) * word_size);
- for (k = 0; k < word_size; k++) {
- (*rep)[k] = NULL;
- (*pos)[k] = 0;
- (*cut)[k] = 0;
- }
- }
- (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
- (*pos)[matchindex[i] - 1] = matchindex[i] - i;
- (*cut)[matchindex[i] - 1] = matchlen[i];
- }
- j += strlen(matchrepl[matchindex[i]]);
- i += matchlen[i] - 1;
- }
- }
-
- if (matchrepl != matchrepl_buf) {
- hnj_free (matchrepl);
- hnj_free (matchlen);
- hnj_free (matchindex);
- }
-
- // recursive hyphenation of the first (compound) level segments
- if (dict->nextlevel) {
- char * rep2_buf[MAX_WORD];
- int pos2_buf[MAX_WORD];
- int cut2_buf[MAX_WORD];
- char hyphens2_buf[MAX_WORD];
- char ** rep2;
- int * pos2;
- int * cut2;
- char * hyphens2;
- int begin = 0;
- if (word_size < MAX_CHARS) {
- rep2 = rep2_buf;
- pos2 = pos2_buf;
- cut2 = cut2_buf;
- hyphens2 = hyphens2_buf;
- } else {
- rep2 = hnj_malloc (word_size * sizeof(char *));
- pos2 = hnj_malloc (word_size * sizeof(int));
- cut2 = hnj_malloc (word_size * sizeof(int));
- hyphens2 = hnj_malloc (word_size);
- }
- for (i = 0; i < word_size; i++) rep2[i] = NULL;
- for (i = 0; i < word_size; i++) if
- (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
- if (i - begin > 1) {
- int hyph = 0;
- prep_word[i + 2] = '\0';
- /* non-standard hyphenation at compound boundary (Schiffahrt) */
- if (*rep && *pos && *cut && (*rep)[i]) {
- char * l = strchr((*rep)[i], '=');
- strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
- if (l) {
- hyph = (l - (*rep)[i]) - (*pos)[i];
- prep_word[2 + i + hyph] = '\0';
- }
- }
- hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
- hyphens2, &rep2, &pos2, &cut2, clhmin,
- crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
- for (j = 0; j < i - begin - 1; j++) {
- hyphens[begin + j] = hyphens2[j];
- if (rep2[j] && rep && pos && cut) {
- if (!*rep && !*pos && !*cut) {
- int k;
- *rep = (char **) malloc(sizeof(char *) * word_size);
- *pos = (int *) malloc(sizeof(int) * word_size);
- *cut = (int *) malloc(sizeof(int) * word_size);
- for (k = 0; k < word_size; k++) {
- (*rep)[k] = NULL;
- (*pos)[k] = 0;
- (*cut)[k] = 0;
- }
- }
- (*rep)[begin + j] = rep2[j];
- (*pos)[begin + j] = pos2[j];
- (*cut)[begin + j] = cut2[j];
- }
- }
- prep_word[i + 2] = word[i + 1];
- if (*rep && *pos && *cut && (*rep)[i]) {
- strcpy(prep_word + 1, word);
- }
- }
- begin = i + 1;
- for (j = 0; j < word_size; j++) rep2[j] = NULL;
- }
-
- // non-compound
- if (begin == 0) {
- hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
- hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
- if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
- rep, pos, cut, clhmin);
- if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
- rep, pos, cut, crhmin);
- }
-
- if (rep2 != rep2_buf) {
- free(rep2);
- free(cut2);
- free(pos2);
- free(hyphens2);
- }
- }
-
- if (prep_word != prep_word_buf) hnj_free (prep_word);
- return 0;
-}
-
-/* UTF-8 normalization of hyphen and non-standard positions */
-int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
- char *** rep, int ** pos, int ** cut)
-{
- int i, j, k;
- if ((((unsigned char) word[0]) >> 6) == 2) {
- fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
- return 1;
- }
-
- /* calculate UTF-8 character positions */
- for (i = 0, j = -1; i < word_size; i++) {
- /* beginning of an UTF-8 character (not '10' start bits) */
- if ((((unsigned char) word[i]) >> 6) != 2) j++;
- hyphens[j] = hyphens[i];
- if (rep && pos && cut && *rep && *pos && *cut) {
- int l = (*pos)[i];
- (*pos)[j] = 0;
- for (k = 0; k < l; k++) {
- if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
- }
- k = i - l + 1;
- l = k + (*cut)[i];
- (*cut)[j] = 0;
- for (; k < l; k++) {
- if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
- }
- (*rep)[j] = (*rep)[i];
- if (j < i) {
- (*rep)[i] = NULL;
- (*pos)[i] = 0;
- (*cut)[i] = 0;
- }
- }
- }
- hyphens[j + 1] = '\0';
- return 0;
-}
-
-/* get the word with all possible hyphenations (output: hyphword) */
-void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
- char * hyphword, char *** rep, int ** pos, int ** cut)
-{
- int i, j;
- for (i = 0, j = 0; i < l; i++, j++) {
- if (hyphens[i]&1) {
- hyphword[j] = word[i];
- if (*rep && *pos && *cut && (*rep)[i]) {
- strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
- j += strlen((*rep)[i]) - (*pos)[i];
- i += (*cut)[i] - (*pos)[i];
- } else hyphword[++j] = '=';
- } else hyphword[j] = word[i];
- }
- hyphword[j] = '\0';
-}
-
-
-/* main api function with default hyphenmin parameters */
-int hnj_hyphen_hyphenate2 (HyphenDict *dict,
- const char *word, int word_size, char * hyphens,
- char *hyphword, char *** rep, int ** pos, int ** cut)
-{
- hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
- dict->clhmin, dict->crhmin, 1, 1);
- hnj_hyphen_lhmin(dict->utf8, word, word_size,
- hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
- hnj_hyphen_rhmin(dict->utf8, word, word_size,
- hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
- if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
- if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
- return 0;
-}
-
-/* previous main api function with hyphenmin parameters */
-int hnj_hyphen_hyphenate3 (HyphenDict *dict,
- const char *word, int word_size, char * hyphens,
- char *hyphword, char *** rep, int ** pos, int ** cut,
- int lhmin, int rhmin, int clhmin, int crhmin)
-{
- lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
- rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
- hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
- clhmin, crhmin, 1, 1);
- hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
- rep, pos, cut, (lhmin > 0 ? lhmin : 2));
- hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
- rep, pos, cut, (rhmin > 0 ? rhmin : 2));
- if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
- if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
- return 0;
-}
« no previous file with comments | « third_party/hyphen/hyphen.h ('k') | third_party/hyphen/hyphen.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698