third_party/hyphen/hyphen.c - Issue 20860003: Remove hyphenation code from Chromium.

Side by Side Diff: third_party/hyphen/hyphen.c

Issue 20860003: Remove hyphenation code from Chromium. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: rebase Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both

2 * licenses follows.

3 */

4

5 /* LibHnj - a library for high quality hyphenation and justification

6 * Copyright (C) 1998 Raph Levien,

7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),

8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)

9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)

10 *

11 * This library is free software; you can redistribute it and/or

12 * modify it under the terms of the GNU Library General Public

13 * License as published by the Free Software Foundation; either

14 * version 2 of the License, or (at your option) any later version.

15 *

16 * This library is distributed in the hope that it will be useful,

17 * but WITHOUT ANY WARRANTY; without even the implied warranty of

18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

19 * Library General Public License for more details.

20 *

21 * You should have received a copy of the GNU Library General Public

22 * License along with this library; if not, write to the

23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,

24 * Boston, MA 02111-1307 USA.

25 */

26

27 /*

28 * The contents of this file are subject to the Mozilla Public License

29 * Version 1.0 (the "MPL"); you may not use this file except in

30 * compliance with the MPL. You may obtain a copy of the MPL at

31 * http://www.mozilla.org/MPL/

32 *

33 * Software distributed under the MPL is distributed on an "AS IS" basis,

34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL

35 * for the specific language governing rights and limitations under the

36 * MPL.

37 *

38 */

39 #include <stdlib.h> /* for NULL, malloc */

40 #include <stdio.h> /* for fprintf */

41 #include <string.h> /* for strdup */

42

43 #ifdef UNX

44 #include <unistd.h> /* for exit */

45 #endif

46

47 #define noVERBOSE

48

49 /* calculate hyphenmin values with long ligature length (2 or 3 characters

50 * instead of 1 or 2) for comparison with hyphenation without ligatures */

51 #define noLONG_LIGATURE

52

53 #ifdef LONG_LIGATURE

54 #define LIG_xx 1

55 #define LIG_xxx 2

56 #else

57 #define LIG_xx 0

58 #define LIG_xxx 1

59 #endif

60

61 #include "hnjalloc.h"

62 #include "hyphen.h"

63

64 static char *

65 hnj_strdup (const char *s)

66 {

67 char *new;

68 int l;

69

70 l = strlen (s);

71 new = hnj_malloc (l + 1);

72 memcpy (new, s, l);

73 new[l] = 0;

74 return new;

75 }

76

77 /* remove cross-platform text line end characters */

78 void hnj_strchomp(char * s)

79 {

80 int k = strlen(s);

81 if ((k > 0) && (((s+k-1)=='\r') \|\| ((s+k-1)=='\n'))) *(s+k-1) = '\0';

82 if ((k > 1) && ((s+k-2) == '\r')) (s+k-2) = '\0';

83 }

84

85 /* a little bit of a hash table implementation. This simply maps strings

86 to state numbers */

87

88 typedef struct _HashTab HashTab;

89 typedef struct _HashEntry HashEntry;

90

91 /* A cheap, but effective, hack. */

92 #define HASH_SIZE 31627

93

94 struct _HashTab {

95 HashEntry *entries[HASH_SIZE];

96 };

97

98 struct _HashEntry {

99 HashEntry *next;

100 char *key;

101 int val;

102 };

103

104 /* a char* hash function from ASU - adapted from Gtk+ */

105 static unsigned int

106 hnj_string_hash (const char *s)

107 {

108 const char *p;

109 unsigned int h=0, g;

110 for(p = s; *p != '\0'; p += 1) {

111 h = ( h << 4 ) + *p;

112 if ( ( g = h & 0xf0000000 ) ) {

113 h = h ^ (g >> 24);

114 h = h ^ g;

115 }

116 }

117 return h /* % M */;

118 }

119

120 static HashTab *

121 hnj_hash_new (void)

122 {

123 HashTab *hashtab;

124 int i;

125

126 hashtab = hnj_malloc (sizeof(HashTab));

127 for (i = 0; i < HASH_SIZE; i++)

128 hashtab->entries[i] = NULL;

129

130 return hashtab;

131 }

132

133 static void

134 hnj_hash_free (HashTab *hashtab)

135 {

136 int i;

137 HashEntry e, next;

138

139 for (i = 0; i < HASH_SIZE; i++)

140 for (e = hashtab->entries[i]; e; e = next)

141 {

142 next = e->next;

143 hnj_free (e->key);

144 hnj_free (e);

145 }

146

147 hnj_free (hashtab);

148 }

149

150 /* assumes that key is not already present! */

151 static void

152 hnj_hash_insert (HashTab hashtab, const char key, int val)

153 {

154 int i;

155 HashEntry *e;

156

157 i = hnj_string_hash (key) % HASH_SIZE;

158 e = hnj_malloc (sizeof(HashEntry));

159 e->next = hashtab->entries[i];

160 e->key = hnj_strdup (key);

161 e->val = val;

162 hashtab->entries[i] = e;

163 }

164

165 /* return val if found, otherwise -1 */

166 static int

167 hnj_hash_lookup (HashTab hashtab, const char key)

168 {

169 int i;

170 HashEntry *e;

171 i = hnj_string_hash (key) % HASH_SIZE;

172 for (e = hashtab->entries[i]; e; e = e->next)

173 if (!strcmp (key, e->key))

174 return e->val;

175 return -1;

176 }

177

178 /* Get the state number, allocating a new state if necessary. */

179 static int

180 hnj_get_state (HyphenDict dict, HashTab hashtab, const char *string)

181 {

182 int state_num;

183

184 state_num = hnj_hash_lookup (hashtab, string);

185

186 if (state_num >= 0)

187 return state_num;

188

189 hnj_hash_insert (hashtab, string, dict->num_states);

190 /* predicate is true if dict->num_states is a power of two */

191 if (!(dict->num_states & (dict->num_states - 1)))

192 {

193 dict->states = hnj_realloc (dict->states,

194 (dict->num_states << 1) *

195 sizeof(HyphenState));

196 }

197 dict->states[dict->num_states].match = NULL;

198 dict->states[dict->num_states].repl = NULL;

199 dict->states[dict->num_states].fallback_state = -1;

200 dict->states[dict->num_states].num_trans = 0;

201 dict->states[dict->num_states].trans = NULL;

202 return dict->num_states++;

203 }

204

205 /* add a transition from state1 to state2 through ch - assumes that the

206 transition does not already exist */

207 static void

208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)

209 {

210 int num_trans;

211

212 num_trans = dict->states[state1].num_trans;

213 if (num_trans == 0)

214 {

215 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));

216 }

217 else if (!(num_trans & (num_trans - 1)))

218 {

219 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,

220 (num_trans << 1) *

221 sizeof(HyphenTrans));

222 }

223 dict->states[state1].trans[num_trans].ch = ch;

224 dict->states[state1].trans[num_trans].new_state = state2;

225 dict->states[state1].num_trans++;

226 }

227

228 #ifdef VERBOSE

229 HashTab *global;

230

231 static char *

232 get_state_str (int state)

233 {

234 int i;

235 HashEntry *e;

236

237 for (i = 0; i < HASH_SIZE; i++)

238 for (e = global->entries[i]; e; e = e->next)

239 if (e->val == state)

240 return e->key;

241 return NULL;

242 }

243 #endif

244

245 HyphenDict *

246 hnj_hyphen_load (const char *fn)

247 {

248 HyphenDict *result;

249 FILE *f;

250 f = fopen (fn, "r");

251 if (f == NULL)

252 return NULL;

253

254 result = hnj_hyphen_load_file(f);

255

256 fclose(f);

257 return result;

258 }

259

260 HyphenDict *

261 hnj_hyphen_load_file (FILE *f)

262 {

263 HyphenDict *dict[2];

264 HashTab *hashtab;

265 char buf[MAX_CHARS];

266 char word[MAX_CHARS];

267 char pattern[MAX_CHARS];

268 char * repl;

269 signed char replindex;

270 signed char replcut;

271 int state_num = 0, last_state;

272 int i, j, k;

273 char ch;

274 int found;

275 HashEntry *e;

276 int nextlevel = 0;

277

278 // loading one or two dictionaries (separated by NEXTLEVEL keyword)

279 for (k = 0; k == 0 \|\| (k == 1 && nextlevel); k++) {

280 hashtab = hnj_hash_new ();

281 #ifdef VERBOSE

282 global = hashtab;

283 #endif

284 hnj_hash_insert (hashtab, "", 0);

285 dict[k] = hnj_malloc (sizeof(HyphenDict));

286 dict[k]->num_states = 1;

287 dict[k]->states = hnj_malloc (sizeof(HyphenState));

288 dict[k]->states[0].match = NULL;

289 dict[k]->states[0].repl = NULL;

290 dict[k]->states[0].fallback_state = -1;

291 dict[k]->states[0].num_trans = 0;

292 dict[k]->states[0].trans = NULL;

293 dict[k]->nextlevel = NULL;

294 dict[k]->lhmin = 0;

295 dict[k]->rhmin = 0;

296 dict[k]->clhmin = 0;

297 dict[k]->crhmin = 0;

298

299 /* read in character set info */

300 if (k == 0) {

301 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;

302 if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {

303 for (i=0;i<MAX_NAME;i++)

304 if ((dict[k]->cset[i] == '\r') \|\| (dict[k]->cset[i] == '\n'))

305 dict[k]->cset[i] = 0;

306 } else {

307 dict[k]->cset[0] = 0;

308 }

309 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);

310 } else {

311 strcpy(dict[k]->cset, dict[0]->cset);

312 dict[k]->utf8 = dict[0]->utf8;

313 }

314

315 while (fgets (buf, sizeof(buf), f) != NULL)

316 {

317 if (buf[0] != '%')

318 {

319 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {

320 nextlevel = 1;

321 break;

322 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {

323 dict[k]->lhmin = atoi(buf + 13);

324 continue;

325 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {

326 dict[k]->rhmin = atoi(buf + 14);

327 continue;

328 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {

329 dict[k]->clhmin = atoi(buf + 21);

330 continue;

331 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {

332 dict[k]->crhmin = atoi(buf + 22);

333 continue;

334 }

335 j = 0;

336 pattern[j] = '0';

337 repl = strchr(buf, '/');

338 replindex = 0;

339 replcut = 0;

340 if (repl) {

341 char * index = strchr(repl + 1, ',');

342 *repl = '\0';

343 if (index) {

344 char * index2 = strchr(index + 1, ',');

345 *index = '\0';

346 if (index2) {

347 *index2 = '\0';

348 replindex = (signed char) atoi(index + 1) - 1;

349 replcut = (signed char) atoi(index2 + 1);

350 }

351 } else {

352 hnj_strchomp(repl + 1);

353 replindex = 0;

354 replcut = (signed char) strlen(buf);

355 }

356 repl = hnj_strdup(repl + 1);

357 }

358 for (i = 0; ((buf[i] > ' ') \|\| (buf[i] < 0)); i++)

359 {

360 if (buf[i] >= '0' && buf[i] <= '9')

361 pattern[j] = buf[i];

362 else

363 {

364 word[j] = buf[i];

365 pattern[++j] = '0';

366 }

367 }

368 word[j] = '\0';

369 pattern[j + 1] = '\0';

370

371 i = 0;

372 if (!repl) {

373 /* Optimize away leading zeroes */

374 for (; pattern[i] == '0'; i++);

375 } else {

376 if (*word == '.') i++;

377 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */

378 if (dict[k]->utf8) {

379 int pu = -1; /* unicode character position */

380 int ps = -1; /* unicode start position (original replinde x) */

381 int pc = (word == '.') ? 1: 0; / 8-bit character position */

382 for (; pc < (strlen(word) + 1); pc++) {

383 /* beginning of an UTF-8 character (not '10' start bits) */

384 if ((((unsigned char) word[pc]) >> 6) != 2) pu++;

385 if ((ps < 0) && (replindex == pu)) {

386 ps = replindex;

387 replindex = (signed char) pc;

388 }

389 if ((ps >= 0) && ((pu - ps) == replcut)) {

390 replcut = (signed char) (pc - replindex);

391 break;

392 }

393 }

394 if (*word == '.') replindex--;

395 }

396 }

397

398 #ifdef VERBOSE

399 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j , repl);

400 #endif

401 found = hnj_hash_lookup (hashtab, word);

402 state_num = hnj_get_state (dict[k], hashtab, word);

403 dict[k]->states[state_num].match = hnj_strdup (pattern + i);

404 dict[k]->states[state_num].repl = repl;

405 dict[k]->states[state_num].replindex = replindex;

406 if (!replcut) {

407 dict[k]->states[state_num].replcut = (signed char) strlen(word);

408 } else {

409 dict[k]->states[state_num].replcut = replcut;

410 }

411

412 /* now, put in the prefix transitions */

413 for (; found < 0 ;j--)

414 {

415 last_state = state_num;

416 ch = word[j - 1];

417 word[j - 1] = '\0';

418 found = hnj_hash_lookup (hashtab, word);

419 state_num = hnj_get_state (dict[k], hashtab, word);

420 hnj_add_trans (dict[k], state_num, last_state, ch);

421 }

422 }

423 }

424

425 /* Could do unioning of matches here (instead of the preprocessor script).

426 If we did, the pseudocode would look something like this:

427

428 foreach state in the hash table

429 foreach i = [1..length(state) - 1]

430 state to check is substr (state, i)

431 look it up

432 if found, and if there is a match, union the match in.

433

434 It's also possible to avoid the quadratic blowup by doing the

435 search in order of increasing state string sizes - then you

436 can break the loop after finding the first match.

437

438 This step should be optional in any case - if there is a

439 preprocessed rule table, it's always faster to use that.

440

441 */

442

443 /* put in the fallback states */

444 for (i = 0; i < HASH_SIZE; i++)

445 for (e = hashtab->entries[i]; e; e = e->next)

446 {

447 if (*(e->key)) for (j = 1; 1; j++)

448 {

449 state_num = hnj_hash_lookup (hashtab, e->key + j);

450 if (state_num >= 0)

451 break;

452 }

453 /* KBH: FIXME state 0 fallback_state should always be -1? */

454 if (e->val)

455 dict[k]->states[e->val].fallback_state = state_num;

456 }

457 #ifdef VERBOSE

458 for (i = 0; i < HASH_SIZE; i++)

459 for (e = hashtab->entries[i]; e; e = e->next)

460 {

461 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,

462 dict[k]->states[e->val].fallback_state);

463 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)

464 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,

465 dict[k]->states[e->val].trans[j].new_state);

466 }

467 #endif

468

469 #ifndef VERBOSE

470 hnj_hash_free (hashtab);

471 #endif

472 state_num = 0;

473 }

474 if (k == 2) dict[0]->nextlevel = dict[1];

475 return dict[0];

476 }

477

478 void hnj_hyphen_free (HyphenDict *dict)

479 {

480 int state_num;

481 HyphenState *hstate;

482

483 for (state_num = 0; state_num < dict->num_states; state_num++)

484 {

485 hstate = &dict->states[state_num];

486 if (hstate->match)

487 hnj_free (hstate->match);

488 if (hstate->repl)

489 hnj_free (hstate->repl);

490 if (hstate->trans)

491 hnj_free (hstate->trans);

492 }

493 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);

494

495 hnj_free (dict->states);

496

497 hnj_free (dict);

498 }

499

500 #define MAX_WORD 256

501

502 int hnj_hyphen_hyphenate (HyphenDict *dict,

503 const char *word, int word_size,

504 char *hyphens)

505 {

506 char prep_word_buf[MAX_WORD];

507 char *prep_word;

508 int i, j, k;

509 int state;

510 char ch;

511 HyphenState *hstate;

512 char *match;

513 int offset;

514

515 if (word_size + 3 < MAX_WORD)

516 prep_word = prep_word_buf;

517 else

518 prep_word = hnj_malloc (word_size + 3);

519

520 j = 0;

521 prep_word[j++] = '.';

522

523 for (i = 0; i < word_size; i++)

524 prep_word[j++] = word[i];

525

526 prep_word[j++] = '.';

527 prep_word[j] = '\0';

528

529 for (i = 0; i < word_size + 5; i++)

530 hyphens[i] = '0';

531

532 #ifdef VERBOSE

533 printf ("prep_word = %s\n", prep_word);

534 #endif

535

536 /* now, run the finite state machine */

537 state = 0;

538 for (i = 0; i < j; i++)

539 {

540 ch = prep_word[i];

541 for (;;)

542 {

543

544 if (state == -1) {

545 /* return 1; */

546 /* KBH: FIXME shouldn't this be as follows? */

547 state = 0;

548 goto try_next_letter;

549 }

550

551 #ifdef VERBOSE

552 char *state_str;

553 state_str = get_state_str (state);

554

555 for (k = 0; k < i - strlen (state_str); k++)

556 putchar (' ');

557 printf ("%s", state_str);

558 #endif

559

560 hstate = &dict->states[state];

561 for (k = 0; k < hstate->num_trans; k++)

562 if (hstate->trans[k].ch == ch)

563 {

564 state = hstate->trans[k].new_state;

565 goto found_state;

566 }

567 state = hstate->fallback_state;

568 #ifdef VERBOSE

569 printf (" falling back, fallback_state %d\n", state);

570 #endif

571 }

572 found_state:

573 #ifdef VERBOSE

574 printf ("found state %d\n",state);

575 #endif

576 /* Additional optimization is possible here - especially,

577 elimination of trailing zeroes from the match. Leading zeroes

578 have already been optimized. */

579 match = dict->states[state].match;

580 /* replacing rules not handled by hyphen_hyphenate() */

581 if (match && !dict->states[state].repl)

582 {

583 offset = i + 1 - strlen (match);

584 #ifdef VERBOSE

585 for (k = 0; k < offset; k++)

586 putchar (' ');

587 printf ("%s\n", match);

588 #endif

589 /* This is a linear search because I tried a binary search and

590 found it to be just a teeny bit slower. */

591 for (k = 0; match[k]; k++)

592 if (hyphens[offset + k] < match[k])

593 hyphens[offset + k] = match[k];

594 }

595

596 /* KBH: we need this to make sure we keep looking in a word */

597 /* for patterns even if the current character is not known in state 0 */

598 /* since patterns for hyphenation may occur anywhere in the word */

599 try_next_letter: ;

600

601 }

602 #ifdef VERBOSE

603 for (i = 0; i < j; i++)

604 putchar (hyphens[i]);

605 putchar ('\n');

606 #endif

607

608 for (i = 0; i < j - 4; i++)

609 #if 0

610 if (hyphens[i + 1] & 1)

611 hyphens[i] = '-';

612 #else

613 hyphens[i] = hyphens[i + 1];

614 #endif

615 hyphens[0] = '0';

616 for (; i < word_size; i++)

617 hyphens[i] = '0';

618 hyphens[word_size] = '\0';

619

620 if (prep_word != prep_word_buf)

621 hnj_free (prep_word);

622

623 return 0;

624 }

625

626 /* Unicode ligature length */

627 int hnj_ligature(unsigned char c) {

628 switch (c) {

629 case 0x80: /* ff */

630 case 0x81: /* fi */

631 case 0x82: return LIG_xx; /* fl */

632 case 0x83: /* ffi */

633 case 0x84: return LIG_xxx; /* ffl */

634 case 0x85: /* long st */

635 case 0x86: return LIG_xx; /* st */

636 }

637 return 0;

638 }

639

640 /* character length of the first n byte of the input word */

641 int hnj_hyphen_strnlen(const char * word, int n, int utf8)

642 {

643 int i = 0;

644 int j = 0;

645 while (j < n && word[j] != '\0') {

646 i++;

647 // Unicode ligature support

648 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {

649 i += hnj_ligature(word[j + 2]);

650 }

651 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);

652 }

653 return i;

654 }

655

656 int hnj_hyphen_lhmin(int utf8, const char word, int word_size, char hyphens,

657 char * rep, int pos, int ** cut, int lhmin)

658 {

659 int i = 1, j;

660

661 // Unicode ligature support

662 if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {

663 i += hnj_ligature(word[2]);

664 }

665

666 for (j = 0; i < lhmin && word[j] != '\0'; i++) do {

667 // check length of the non-standard part

668 if (rep && pos && cut && (rep)[j]) {

669 char * rh = strchr((*rep)[j], '=');

670 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +

671 hnj_hyphen_strnlen((rep)[j], rh - (rep)[j], utf8)) < lhmin) {

672 free((*rep)[j]);

673 (*rep)[j] = NULL;

674 hyphens[j] = '0';

675 }

676 } else {

677 hyphens[j] = '0';

678 }

679 j++;

680

681 // Unicode ligature support

682 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {

683 i += hnj_ligature(word[j + 2]);

684 }

685 } while (utf8 && (word[j] & 0xc0) == 0x80);

686 return 0;

687 }

688

689 int hnj_hyphen_rhmin(int utf8, const char word, int word_size, char hyphens,

690 char * rep, int pos, int ** cut, int rhmin)

691 {

692 int i;

693 int j = word_size - 2;

694 for (i = 1; i < rhmin && j > 0; j--) {

695 // check length of the non-standard part

696 if (rep && pos && cut && (rep)[j]) {

697 char * rh = strchr((*rep)[j], '=');

698 if (rh && (hnj_hyphen_strnlen(word + j - (pos)[j] + (cut)[j] + 1, 100, utf8) +

699 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {

700 free((*rep)[j]);

701 (*rep)[j] = NULL;

702 hyphens[j] = '0';

703 }

704 } else {

705 hyphens[j] = '0';

706 }

707 if (!utf8 \|\| (word[j] & 0xc0) != 0xc0) i++;

708 }

709 return 0;

710 }

711

712 // recursive function for compound level hyphenation

713 int hnj_hyphen_hyph_(HyphenDict dict, const char word, int word_size,

714 char * hyphens, char * rep, int pos, int ** cut,

715 int clhmin, int crhmin, int lend, int rend)

716 {

717 char prep_word_buf[MAX_WORD];

718 char *prep_word;

719 int i, j, k;

720 int state;

721 char ch;

722 HyphenState *hstate;

723 char *match;

724 char *repl;

725 signed char replindex;

726 signed char replcut;

727 int offset;

728 int matchlen_buf[MAX_CHARS];

729 int matchindex_buf[MAX_CHARS];

730 char * matchrepl_buf[MAX_CHARS];

731 int * matchlen;

732 int * matchindex;

733 char ** matchrepl;

734 int isrepl = 0;

735 int nHyphCount;

736

737 if (word_size + 3 < MAX_CHARS) {

738 prep_word = prep_word_buf;

739 matchlen = matchlen_buf;

740 matchindex = matchindex_buf;

741 matchrepl = matchrepl_buf;

742 } else {

743 prep_word = hnj_malloc (word_size + 3);

744 matchlen = hnj_malloc ((word_size + 3) * sizeof(int));

745 matchindex = hnj_malloc ((word_size + 3) * sizeof(int));

746 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));

747 }

748

749 j = 0;

750 prep_word[j++] = '.';

751

752 for (i = 0; i < word_size; i++)

753 prep_word[j++] = word[i];

754

755 prep_word[j++] = '.';

756 prep_word[j] = '\0';

757

758 for (i = 0; i < j; i++)

759 hyphens[i] = '0';

760

761 #ifdef VERBOSE

762 printf ("prep_word = %s\n", prep_word);

763 #endif

764

765 /* now, run the finite state machine */

766 state = 0;

767 for (i = 0; i < j; i++)

768 {

769 ch = prep_word[i];

770 for (;;)

771 {

772

773 if (state == -1) {

774 /* return 1; */

775 /* KBH: FIXME shouldn't this be as follows? */

776 state = 0;

777 goto try_next_letter;

778 }

779

780 #ifdef VERBOSE

781 char *state_str;

782 state_str = get_state_str (state);

783

784 for (k = 0; k < i - strlen (state_str); k++)

785 putchar (' ');

786 printf ("%s", state_str);

787 #endif

788

789 hstate = &dict->states[state];

790 for (k = 0; k < hstate->num_trans; k++)

791 if (hstate->trans[k].ch == ch)

792 {

793 state = hstate->trans[k].new_state;

794 goto found_state;

795 }

796 state = hstate->fallback_state;

797 #ifdef VERBOSE

798 printf (" falling back, fallback_state %d\n", state);

799 #endif

800 }

801 found_state:

802 #ifdef VERBOSE

803 printf ("found state %d\n",state);

804 #endif

805 /* Additional optimization is possible here - especially,

806 elimination of trailing zeroes from the match. Leading zeroes

807 have already been optimized. */

808 match = dict->states[state].match;

809 repl = dict->states[state].repl;

810 replindex = dict->states[state].replindex;

811 replcut = dict->states[state].replcut;

812 /* replacing rules not handled by hyphen_hyphenate() */

813 if (match)

814 {

815 offset = i + 1 - strlen (match);

816 #ifdef VERBOSE

817 for (k = 0; k < offset; k++)

818 putchar (' ');

819 printf ("%s (%s)\n", match, repl);

820 #endif

821 if (repl) {

822 if (!isrepl) for(; isrepl < word_size; isrepl++) {

823 matchrepl[isrepl] = NULL;

824 matchindex[isrepl] = -1;

825 }

826 matchlen[offset + replindex] = replcut;

827 }

828 /* This is a linear search because I tried a binary search and

829 found it to be just a teeny bit slower. */

830 for (k = 0; match[k]; k++) {

831 if ((hyphens[offset + k] < match[k])) {

832 hyphens[offset + k] = match[k];

833 if (match[k]&1) {

834 matchrepl[offset + k] = repl;

835 if (repl && (k >= replindex) && (k <= replindex + replcut)) {

836 matchindex[offset + replindex] = offset + k;

837 }

838 }

839 }

840 }

841

842 }

843

844 /* KBH: we need this to make sure we keep looking in a word */

845 /* for patterns even if the current character is not known in state 0 */

846 /* since patterns for hyphenation may occur anywhere in the word */

847 try_next_letter: ;

848

849 }

850 #ifdef VERBOSE

851 for (i = 0; i < j; i++)

852 putchar (hyphens[i]);

853 putchar ('\n');

854 #endif

855

856 for (i = 0; i < j - 3; i++)

857 #if 0

858 if (hyphens[i + 1] & 1)

859 hyphens[i] = '-';

860 #else

861 hyphens[i] = hyphens[i + 1];

862 #endif

863 for (; i < word_size; i++)

864 hyphens[i] = '0';

865 hyphens[word_size] = '\0';

866

867 /* now create a new char string showing hyphenation positions */

868 /* count the hyphens and allocate space for the new hyphenated string */

869 nHyphCount = 0;

870 for (i = 0; i < word_size; i++)

871 if (hyphens[i]&1)

872 nHyphCount++;

873 j = 0;

874 for (i = 0; i < word_size; i++) {

875 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {

876 if (rep && pos && cut) {

877 if (!rep && !pos && !*cut) {

878 int k;

879 rep = (char ) malloc(sizeof(char ) * word_size);

880 pos = (int ) malloc(sizeof(int) * word_size);

881 cut = (int ) malloc(sizeof(int) * word_size);

882 for (k = 0; k < word_size; k++) {

883 (*rep)[k] = NULL;

884 (*pos)[k] = 0;

885 (*cut)[k] = 0;

886 }

887 }

888 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[ i]]);

889 (*pos)[matchindex[i] - 1] = matchindex[i] - i;

890 (*cut)[matchindex[i] - 1] = matchlen[i];

891 }

892 j += strlen(matchrepl[matchindex[i]]);

893 i += matchlen[i] - 1;

894 }

895 }

896

897 if (matchrepl != matchrepl_buf) {

898 hnj_free (matchrepl);

899 hnj_free (matchlen);

900 hnj_free (matchindex);

901 }

902

903 // recursive hyphenation of the first (compound) level segments

904 if (dict->nextlevel) {

905 char * rep2_buf[MAX_WORD];

906 int pos2_buf[MAX_WORD];

907 int cut2_buf[MAX_WORD];

908 char hyphens2_buf[MAX_WORD];

909 char ** rep2;

910 int * pos2;

911 int * cut2;

912 char * hyphens2;

913 int begin = 0;

914 if (word_size < MAX_CHARS) {

915 rep2 = rep2_buf;

916 pos2 = pos2_buf;

917 cut2 = cut2_buf;

918 hyphens2 = hyphens2_buf;

919 } else {

920 rep2 = hnj_malloc (word_size * sizeof(char *));

921 pos2 = hnj_malloc (word_size * sizeof(int));

922 cut2 = hnj_malloc (word_size * sizeof(int));

923 hyphens2 = hnj_malloc (word_size);

924 }

925 for (i = 0; i < word_size; i++) rep2[i] = NULL;

926 for (i = 0; i < word_size; i++) if

927 (hyphens[i]&1 \|\| (begin > 0 && i + 1 == word_size)) {

928 if (i - begin > 1) {

929 int hyph = 0;

930 prep_word[i + 2] = '\0';

931 /* non-standard hyphenation at compound boundary (Schiffahrt) */

932 if (rep && pos && cut && (rep)[i]) {

933 char * l = strchr((*rep)[i], '=');

934 strcpy(prep_word + 2 + i - (pos)[i], (rep)[i]);

935 if (l) {

936 hyph = (l - (rep)[i]) - (pos)[i];

937 prep_word[2 + i + hyph] = '\0';

938 }

939 }

940 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,

941 hyphens2, &rep2, &pos2, &cut2, clhmin,

942 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));

943 for (j = 0; j < i - begin - 1; j++) {

944 hyphens[begin + j] = hyphens2[j];

945 if (rep2[j] && rep && pos && cut) {

946 if (!rep && !pos && !*cut) {

947 int k;

948 rep = (char ) malloc(sizeof(char ) * word_size);

949 pos = (int ) malloc(sizeof(int) * word_size);

950 cut = (int ) malloc(sizeof(int) * word_size);

951 for (k = 0; k < word_size; k++) {

952 (*rep)[k] = NULL;

953 (*pos)[k] = 0;

954 (*cut)[k] = 0;

955 }

956 }

957 (*rep)[begin + j] = rep2[j];

958 (*pos)[begin + j] = pos2[j];

959 (*cut)[begin + j] = cut2[j];

960 }

961 }

962 prep_word[i + 2] = word[i + 1];

963 if (rep && pos && cut && (rep)[i]) {

964 strcpy(prep_word + 1, word);

965 }

966 }

967 begin = i + 1;

968 for (j = 0; j < word_size; j++) rep2[j] = NULL;

969 }

970

971 // non-compound

972 if (begin == 0) {

973 hnj_hyphen_hyph_(dict->nextlevel, word, word_size,

974 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);

975 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

976 rep, pos, cut, clhmin);

977 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

978 rep, pos, cut, crhmin);

979 }

980

981 if (rep2 != rep2_buf) {

982 free(rep2);

983 free(cut2);

984 free(pos2);

985 free(hyphens2);

986 }

987 }

988

989 if (prep_word != prep_word_buf) hnj_free (prep_word);

990 return 0;

991 }

992

993 /* UTF-8 normalization of hyphen and non-standard positions */

994 int hnj_hyphen_norm(const char word, int word_size, char hyphens,

995 char * rep, int pos, int ** cut)

996 {

997 int i, j, k;

998 if ((((unsigned char) word[0]) >> 6) == 2) {

999 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);

1000 return 1;

1001 }

1002

1003 /* calculate UTF-8 character positions */

1004 for (i = 0, j = -1; i < word_size; i++) {

1005 /* beginning of an UTF-8 character (not '10' start bits) */

1006 if ((((unsigned char) word[i]) >> 6) != 2) j++;

1007 hyphens[j] = hyphens[i];

1008 if (rep && pos && cut && rep && pos && *cut) {

1009 int l = (*pos)[i];

1010 (*pos)[j] = 0;

1011 for (k = 0; k < l; k++) {

1012 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;

1013 }

1014 k = i - l + 1;

1015 l = k + (*cut)[i];

1016 (*cut)[j] = 0;

1017 for (; k < l; k++) {

1018 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;

1019 }

1020 (rep)[j] = (rep)[i];

1021 if (j < i) {

1022 (*rep)[i] = NULL;

1023 (*pos)[i] = 0;

1024 (*cut)[i] = 0;

1025 }

1026 }

1027 }

1028 hyphens[j + 1] = '\0';

1029 return 0;

1030 }

1031

1032 /* get the word with all possible hyphenations (output: hyphword) */

1033 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,

1034 char * hyphword, char * rep, int pos, int ** cut)

1035 {

1036 int i, j;

1037 for (i = 0, j = 0; i < l; i++, j++) {

1038 if (hyphens[i]&1) {

1039 hyphword[j] = word[i];

1040 if (rep && pos && cut && (rep)[i]) {

1041 strcpy(hyphword + j - (pos)[i] + 1, (rep)[i]);

1042 j += strlen((rep)[i]) - (pos)[i];

1043 i += (cut)[i] - (pos)[i];

1044 } else hyphword[++j] = '=';

1045 } else hyphword[j] = word[i];

1046 }

1047 hyphword[j] = '\0';

1048 }

1049

1050

1051 /* main api function with default hyphenmin parameters */

1052 int hnj_hyphen_hyphenate2 (HyphenDict *dict,

1053 const char word, int word_size, char hyphens,

1054 char hyphword, char rep, int pos, int ** cut)

1055 {

1056 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

1057 dict->clhmin, dict->crhmin, 1, 1);

1058 hnj_hyphen_lhmin(dict->utf8, word, word_size,

1059 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));

1060 hnj_hyphen_rhmin(dict->utf8, word, word_size,

1061 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));

1062 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos , cut);

1063 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut );

1064 return 0;

1065 }

1066

1067 /* previous main api function with hyphenmin parameters */

1068 int hnj_hyphen_hyphenate3 (HyphenDict *dict,

1069 const char word, int word_size, char hyphens,

1070 char hyphword, char rep, int pos, int ** cut,

1071 int lhmin, int rhmin, int clhmin, int crhmin)

1072 {

1073 lhmin = (lhmin > 0 ? lhmin : dict->lhmin);

1074 rhmin = (rhmin > 0 ? rhmin : dict->rhmin);

1075 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

1076 clhmin, crhmin, 1, 1);

1077 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

1078 rep, pos, cut, (lhmin > 0 ? lhmin : 2));

1079 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

1080 rep, pos, cut, (rhmin > 0 ? rhmin : 2));

1081 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos , cut);

1082 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut );

1083 return 0;

1084 }

OLD	NEW

« no previous file with comments | « third_party/hyphen/hyphen.h ('k') | third_party/hyphen/hyphen.gyp » ('j') | no next file with comments »