| Index: third_party/hyphen/substrings.pl
|
| diff --git a/third_party/hyphen/substrings.pl b/third_party/hyphen/substrings.pl
|
| deleted file mode 100755
|
| index 1a4f94fbb971976e98a91cf3af1baecd648e777e..0000000000000000000000000000000000000000
|
| --- a/third_party/hyphen/substrings.pl
|
| +++ /dev/null
|
| @@ -1,203 +0,0 @@
|
| -#!/usr/bin/perl
|
| -# convert TeX (Patgen) hyphenation patterns to Libhnj format
|
| -# (A utility for finding substring embeddings in patterns)
|
| -# usage: substrings.pl inputfile outputfile [encoding]
|
| -
|
| -# Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
|
| -# licenses follows.
|
| -
|
| -# LibHnj - a library for high quality hyphenation and justification
|
| -# Copyright (C) 1998 Raph Levien,
|
| -# (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
|
| -# (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
|
| -# (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
|
| -#
|
| -# This library is free software; you can redistribute it and/or
|
| -# modify it under the terms of the GNU Library General Public
|
| -# License as published by the Free Software Foundation; either
|
| -# version 2 of the License, or (at your option) any later version.
|
| -#
|
| -# This library is distributed in the hope that it will be useful,
|
| -# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| -# Library General Public License for more details.
|
| -#
|
| -# You should have received a copy of the GNU Library General Public
|
| -# License along with this library; if not, write to the
|
| -# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
| -# Boston, MA 02111-1307 USA.
|
| -
|
| -# The contents of this file are subject to the Mozilla Public License
|
| -# Version 1.0 (the "MPL"); you may not use this file except in
|
| -# compliance with the MPL. You may obtain a copy of the MPL at
|
| -# http://www.mozilla.org/MPL/
|
| -#
|
| -# Software distributed under the MPL is distributed on an "AS IS" basis,
|
| -# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
|
| -# for the specific language governing rights and limitations under the
|
| -# MPL.
|
| -
|
| -if (!defined $ARGV[1]) {
|
| - print "" .
|
| -"substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" .
|
| -"(A utility for finding substring embeddings in patterns)\n" .
|
| -"usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n";
|
| - exit 1;
|
| -}
|
| -$fn = $ARGV[0];
|
| -if (!-e $fn) { $fn = "hyphen.us"; }
|
| -open HYPH, $fn;
|
| -open OUT, ">$ARGV[1]";
|
| -$encoding = $ARGV[2];
|
| -$lhmin = $ARGV[3];
|
| -$rhmin = $ARGV[4];
|
| -if (defined $encoding) { print OUT "$encoding\n"; }
|
| -if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; }
|
| -if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; }
|
| -
|
| -while (<HYPH>)
|
| -{
|
| - $pat =~ s/%.*$//g;
|
| - if (/^\%/) {
|
| - #comment, ignore
|
| - } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) {
|
| - $origpat = $1;
|
| - $pat = $1;
|
| - $repl = $2;
|
| - $beg = $3;
|
| - $len = $4;
|
| - $pat =~ s/\d//g;
|
| - if ($origpat eq $pat) {
|
| - print "error - missing hyphenation point: $_";
|
| - exit 1;
|
| - }
|
| - push @patlist, $pat;
|
| - $pattab{$pat} = $origpat;
|
| - $repltab{$pat} = $repl;
|
| - $replbeg{$pat} = $beg - 1;
|
| - $repllen{$pat} = $len;
|
| - } elsif (/^(.+)\/(.+)$/) {
|
| - $origpat = $1;
|
| - $pat = $1;
|
| - $repl = $2;
|
| - $pat =~ s/\d//g;
|
| - if ($origpat eq $pat) {
|
| - print "error - missing hyphenation point: $_";
|
| - exit 1;
|
| - }
|
| - push @patlist, $pat;
|
| - $pattab{$pat} = $origpat;
|
| - $repltab{$pat} = $repl;
|
| - $replbeg{$pat} = 0;
|
| - $repllen{$pat} = enclen($pat);
|
| - } elsif (/^(.+)$/) {
|
| - $origpat = $1;
|
| - $pat = $1;
|
| - $pat =~ s/\d//g;
|
| - push @patlist, $pat;
|
| - $pattab{$pat} = $origpat;
|
| - }
|
| -}
|
| -
|
| -foreach $pat (@patlist) {
|
| - $patsize = length $pat;
|
| - for $i (0..$patsize - 1) {
|
| - for $j (1..$patsize - $i) {
|
| - $subpat = substr ($pat, $i, $j);
|
| - if (defined $pattab{$subpat}) {
|
| - print "$pattab{$subpat} is embedded in $pattab{$pat}\n";
|
| - $newpat = substr $pat, 0, $i + $j;
|
| - if (!defined $newpattab{$newpat}) {
|
| - $newpattab{$newpat} =
|
| - substr ($pat, 0, $i).$pattab{$subpat};
|
| - $ss = substr $pat, 0, $i;
|
| - print "$ss+$pattab{$subpat}\n";
|
| - push @newpatlist, $newpat;
|
| - if (defined $repltab{$subpat}) {
|
| - $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0;
|
| - $newrepltab{$newpat} = $repltab{$subpat};
|
| - $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr;
|
| - $newrepllen{$newpat} = $repllen{$subpat};
|
| - }
|
| - } else {
|
| - $tmp = $newpattab{$newpat};
|
| - $newpattab{$newpat} =
|
| - combine ($newpattab{$newpat}, $pattab{$subpat});
|
| - print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n";
|
| - }
|
| - }
|
| - }
|
| - }
|
| -}
|
| -
|
| -foreach $pat (@newpatlist) {
|
| - if (defined $newrepltab{$pat}) {
|
| - print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n";
|
| - } else {
|
| - print OUT $newpattab{$pat}."\n";
|
| - }
|
| -}
|
| -
|
| -#convert 'n1im' to 0n1i0m0 expresed as a list
|
| -sub expand {
|
| - my ($pat) = @_;
|
| - my $last = '.';
|
| - my @exp = ();
|
| -
|
| - foreach $c (split (//, $pat)) {
|
| - if ($last =~ /[\D]/ && $c =~ /[\D]/) {
|
| - push @exp, 0;
|
| - }
|
| - push @exp, $c;
|
| - $last = $c;
|
| - }
|
| - if ($last =~ /[\D]/) {
|
| - push @exp, 0;
|
| - }
|
| - return @exp;
|
| -}
|
| -
|
| -# Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
|
| -# The second pattern needs to be a substring of the first (modulo digits)
|
| -sub combine {
|
| - my @exp = expand shift;
|
| - my @subexp = expand shift;
|
| - my $pat1, $pat2;
|
| - my $i;
|
| -
|
| - $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp);
|
| - $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp);
|
| -
|
| - $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0;
|
| -
|
| - for $i (0..length ($pat1) - length ($pat2)) {
|
| - if (substr ($pat1, $i, length $pat2) eq $subpat) {
|
| - for ($j = 0; $j < @subexp; $j += 2) {
|
| - if ($subexp[$j] > $exp[2 * $i + $j]) {
|
| - $exp[2 * $i + $j] = $subexp[$j];
|
| - if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) {
|
| - $ss = substr ($pat1, 0, $i);
|
| - $newrepltab{$pat1} = $newrepltab{$pat2};
|
| - $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr;
|
| - $newrepllen{$pat1} = $newrepllen{$pat2};
|
| - }
|
| - }
|
| - }
|
| - print ("$pat1 includes $pat2 at pos $i\n");
|
| - }
|
| - }
|
| - return join ('', map { $_ eq '0' ? () : $_ } @exp);
|
| -}
|
| -
|
| -# 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation)
|
| -sub enclen {
|
| - my $nonchar = 0;
|
| - my $len = length($_[0]);
|
| - if ($encoding eq "UTF-8") {
|
| - # length of an UTF-8 string equals to the count of the characters not started with '10' bits
|
| - for ($i = 0; $i < $len; $i++) {
|
| - if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; }
|
| - }
|
| - }
|
| - return $len - $nonchar;
|
| -}
|
|
|