| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/perl | |
| 2 # convert TeX (Patgen) hyphenation patterns to Libhnj format | |
| 3 # (A utility for finding substring embeddings in patterns) | |
| 4 # usage: substrings.pl inputfile outputfile [encoding] | |
| 5 | |
| 6 # Libhnj is dual licensed under LGPL and MPL. Boilerplate for both | |
| 7 # licenses follows. | |
| 8 | |
| 9 # LibHnj - a library for high quality hyphenation and justification | |
| 10 # Copyright (C) 1998 Raph Levien, | |
| 11 # (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), | |
| 12 # (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) | |
| 13 # (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) | |
| 14 # | |
| 15 # This library is free software; you can redistribute it and/or | |
| 16 # modify it under the terms of the GNU Library General Public | |
| 17 # License as published by the Free Software Foundation; either | |
| 18 # version 2 of the License, or (at your option) any later version. | |
| 19 # | |
| 20 # This library is distributed in the hope that it will be useful, | |
| 21 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 23 # Library General Public License for more details. | |
| 24 # | |
| 25 # You should have received a copy of the GNU Library General Public | |
| 26 # License along with this library; if not, write to the | |
| 27 # Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
| 28 # Boston, MA 02111-1307 USA. | |
| 29 | |
| 30 # The contents of this file are subject to the Mozilla Public License | |
| 31 # Version 1.0 (the "MPL"); you may not use this file except in | |
| 32 # compliance with the MPL. You may obtain a copy of the MPL at | |
| 33 # http://www.mozilla.org/MPL/ | |
| 34 # | |
| 35 # Software distributed under the MPL is distributed on an "AS IS" basis, | |
| 36 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL | |
| 37 # for the specific language governing rights and limitations under the | |
| 38 # MPL. | |
| 39 | |
| 40 if (!defined $ARGV[1]) { | |
| 41 print "" . | |
| 42 "substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" .
| |
| 43 "(A utility for finding substring embeddings in patterns)\n" . | |
| 44 "usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]
\n"; | |
| 45 exit 1; | |
| 46 } | |
| 47 $fn = $ARGV[0]; | |
| 48 if (!-e $fn) { $fn = "hyphen.us"; } | |
| 49 open HYPH, $fn; | |
| 50 open OUT, ">$ARGV[1]"; | |
| 51 $encoding = $ARGV[2]; | |
| 52 $lhmin = $ARGV[3]; | |
| 53 $rhmin = $ARGV[4]; | |
| 54 if (defined $encoding) { print OUT "$encoding\n"; } | |
| 55 if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; } | |
| 56 if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; } | |
| 57 | |
| 58 while (<HYPH>) | |
| 59 { | |
| 60 $pat =~ s/%.*$//g; | |
| 61 if (/^\%/) { | |
| 62 #comment, ignore | |
| 63 } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) { | |
| 64 $origpat = $1; | |
| 65 $pat = $1; | |
| 66 $repl = $2; | |
| 67 $beg = $3; | |
| 68 $len = $4; | |
| 69 $pat =~ s/\d//g; | |
| 70 if ($origpat eq $pat) { | |
| 71 print "error - missing hyphenation point: $_"; | |
| 72 exit 1; | |
| 73 } | |
| 74 push @patlist, $pat; | |
| 75 $pattab{$pat} = $origpat; | |
| 76 $repltab{$pat} = $repl; | |
| 77 $replbeg{$pat} = $beg - 1; | |
| 78 $repllen{$pat} = $len; | |
| 79 } elsif (/^(.+)\/(.+)$/) { | |
| 80 $origpat = $1; | |
| 81 $pat = $1; | |
| 82 $repl = $2; | |
| 83 $pat =~ s/\d//g; | |
| 84 if ($origpat eq $pat) { | |
| 85 print "error - missing hyphenation point: $_"; | |
| 86 exit 1; | |
| 87 } | |
| 88 push @patlist, $pat; | |
| 89 $pattab{$pat} = $origpat; | |
| 90 $repltab{$pat} = $repl; | |
| 91 $replbeg{$pat} = 0; | |
| 92 $repllen{$pat} = enclen($pat); | |
| 93 } elsif (/^(.+)$/) { | |
| 94 $origpat = $1; | |
| 95 $pat = $1; | |
| 96 $pat =~ s/\d//g; | |
| 97 push @patlist, $pat; | |
| 98 $pattab{$pat} = $origpat; | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 foreach $pat (@patlist) { | |
| 103 $patsize = length $pat; | |
| 104 for $i (0..$patsize - 1) { | |
| 105 for $j (1..$patsize - $i) { | |
| 106 $subpat = substr ($pat, $i, $j); | |
| 107 if (defined $pattab{$subpat}) { | |
| 108 print "$pattab{$subpat} is embedded in $pattab{$pat}\n"; | |
| 109 $newpat = substr $pat, 0, $i + $j; | |
| 110 if (!defined $newpattab{$newpat}) { | |
| 111 $newpattab{$newpat} = | |
| 112 substr ($pat, 0, $i).$pattab{$subpat}; | |
| 113 $ss = substr $pat, 0, $i; | |
| 114 print "$ss+$pattab{$subpat}\n"; | |
| 115 push @newpatlist, $newpat; | |
| 116 if (defined $repltab{$subpat}) { | |
| 117 $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ?
1 : 0; | |
| 118 $newrepltab{$newpat} = $repltab{$subpat}; | |
| 119 $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) -
$begcorr; | |
| 120 $newrepllen{$newpat} = $repllen{$subpat}; | |
| 121 } | |
| 122 } else { | |
| 123 $tmp = $newpattab{$newpat}; | |
| 124 $newpattab{$newpat} = | |
| 125 combine ($newpattab{$newpat}, $pattab{$subpat}); | |
| 126 print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n"; | |
| 127 } | |
| 128 } | |
| 129 } | |
| 130 } | |
| 131 } | |
| 132 | |
| 133 foreach $pat (@newpatlist) { | |
| 134 if (defined $newrepltab{$pat}) { | |
| 135 print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+
1).",".$newrepllen{$pat}."\n"; | |
| 136 } else { | |
| 137 print OUT $newpattab{$pat}."\n"; | |
| 138 } | |
| 139 } | |
| 140 | |
| 141 #convert 'n1im' to 0n1i0m0 expresed as a list | |
| 142 sub expand { | |
| 143 my ($pat) = @_; | |
| 144 my $last = '.'; | |
| 145 my @exp = (); | |
| 146 | |
| 147 foreach $c (split (//, $pat)) { | |
| 148 if ($last =~ /[\D]/ && $c =~ /[\D]/) { | |
| 149 push @exp, 0; | |
| 150 } | |
| 151 push @exp, $c; | |
| 152 $last = $c; | |
| 153 } | |
| 154 if ($last =~ /[\D]/) { | |
| 155 push @exp, 0; | |
| 156 } | |
| 157 return @exp; | |
| 158 } | |
| 159 | |
| 160 # Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der | |
| 161 # The second pattern needs to be a substring of the first (modulo digits) | |
| 162 sub combine { | |
| 163 my @exp = expand shift; | |
| 164 my @subexp = expand shift; | |
| 165 my $pat1, $pat2; | |
| 166 my $i; | |
| 167 | |
| 168 $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp); | |
| 169 $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp); | |
| 170 | |
| 171 $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0; | |
| 172 | |
| 173 for $i (0..length ($pat1) - length ($pat2)) { | |
| 174 if (substr ($pat1, $i, length $pat2) eq $subpat) { | |
| 175 for ($j = 0; $j < @subexp; $j += 2) { | |
| 176 if ($subexp[$j] > $exp[2 * $i + $j]) { | |
| 177 $exp[2 * $i + $j] = $subexp[$j]; | |
| 178 if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1
}) { | |
| 179 $ss = substr ($pat1, 0, $i); | |
| 180 $newrepltab{$pat1} = $newrepltab{$pat2}; | |
| 181 $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) -
$begcorr; | |
| 182 $newrepllen{$pat1} = $newrepllen{$pat2}; | |
| 183 } | |
| 184 } | |
| 185 } | |
| 186 print ("$pat1 includes $pat2 at pos $i\n"); | |
| 187 } | |
| 188 } | |
| 189 return join ('', map { $_ eq '0' ? () : $_ } @exp); | |
| 190 } | |
| 191 | |
| 192 # 8 bit or UTF-8 character length (calculating right start position for discreti
onary hyphenation) | |
| 193 sub enclen { | |
| 194 my $nonchar = 0; | |
| 195 my $len = length($_[0]); | |
| 196 if ($encoding eq "UTF-8") { | |
| 197 # length of an UTF-8 string equals to the count of the characters not st
arted with '10' bits | |
| 198 for ($i = 0; $i < $len; $i++) { | |
| 199 if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; } | |
| 200 } | |
| 201 } | |
| 202 return $len - $nonchar; | |
| 203 } | |
| OLD | NEW |