Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(333)

Side by Side Diff: third_party/hyphen/substrings.pl

Issue 20860003: Remove hyphenation code from Chromium. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: rebase Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/hyphen/substrings.c ('k') | third_party/hyphen/tbhyphext.sh » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/perl
2 # convert TeX (Patgen) hyphenation patterns to Libhnj format
3 # (A utility for finding substring embeddings in patterns)
4 # usage: substrings.pl inputfile outputfile [encoding]
5
6 # Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
7 # licenses follows.
8
9 # LibHnj - a library for high quality hyphenation and justification
10 # Copyright (C) 1998 Raph Levien,
11 # (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
12 # (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
13 # (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
14 #
15 # This library is free software; you can redistribute it and/or
16 # modify it under the terms of the GNU Library General Public
17 # License as published by the Free Software Foundation; either
18 # version 2 of the License, or (at your option) any later version.
19 #
20 # This library is distributed in the hope that it will be useful,
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 # Library General Public License for more details.
24 #
25 # You should have received a copy of the GNU Library General Public
26 # License along with this library; if not, write to the
27 # Free Software Foundation, Inc., 59 Temple Place - Suite 330,
28 # Boston, MA 02111-1307 USA.
29
30 # The contents of this file are subject to the Mozilla Public License
31 # Version 1.0 (the "MPL"); you may not use this file except in
32 # compliance with the MPL. You may obtain a copy of the MPL at
33 # http://www.mozilla.org/MPL/
34 #
35 # Software distributed under the MPL is distributed on an "AS IS" basis,
36 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
37 # for the specific language governing rights and limitations under the
38 # MPL.
39
40 if (!defined $ARGV[1]) {
41 print "" .
42 "substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" .
43 "(A utility for finding substring embeddings in patterns)\n" .
44 "usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]] \n";
45 exit 1;
46 }
47 $fn = $ARGV[0];
48 if (!-e $fn) { $fn = "hyphen.us"; }
49 open HYPH, $fn;
50 open OUT, ">$ARGV[1]";
51 $encoding = $ARGV[2];
52 $lhmin = $ARGV[3];
53 $rhmin = $ARGV[4];
54 if (defined $encoding) { print OUT "$encoding\n"; }
55 if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; }
56 if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; }
57
58 while (<HYPH>)
59 {
60 $pat =~ s/%.*$//g;
61 if (/^\%/) {
62 #comment, ignore
63 } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) {
64 $origpat = $1;
65 $pat = $1;
66 $repl = $2;
67 $beg = $3;
68 $len = $4;
69 $pat =~ s/\d//g;
70 if ($origpat eq $pat) {
71 print "error - missing hyphenation point: $_";
72 exit 1;
73 }
74 push @patlist, $pat;
75 $pattab{$pat} = $origpat;
76 $repltab{$pat} = $repl;
77 $replbeg{$pat} = $beg - 1;
78 $repllen{$pat} = $len;
79 } elsif (/^(.+)\/(.+)$/) {
80 $origpat = $1;
81 $pat = $1;
82 $repl = $2;
83 $pat =~ s/\d//g;
84 if ($origpat eq $pat) {
85 print "error - missing hyphenation point: $_";
86 exit 1;
87 }
88 push @patlist, $pat;
89 $pattab{$pat} = $origpat;
90 $repltab{$pat} = $repl;
91 $replbeg{$pat} = 0;
92 $repllen{$pat} = enclen($pat);
93 } elsif (/^(.+)$/) {
94 $origpat = $1;
95 $pat = $1;
96 $pat =~ s/\d//g;
97 push @patlist, $pat;
98 $pattab{$pat} = $origpat;
99 }
100 }
101
102 foreach $pat (@patlist) {
103 $patsize = length $pat;
104 for $i (0..$patsize - 1) {
105 for $j (1..$patsize - $i) {
106 $subpat = substr ($pat, $i, $j);
107 if (defined $pattab{$subpat}) {
108 print "$pattab{$subpat} is embedded in $pattab{$pat}\n";
109 $newpat = substr $pat, 0, $i + $j;
110 if (!defined $newpattab{$newpat}) {
111 $newpattab{$newpat} =
112 substr ($pat, 0, $i).$pattab{$subpat};
113 $ss = substr $pat, 0, $i;
114 print "$ss+$pattab{$subpat}\n";
115 push @newpatlist, $newpat;
116 if (defined $repltab{$subpat}) {
117 $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0;
118 $newrepltab{$newpat} = $repltab{$subpat};
119 $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr;
120 $newrepllen{$newpat} = $repllen{$subpat};
121 }
122 } else {
123 $tmp = $newpattab{$newpat};
124 $newpattab{$newpat} =
125 combine ($newpattab{$newpat}, $pattab{$subpat});
126 print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n";
127 }
128 }
129 }
130 }
131 }
132
133 foreach $pat (@newpatlist) {
134 if (defined $newrepltab{$pat}) {
135 print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+ 1).",".$newrepllen{$pat}."\n";
136 } else {
137 print OUT $newpattab{$pat}."\n";
138 }
139 }
140
141 #convert 'n1im' to 0n1i0m0 expresed as a list
142 sub expand {
143 my ($pat) = @_;
144 my $last = '.';
145 my @exp = ();
146
147 foreach $c (split (//, $pat)) {
148 if ($last =~ /[\D]/ && $c =~ /[\D]/) {
149 push @exp, 0;
150 }
151 push @exp, $c;
152 $last = $c;
153 }
154 if ($last =~ /[\D]/) {
155 push @exp, 0;
156 }
157 return @exp;
158 }
159
160 # Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
161 # The second pattern needs to be a substring of the first (modulo digits)
162 sub combine {
163 my @exp = expand shift;
164 my @subexp = expand shift;
165 my $pat1, $pat2;
166 my $i;
167
168 $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp);
169 $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp);
170
171 $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0;
172
173 for $i (0..length ($pat1) - length ($pat2)) {
174 if (substr ($pat1, $i, length $pat2) eq $subpat) {
175 for ($j = 0; $j < @subexp; $j += 2) {
176 if ($subexp[$j] > $exp[2 * $i + $j]) {
177 $exp[2 * $i + $j] = $subexp[$j];
178 if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1 }) {
179 $ss = substr ($pat1, 0, $i);
180 $newrepltab{$pat1} = $newrepltab{$pat2};
181 $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr;
182 $newrepllen{$pat1} = $newrepllen{$pat2};
183 }
184 }
185 }
186 print ("$pat1 includes $pat2 at pos $i\n");
187 }
188 }
189 return join ('', map { $_ eq '0' ? () : $_ } @exp);
190 }
191
192 # 8 bit or UTF-8 character length (calculating right start position for discreti onary hyphenation)
193 sub enclen {
194 my $nonchar = 0;
195 my $len = length($_[0]);
196 if ($encoding eq "UTF-8") {
197 # length of an UTF-8 string equals to the count of the characters not st arted with '10' bits
198 for ($i = 0; $i < $len; $i++) {
199 if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; }
200 }
201 }
202 return $len - $nonchar;
203 }
OLDNEW
« no previous file with comments | « third_party/hyphen/substrings.c ('k') | third_party/hyphen/tbhyphext.sh » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698