OLD | NEW |
| (Empty) |
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both | |
2 * licenses follows. | |
3 */ | |
4 | |
5 /* LibHnj - a library for high quality hyphenation and justification | |
6 * Copyright (C) 1998 Raph Levien, | |
7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), | |
8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) | |
9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) | |
10 * | |
11 * This library is free software; you can redistribute it and/or | |
12 * modify it under the terms of the GNU Library General Public | |
13 * License as published by the Free Software Foundation; either | |
14 * version 2 of the License, or (at your option) any later version. | |
15 * | |
16 * This library is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 * Library General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU Library General Public | |
22 * License along with this library; if not, write to the | |
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 * Boston, MA 02111-1307 USA. | |
25 */ | |
26 | |
27 /* | |
28 * The contents of this file are subject to the Mozilla Public License | |
29 * Version 1.0 (the "MPL"); you may not use this file except in | |
30 * compliance with the MPL. You may obtain a copy of the MPL at | |
31 * http://www.mozilla.org/MPL/ | |
32 * | |
33 * Software distributed under the MPL is distributed on an "AS IS" basis, | |
34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL | |
35 * for the specific language governing rights and limitations under the | |
36 * MPL. | |
37 * | |
38 */ | |
39 | |
40 // | |
41 // A utility for finding substring embeddings in patterns | |
42 | |
43 #include <stdio.h> | |
44 #include <string.h> | |
45 #include <stdlib.h> | |
46 | |
47 #define MAXPATHS (256*1024) | |
48 | |
49 // | |
50 // | |
51 static void die( | |
52 const char*msg | |
53 ) { | |
54 fprintf(stderr,"%s\n",msg); | |
55 exit(1); | |
56 } | |
57 | |
58 | |
59 // Finds the index of an entry, only used on xxx_key arrays | |
60 // Caveat: the table has to be sorted | |
61 static int find_in( | |
62 char *tab[], | |
63 int max, | |
64 const char *pat | |
65 ) { | |
66 int left=0, right=max-1; | |
67 while (left <= right) { | |
68 int mid = ((right-left)/2)+left; | |
69 int v = strcmp(pat,tab[mid]); | |
70 if (v>0) { | |
71 left = mid + 1; | |
72 } else if (v<0) { | |
73 right = mid -1; | |
74 } else { | |
75 return mid; | |
76 } | |
77 } | |
78 return -1; | |
79 } | |
80 | |
81 | |
82 // used by partition (which is used by qsort_arr) | |
83 // | |
84 static void swap2( | |
85 char *a[], | |
86 char *b[], | |
87 int i, | |
88 int j | |
89 ) { | |
90 if (i==j) return; | |
91 char*v; | |
92 v=a[i]; a[i]=a[j]; a[j]=v; | |
93 v=b[i]; b[i]=b[j]; b[j]=v; | |
94 } | |
95 | |
96 | |
97 // used by qsort_arr | |
98 // | |
99 static int partition( | |
100 char *a[], | |
101 char *b[], | |
102 int left, | |
103 int right, | |
104 int p | |
105 ) { | |
106 const char *pivotValue = a[p]; | |
107 int i; | |
108 swap2(a,b,p,right); // Move pivot to end | |
109 p = left; | |
110 for (i=left; i<right; i++) { | |
111 if (strcmp(a[i],pivotValue)<=0) { | |
112 swap2(a,b,p,i); | |
113 p++; | |
114 } | |
115 } | |
116 swap2(a,b,right,p); // Move pivot to its final place | |
117 return p; | |
118 } | |
119 | |
120 | |
121 // | |
122 // | |
123 static void qsort_arr( | |
124 char *a[], | |
125 char *b[], | |
126 int left, | |
127 int right | |
128 ) { | |
129 while (right > left) { | |
130 int p = left + (right-left)/2; //select a pivot | |
131 p = partition(a,b, left, right, p); | |
132 if ((p-1) - left < right - (p+1)) { | |
133 qsort_arr(a,b, left, p-1); | |
134 left = p+1; | |
135 } else { | |
136 qsort_arr(a,b, p+1, right); | |
137 right = p-1; | |
138 } | |
139 } | |
140 } | |
141 | |
142 | |
143 // Removes extra '0' entries from the string | |
144 // | |
145 static char* compact( | |
146 char *expr | |
147 ) { | |
148 int l=strlen(expr); | |
149 int i,j; | |
150 for (i=0,j=0; i<l; i++) { | |
151 if (expr[i]!='0') expr[j++] = expr[i]; | |
152 } | |
153 expr[j]=0; | |
154 return expr; | |
155 } | |
156 | |
157 | |
158 // convert 'n1im' to 0n1i0m0 expressed as a string | |
159 // | |
160 static void expand( | |
161 char *expr, | |
162 const char *pat, | |
163 int l | |
164 ) { | |
165 int el = 0; | |
166 char last = '.'; | |
167 int i; | |
168 for (i=0; i<l; i++) { | |
169 char c = pat[i]; | |
170 if ( (last<'0' || last>'9') | |
171 && (c <'0' || c >'9') | |
172 ) { | |
173 expr[el++] = '0'; | |
174 } | |
175 expr[el++] = c; | |
176 last = c; | |
177 } | |
178 if (last<'0' || last>'9') expr[el++] = '0'; | |
179 expr[el]=0; | |
180 } | |
181 | |
182 | |
183 // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der | |
184 // The second pattern needs to be a right side match of the first | |
185 // (modulo digits) | |
186 static char *combine( | |
187 char *expr, | |
188 const char *subexpr | |
189 ) { | |
190 int l1 = strlen(expr); | |
191 int l2 = strlen(subexpr); | |
192 int off = l1-l2; | |
193 int j; | |
194 // this works also for utf8 sequences because the substring is identical | |
195 // to the last substring-length bytes of expr except for the (single byte) | |
196 // hyphenation encoders | |
197 for (j=0; j<l2; j++) { | |
198 if (subexpr[j]>expr[off+j]) { | |
199 expr[off+j] = subexpr[j]; | |
200 } | |
201 } | |
202 return expr; | |
203 } | |
204 | |
205 | |
206 // | |
207 // | |
208 int main(int argc, const char* argv[]) { | |
209 FILE *in, *out; | |
210 char *pattab_key[MAXPATHS]; | |
211 char *pattab_val[MAXPATHS]; | |
212 int patterns = 0; | |
213 char *newpattab_key[MAXPATHS]; | |
214 char *newpattab_val[MAXPATHS]; | |
215 int newpatterns = 0; | |
216 char format[132]; // 64+65+newline+zero+spare | |
217 int p; | |
218 if (argc!=3) die("Usage: <orig-file> <new-file>\n"); | |
219 if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input"); | |
220 if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output"); | |
221 // read all patterns and split in pure text (_key) & expanded patterns (_val) | |
222 while(fgets(format,132,in) != NULL) { | |
223 int l = strlen(format); | |
224 if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp | |
225 if (format[0]=='%' || format[0]==0) { | |
226 // skip | |
227 } else { | |
228 if (format[l-1]=='%') { | |
229 l--; | |
230 format[l] = 0; // remove '%' | |
231 } | |
232 int i,j; | |
233 char *pat = (char*) malloc(l+1); | |
234 char *org = (char*) malloc(l*2+1); | |
235 if (pat==NULL || org==NULL) die("not enough memory"); | |
236 expand(org,format,l); | |
237 // remove hyphenation encoders (digits) from pat | |
238 for (i=0,j=0; i<l; i++) { | |
239 // odd, but utf-8 proof | |
240 char c = format[i]; | |
241 if (c<'0' || c>'9') pat[j++]=c; | |
242 } | |
243 pat[j]=0; | |
244 p = patterns; | |
245 pattab_key[patterns] = pat; | |
246 pattab_val[patterns++] = org; | |
247 if (patterns>MAXPATHS) die("to many base patterns"); | |
248 } | |
249 } | |
250 fclose(in); | |
251 // As we use binairy search, make sure it is sorted | |
252 qsort_arr(pattab_key,pattab_val,0,patterns-1); | |
253 | |
254 for (p=0; p<patterns; p++) { | |
255 char *pat = pattab_key[p]; | |
256 int patsize = strlen(pat); | |
257 int j,l; | |
258 for (l=1; l<=patsize; l++) { | |
259 for (j=1; j<=l; j++) { | |
260 int i = l-j; | |
261 int subpat_ndx; | |
262 char subpat[132]; | |
263 strncpy(subpat,pat+i,j); subpat[j]=0; | |
264 if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) { | |
265 int newpat_ndx; | |
266 char *newpat=malloc(l+1); | |
267 if (newpat==NULL) die("not enough memory"); | |
268 //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]); | |
269 strncpy(newpat, pat+0,l); newpat[l]=0; | |
270 if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) { | |
271 char *neworg = malloc(132); // TODO: compute exact length | |
272 if (neworg==NULL) die("not enough memory"); | |
273 expand(neworg,newpat,l); | |
274 newpattab_key[newpatterns] = newpat; | |
275 newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]
); | |
276 if (newpatterns>MAXPATHS) die("to many new patterns"); | |
277 //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_v
al[p],pattab_val[subpat_ndx],neworg); | |
278 } else { | |
279 free(newpat); | |
280 newpattab_val[newpat_ndx] = combine( | |
281 newpattab_val[newpat_ndx], pattab_val[subpat_ndx] ); | |
282 } | |
283 } | |
284 } | |
285 } | |
286 } | |
287 | |
288 /* for some tiny extra speed, one could forget the free()s | |
289 * as the memory is freed anyway on exit(). | |
290 * However, the gain is minimal and now the code can be cleanly | |
291 * incorporated into other code */ | |
292 for (p=0; p<newpatterns; p++) { | |
293 fprintf(out,"%s\n",compact(newpattab_val[p])); | |
294 free(newpattab_key[p]); | |
295 free(newpattab_val[p]); | |
296 } | |
297 fclose(out); | |
298 | |
299 for (p=0; p<patterns; p++) { | |
300 free(pattab_key[p]); | |
301 free(pattab_val[p]); | |
302 } | |
303 return 0; | |
304 } | |
OLD | NEW |