| OLD | NEW |
| (Empty) |
| 1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both | |
| 2 * licenses follows. | |
| 3 */ | |
| 4 | |
| 5 /* LibHnj - a library for high quality hyphenation and justification | |
| 6 * Copyright (C) 1998 Raph Levien, | |
| 7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), | |
| 8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) | |
| 9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) | |
| 10 * | |
| 11 * This library is free software; you can redistribute it and/or | |
| 12 * modify it under the terms of the GNU Library General Public | |
| 13 * License as published by the Free Software Foundation; either | |
| 14 * version 2 of the License, or (at your option) any later version. | |
| 15 * | |
| 16 * This library is distributed in the hope that it will be useful, | |
| 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 19 * Library General Public License for more details. | |
| 20 * | |
| 21 * You should have received a copy of the GNU Library General Public | |
| 22 * License along with this library; if not, write to the | |
| 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
| 24 * Boston, MA 02111-1307 USA. | |
| 25 */ | |
| 26 | |
| 27 /* | |
| 28 * The contents of this file are subject to the Mozilla Public License | |
| 29 * Version 1.0 (the "MPL"); you may not use this file except in | |
| 30 * compliance with the MPL. You may obtain a copy of the MPL at | |
| 31 * http://www.mozilla.org/MPL/ | |
| 32 * | |
| 33 * Software distributed under the MPL is distributed on an "AS IS" basis, | |
| 34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL | |
| 35 * for the specific language governing rights and limitations under the | |
| 36 * MPL. | |
| 37 * | |
| 38 */ | |
| 39 | |
| 40 // | |
| 41 // A utility for finding substring embeddings in patterns | |
| 42 | |
| 43 #include <stdio.h> | |
| 44 #include <string.h> | |
| 45 #include <stdlib.h> | |
| 46 | |
| 47 #define MAXPATHS (256*1024) | |
| 48 | |
| 49 // | |
| 50 // | |
| 51 static void die( | |
| 52 const char*msg | |
| 53 ) { | |
| 54 fprintf(stderr,"%s\n",msg); | |
| 55 exit(1); | |
| 56 } | |
| 57 | |
| 58 | |
| 59 // Finds the index of an entry, only used on xxx_key arrays | |
| 60 // Caveat: the table has to be sorted | |
| 61 static int find_in( | |
| 62 char *tab[], | |
| 63 int max, | |
| 64 const char *pat | |
| 65 ) { | |
| 66 int left=0, right=max-1; | |
| 67 while (left <= right) { | |
| 68 int mid = ((right-left)/2)+left; | |
| 69 int v = strcmp(pat,tab[mid]); | |
| 70 if (v>0) { | |
| 71 left = mid + 1; | |
| 72 } else if (v<0) { | |
| 73 right = mid -1; | |
| 74 } else { | |
| 75 return mid; | |
| 76 } | |
| 77 } | |
| 78 return -1; | |
| 79 } | |
| 80 | |
| 81 | |
| 82 // used by partition (which is used by qsort_arr) | |
| 83 // | |
| 84 static void swap2( | |
| 85 char *a[], | |
| 86 char *b[], | |
| 87 int i, | |
| 88 int j | |
| 89 ) { | |
| 90 if (i==j) return; | |
| 91 char*v; | |
| 92 v=a[i]; a[i]=a[j]; a[j]=v; | |
| 93 v=b[i]; b[i]=b[j]; b[j]=v; | |
| 94 } | |
| 95 | |
| 96 | |
| 97 // used by qsort_arr | |
| 98 // | |
| 99 static int partition( | |
| 100 char *a[], | |
| 101 char *b[], | |
| 102 int left, | |
| 103 int right, | |
| 104 int p | |
| 105 ) { | |
| 106 const char *pivotValue = a[p]; | |
| 107 int i; | |
| 108 swap2(a,b,p,right); // Move pivot to end | |
| 109 p = left; | |
| 110 for (i=left; i<right; i++) { | |
| 111 if (strcmp(a[i],pivotValue)<=0) { | |
| 112 swap2(a,b,p,i); | |
| 113 p++; | |
| 114 } | |
| 115 } | |
| 116 swap2(a,b,right,p); // Move pivot to its final place | |
| 117 return p; | |
| 118 } | |
| 119 | |
| 120 | |
| 121 // | |
| 122 // | |
| 123 static void qsort_arr( | |
| 124 char *a[], | |
| 125 char *b[], | |
| 126 int left, | |
| 127 int right | |
| 128 ) { | |
| 129 while (right > left) { | |
| 130 int p = left + (right-left)/2; //select a pivot | |
| 131 p = partition(a,b, left, right, p); | |
| 132 if ((p-1) - left < right - (p+1)) { | |
| 133 qsort_arr(a,b, left, p-1); | |
| 134 left = p+1; | |
| 135 } else { | |
| 136 qsort_arr(a,b, p+1, right); | |
| 137 right = p-1; | |
| 138 } | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 | |
| 143 // Removes extra '0' entries from the string | |
| 144 // | |
| 145 static char* compact( | |
| 146 char *expr | |
| 147 ) { | |
| 148 int l=strlen(expr); | |
| 149 int i,j; | |
| 150 for (i=0,j=0; i<l; i++) { | |
| 151 if (expr[i]!='0') expr[j++] = expr[i]; | |
| 152 } | |
| 153 expr[j]=0; | |
| 154 return expr; | |
| 155 } | |
| 156 | |
| 157 | |
| 158 // convert 'n1im' to 0n1i0m0 expressed as a string | |
| 159 // | |
| 160 static void expand( | |
| 161 char *expr, | |
| 162 const char *pat, | |
| 163 int l | |
| 164 ) { | |
| 165 int el = 0; | |
| 166 char last = '.'; | |
| 167 int i; | |
| 168 for (i=0; i<l; i++) { | |
| 169 char c = pat[i]; | |
| 170 if ( (last<'0' || last>'9') | |
| 171 && (c <'0' || c >'9') | |
| 172 ) { | |
| 173 expr[el++] = '0'; | |
| 174 } | |
| 175 expr[el++] = c; | |
| 176 last = c; | |
| 177 } | |
| 178 if (last<'0' || last>'9') expr[el++] = '0'; | |
| 179 expr[el]=0; | |
| 180 } | |
| 181 | |
| 182 | |
| 183 // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der | |
| 184 // The second pattern needs to be a right side match of the first | |
| 185 // (modulo digits) | |
| 186 static char *combine( | |
| 187 char *expr, | |
| 188 const char *subexpr | |
| 189 ) { | |
| 190 int l1 = strlen(expr); | |
| 191 int l2 = strlen(subexpr); | |
| 192 int off = l1-l2; | |
| 193 int j; | |
| 194 // this works also for utf8 sequences because the substring is identical | |
| 195 // to the last substring-length bytes of expr except for the (single byte) | |
| 196 // hyphenation encoders | |
| 197 for (j=0; j<l2; j++) { | |
| 198 if (subexpr[j]>expr[off+j]) { | |
| 199 expr[off+j] = subexpr[j]; | |
| 200 } | |
| 201 } | |
| 202 return expr; | |
| 203 } | |
| 204 | |
| 205 | |
| 206 // | |
| 207 // | |
| 208 int main(int argc, const char* argv[]) { | |
| 209 FILE *in, *out; | |
| 210 char *pattab_key[MAXPATHS]; | |
| 211 char *pattab_val[MAXPATHS]; | |
| 212 int patterns = 0; | |
| 213 char *newpattab_key[MAXPATHS]; | |
| 214 char *newpattab_val[MAXPATHS]; | |
| 215 int newpatterns = 0; | |
| 216 char format[132]; // 64+65+newline+zero+spare | |
| 217 int p; | |
| 218 if (argc!=3) die("Usage: <orig-file> <new-file>\n"); | |
| 219 if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input"); | |
| 220 if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output"); | |
| 221 // read all patterns and split in pure text (_key) & expanded patterns (_val) | |
| 222 while(fgets(format,132,in) != NULL) { | |
| 223 int l = strlen(format); | |
| 224 if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp | |
| 225 if (format[0]=='%' || format[0]==0) { | |
| 226 // skip | |
| 227 } else { | |
| 228 if (format[l-1]=='%') { | |
| 229 l--; | |
| 230 format[l] = 0; // remove '%' | |
| 231 } | |
| 232 int i,j; | |
| 233 char *pat = (char*) malloc(l+1); | |
| 234 char *org = (char*) malloc(l*2+1); | |
| 235 if (pat==NULL || org==NULL) die("not enough memory"); | |
| 236 expand(org,format,l); | |
| 237 // remove hyphenation encoders (digits) from pat | |
| 238 for (i=0,j=0; i<l; i++) { | |
| 239 // odd, but utf-8 proof | |
| 240 char c = format[i]; | |
| 241 if (c<'0' || c>'9') pat[j++]=c; | |
| 242 } | |
| 243 pat[j]=0; | |
| 244 p = patterns; | |
| 245 pattab_key[patterns] = pat; | |
| 246 pattab_val[patterns++] = org; | |
| 247 if (patterns>MAXPATHS) die("to many base patterns"); | |
| 248 } | |
| 249 } | |
| 250 fclose(in); | |
| 251 // As we use binairy search, make sure it is sorted | |
| 252 qsort_arr(pattab_key,pattab_val,0,patterns-1); | |
| 253 | |
| 254 for (p=0; p<patterns; p++) { | |
| 255 char *pat = pattab_key[p]; | |
| 256 int patsize = strlen(pat); | |
| 257 int j,l; | |
| 258 for (l=1; l<=patsize; l++) { | |
| 259 for (j=1; j<=l; j++) { | |
| 260 int i = l-j; | |
| 261 int subpat_ndx; | |
| 262 char subpat[132]; | |
| 263 strncpy(subpat,pat+i,j); subpat[j]=0; | |
| 264 if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) { | |
| 265 int newpat_ndx; | |
| 266 char *newpat=malloc(l+1); | |
| 267 if (newpat==NULL) die("not enough memory"); | |
| 268 //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]); | |
| 269 strncpy(newpat, pat+0,l); newpat[l]=0; | |
| 270 if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) { | |
| 271 char *neworg = malloc(132); // TODO: compute exact length | |
| 272 if (neworg==NULL) die("not enough memory"); | |
| 273 expand(neworg,newpat,l); | |
| 274 newpattab_key[newpatterns] = newpat; | |
| 275 newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]
); | |
| 276 if (newpatterns>MAXPATHS) die("to many new patterns"); | |
| 277 //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_v
al[p],pattab_val[subpat_ndx],neworg); | |
| 278 } else { | |
| 279 free(newpat); | |
| 280 newpattab_val[newpat_ndx] = combine( | |
| 281 newpattab_val[newpat_ndx], pattab_val[subpat_ndx] ); | |
| 282 } | |
| 283 } | |
| 284 } | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 /* for some tiny extra speed, one could forget the free()s | |
| 289 * as the memory is freed anyway on exit(). | |
| 290 * However, the gain is minimal and now the code can be cleanly | |
| 291 * incorporated into other code */ | |
| 292 for (p=0; p<newpatterns; p++) { | |
| 293 fprintf(out,"%s\n",compact(newpattab_val[p])); | |
| 294 free(newpattab_key[p]); | |
| 295 free(newpattab_val[p]); | |
| 296 } | |
| 297 fclose(out); | |
| 298 | |
| 299 for (p=0; p<patterns; p++) { | |
| 300 free(pattab_key[p]); | |
| 301 free(pattab_val[p]); | |
| 302 } | |
| 303 return 0; | |
| 304 } | |
| OLD | NEW |