OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/tools/convert_dict/aff_reader.h" | 5 #include "chrome/tools/convert_dict/aff_reader.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
9 #include "base/file_util.h" | 9 #include "base/file_util.h" |
10 #include "base/i18n/icu_string_conversions.h" | 10 #include "base/i18n/icu_string_conversions.h" |
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
115 } | 115 } |
116 } else if (StringBeginsWith(line, "TRY ") || | 116 } else if (StringBeginsWith(line, "TRY ") || |
117 StringBeginsWith(line, "MAP ")) { | 117 StringBeginsWith(line, "MAP ")) { |
118 HandleEncodedCommand(line); | 118 HandleEncodedCommand(line); |
119 } else if (StringBeginsWith(line, "IGNORE ")) { | 119 } else if (StringBeginsWith(line, "IGNORE ")) { |
120 printf("We don't support the IGNORE command yet. This would change how " | 120 printf("We don't support the IGNORE command yet. This would change how " |
121 "we would insert things in our lookup table.\n"); | 121 "we would insert things in our lookup table.\n"); |
122 exit(1); | 122 exit(1); |
123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { | 123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { |
124 printf("We don't support the COMPLEXPREFIXES command yet. This would " | 124 printf("We don't support the COMPLEXPREFIXES command yet. This would " |
125 "mean we have to insert words backwords as well (I think)\n"); | 125 "mean we have to insert words backwards as well (I think)\n"); |
126 exit(1); | 126 exit(1); |
127 } else { | 127 } else { |
128 // All other commands get stored in the other commands list. | 128 // All other commands get stored in the other commands list. |
129 HandleRawCommand(line); | 129 HandleRawCommand(line); |
130 } | 130 } |
131 } | 131 } |
132 | 132 |
133 return true; | 133 return true; |
134 } | 134 } |
135 | 135 |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
234 size_t slash_index = part.find('/'); | 234 size_t slash_index = part.find('/'); |
235 if (slash_index != std::string::npos && !has_indexed_affixes()) { | 235 if (slash_index != std::string::npos && !has_indexed_affixes()) { |
236 // This can also have a rule string associated with it following a | 236 // This can also have a rule string associated with it following a |
237 // slash. For example: | 237 // slash. For example: |
238 // PFX P 0 foo/Y . | 238 // PFX P 0 foo/Y . |
239 // The "Y" is a flag. For example, the aff file might have a line: | 239 // The "Y" is a flag. For example, the aff file might have a line: |
240 // COMPOUNDFLAG Y | 240 // COMPOUNDFLAG Y |
241 // so that means that this prefix would be a compound one. | 241 // so that means that this prefix would be a compound one. |
242 // | 242 // |
243 // It expects these rules to use the same alias rules as the .dic | 243 // It expects these rules to use the same alias rules as the .dic |
244 // file. We've forced it to use aliases, which is a numberical index | 244 // file. We've forced it to use aliases, which is a numerical index |
245 // instead of these character flags, and this needs to be consistent. | 245 // instead of these character flags, and this needs to be consistent. |
246 | 246 |
247 std::string before_flags = part.substr(0, slash_index + 1); | 247 std::string before_flags = part.substr(0, slash_index + 1); |
248 | 248 |
249 // After the slash are both the flags, then whitespace, then the part | 249 // After the slash are both the flags, then whitespace, then the part |
250 // that tells us what to strip. | 250 // that tells us what to strip. |
251 std::vector<std::string> after_slash; | 251 std::vector<std::string> after_slash; |
252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); | 252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); |
253 if (after_slash.size() < 2) { | 253 if (after_slash.size() == 0) { |
254 // Note that we may get a third term here which is the | 254 printf("ERROR: Found 0 terms after slash in affix rule '%s', " |
255 // morphological description of this rule. This happens in the tests | 255 "but need at least 2.\n", |
256 // only, so we can just ignore it. | 256 part.c_str()); |
257 printf("ERROR: Didn't get enough after the slash\n"); | |
258 return; | 257 return; |
259 } | 258 } |
| 259 if (after_slash.size() == 1) { |
| 260 printf("WARNING: Found 1 term after slash in affix rule '%s', " |
| 261 "but expected at least 2. Adding '.'.\n", |
| 262 part.c_str()); |
| 263 after_slash.push_back("."); |
| 264 } |
| 265 // Note that we may get a third term here which is the morphological |
| 266 // description of this rule. This happens in the tests only, so we can |
| 267 // just ignore it. |
260 | 268 |
261 part = base::StringPrintf("%s%d %s", | 269 part = base::StringPrintf("%s%d %s", |
262 before_flags.c_str(), | 270 before_flags.c_str(), |
263 GetAFIndexForAFString(after_slash[0]), | 271 GetAFIndexForAFString(after_slash[0]), |
264 after_slash[1].c_str()); | 272 after_slash[1].c_str()); |
265 } | 273 } |
266 | 274 |
267 // Reencode from here | 275 // Reencode from here |
268 std::string reencoded; | 276 std::string reencoded; |
269 if (!EncodingToUTF8(part, &reencoded)) | 277 if (!EncodingToUTF8(part, &reencoded)) { |
| 278 printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n", |
| 279 part.c_str()); |
270 break; | 280 break; |
| 281 } |
271 | 282 |
272 *rule = rule->substr(0, part_start) + reencoded; | 283 *rule = rule->substr(0, part_start) + reencoded; |
273 break; | 284 break; |
274 } | 285 } |
275 token.clear(); | 286 token.clear(); |
276 } else { | 287 } else { |
277 token.push_back((*rule)[i]); | 288 token.push_back((*rule)[i]); |
278 } | 289 } |
279 } | 290 } |
280 | 291 |
281 affix_rules_.push_back(*rule); | 292 affix_rules_.push_back(*rule); |
282 } | 293 } |
283 | 294 |
284 void AffReader::AddReplacement(std::string* rule) { | 295 void AffReader::AddReplacement(std::string* rule) { |
285 TrimLine(rule); | 296 TrimLine(rule); |
| 297 CollapseDuplicateSpaces(rule); |
286 | 298 |
287 std::string utf8rule; | 299 std::string utf8rule; |
288 if (!EncodingToUTF8(*rule, &utf8rule)) | 300 if (!EncodingToUTF8(*rule, &utf8rule)) { |
| 301 printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n", |
| 302 rule->c_str()); |
289 return; | 303 return; |
| 304 } |
290 | 305 |
| 306 // The first space separates key and value. |
| 307 size_t space_index = utf8rule.find(' '); |
| 308 if (space_index == std::string::npos) { |
| 309 printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str()); |
| 310 return; |
| 311 } |
291 std::vector<std::string> split; | 312 std::vector<std::string> split; |
292 base::SplitString(utf8rule, ' ', &split); | 313 split.push_back(utf8rule.substr(0, space_index)); |
| 314 split.push_back(utf8rule.substr(space_index + 1)); |
293 | 315 |
294 // There should be two parts. | 316 // Underscores are used to represent spaces in most aff files |
295 if (split.size() != 2) | |
296 return; | |
297 | |
298 // Underscores are used to represent spaces | |
299 // (since the line is parsed on spaces). | 317 // (since the line is parsed on spaces). |
300 std::replace(split[0].begin(), split[0].end(), '_', ' '); | 318 std::replace(split[0].begin(), split[0].end(), '_', ' '); |
301 std::replace(split[1].begin(), split[1].end(), '_', ' '); | 319 std::replace(split[1].begin(), split[1].end(), '_', ' '); |
302 | 320 |
303 replacements_.push_back(std::make_pair(split[0], split[1])); | 321 replacements_.push_back(std::make_pair(split[0], split[1])); |
304 } | 322 } |
305 | 323 |
306 void AffReader::HandleRawCommand(const std::string& line) { | 324 void AffReader::HandleRawCommand(const std::string& line) { |
307 other_commands_.push_back(line); | 325 other_commands_.push_back(line); |
308 } | 326 } |
309 | 327 |
310 void AffReader::HandleEncodedCommand(const std::string& line) { | 328 void AffReader::HandleEncodedCommand(const std::string& line) { |
311 std::string utf8; | 329 std::string utf8; |
312 if (EncodingToUTF8(line, &utf8)) | 330 if (!EncodingToUTF8(line, &utf8)) { |
313 other_commands_.push_back(utf8); | 331 printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str()); |
| 332 return; |
| 333 } |
| 334 other_commands_.push_back(utf8); |
314 } | 335 } |
315 | 336 |
316 } // namespace convert_dict | 337 } // namespace convert_dict |
OLD | NEW |