Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(34)

Side by Side Diff: chrome/tools/convert_dict/aff_reader.cc

Issue 11566003: Bump dictionary versions to 3-0 (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Fix android compile Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/tools/convert_dict/aff_reader.h" 5 #include "chrome/tools/convert_dict/aff_reader.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 8
9 #include "base/file_util.h" 9 #include "base/file_util.h"
10 #include "base/i18n/icu_string_conversions.h" 10 #include "base/i18n/icu_string_conversions.h"
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after
115 } 115 }
116 } else if (StringBeginsWith(line, "TRY ") || 116 } else if (StringBeginsWith(line, "TRY ") ||
117 StringBeginsWith(line, "MAP ")) { 117 StringBeginsWith(line, "MAP ")) {
118 HandleEncodedCommand(line); 118 HandleEncodedCommand(line);
119 } else if (StringBeginsWith(line, "IGNORE ")) { 119 } else if (StringBeginsWith(line, "IGNORE ")) {
120 printf("We don't support the IGNORE command yet. This would change how " 120 printf("We don't support the IGNORE command yet. This would change how "
121 "we would insert things in our lookup table.\n"); 121 "we would insert things in our lookup table.\n");
122 exit(1); 122 exit(1);
123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { 123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
124 printf("We don't support the COMPLEXPREFIXES command yet. This would " 124 printf("We don't support the COMPLEXPREFIXES command yet. This would "
125 "mean we have to insert words backwords as well (I think)\n"); 125 "mean we have to insert words backwards as well (I think)\n");
126 exit(1); 126 exit(1);
127 } else { 127 } else {
128 // All other commands get stored in the other commands list. 128 // All other commands get stored in the other commands list.
129 HandleRawCommand(line); 129 HandleRawCommand(line);
130 } 130 }
131 } 131 }
132 132
133 return true; 133 return true;
134 } 134 }
135 135
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after
234 size_t slash_index = part.find('/'); 234 size_t slash_index = part.find('/');
235 if (slash_index != std::string::npos && !has_indexed_affixes()) { 235 if (slash_index != std::string::npos && !has_indexed_affixes()) {
236 // This can also have a rule string associated with it following a 236 // This can also have a rule string associated with it following a
237 // slash. For example: 237 // slash. For example:
238 // PFX P 0 foo/Y . 238 // PFX P 0 foo/Y .
239 // The "Y" is a flag. For example, the aff file might have a line: 239 // The "Y" is a flag. For example, the aff file might have a line:
240 // COMPOUNDFLAG Y 240 // COMPOUNDFLAG Y
241 // so that means that this prefix would be a compound one. 241 // so that means that this prefix would be a compound one.
242 // 242 //
243 // It expects these rules to use the same alias rules as the .dic 243 // It expects these rules to use the same alias rules as the .dic
244 // file. We've forced it to use aliases, which is a numberical index 244 // file. We've forced it to use aliases, which is a numerical index
245 // instead of these character flags, and this needs to be consistent. 245 // instead of these character flags, and this needs to be consistent.
246 246
247 std::string before_flags = part.substr(0, slash_index + 1); 247 std::string before_flags = part.substr(0, slash_index + 1);
248 248
249 // After the slash are both the flags, then whitespace, then the part 249 // After the slash are both the flags, then whitespace, then the part
250 // that tells us what to strip. 250 // that tells us what to strip.
251 std::vector<std::string> after_slash; 251 std::vector<std::string> after_slash;
252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); 252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
253 if (after_slash.size() < 2) { 253 if (after_slash.size() == 0) {
254 // Note that we may get a third term here which is the 254 printf("ERROR: Found 0 terms after slash in affix rule '%s', "
255 // morphological description of this rule. This happens in the tests 255 "but need at least 2.\n",
256 // only, so we can just ignore it. 256 part.c_str());
257 printf("ERROR: Didn't get enough after the slash\n");
258 return; 257 return;
259 } 258 }
259 if (after_slash.size() == 1) {
260 printf("WARNING: Found 1 term after slash in affix rule '%s', "
261 "but expected at least 2. Adding '.'.\n",
262 part.c_str());
263 after_slash.push_back(".");
264 }
265 // Note that we may get a third term here which is the morphological
266 // description of this rule. This happens in the tests only, so we can
267 // just ignore it.
260 268
261 part = base::StringPrintf("%s%d %s", 269 part = base::StringPrintf("%s%d %s",
262 before_flags.c_str(), 270 before_flags.c_str(),
263 GetAFIndexForAFString(after_slash[0]), 271 GetAFIndexForAFString(after_slash[0]),
264 after_slash[1].c_str()); 272 after_slash[1].c_str());
265 } 273 }
266 274
267 // Reencode from here 275 // Reencode from here
268 std::string reencoded; 276 std::string reencoded;
269 if (!EncodingToUTF8(part, &reencoded)) 277 if (!EncodingToUTF8(part, &reencoded)) {
278 printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n",
279 part.c_str());
270 break; 280 break;
281 }
271 282
272 *rule = rule->substr(0, part_start) + reencoded; 283 *rule = rule->substr(0, part_start) + reencoded;
273 break; 284 break;
274 } 285 }
275 token.clear(); 286 token.clear();
276 } else { 287 } else {
277 token.push_back((*rule)[i]); 288 token.push_back((*rule)[i]);
278 } 289 }
279 } 290 }
280 291
281 affix_rules_.push_back(*rule); 292 affix_rules_.push_back(*rule);
282 } 293 }
283 294
284 void AffReader::AddReplacement(std::string* rule) { 295 void AffReader::AddReplacement(std::string* rule) {
285 TrimLine(rule); 296 TrimLine(rule);
297 CollapseDuplicateSpaces(rule);
286 298
287 std::string utf8rule; 299 std::string utf8rule;
288 if (!EncodingToUTF8(*rule, &utf8rule)) 300 if (!EncodingToUTF8(*rule, &utf8rule)) {
301 printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n",
302 rule->c_str());
289 return; 303 return;
304 }
290 305
306 // The first space separates key and value.
307 size_t space_index = utf8rule.find(' ');
308 if (space_index == std::string::npos) {
309 printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str());
310 return;
311 }
291 std::vector<std::string> split; 312 std::vector<std::string> split;
292 base::SplitString(utf8rule, ' ', &split); 313 split.push_back(utf8rule.substr(0, space_index));
314 split.push_back(utf8rule.substr(space_index + 1));
293 315
294 // There should be two parts. 316 // Underscores are used to represent spaces in most aff files
295 if (split.size() != 2)
296 return;
297
298 // Underscores are used to represent spaces
299 // (since the line is parsed on spaces). 317 // (since the line is parsed on spaces).
300 std::replace(split[0].begin(), split[0].end(), '_', ' '); 318 std::replace(split[0].begin(), split[0].end(), '_', ' ');
301 std::replace(split[1].begin(), split[1].end(), '_', ' '); 319 std::replace(split[1].begin(), split[1].end(), '_', ' ');
302 320
303 replacements_.push_back(std::make_pair(split[0], split[1])); 321 replacements_.push_back(std::make_pair(split[0], split[1]));
304 } 322 }
305 323
306 void AffReader::HandleRawCommand(const std::string& line) { 324 void AffReader::HandleRawCommand(const std::string& line) {
307 other_commands_.push_back(line); 325 other_commands_.push_back(line);
308 } 326 }
309 327
310 void AffReader::HandleEncodedCommand(const std::string& line) { 328 void AffReader::HandleEncodedCommand(const std::string& line) {
311 std::string utf8; 329 std::string utf8;
312 if (EncodingToUTF8(line, &utf8)) 330 if (!EncodingToUTF8(line, &utf8)) {
313 other_commands_.push_back(utf8); 331 printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str());
332 return;
333 }
334 other_commands_.push_back(utf8);
314 } 335 }
315 336
316 } // namespace convert_dict 337 } // namespace convert_dict
OLDNEW
« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_unittest.cc ('k') | chrome/tools/convert_dict/dic_reader.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698