Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/autofill/core/browser/address_field.h" | 5 #include "components/autofill/core/browser/address_field.h" |
| 6 | 6 |
| 7 #include <stddef.h> | 7 #include <stddef.h> |
| 8 | 8 |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
| 11 #include "base/strings/string16.h" | |
| 12 #include "base/strings/string_util.h" | 11 #include "base/strings/string_util.h" |
| 13 #include "base/strings/utf_string_conversions.h" | |
| 14 #include "components/autofill/core/browser/autofill_field.h" | 12 #include "components/autofill/core/browser/autofill_field.h" |
| 15 #include "components/autofill/core/browser/autofill_regex_constants.h" | 13 #include "components/autofill/core/browser/autofill_regex_constants.h" |
| 16 #include "components/autofill/core/browser/autofill_scanner.h" | 14 #include "components/autofill/core/browser/autofill_scanner.h" |
| 17 #include "components/autofill/core/browser/field_types.h" | 15 #include "components/autofill/core/browser/field_types.h" |
| 18 | 16 |
| 19 using base::UTF8ToUTF16; | |
| 20 | |
| 21 namespace autofill { | 17 namespace autofill { |
| 22 | 18 |
| 23 namespace { | 19 namespace { |
| 24 | 20 |
| 25 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) { | 21 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) { |
| 26 *field = scanner->Cursor(); | 22 *field = scanner->Cursor(); |
| 27 scanner->Advance(); | 23 scanner->Advance(); |
| 28 return true; | 24 return true; |
| 29 } | 25 } |
| 30 | 26 |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 41 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT; | 37 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT; |
| 42 | 38 |
| 43 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { | 39 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { |
| 44 if (scanner->IsEnd()) | 40 if (scanner->IsEnd()) |
| 45 return NULL; | 41 return NULL; |
| 46 | 42 |
| 47 scoped_ptr<AddressField> address_field(new AddressField); | 43 scoped_ptr<AddressField> address_field(new AddressField); |
| 48 const AutofillField* const initial_field = scanner->Cursor(); | 44 const AutofillField* const initial_field = scanner->Cursor(); |
| 49 size_t saved_cursor = scanner->SaveCursor(); | 45 size_t saved_cursor = scanner->SaveCursor(); |
| 50 | 46 |
| 51 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe); | |
| 52 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe); | |
| 53 | |
| 54 // Allow address fields to appear in any order. | 47 // Allow address fields to appear in any order. |
| 55 size_t begin_trailing_non_labeled_fields = 0; | 48 size_t begin_trailing_non_labeled_fields = 0; |
| 56 bool has_trailing_non_labeled_fields = false; | 49 bool has_trailing_non_labeled_fields = false; |
| 57 while (!scanner->IsEnd()) { | 50 while (!scanner->IsEnd()) { |
| 58 const size_t cursor = scanner->SaveCursor(); | 51 const size_t cursor = scanner->SaveCursor(); |
| 59 if (address_field->ParseAddressLines(scanner) || | 52 if (address_field->ParseAddressLines(scanner) || |
| 60 address_field->ParseCityStateZipCode(scanner) || | 53 address_field->ParseCityStateZipCode(scanner) || |
| 61 address_field->ParseCountry(scanner) || | 54 address_field->ParseCountry(scanner) || |
| 62 address_field->ParseCompany(scanner)) { | 55 address_field->ParseCompany(scanner)) { |
| 63 has_trailing_non_labeled_fields = false; | 56 has_trailing_non_labeled_fields = false; |
| 64 continue; | 57 continue; |
| 65 } else if (ParseField(scanner, attention_ignored, NULL) || | 58 } else if (ParseField(scanner, kAttentionIgnoredRe, NULL) || |
| 66 ParseField(scanner, region_ignored, NULL)) { | 59 ParseField(scanner, kRegionIgnoredRe, NULL)) { |
| 67 // We ignore the following: | 60 // We ignore the following: |
| 68 // * Attention. | 61 // * Attention. |
| 69 // * Province/Region/Other. | 62 // * Province/Region/Other. |
| 70 continue; | 63 continue; |
| 71 } else if (scanner->Cursor() != initial_field && | 64 } else if (scanner->Cursor() != initial_field && |
| 72 ParseEmptyLabel(scanner, NULL)) { | 65 ParseEmptyLabel(scanner, NULL)) { |
| 73 // Ignore non-labeled fields within an address; the page | 66 // Ignore non-labeled fields within an address; the page |
| 74 // MapQuest Driving Directions North America.html contains such a field. | 67 // MapQuest Driving Directions North America.html contains such a field. |
| 75 // We only ignore such fields after we've parsed at least one other field; | 68 // We only ignore such fields after we've parsed at least one other field; |
| 76 // otherwise we'd effectively parse address fields before other field | 69 // otherwise we'd effectively parse address fields before other field |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 141 AddClassification(city_, ADDRESS_HOME_CITY, map) && | 134 AddClassification(city_, ADDRESS_HOME_CITY, map) && |
| 142 AddClassification(state_, ADDRESS_HOME_STATE, map) && | 135 AddClassification(state_, ADDRESS_HOME_STATE, map) && |
| 143 AddClassification(zip_, ADDRESS_HOME_ZIP, map) && | 136 AddClassification(zip_, ADDRESS_HOME_ZIP, map) && |
| 144 AddClassification(country_, ADDRESS_HOME_COUNTRY, map); | 137 AddClassification(country_, ADDRESS_HOME_COUNTRY, map); |
| 145 } | 138 } |
| 146 | 139 |
| 147 bool AddressField::ParseCompany(AutofillScanner* scanner) { | 140 bool AddressField::ParseCompany(AutofillScanner* scanner) { |
| 148 if (company_ && !company_->IsEmpty()) | 141 if (company_ && !company_->IsEmpty()) |
| 149 return false; | 142 return false; |
| 150 | 143 |
| 151 return ParseField(scanner, UTF8ToUTF16(kCompanyRe), &company_); | 144 return ParseField(scanner, kCompanyRe, &company_); |
| 152 } | 145 } |
| 153 | 146 |
| 154 bool AddressField::ParseAddressLines(AutofillScanner* scanner) { | 147 bool AddressField::ParseAddressLines(AutofillScanner* scanner) { |
| 155 // We only match the string "address" in page text, not in element names, | 148 // We only match the string "address" in page text, not in element names, |
| 156 // because sometimes every element in a group of address fields will have | 149 // because sometimes every element in a group of address fields will have |
| 157 // a name containing the string "address"; for example, on the page | 150 // a name containing the string "address"; for example, on the page |
| 158 // Kohl's - Register Billing Address.html the text element labeled "city" | 151 // Kohl's - Register Billing Address.html the text element labeled "city" |
| 159 // has the name "BILL_TO_ADDRESS<>city". We do match address labels | 152 // has the name "BILL_TO_ADDRESS<>city". We do match address labels |
| 160 // such as "address1", which appear as element names on various pages (eg | 153 // such as "address1", which appear as element names on various pages (eg |
| 161 // AmericanGirl-Registration.html, BloomingdalesBilling.html, | 154 // AmericanGirl-Registration.html, BloomingdalesBilling.html, |
| 162 // EBay Registration Enter Information.html). | 155 // EBay Registration Enter Information.html). |
| 163 if (address1_ || street_address_) | 156 if (address1_ || street_address_) |
| 164 return false; | 157 return false; |
| 165 | 158 |
| 166 // Ignore "Address Lookup" field. http://crbug.com/427622 | 159 // Ignore "Address Lookup" field. http://crbug.com/427622 |
| 167 if (ParseField(scanner, base::UTF8ToUTF16(kAddressLookupRe), NULL)) | 160 if (ParseField(scanner, kAddressLookupRe, NULL)) |
| 168 return false; | 161 return false; |
| 169 | 162 |
| 170 base::string16 pattern = UTF8ToUTF16(kAddressLine1Re); | 163 if (!ParseFieldSpecifics(scanner, kAddressLine1Re, MATCH_DEFAULT, |
| 171 base::string16 label_pattern = UTF8ToUTF16(kAddressLine1LabelRe); | |
| 172 if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, &address1_) && | |
| 173 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, | |
| 174 &address1_) && | 164 &address1_) && |
| 175 !ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_TEXT_AREA, | 165 !ParseFieldSpecifics(scanner, kAddressLine1LabelRe, |
| 176 &street_address_) && | 166 MATCH_LABEL | MATCH_TEXT, &address1_) && |
| 177 !ParseFieldSpecifics(scanner, label_pattern, | 167 !ParseFieldSpecifics(scanner, kAddressLine1Re, |
| 178 MATCH_LABEL | MATCH_TEXT_AREA, | 168 MATCH_DEFAULT | MATCH_TEXT_AREA, &street_address_) && |
| 179 &street_address_)) | 169 !ParseFieldSpecifics(scanner, kAddressLine1LabelRe, |
| 170 MATCH_LABEL | MATCH_TEXT_AREA, &street_address_)) | |
| 180 return false; | 171 return false; |
| 181 | 172 |
| 182 if (street_address_) | 173 if (street_address_) |
| 183 return true; | 174 return true; |
| 184 | 175 |
| 185 // This code may not pick up pages that have an address field consisting of a | 176 // This code may not pick up pages that have an address field consisting of a |
| 186 // sequence of unlabeled address fields. If we need to add this, see | 177 // sequence of unlabeled address fields. If we need to add this, see |
| 187 // discussion on https://codereview.chromium.org/741493003/ | 178 // discussion on https://codereview.chromium.org/741493003/ |
| 188 pattern = UTF8ToUTF16(kAddressLine2Re); | 179 if (!ParseField(scanner, kAddressLine2Re, &address2_) && |
| 189 label_pattern = UTF8ToUTF16(kAddressLine2LabelRe); | 180 !ParseFieldSpecifics(scanner, kAddressLine2LabelRe, |
| 190 if (!ParseField(scanner, pattern, &address2_) && | 181 MATCH_LABEL | MATCH_TEXT, &address2_)) |
| 191 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, | |
| 192 &address2_)) | |
| 193 return true; | 182 return true; |
| 194 | 183 |
| 195 // Optionally parse address line 3. This uses the same label regexp as | 184 // Optionally parse address line 3. This uses the same label regexp as |
| 196 // address 2 above. | 185 // address 2 above. |
| 197 pattern = UTF8ToUTF16(kAddressLinesExtraRe); | 186 if (!ParseField(scanner, kAddressLinesExtraRe, &address3_) && |
| 198 if (!ParseField(scanner, pattern, &address3_) && | 187 !ParseFieldSpecifics(scanner, kAddressLine2LabelRe, |
| 199 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, | 188 MATCH_LABEL | MATCH_TEXT, &address3_)) |
| 200 &address3_)) | |
| 201 return true; | 189 return true; |
| 202 | 190 |
| 203 // Try for surplus lines, which we will promptly discard. Some pages have 4 | 191 // Try for surplus lines, which we will promptly discard. Some pages have 4 |
| 204 // address lines (e.g. uk/ShoesDirect2.html)! | 192 // address lines (e.g. uk/ShoesDirect2.html)! |
| 205 // | 193 // |
| 206 // Since these are rare, don't bother considering unlabeled lines as extra | 194 // Since these are rare, don't bother considering unlabeled lines as extra |
| 207 // address lines. | 195 // address lines. |
| 208 pattern = UTF8ToUTF16(kAddressLinesExtraRe); | 196 while (ParseField(scanner, kAddressLinesExtraRe, NULL)) { |
| 209 while (ParseField(scanner, pattern, NULL)) { | |
| 210 // Consumed a surplus line, try for another. | 197 // Consumed a surplus line, try for another. |
| 211 } | 198 } |
| 212 return true; | 199 return true; |
| 213 } | 200 } |
| 214 | 201 |
| 215 bool AddressField::ParseCountry(AutofillScanner* scanner) { | 202 bool AddressField::ParseCountry(AutofillScanner* scanner) { |
| 216 if (country_ && !country_->IsEmpty()) | 203 if (country_ && !country_->IsEmpty()) |
| 217 return false; | 204 return false; |
| 218 | 205 |
| 219 scanner->SaveCursor(); | 206 scanner->SaveCursor(); |
| 220 if (ParseFieldSpecifics(scanner, | 207 if (ParseFieldSpecifics(scanner, kCountryRe, MATCH_DEFAULT | MATCH_SELECT, |
| 221 UTF8ToUTF16(kCountryRe), | |
| 222 MATCH_DEFAULT | MATCH_SELECT, | |
| 223 &country_)) { | 208 &country_)) { |
| 224 return true; | 209 return true; |
| 225 } | 210 } |
| 226 | 211 |
| 227 // The occasional page (e.g. google account registration page) calls this a | 212 // The occasional page (e.g. google account registration page) calls this a |
| 228 // "location". However, this only makes sense for select tags. | 213 // "location". However, this only makes sense for select tags. |
| 229 scanner->Rewind(); | 214 scanner->Rewind(); |
| 230 return ParseFieldSpecifics(scanner, | 215 return ParseFieldSpecifics(scanner, kCountryLocationRe, |
| 231 UTF8ToUTF16(kCountryLocationRe), | |
| 232 MATCH_LABEL | MATCH_NAME | MATCH_SELECT, | 216 MATCH_LABEL | MATCH_NAME | MATCH_SELECT, |
| 233 &country_); | 217 &country_); |
| 234 } | 218 } |
| 235 | 219 |
| 236 bool AddressField::ParseZipCode(AutofillScanner* scanner) { | 220 bool AddressField::ParseZipCode(AutofillScanner* scanner) { |
| 237 if (zip_) | 221 if (zip_) |
| 238 return false; | 222 return false; |
| 239 | 223 |
| 240 if (!ParseFieldSpecifics(scanner, | 224 if (!ParseFieldSpecifics(scanner, kZipCodeRe, kZipCodeMatchType, &zip_)) { |
| 241 UTF8ToUTF16(kZipCodeRe), | |
| 242 kZipCodeMatchType, | |
| 243 &zip_)) { | |
| 244 return false; | 225 return false; |
| 245 } | 226 } |
| 246 | 227 |
| 247 // Look for a zip+4, whose field name will also often contain | 228 // Look for a zip+4, whose field name will also often contain |
| 248 // the substring "zip". | 229 // the substring "zip". |
| 249 ParseFieldSpecifics(scanner, UTF8ToUTF16(kZip4Re), kZipCodeMatchType, &zip4_); | 230 ParseFieldSpecifics(scanner, kZip4Re, kZipCodeMatchType, &zip4_); |
| 250 return true; | 231 return true; |
| 251 } | 232 } |
| 252 | 233 |
| 253 bool AddressField::ParseCity(AutofillScanner* scanner) { | 234 bool AddressField::ParseCity(AutofillScanner* scanner) { |
| 254 if (city_) | 235 if (city_) |
| 255 return false; | 236 return false; |
| 256 | 237 |
| 257 return ParseFieldSpecifics(scanner, | 238 return ParseFieldSpecifics(scanner, kCityRe, kCityMatchType, &city_); |
| 258 UTF8ToUTF16(kCityRe), | |
| 259 kCityMatchType, | |
| 260 &city_); | |
| 261 } | 239 } |
| 262 | 240 |
| 263 bool AddressField::ParseState(AutofillScanner* scanner) { | 241 bool AddressField::ParseState(AutofillScanner* scanner) { |
| 264 if (state_) | 242 if (state_) |
| 265 return false; | 243 return false; |
| 266 | 244 |
| 267 return ParseFieldSpecifics(scanner, | 245 // Ignore spurious matches for "United States". |
|
Evan Stade
2015/12/08 19:34:56
where is this coming from? is this a new addition?
Ilya Sherman
2015/12/08 20:02:28
It's coming from a change to one of the regexes, w
| |
| 268 UTF8ToUTF16(kStateRe), | 246 size_t saved_cursor = scanner->SaveCursor(); |
| 269 kStateMatchType, | 247 if (ParseFieldSpecifics(scanner, "United States", kStateMatchType, nullptr)) { |
| 270 &state_); | 248 scanner->RewindTo(saved_cursor); |
| 249 return false; | |
| 250 } | |
| 251 | |
| 252 return ParseFieldSpecifics(scanner, kStateRe, kStateMatchType, &state_); | |
| 271 } | 253 } |
| 272 | 254 |
| 273 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) { | 255 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) { |
| 274 // Simple cases. | 256 // Simple cases. |
| 275 if (scanner->IsEnd()) | 257 if (scanner->IsEnd()) |
| 276 return false; | 258 return false; |
| 277 if (city_ && state_ && zip_) | 259 if (city_ && state_ && zip_) |
| 278 return false; | 260 return false; |
| 279 if (state_ && zip_) | 261 if (state_ && zip_) |
| 280 return ParseCity(scanner); | 262 return ParseCity(scanner); |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 322 | 304 |
| 323 return false; | 305 return false; |
| 324 } | 306 } |
| 325 | 307 |
| 326 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode( | 308 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode( |
| 327 AutofillScanner* scanner) { | 309 AutofillScanner* scanner) { |
| 328 if (zip_) | 310 if (zip_) |
| 329 return RESULT_MATCH_NONE; | 311 return RESULT_MATCH_NONE; |
| 330 | 312 |
| 331 ParseNameLabelResult result = ParseNameAndLabelSeparately( | 313 ParseNameLabelResult result = ParseNameAndLabelSeparately( |
| 332 scanner, UTF8ToUTF16(kZipCodeRe), kZipCodeMatchType, &zip_); | 314 scanner, kZipCodeRe, kZipCodeMatchType, &zip_); |
| 333 | 315 |
| 334 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd()) | 316 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd()) |
| 335 return result; | 317 return result; |
| 336 | 318 |
| 337 size_t saved_cursor = scanner->SaveCursor(); | 319 size_t saved_cursor = scanner->SaveCursor(); |
| 338 bool found_non_zip4 = ParseCity(scanner); | 320 bool found_non_zip4 = ParseCity(scanner); |
| 339 if (found_non_zip4) | 321 if (found_non_zip4) |
| 340 city_ = nullptr; | 322 city_ = nullptr; |
| 341 scanner->RewindTo(saved_cursor); | 323 scanner->RewindTo(saved_cursor); |
| 342 if (!found_non_zip4) { | 324 if (!found_non_zip4) { |
| 343 found_non_zip4 = ParseState(scanner); | 325 found_non_zip4 = ParseState(scanner); |
| 344 if (found_non_zip4) | 326 if (found_non_zip4) |
| 345 state_ = nullptr; | 327 state_ = nullptr; |
| 346 scanner->RewindTo(saved_cursor); | 328 scanner->RewindTo(saved_cursor); |
| 347 } | 329 } |
| 348 | 330 |
| 349 if (!found_non_zip4) { | 331 if (!found_non_zip4) { |
| 350 // Look for a zip+4, whose field name will also often contain | 332 // Look for a zip+4, whose field name will also often contain |
| 351 // the substring "zip". | 333 // the substring "zip". |
| 352 ParseFieldSpecifics(scanner, | 334 ParseFieldSpecifics(scanner, kZip4Re, kZipCodeMatchType, &zip4_); |
| 353 UTF8ToUTF16(kZip4Re), | |
| 354 kZipCodeMatchType, | |
| 355 &zip4_); | |
| 356 } | 335 } |
| 357 return result; | 336 return result; |
| 358 } | 337 } |
| 359 | 338 |
| 360 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity( | 339 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity( |
| 361 AutofillScanner* scanner) { | 340 AutofillScanner* scanner) { |
| 362 if (city_) | 341 if (city_) |
| 363 return RESULT_MATCH_NONE; | 342 return RESULT_MATCH_NONE; |
| 364 | 343 |
| 365 return ParseNameAndLabelSeparately( | 344 return ParseNameAndLabelSeparately(scanner, kCityRe, kCityMatchType, &city_); |
| 366 scanner, UTF8ToUTF16(kCityRe), kCityMatchType, &city_); | |
| 367 } | 345 } |
| 368 | 346 |
| 369 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState( | 347 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState( |
| 370 AutofillScanner* scanner) { | 348 AutofillScanner* scanner) { |
| 371 if (state_) | 349 if (state_) |
| 372 return RESULT_MATCH_NONE; | 350 return RESULT_MATCH_NONE; |
| 373 | 351 |
| 374 return ParseNameAndLabelSeparately( | 352 size_t saved_cursor = scanner->SaveCursor(); |
| 375 scanner, UTF8ToUTF16(kStateRe), kStateMatchType, &state_); | 353 if (ParseFieldSpecifics(scanner, "United States", kStateMatchType, nullptr)) { |
| 354 scanner->RewindTo(saved_cursor); | |
| 355 return RESULT_MATCH_NONE; | |
| 356 } | |
| 357 | |
| 358 return ParseNameAndLabelSeparately(scanner, kStateRe, kStateMatchType, | |
| 359 &state_); | |
| 376 } | 360 } |
| 377 | 361 |
| 378 } // namespace autofill | 362 } // namespace autofill |
| OLD | NEW |