chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest

Unified Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: No change in code, but with a patch generated without copy detection Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/extensions/api/web_request/form_data_parser.cc

diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc

new file mode 100644

index 0000000000000000000000000000000000000000..ad830cc2f09f1de32cdd9f8e474d0d796faac26b

--- /dev/null

+++ b/chrome/browser/extensions/api/web_request/form_data_parser.cc

@@ -0,0 +1,591 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "chrome/browser/extensions/api/web_request/form_data_parser.h"

+#include <vector>

+#include "base/lazy_instance.h"

+#include "base/string_util.h"

+#include "base/values.h"

+#include "net/base/escape.h"

+#include "net/url_request/url_request.h"

+#include "third_party/re2/re2/re2.h"

+using base::DictionaryValue;

+using base::ListValue;

+using base::StringPiece;

+using re2::RE2;

+namespace extensions {

+namespace {

+#define CONTENT_DISPOSITION "content-disposition:"

+static const char g_escape_closing_quote[] = "\\\\E";

+static const size_t g_content_disposition_length =

+ sizeof(CONTENT_DISPOSITION) - 1;

+// A wrapper struct for static RE2 objects to be held as LazyInstance.

+struct Patterns {

+ Patterns();

+ ~Patterns();

+ const RE2 transfer_padding_pattern;

+ const RE2 crlf_pattern;

+ const RE2 closing_pattern;

+ const RE2 epilogue_pattern;

+ const RE2 crlf_free_pattern;

+ const RE2 preamble_pattern;

+ const RE2 header_pattern;

+ const RE2 content_disposition_pattern;

+ const RE2 name_pattern;

+ const RE2 value_pattern;

+ const RE2 unquote_pattern;

+ const RE2 url_encoded_pattern;

+};

+Patterns::Patterns()

+ : transfer_padding_pattern("[ \\t]*\\r\\n"),

+ crlf_pattern("\\r\\n"),

+ closing_pattern("--[ \\t]*"),

+ epilogue_pattern("|\\r\\n(?s:.)*"),

+ crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),

+ preamble_pattern(".*?"),

+ header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),

+ content_disposition_pattern("(?i:" CONTENT_DISPOSITION ")"),

+ name_pattern("\\bname=\"([^\"]*)\""),

+ value_pattern("\\bfilename=\"([^\"]*)\""),

+ unquote_pattern(g_escape_closing_quote),

+ url_encoded_pattern("([^=]*)=([^&]*)&?") {}

+#undef CONTENT_DISPOSITION

+Patterns::~Patterns() {}

+static base::LazyInstance<Patterns>::Leaky g_patterns =

+ LAZY_INSTANCE_INITIALIZER;

+} // namespace

+// Parses URLencoded forms, see

+// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

+class FormDataParserUrlEncoded : public FormDataParser {

+ public:

+ FormDataParserUrlEncoded();

+ virtual ~FormDataParserUrlEncoded();

+ // Implementation of FormDataParser.

+ virtual bool AllDataReadOK() OVERRIDE;

+ virtual bool GetNextNameValue(Result* result) OVERRIDE;

+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

+ private:

+ // The pattern to match a single name-value pair. This could be even static,

+ // but then we would have to spend more code on initializing the cached

+ // pointer to g_patterns.Get().

+ const RE2& pattern() const {

+ return patterns_->url_encoded_pattern;

+ }

+ // Auxiliary constant for using RE2. Number of arguments for parsing

+ // name-value pairs (one for name, one for value).

+ static const size_t args_size_ = 2u;

+ static const net::UnescapeRule::Type unescape_rules_;

+ re2::StringPiece source_;

+ bool source_set_;

+ // Auxiliary store for using RE2.

+ std::string name_;

+ std::string value_;

+ const RE2::Arg arg_name_;

+ const RE2::Arg arg_value_;

+ const RE2::Arg* args_[args_size_];

+ // Caching the pointer to g_patterns.Get().

+ const Patterns* patterns_;

+ DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

+};

+// The following class, FormDataParserMultipart, parses forms encoded as

+// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

+// encoding) and 5322 (MIME-headers).

+//

+// Implementation details

+//

+// The original grammar from RFC 2046 is this, "multipart-body" being the root

+// non-terminal:

+//

+// boundary := 0*69<bchars> bcharsnospace

+// bchars := bcharsnospace / " "

+// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

+// / "-" / "." / "/" / ":" / "=" / "?"

+// dash-boundary := "--" boundary

+// multipart-body := [preamble CRLF]

+// dash-boundary transport-padding CRLF

+// body-part *encapsulation

+// close-delimiter transport-padding

+// [CRLF epilogue]

+// transport-padding := *LWSP-char

+// encapsulation := delimiter transport-padding CRLF body-part

+// delimiter := CRLF dash-boundary

+// close-delimiter := delimiter "--"

+// preamble := discard-text

+// epilogue := discard-text

+// discard-text := *(*text CRLF) *text

+// body-part := MIME-part-headers [CRLF *OCTET]

+// OCTET := <any 0-255 octet value>

+//

+// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

+// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

+// English alphabet, respectively.

+// The non-terminal "text" is presumably just any text, excluding line breaks.

+// The non-terminal "LWSP-char" is not directly defined in the original grammar

+// but it means "linear whitespace", which is a space or a horizontal tab.

+// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

+// the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

+//

+// MIME-part-headers := field-name ":" unstructured CRLF

+// field-name := 1*ftext

+// ftext := %d33-57 / ; Printable US-ASCII

+// %d59-126 ; characters not including ":".

+// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

+// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

+// "CRLF<horizontal tab>", which serve for "folding".

+//

+// The FormDataParseMultipart class reads the input source and tries to parse it

+// according to the grammar above, rooted at the "multipart-body" non-terminal.

+// This happens in stages:

+//

+// 1. The optional preamble and the initial dash-boundary with transport padding

+// and a CRLF are read and ignored.

+//

+// 2. Repeatedly each body part is read. The body parts can either serve to

+// upload a file, or just a string of bytes.

+// 2.a. The headers of that part are searched for the "content-disposition"

+// header, which contains the name of the value represented by that body

+// part. If the body-part is for file upload, that header also contains a

+// filename.

+// 2.b. The "*OCTET" part of the body part is then read and passed as the value

+// of the name-value pair for body parts representing a string of bytes.

+// For body parts for uploading a file the "*OCTET" part is just ignored

+// and the filename is used for value instead.

+//

+// 3. The final close-delimiter and epilogue are read and ignored.

+//

+// IMPORTANT NOTE

+// This parser supports multiple sources, i.e., SetSource can be called multiple

+// times if the input is spread over several byte blocks. However, the split

+// may only occur inside a body part, right after the trailing CRLF of headers.

+class FormDataParserMultipart : public FormDataParser {

+ public:

+ explicit FormDataParserMultipart(const std::string& boundary_separator);

+ virtual ~FormDataParserMultipart();

+ // Implementation of FormDataParser.

+ virtual bool AllDataReadOK() OVERRIDE;

+ virtual bool GetNextNameValue(Result* result) OVERRIDE;

+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

+ private:

+ enum State {

+ STATE_INIT, // No input read yet.

+ STATE_READY, // Ready to call GetNextNameValue.

+ STATE_FINISHED, // Read the input until the end.

+ STATE_SUSPEND, // Waiting until a new |source_| is set.

+ STATE_ERROR

+ };

+ // Produces a regexp to match the string "--" + |literal|. The idea is to

+ // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed

+ // in "\\Q" and "\\E". The only catch is to watch out ofr occurences of "\\E"

+ // inside |literal|. Those must be excluded from the quote and the backslash

+ // doubly escaped. For example, for literal == "abc\\Edef" the result is

+ // "\\Q--abc\\E\\\\E\\Qdef\\E".

+ static std::string CreateBoundaryPatternFromLiteral(

+ const std::string& literal);

+ // Tests whether |input| has a prefix matching |pattern|.

+ static bool StartsWithPattern(const re2::StringPiece& input,

+ const RE2& pattern);

+ // If |source_| starts with a header, seeks |source_| beyond the header. If

+ // the header is Content-Disposition, extracts |name| from "name=" and

+ // possibly |value| from "filename=" fields of that header. Only if the

+ // "name" or "filename" fields are found, then |name| or |value| are touched.

+ // Returns true iff |source_| is seeked forward. Sets |value_assigned|

+ // to true iff |value| has been assigned to.

+ bool TryReadHeader(base::StringPiece* name,

+ base::StringPiece* value,

+ bool* value_assigned);

+ // Helper to GetNextNameValue. Expects that the input starts with a data

+ // portion of a body part. An attempt is made to read the input until the end

+ // of that body part. If |data| is not NULL, it is set to contain the data

+ // portion. Returns true iff the reading was successful.

+ bool FinishReadingPart(base::StringPiece* data);

+ // These methods could be even static, but then we would have to spend more

+ // code on initializing the cached pointer to g_patterns.Get().

+ const RE2& transfer_padding_pattern() const {

+ return patterns_->transfer_padding_pattern;

+ }

+ const RE2& crlf_pattern() const {

+ return patterns_->crlf_pattern;

+ }

+ const RE2& closing_pattern() const {

+ return patterns_->closing_pattern;

+ }

+ const RE2& epilogue_pattern() const {

+ return patterns_->epilogue_pattern;

+ }

+ const RE2& crlf_free_pattern() const {

+ return patterns_->crlf_free_pattern;

+ }

+ const RE2& preamble_pattern() const {

+ return patterns_->preamble_pattern;

+ }

+ const RE2& header_pattern() const {

+ return patterns_->header_pattern;

+ }

+ const RE2& content_disposition_pattern() const {

+ return patterns_->content_disposition_pattern;

+ }

+ const RE2& name_pattern() const {

+ return patterns_->name_pattern;

+ }

+ const RE2& value_pattern() const {

+ return patterns_->value_pattern;

+ }

+ // However, this is used in a static method so it needs to be static.

+ static const RE2& unquote_pattern() {

+ return g_patterns.Get().unquote_pattern; // No caching g_patterns here.

+ }

+ const RE2 dash_boundary_pattern_;

+ // Because of initialisation dependency, |state_| needs to be declared after

+ // |dash_boundary_pattern_|.

+ State state_;

+ // The parsed message can be split into multiple sources which we read

+ // sequentially.

+ re2::StringPiece source_;

+ // Caching the pointer to g_patterns.Get().

+ const Patterns* patterns_;

+ DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

+};

+// Implementation of FormDataParser and FormDataParser::Result.

+FormDataParser::Result::Result() {}

+FormDataParser::Result::~Result() {}

+void FormDataParser::Result::Reset() {

+ name_.erase();

+ value_.erase();

+FormDataParser::~FormDataParser() {}

+// static

+scoped_ptr<FormDataParser> FormDataParser::Create(

+ const net::URLRequest* request) {

+ std::string value;

+ const bool found = request->extra_request_headers().GetHeader(

+ net::HttpRequestHeaders::kContentType, &value);

+ return Create(found ? &value : NULL);

+// static

+scoped_ptr<FormDataParser> FormDataParser::Create(

+ const std::string* content_type_header) {

+ enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

+ ParserChoice choice = ERROR_CHOICE;

+ std::string boundary;

+ if (content_type_header == NULL) {

+ choice = URL_ENCODED;

+ } else {

+ const std::string content_type(

+ content_type_header->substr(0, content_type_header->find(';')));

+ if (base::strcasecmp(

+ content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

+ choice = URL_ENCODED;

+ } else if (base::strcasecmp(

+ content_type.c_str(), "multipart/form-data") == 0) {

+ static const char kBoundaryString[] = "boundary=";

+ size_t offset = content_type_header->find(kBoundaryString);

+ if (offset == std::string::npos) {

+ // Malformed header.

+ return scoped_ptr<FormDataParser>();

+ }

+ offset += sizeof(kBoundaryString) - 1;

+ boundary = content_type_header->substr(

+ offset, content_type_header->find(';', offset));

+ if (!boundary.empty())

+ choice = MULTIPART;

+ }

+ // Other cases are unparseable, including when |content_type| is "text/plain".

+ switch (choice) {

+ case URL_ENCODED:

+ return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

+ case MULTIPART:

+ return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

+ default: // In other words, case ERROR_CHOICE:

+ return scoped_ptr<FormDataParser>();

+ }

+FormDataParser::FormDataParser() {}

+// Implementation of FormDataParserUrlEncoded.

+const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

+ net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |

+ net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

+FormDataParserUrlEncoded::FormDataParserUrlEncoded()

+ : source_(NULL),

+ source_set_(false),

+ arg_name_(&name_),

+ arg_value_(&value_),

+ patterns_(&(g_patterns.Get())) {

+ args_[0] = &arg_name_;

+ args_[1] = &arg_value_;

+FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

+bool FormDataParserUrlEncoded::AllDataReadOK() {

+ // All OK means we read the whole source.

+ return source_set_ && source_.size() == 0;

+bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

+ if (!source_set_)

+ return false;

+ bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);

+ if (success) {

+ result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

+ result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

+ }

+ return success;

+bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {

+ if (source_set_)

+ return false; // We do not allow multiple sources for this parser.

+ source_.set(source.data(), source.size());

+ source_set_ = true;

+ return true;

+// Implementation of FormDataParserMultipart.

+// static

+std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral(

+ const std::string& literal) {

+#define OPEN_QUOTE "\\Q"

+ static const char quote[] = OPEN_QUOTE;

+ static const char unquote[] = "\\E";

+ // The result always starts with opening the qoute and then "--".

+ std::string result(OPEN_QUOTE "--");

+#undef OPEN_QUOTE

+ // This StringPiece is used below to record the next occurrence of "\\E" in

+ // |literal|.

+ re2::StringPiece seek_unquote(literal);

+ const char* copy_start = literal.data();

+ size_t copy_length = literal.size();

+ // Find all "\\E" in |literal| and exclude them from the \Q...\E quote.

+ while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {

+ copy_length = seek_unquote.data() - copy_start;

+ result.append(copy_start, copy_length);

+ result.append(g_escape_closing_quote);

+ result.append(quote);

+ copy_start = seek_unquote.data();

+ }

+ // Finish the last \Q...\E quote.

+ copy_length = (literal.data() + literal.size()) - copy_start;

+ result.append(copy_start, copy_length);

+ result.append(unquote);

+ return result;

+// static

+bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input,

+ const RE2& pattern) {

+ return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

+FormDataParserMultipart::FormDataParserMultipart(

+ const std::string& boundary_separator)

+ : dash_boundary_pattern_(

+ CreateBoundaryPatternFromLiteral(boundary_separator)),

+ state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR),

+ patterns_(&(g_patterns.Get())) {}

+FormDataParserMultipart::~FormDataParserMultipart() {}

+bool FormDataParserMultipart::AllDataReadOK() {

+ return state_ == STATE_FINISHED;

+bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {

+ const char* data_start = source_.data();

+ while (!StartsWithPattern(source_, dash_boundary_pattern_)) {

+ if (!RE2::Consume(&source_, crlf_free_pattern()) ||

+ !RE2::Consume(&source_, crlf_pattern())) {

+ state_ = STATE_ERROR;

+ return false;

+ }

+ if (data != NULL) {

+ if (source_.data() == data_start) {

+ // No data in this body part.

+ state_ = STATE_ERROR;

+ return false;

+ }

+ // Subtract 2u for the trailing "\r\n".

+ data->set(data_start, source_.data() - data_start - 2u);

+ }

+ // Finally, read the dash-boundary and either skip to the next body part, or

+ // finish reading the source.

+ CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

+ if (StartsWithPattern(source_, closing_pattern())) {

+ CHECK(RE2::Consume(&source_, closing_pattern()));

+ if (RE2::Consume(&source_, epilogue_pattern()))

+ state_ = STATE_FINISHED;

+ else

+ state_ = STATE_ERROR;

+ } else { // Next body part ahead.

+ if (!RE2::Consume(&source_, transfer_padding_pattern()))

+ state_ = STATE_ERROR;

+ }

+ return state_ != STATE_ERROR;

+bool FormDataParserMultipart::GetNextNameValue(Result* result) {

+ if (source_.size() == 0 || state_ != STATE_READY)

+ return false;

+ // 1. Read body-part headers.

+ base::StringPiece name;

+ base::StringPiece value;

+ bool value_assigned = false;

+ bool value_assigned_temp;

+ while (TryReadHeader(&name, &value, &value_assigned_temp))

+ value_assigned |= value_assigned_temp;

+ if (name.size() == 0 || state_ == STATE_ERROR) {

+ state_ = STATE_ERROR;

+ return false;

+ }

+ // 2. Read the trailing CRLF after headers.

+ if (!RE2::Consume(&source_, crlf_pattern())) {

+ state_ = STATE_ERROR;

+ return false;

+ }

+ // 3. Read the data of this body part, i.e., everything until the first

+ // dash-boundary.

+ bool return_value;

+ if (value_assigned && source_.size() == 0) { // Wait for a new source?

+ return_value = true;

+ state_ = STATE_SUSPEND;

+ } else {

+ return_value = FinishReadingPart(value_assigned ? NULL : &value);

+ }

+ std::string unescaped_name = net::UnescapeURLComponent(

+ name.as_string(),

+ net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);

+ result->set_name(unescaped_name);

+ result->set_value(value);

+ return return_value;

+bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {

+ if (source.data() == NULL || source_.size() != 0)

+ return false;

+ source_.set(source.data(), source.size());

+ switch (state_) {

+ case STATE_INIT:

+ // Seek behind the preamble.

+ while (!StartsWithPattern(source_, dash_boundary_pattern_)) {

+ if (!RE2::Consume(&source_, preamble_pattern())) {

+ state_ = STATE_ERROR;

+ break;

+ }

+ // Read dash-boundary, transfer padding, and CRLF.

+ if (state_ != STATE_ERROR) {

+ if (!RE2::Consume(&source_, dash_boundary_pattern_) ||

+ !RE2::Consume(&source_, transfer_padding_pattern()))

+ state_ = STATE_ERROR;

+ else

+ state_ = STATE_READY;

+ }

+ break;

+ case STATE_READY: // Nothing to do.

+ break;

+ case STATE_SUSPEND:

+ state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;

+ break;

+ default:

+ state_ = STATE_ERROR;

+ }

+ return state_ != STATE_ERROR;

+bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

+ base::StringPiece* value,

+ bool* value_assigned) {

+ *value_assigned = false;

+ const char* header_start = source_.data();

+ if (!RE2::Consume(&source_, header_pattern()))

+ return false;

+ // (*) After this point we must return true, because we consumed one header.

+ // Subtract 2u for the trailing "\r\n".

+ re2::StringPiece header(header_start, source_.data() - header_start - 2u);

+ if (!StartsWithPattern(header, content_disposition_pattern()))

+ return true; // Skip headers that don't describe the content-disposition.

+ re2::StringPiece groups[2u];

+ if (!name_pattern().Match(header,

+ g_content_disposition_length, header.size(),

+ RE2::UNANCHORED, groups, 2)) {

+ state_ = STATE_ERROR;

+ return true; // See (*) for why true.

+ }

+ name->set(groups[1].data(), groups[1].size());

+ if (value_pattern().Match(header,

+ g_content_disposition_length, header.size(),

+ RE2::UNANCHORED, groups, 2)) {

+ value->set(groups[1].data(), groups[1].size());

+ *value_assigned = true;

+ }

+ return true;

+} // namespace extensions