net/http/http_content_disposition.cc - Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse.

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Suppress accidental trigraph Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/http/http_content_disposition.h"	5 #include "net/http/http_content_disposition.h"

6	6

7 #include "base/base64.h"	7 #include "base/base64.h"

8 #include "base/i18n/icu_string_conversions.h"	8 #include "base/i18n/icu_string_conversions.h"

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/string_util.h"	10 #include "base/string_util.h"

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
88 ucnv_close(converter);	88 ucnv_close(converter);

89 if (U_FAILURE(err))	89 if (U_FAILURE(err))

90 return false;	90 return false;

91 output->resize(output_length);	91 output->resize(output_length);

92 return true;	92 return true;

93 }	93 }

94	94

95 bool DecodeWord(const std::string& encoded_word,	95 bool DecodeWord(const std::string& encoded_word,

96 const std::string& referrer_charset,	96 const std::string& referrer_charset,

97 bool* is_rfc2047,	97 bool* is_rfc2047,

98 std::string* output) {	98 std::string* output,

	99 int* parse_result_flags) {

99 *is_rfc2047 = false;	100 *is_rfc2047 = false;

100 output->clear();	101 output->clear();

101 if (encoded_word.empty())	102 if (encoded_word.empty())

102 return true;	103 return true;

103	104

104 if (!IsStringASCII(encoded_word)) {	105 if (!IsStringASCII(encoded_word)) {

105 // Try UTF-8, referrer_charset and the native OS default charset in turn.	106 // Try UTF-8, referrer_charset and the native OS default charset in turn.

106 if (IsStringUTF8(encoded_word)) {	107 if (IsStringUTF8(encoded_word)) {

107 *output = encoded_word;	108 *output = encoded_word;

108 } else {	109 } else {

109 string16 utf16_output;	110 string16 utf16_output;

110 if (!referrer_charset.empty() &&	111 if (!referrer_charset.empty() &&

111 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),	112 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),

112 base::OnStringConversionError::FAIL,	113 base::OnStringConversionError::FAIL,

113 &utf16_output)) {	114 &utf16_output)) {

114 *output = UTF16ToUTF8(utf16_output);	115 *output = UTF16ToUTF8(utf16_output);

115 } else {	116 } else {

116 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));	117 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));

117 }	118 }

118 }	119 }

119	120

	121 *parse_result_flags \|= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;

120 return true;	122 return true;

121 }	123 }

122	124

123 // RFC 2047 : one of encoding methods supported by Firefox and relatively	125 // RFC 2047 : one of encoding methods supported by Firefox and relatively

124 // widely used by web servers.	126 // widely used by web servers.

125 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.	127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.

126 // We don't care about the length restriction (72 bytes) because	128 // We don't care about the length restriction (72 bytes) because

127 // many web servers generate encoded words longer than the limit.	129 // many web servers generate encoded words longer than the limit.

128 std::string tmp;	130 std::string decoded_word;

129 *is_rfc2047 = true;	131 *is_rfc2047 = true;

130 int part_index = 0;	132 int part_index = 0;

131 std::string charset;	133 std::string charset;

132 StringTokenizer t(encoded_word, "?");	134 StringTokenizer t(encoded_word, "?");

133 RFC2047EncodingType enc_type = Q_ENCODING;	135 RFC2047EncodingType enc_type = Q_ENCODING;

134 while (*is_rfc2047 && t.GetNext()) {	136 while (*is_rfc2047 && t.GetNext()) {

135 std::string part = t.token();	137 std::string part = t.token();

136 switch (part_index) {	138 switch (part_index) {

137 case 0:	139 case 0:

138 if (part != "=") {	140 if (part != "=") {

(...skipping 12 matching lines...) Expand all Loading...
151 part.find_first_of("bBqQ") == std::string::npos) {	153 part.find_first_of("bBqQ") == std::string::npos) {

152 *is_rfc2047 = false;	154 *is_rfc2047 = false;

153 break;	155 break;

154 }	156 }

155 if (part[0] == 'b' \|\| part[0] == 'B') {	157 if (part[0] == 'b' \|\| part[0] == 'B') {

156 enc_type = B_ENCODING;	158 enc_type = B_ENCODING;

157 }	159 }

158 ++part_index;	160 ++part_index;

159 break;	161 break;

160 case 3:	162 case 3:

161 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);	163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);

162 if (!*is_rfc2047) {	164 if (!*is_rfc2047) {

163 // Last minute failure. Invalid B/Q encoding. Rather than	165 // Last minute failure. Invalid B/Q encoding. Rather than

164 // passing it through, return now.	166 // passing it through, return now.

165 return false;	167 return false;

166 }	168 }

167 ++part_index;	169 ++part_index;

168 break;	170 break;

169 case 4:	171 case 4:

170 if (part != "=") {	172 if (part != "=") {

171 // Another last minute failure !	173 // Another last minute failure !

172 // Likely to be a case of two encoded-words in a row or	174 // Likely to be a case of two encoded-words in a row or

173 // an encoded word followed by a non-encoded word. We can be	175 // an encoded word followed by a non-encoded word. We can be

174 // generous, but it does not help much in terms of compatibility,	176 // generous, but it does not help much in terms of compatibility,

175 // I believe. Return immediately.	177 // I believe. Return immediately.

176 *is_rfc2047 = false;	178 *is_rfc2047 = false;

177 return false;	179 return false;

178 }	180 }

179 ++part_index;	181 ++part_index;

180 break;	182 break;

181 default:	183 default:

182 *is_rfc2047 = false;	184 *is_rfc2047 = false;

183 return false;	185 return false;

184 }	186 }

185 }	187 }

186	188

187 if (*is_rfc2047) {	189 if (*is_rfc2047) {

188 if (*(encoded_word.end() - 1) == '=') {	190 if (*(encoded_word.end() - 1) == '=') {

189 output->swap(tmp);	191 output->swap(decoded_word);

	192 *parse_result_flags \|=

	193 net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;

190 return true;	194 return true;

191 }	195 }

192 // encoded_word ending prematurelly with '?' or extra '?'	196 // encoded_word ending prematurelly with '?' or extra '?'

193 *is_rfc2047 = false;	197 *is_rfc2047 = false;

194 return false;	198 return false;

195 }	199 }

196	200

197 // We're not handling 'especial' characters quoted with '\', but	201 // We're not handling 'especial' characters quoted with '\', but

198 // it should be Ok because we're not an email client but a	202 // it should be Ok because we're not an email client but a

199 // web browser.	203 // web browser.

200	204

201 // What IE6/7 does: %-escaped UTF-8.	205 // What IE6/7 does: %-escaped UTF-8.

202 tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES);	206 decoded_word = net::UnescapeURLComponent(encoded_word,

203 if (IsStringUTF8(tmp)) {	207 net::UnescapeRule::SPACES);

204 output->swap(tmp);	208 if (decoded_word != encoded_word)

	209 *parse_result_flags \|=

	210 net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;

	211 if (IsStringUTF8(decoded_word)) {

	212 output->swap(decoded_word);

205 return true;	213 return true;

206 // We can try either the OS default charset or 'origin charset' here,	214 // We can try either the OS default charset or 'origin charset' here,

207 // As far as I can tell, IE does not support it. However, I've seen	215 // As far as I can tell, IE does not support it. However, I've seen

208 // web servers emit %-escaped string in a legacy encoding (usually	216 // web servers emit %-escaped string in a legacy encoding (usually

209 // origin charset).	217 // origin charset).

210 // TODO(jungshik) : Test IE further and consider adding a fallback here.	218 // TODO(jungshik) : Test IE further and consider adding a fallback here.

211 }	219 }

212 return false;	220 return false;

213 }	221 }

214	222

215 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The	223 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The

216 // value is supposed to be of the form:	224 // value is supposed to be of the form:

217 //	225 //

218 // value = token \| quoted-string	226 // value = token \| quoted-string

219 //	227 //

220 // However we currently also allow RFC 2047 encoding and non-ASCII	228 // However we currently also allow RFC 2047 encoding and non-ASCII

221 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.	229 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.

222 bool DecodeFilenameValue(const std::string& input,	230 bool DecodeFilenameValue(const std::string& input,

223 const std::string& referrer_charset,	231 const std::string& referrer_charset,

224 std::string* output) {	232 std::string* output,

225 std::string tmp;	233 int* parse_result_flags) {

	234 int current_parse_result_flags = 0;

	235 std::string decoded_value;

	236 bool is_previous_token_rfc2047 = true;

	237

226 // Tokenize with whitespace characters.	238 // Tokenize with whitespace characters.

227 StringTokenizer t(input, " \t\n\r");	239 StringTokenizer t(input, " \t\n\r");

228 t.set_options(StringTokenizer::RETURN_DELIMS);	240 t.set_options(StringTokenizer::RETURN_DELIMS);

229 bool is_previous_token_rfc2047 = true;

230 while (t.GetNext()) {	241 while (t.GetNext()) {

231 if (t.token_is_delim()) {	242 if (t.token_is_delim()) {

232 // If the previous non-delimeter token is not RFC2047-encoded,	243 // If the previous non-delimeter token is not RFC2047-encoded,

233 // put in a space in its place. Otheriwse, skip over it.	244 // put in a space in its place. Otheriwse, skip over it.

234 if (!is_previous_token_rfc2047) {	245 if (!is_previous_token_rfc2047)

235 tmp.push_back(' ');	246 decoded_value.push_back(' ');

236 }

237 continue;	247 continue;

238 }	248 }

239 // We don't support a single multibyte character split into	249 // We don't support a single multibyte character split into

240 // adjacent encoded words. Some broken mail clients emit headers	250 // adjacent encoded words. Some broken mail clients emit headers

241 // with that problem, but most web servers usually encode a filename	251 // with that problem, but most web servers usually encode a filename

242 // in a single encoded-word. Firefox/Thunderbird do not support	252 // in a single encoded-word. Firefox/Thunderbird do not support

243 // it, either.	253 // it, either.

244 std::string decoded;	254 std::string decoded;

245 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,	255 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,

246 &decoded))	256 &decoded, &current_parse_result_flags))

247 return false;	257 return false;

248 tmp.append(decoded);	258 decoded_value.append(decoded);

249 }	259 }

250 output->swap(tmp);	260 output->swap(decoded_value);

	261 if (parse_result_flags && !output->empty())

	262 *parse_result_flags \|= current_parse_result_flags;

251 return true;	263 return true;

252 }	264 }

253	265

254 // Parses the charset and value-chars out of an ext-value string.	266 // Parses the charset and value-chars out of an ext-value string.

255 //	267 //

256 // ext-value = charset "'" [ language ] "'" value-chars	268 // ext-value = charset "'" [ language ] "'" value-chars

257 bool ParseExtValueComponents(const std::string& input,	269 bool ParseExtValueComponents(const std::string& input,

258 std::string* charset,	270 std::string* charset,

259 std::string* value_chars) {	271 std::string* value_chars) {

260 StringTokenizer t(input, "'");	272 StringTokenizer t(input, "'");

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
332	344

333 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);	345 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);

334 }	346 }

335	347

336 } // namespace	348 } // namespace

337	349

338 namespace net {	350 namespace net {

339	351

340 HttpContentDisposition::HttpContentDisposition(	352 HttpContentDisposition::HttpContentDisposition(

341 const std::string& header, const std::string& referrer_charset)	353 const std::string& header, const std::string& referrer_charset)

342 : type_(INLINE) {	354 : type_(INLINE),

	355 parse_result_flags_(INVALID) {

343 Parse(header, referrer_charset);	356 Parse(header, referrer_charset);

344 }	357 }

345	358

346 HttpContentDisposition::~HttpContentDisposition() {	359 HttpContentDisposition::~HttpContentDisposition() {

347 }	360 }

348	361

349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(	362 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(

350 std::string::const_iterator begin, std::string::const_iterator end) {	363 std::string::const_iterator begin, std::string::const_iterator end) {

351 DCHECK(type_ == INLINE);	364 DCHECK(type_ == INLINE);

352 std::string::const_iterator delimiter = std::find(begin, end, ';');	365 std::string::const_iterator delimiter = std::find(begin, end, ';');

353	366

354 std::string::const_iterator type_begin = begin;	367 std::string::const_iterator type_begin = begin;

355 std::string::const_iterator type_end = delimiter;	368 std::string::const_iterator type_end = delimiter;

356 HttpUtil::TrimLWS(&type_begin, &type_end);	369 HttpUtil::TrimLWS(&type_begin, &type_end);

357	370

358 // If the disposition-type isn't a valid token the then the	371 // If the disposition-type isn't a valid token the then the

359 // Content-Disposition header is malformed, and we treat the first bytes as	372 // Content-Disposition header is malformed, and we treat the first bytes as

360 // a parameter rather than a disposition-type.	373 // a parameter rather than a disposition-type.

361 if (!HttpUtil::IsToken(type_begin, type_end))	374 if (!HttpUtil::IsToken(type_begin, type_end))

362 return begin;	375 return begin;

363	376

	377 parse_result_flags_ \|= HAS_DISPOSITION_TYPE;

	378

364 DCHECK(std::find(type_begin, type_end, '=') == type_end);	379 DCHECK(std::find(type_begin, type_end, '=') == type_end);

365	380

366 if (!LowerCaseEqualsASCII(type_begin, type_end, "inline"))	381 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {

	382 type_ = INLINE;

	383 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {

367 type_ = ATTACHMENT;	384 type_ = ATTACHMENT;

	385 } else {

	386 parse_result_flags_ \|= HAS_UNKNOWN_DISPOSITION_TYPE;

	387 type_ = ATTACHMENT;

	388 }

368 return delimiter;	389 return delimiter;

369 }	390 }

370	391

371 // http://tools.ietf.org/html/rfc6266	392 // http://tools.ietf.org/html/rfc6266

372 //	393 //

373 // content-disposition = "Content-Disposition" ":"	394 // content-disposition = "Content-Disposition" ":"

374 // disposition-type *( ";" disposition-parm )	395 // disposition-type *( ";" disposition-parm )

375 //	396 //

376 // disposition-type = "inline" \| "attachment" \| disp-ext-type	397 // disposition-type = "inline" \| "attachment" \| disp-ext-type

377 // ; case-insensitive	398 // ; case-insensitive

(...skipping 19 matching lines...) Expand all Loading...
397	418

398 std::string name;	419 std::string name;

399 std::string filename;	420 std::string filename;

400 std::string ext_filename;	421 std::string ext_filename;

401	422

402 HttpUtil::NameValuePairsIterator iter(pos, end, ';');	423 HttpUtil::NameValuePairsIterator iter(pos, end, ';');

403 while (iter.GetNext()) {	424 while (iter.GetNext()) {

404 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),	425 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),

405 iter.name_end(),	426 iter.name_end(),

406 "filename")) {	427 "filename")) {

407 DecodeFilenameValue(iter.value(), referrer_charset, &filename);	428 DecodeFilenameValue(iter.value(), referrer_charset, &filename,

	429 &parse_result_flags_);

	430 if (!filename.empty())

	431 parse_result_flags_ \|= HAS_FILENAME;

408 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),	432 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),

409 iter.name_end(),	433 iter.name_end(),

410 "name")) {	434 "name")) {

411 DecodeFilenameValue(iter.value(), referrer_charset, &name);	435 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);

	436 if (!name.empty())

	437 parse_result_flags_ \|= HAS_NAME;

412 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),	438 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),

413 iter.name_end(),	439 iter.name_end(),

414 "filename*")) {	440 "filename*")) {

415 DecodeExtValue(iter.raw_value(), &ext_filename);	441 DecodeExtValue(iter.raw_value(), &ext_filename);

	442 if (!ext_filename.empty())

	443 parse_result_flags_ \|= HAS_EXT_FILENAME;

416 }	444 }

417 }	445 }

418	446

419 if (!ext_filename.empty())	447 if (!ext_filename.empty())

420 filename_ = ext_filename;	448 filename_ = ext_filename;

421 else if (!filename.empty())	449 else if (!filename.empty())

422 filename_ = filename;	450 filename_ = filename;

423 else	451 else

424 filename_ = name;	452 filename_ = name;

425 }	453 }

426	454

427 } // namespace net	455 } // namespace net

OLD	NEW

« no previous file with comments | « net/http/http_content_disposition.h ('k') | net/http/http_content_disposition_unittest.cc » ('j') | no next file with comments »