OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef URL_URL_CANON_H_ | 5 #ifndef URL_URL_CANON_H_ |
6 #define URL_URL_CANON_H_ | 6 #define URL_URL_CANON_H_ |
7 | 7 |
8 #include <stdlib.h> | 8 #include <stdlib.h> |
9 #include <string.h> | 9 #include <string.h> |
10 | 10 |
11 #include "base/string16.h" | 11 #include "base/string16.h" |
| 12 #include "url/url_export.h" |
12 #include "url/url_parse.h" | 13 #include "url/url_parse.h" |
13 | 14 |
14 namespace url_canon { | 15 namespace url_canon { |
15 | 16 |
16 // Canonicalizer output ------------------------------------------------------- | 17 // Canonicalizer output ------------------------------------------------------- |
17 | 18 |
18 // Base class for the canonicalizer output, this maintains a buffer and | 19 // Base class for the canonicalizer output, this maintains a buffer and |
19 // supports simple resizing and append operations on it. | 20 // supports simple resizing and append operations on it. |
20 // | 21 // |
21 // It is VERY IMPORTANT that no virtual function calls be made on the common | 22 // It is VERY IMPORTANT that no virtual function calls be made on the common |
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
179 class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; | 180 class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; |
180 | 181 |
181 // Character set converter ---------------------------------------------------- | 182 // Character set converter ---------------------------------------------------- |
182 // | 183 // |
183 // Converts query strings into a custom encoding. The embedder can supply an | 184 // Converts query strings into a custom encoding. The embedder can supply an |
184 // implementation of this class to interface with their own character set | 185 // implementation of this class to interface with their own character set |
185 // conversion libraries. | 186 // conversion libraries. |
186 // | 187 // |
187 // Embedders will want to see the unit test for the ICU version. | 188 // Embedders will want to see the unit test for the ICU version. |
188 | 189 |
189 class CharsetConverter { | 190 class URL_EXPORT CharsetConverter { |
190 public: | 191 public: |
191 CharsetConverter() {} | 192 CharsetConverter() {} |
192 virtual ~CharsetConverter() {} | 193 virtual ~CharsetConverter() {} |
193 | 194 |
194 // Converts the given input string from UTF-16 to whatever output format the | 195 // Converts the given input string from UTF-16 to whatever output format the |
195 // converter supports. This is used only for the query encoding conversion, | 196 // converter supports. This is used only for the query encoding conversion, |
196 // which does not fail. Instead, the converter should insert "invalid | 197 // which does not fail. Instead, the converter should insert "invalid |
197 // character" characters in the output for invalid sequences, and do the | 198 // character" characters in the output for invalid sequences, and do the |
198 // best it can. | 199 // best it can. |
199 // | 200 // |
(...skipping 17 matching lines...) Expand all Loading... |
217 // This should be called before parsing if whitespace removal is desired (which | 218 // This should be called before parsing if whitespace removal is desired (which |
218 // it normally is when you are canonicalizing). | 219 // it normally is when you are canonicalizing). |
219 // | 220 // |
220 // If no whitespace is removed, this function will not use the buffer and will | 221 // If no whitespace is removed, this function will not use the buffer and will |
221 // return a pointer to the input, to avoid the extra copy. If modification is | 222 // return a pointer to the input, to avoid the extra copy. If modification is |
222 // required, the given |buffer| will be used and the returned pointer will | 223 // required, the given |buffer| will be used and the returned pointer will |
223 // point to the beginning of the buffer. | 224 // point to the beginning of the buffer. |
224 // | 225 // |
225 // Therefore, callers should not use the buffer, since it may actuall be empty, | 226 // Therefore, callers should not use the buffer, since it may actuall be empty, |
226 // use the computed pointer and |*output_len| instead. | 227 // use the computed pointer and |*output_len| instead. |
227 const char* RemoveURLWhitespace(const char* input, int input_len, | 228 URL_EXPORT const char* RemoveURLWhitespace(const char* input, int input_len, |
228 CanonOutputT<char>* buffer, | 229 CanonOutputT<char>* buffer, |
229 int* output_len); | 230 int* output_len); |
230 const char16* RemoveURLWhitespace(const char16* input, int input_len, | 231 URL_EXPORT const char16* RemoveURLWhitespace(const char16* input, int input_len, |
231 CanonOutputT<char16>* buffer, | 232 CanonOutputT<char16>* buffer, |
232 int* output_len); | 233 int* output_len); |
233 | 234 |
234 // IDN ------------------------------------------------------------------------ | 235 // IDN ------------------------------------------------------------------------ |
235 | 236 |
236 // Converts the Unicode input representing a hostname to ASCII using IDN rules. | 237 // Converts the Unicode input representing a hostname to ASCII using IDN rules. |
237 // The output must fall in the ASCII range, but will be encoded in UTF-16. | 238 // The output must fall in the ASCII range, but will be encoded in UTF-16. |
238 // | 239 // |
239 // On success, the output will be filled with the ASCII host name and it will | 240 // On success, the output will be filled with the ASCII host name and it will |
240 // return true. Unlike most other canonicalization functions, this assumes that | 241 // return true. Unlike most other canonicalization functions, this assumes that |
241 // the output is empty. The beginning of the host will be at offset 0, and | 242 // the output is empty. The beginning of the host will be at offset 0, and |
242 // the length of the output will be set to the length of the new host name. | 243 // the length of the output will be set to the length of the new host name. |
243 // | 244 // |
244 // On error, returns false. The output in this case is undefined. | 245 // On error, returns false. The output in this case is undefined. |
245 bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); | 246 URL_EXPORT bool IDNToASCII(const char16* src, |
| 247 int src_len, |
| 248 CanonOutputW* output); |
246 | 249 |
247 // Piece-by-piece canonicalizers ---------------------------------------------- | 250 // Piece-by-piece canonicalizers ---------------------------------------------- |
248 // | 251 // |
249 // These individual canonicalizers append the canonicalized versions of the | 252 // These individual canonicalizers append the canonicalized versions of the |
250 // corresponding URL component to the given std::string. The spec and the | 253 // corresponding URL component to the given std::string. The spec and the |
251 // previously-identified range of that component are the input. The range of | 254 // previously-identified range of that component are the input. The range of |
252 // the canonicalized component will be written to the output component. | 255 // the canonicalized component will be written to the output component. |
253 // | 256 // |
254 // These functions all append to the output so they can be chained. Make sure | 257 // These functions all append to the output so they can be chained. Make sure |
255 // the output is empty when you start. | 258 // the output is empty when you start. |
256 // | 259 // |
257 // These functions returns boolean values indicating success. On failure, they | 260 // These functions returns boolean values indicating success. On failure, they |
258 // will attempt to write something reasonable to the output so that, if | 261 // will attempt to write something reasonable to the output so that, if |
259 // displayed to the user, they will recognise it as something that's messed up. | 262 // displayed to the user, they will recognise it as something that's messed up. |
260 // Nothing more should ever be done with these invalid URLs, however. | 263 // Nothing more should ever be done with these invalid URLs, however. |
261 | 264 |
262 // Scheme: Appends the scheme and colon to the URL. The output component will | 265 // Scheme: Appends the scheme and colon to the URL. The output component will |
263 // indicate the range of characters up to but not including the colon. | 266 // indicate the range of characters up to but not including the colon. |
264 // | 267 // |
265 // Canonical URLs always have a scheme. If the scheme is not present in the | 268 // Canonical URLs always have a scheme. If the scheme is not present in the |
266 // input, this will just write the colon to indicate an empty scheme. Does not | 269 // input, this will just write the colon to indicate an empty scheme. Does not |
267 // append slashes which will be needed before any authority components for most | 270 // append slashes which will be needed before any authority components for most |
268 // URLs. | 271 // URLs. |
269 // | 272 // |
270 // The 8-bit version requires UTF-8 encoding. | 273 // The 8-bit version requires UTF-8 encoding. |
271 bool CanonicalizeScheme(const char* spec, | 274 URL_EXPORT bool CanonicalizeScheme(const char* spec, |
272 const url_parse::Component& scheme, | 275 const url_parse::Component& scheme, |
273 CanonOutput* output, | 276 CanonOutput* output, |
274 url_parse::Component* out_scheme); | 277 url_parse::Component* out_scheme); |
275 bool CanonicalizeScheme(const char16* spec, | 278 URL_EXPORT bool CanonicalizeScheme(const char16* spec, |
276 const url_parse::Component& scheme, | 279 const url_parse::Component& scheme, |
277 CanonOutput* output, | 280 CanonOutput* output, |
278 url_parse::Component* out_scheme); | 281 url_parse::Component* out_scheme); |
279 | 282 |
280 // User info: username/password. If present, this will add the delimiters so | 283 // User info: username/password. If present, this will add the delimiters so |
281 // the output will be "<username>:<password>@" or "<username>@". Empty | 284 // the output will be "<username>:<password>@" or "<username>@". Empty |
282 // username/password pairs, or empty passwords, will get converted to | 285 // username/password pairs, or empty passwords, will get converted to |
283 // nonexistant in the canonical version. | 286 // nonexistant in the canonical version. |
284 // | 287 // |
285 // The components for the username and password refer to ranges in the | 288 // The components for the username and password refer to ranges in the |
286 // respective source strings. Usually, these will be the same string, which | 289 // respective source strings. Usually, these will be the same string, which |
287 // is legal as long as the two components don't overlap. | 290 // is legal as long as the two components don't overlap. |
288 // | 291 // |
289 // The 8-bit version requires UTF-8 encoding. | 292 // The 8-bit version requires UTF-8 encoding. |
290 bool CanonicalizeUserInfo(const char* username_source, | 293 URL_EXPORT bool CanonicalizeUserInfo(const char* username_source, |
291 const url_parse::Component& username, | 294 const url_parse::Component& username, |
292 const char* password_source, | 295 const char* password_source, |
293 const url_parse::Component& password, | 296 const url_parse::Component& password, |
294 CanonOutput* output, | 297 CanonOutput* output, |
295 url_parse::Component* out_username, | 298 url_parse::Component* out_username, |
296 url_parse::Component* out_password); | 299 url_parse::Component* out_password); |
297 bool CanonicalizeUserInfo(const char16* username_source, | 300 URL_EXPORT bool CanonicalizeUserInfo(const char16* username_source, |
298 const url_parse::Component& username, | 301 const url_parse::Component& username, |
299 const char16* password_source, | 302 const char16* password_source, |
300 const url_parse::Component& password, | 303 const url_parse::Component& password, |
301 CanonOutput* output, | 304 CanonOutput* output, |
302 url_parse::Component* out_username, | 305 url_parse::Component* out_username, |
303 url_parse::Component* out_password); | 306 url_parse::Component* out_password); |
304 | 307 |
305 | 308 |
306 // This structure holds detailed state exported from the IP/Host canonicalizers. | 309 // This structure holds detailed state exported from the IP/Host canonicalizers. |
307 // Additional fields may be added as callers require them. | 310 // Additional fields may be added as callers require them. |
308 struct CanonHostInfo { | 311 struct CanonHostInfo { |
309 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} | 312 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} |
310 | 313 |
311 // Convenience function to test if family is an IP address. | 314 // Convenience function to test if family is an IP address. |
312 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } | 315 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } |
313 | 316 |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
346 int AddressLength() const { | 349 int AddressLength() const { |
347 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); | 350 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); |
348 } | 351 } |
349 }; | 352 }; |
350 | 353 |
351 | 354 |
352 // Host. | 355 // Host. |
353 // | 356 // |
354 // The 8-bit version requires UTF-8 encoding. Use this version when you only | 357 // The 8-bit version requires UTF-8 encoding. Use this version when you only |
355 // need to know whether canonicalization succeeded. | 358 // need to know whether canonicalization succeeded. |
356 bool CanonicalizeHost(const char* spec, | 359 URL_EXPORT bool CanonicalizeHost(const char* spec, |
357 const url_parse::Component& host, | 360 const url_parse::Component& host, |
358 CanonOutput* output, | 361 CanonOutput* output, |
359 url_parse::Component* out_host); | 362 url_parse::Component* out_host); |
360 bool CanonicalizeHost(const char16* spec, | 363 URL_EXPORT bool CanonicalizeHost(const char16* spec, |
361 const url_parse::Component& host, | 364 const url_parse::Component& host, |
362 CanonOutput* output, | 365 CanonOutput* output, |
363 url_parse::Component* out_host); | 366 url_parse::Component* out_host); |
364 | 367 |
365 // Extended version of CanonicalizeHost, which returns additional information. | 368 // Extended version of CanonicalizeHost, which returns additional information. |
366 // Use this when you need to know whether the hostname was an IP address. | 369 // Use this when you need to know whether the hostname was an IP address. |
367 // A successful return is indicated by host_info->family != BROKEN. See the | 370 // A successful return is indicated by host_info->family != BROKEN. See the |
368 // definition of CanonHostInfo above for details. | 371 // definition of CanonHostInfo above for details. |
369 void CanonicalizeHostVerbose(const char* spec, | 372 URL_EXPORT void CanonicalizeHostVerbose(const char* spec, |
370 const url_parse::Component& host, | 373 const url_parse::Component& host, |
371 CanonOutput* output, | 374 CanonOutput* output, |
372 CanonHostInfo* host_info); | 375 CanonHostInfo* host_info); |
373 void CanonicalizeHostVerbose(const char16* spec, | 376 URL_EXPORT void CanonicalizeHostVerbose(const char16* spec, |
374 const url_parse::Component& host, | 377 const url_parse::Component& host, |
375 CanonOutput* output, | 378 CanonOutput* output, |
376 CanonHostInfo* host_info); | 379 CanonHostInfo* host_info); |
377 | 380 |
378 | 381 |
379 // IP addresses. | 382 // IP addresses. |
380 // | 383 // |
381 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is | 384 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is |
382 // an IP address, it will canonicalize it as such, appending it to |output|. | 385 // an IP address, it will canonicalize it as such, appending it to |output|. |
383 // Additional status information is returned via the |*host_info| parameter. | 386 // Additional status information is returned via the |*host_info| parameter. |
384 // See the definition of CanonHostInfo above for details. | 387 // See the definition of CanonHostInfo above for details. |
385 // | 388 // |
386 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that | 389 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that |
387 // the input is unescaped and name-prepped, etc. It should not normally be | 390 // the input is unescaped and name-prepped, etc. It should not normally be |
388 // necessary or wise to call this directly. | 391 // necessary or wise to call this directly. |
389 void CanonicalizeIPAddress(const char* spec, | 392 URL_EXPORT void CanonicalizeIPAddress(const char* spec, |
390 const url_parse::Component& host, | 393 const url_parse::Component& host, |
391 CanonOutput* output, | 394 CanonOutput* output, |
392 CanonHostInfo* host_info); | 395 CanonHostInfo* host_info); |
393 void CanonicalizeIPAddress(const char16* spec, | 396 URL_EXPORT void CanonicalizeIPAddress(const char16* spec, |
394 const url_parse::Component& host, | 397 const url_parse::Component& host, |
395 CanonOutput* output, | 398 CanonOutput* output, |
396 CanonHostInfo* host_info); | 399 CanonHostInfo* host_info); |
397 | 400 |
398 // Port: this function will add the colon for the port if a port is present. | 401 // Port: this function will add the colon for the port if a port is present. |
399 // The caller can pass url_parse::PORT_UNSPECIFIED as the | 402 // The caller can pass url_parse::PORT_UNSPECIFIED as the |
400 // default_port_for_scheme argument if there is no default port. | 403 // default_port_for_scheme argument if there is no default port. |
401 // | 404 // |
402 // The 8-bit version requires UTF-8 encoding. | 405 // The 8-bit version requires UTF-8 encoding. |
403 bool CanonicalizePort(const char* spec, | 406 URL_EXPORT bool CanonicalizePort(const char* spec, |
404 const url_parse::Component& port, | 407 const url_parse::Component& port, |
405 int default_port_for_scheme, | 408 int default_port_for_scheme, |
406 CanonOutput* output, | 409 CanonOutput* output, |
407 url_parse::Component* out_port); | 410 url_parse::Component* out_port); |
408 bool CanonicalizePort(const char16* spec, | 411 URL_EXPORT bool CanonicalizePort(const char16* spec, |
409 const url_parse::Component& port, | 412 const url_parse::Component& port, |
410 int default_port_for_scheme, | 413 int default_port_for_scheme, |
411 CanonOutput* output, | 414 CanonOutput* output, |
412 url_parse::Component* out_port); | 415 url_parse::Component* out_port); |
413 | 416 |
414 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED | 417 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED |
415 // if the scheme is unknown. | 418 // if the scheme is unknown. |
416 int DefaultPortForScheme(const char* scheme, int scheme_len); | 419 URL_EXPORT int DefaultPortForScheme(const char* scheme, int scheme_len); |
417 | 420 |
418 // Path. If the input does not begin in a slash (including if the input is | 421 // Path. If the input does not begin in a slash (including if the input is |
419 // empty), we'll prepend a slash to the path to make it canonical. | 422 // empty), we'll prepend a slash to the path to make it canonical. |
420 // | 423 // |
421 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity | 424 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity |
422 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid | 425 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid |
423 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't | 426 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't |
424 // an issue. Somebody giving us an 8-bit path is responsible for generating | 427 // an issue. Somebody giving us an 8-bit path is responsible for generating |
425 // the path that the server expects (we'll escape high-bit characters), so | 428 // the path that the server expects (we'll escape high-bit characters), so |
426 // if something is invalid, it's their problem. | 429 // if something is invalid, it's their problem. |
427 bool CanonicalizePath(const char* spec, | 430 URL_EXPORT bool CanonicalizePath(const char* spec, |
428 const url_parse::Component& path, | 431 const url_parse::Component& path, |
429 CanonOutput* output, | 432 CanonOutput* output, |
430 url_parse::Component* out_path); | 433 url_parse::Component* out_path); |
431 bool CanonicalizePath(const char16* spec, | 434 URL_EXPORT bool CanonicalizePath(const char16* spec, |
432 const url_parse::Component& path, | 435 const url_parse::Component& path, |
433 CanonOutput* output, | 436 CanonOutput* output, |
434 url_parse::Component* out_path); | 437 url_parse::Component* out_path); |
435 | 438 |
436 // Canonicalizes the input as a file path. This is like CanonicalizePath except | 439 // Canonicalizes the input as a file path. This is like CanonicalizePath except |
437 // that it also handles Windows drive specs. For example, the path can begin | 440 // that it also handles Windows drive specs. For example, the path can begin |
438 // with "c|\" and it will get properly canonicalized to "C:/". | 441 // with "c|\" and it will get properly canonicalized to "C:/". |
439 // The string will be appended to |*output| and |*out_path| will be updated. | 442 // The string will be appended to |*output| and |*out_path| will be updated. |
440 // | 443 // |
441 // The 8-bit version requires UTF-8 encoding. | 444 // The 8-bit version requires UTF-8 encoding. |
442 bool FileCanonicalizePath(const char* spec, | 445 URL_EXPORT bool FileCanonicalizePath(const char* spec, |
443 const url_parse::Component& path, | 446 const url_parse::Component& path, |
444 CanonOutput* output, | 447 CanonOutput* output, |
445 url_parse::Component* out_path); | 448 url_parse::Component* out_path); |
446 bool FileCanonicalizePath(const char16* spec, | 449 URL_EXPORT bool FileCanonicalizePath(const char16* spec, |
447 const url_parse::Component& path, | 450 const url_parse::Component& path, |
448 CanonOutput* output, | 451 CanonOutput* output, |
449 url_parse::Component* out_path); | 452 url_parse::Component* out_path); |
450 | 453 |
451 // Query: Prepends the ? if needed. | 454 // Query: Prepends the ? if needed. |
452 // | 455 // |
453 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly | 456 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly |
454 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode | 457 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode |
455 // "invalid character." This function can not fail, we always just try to do | 458 // "invalid character." This function can not fail, we always just try to do |
456 // our best for crazy input here since web pages can set it themselves. | 459 // our best for crazy input here since web pages can set it themselves. |
457 // | 460 // |
458 // This will convert the given input into the output encoding that the given | 461 // This will convert the given input into the output encoding that the given |
459 // character set converter object provides. The converter will only be called | 462 // character set converter object provides. The converter will only be called |
460 // if necessary, for ASCII input, no conversions are necessary. | 463 // if necessary, for ASCII input, no conversions are necessary. |
461 // | 464 // |
462 // The converter can be NULL. In this case, the output encoding will be UTF-8. | 465 // The converter can be NULL. In this case, the output encoding will be UTF-8. |
463 void CanonicalizeQuery(const char* spec, | 466 URL_EXPORT void CanonicalizeQuery(const char* spec, |
464 const url_parse::Component& query, | 467 const url_parse::Component& query, |
465 CharsetConverter* converter, | 468 CharsetConverter* converter, |
466 CanonOutput* output, | 469 CanonOutput* output, |
467 url_parse::Component* out_query); | 470 url_parse::Component* out_query); |
468 void CanonicalizeQuery(const char16* spec, | 471 URL_EXPORT void CanonicalizeQuery(const char16* spec, |
469 const url_parse::Component& query, | 472 const url_parse::Component& query, |
470 CharsetConverter* converter, | 473 CharsetConverter* converter, |
471 CanonOutput* output, | 474 CanonOutput* output, |
472 url_parse::Component* out_query); | 475 url_parse::Component* out_query); |
473 | 476 |
474 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only | 477 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only |
475 // canonicalizer that does not produce ASCII output). The output is | 478 // canonicalizer that does not produce ASCII output). The output is |
476 // guaranteed to be valid UTF-8. | 479 // guaranteed to be valid UTF-8. |
477 // | 480 // |
478 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use | 481 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use |
479 // the "Unicode replacement character" for the confusing bits and copy the rest. | 482 // the "Unicode replacement character" for the confusing bits and copy the rest. |
480 void CanonicalizeRef(const char* spec, | 483 URL_EXPORT void CanonicalizeRef(const char* spec, |
481 const url_parse::Component& path, | 484 const url_parse::Component& path, |
482 CanonOutput* output, | 485 CanonOutput* output, |
483 url_parse::Component* out_path); | 486 url_parse::Component* out_path); |
484 void CanonicalizeRef(const char16* spec, | 487 URL_EXPORT void CanonicalizeRef(const char16* spec, |
485 const url_parse::Component& path, | 488 const url_parse::Component& path, |
486 CanonOutput* output, | 489 CanonOutput* output, |
487 url_parse::Component* out_path); | 490 url_parse::Component* out_path); |
488 | 491 |
489 // Full canonicalizer --------------------------------------------------------- | 492 // Full canonicalizer --------------------------------------------------------- |
490 // | 493 // |
491 // These functions replace any string contents, rather than append as above. | 494 // These functions replace any string contents, rather than append as above. |
492 // See the above piece-by-piece functions for information specific to | 495 // See the above piece-by-piece functions for information specific to |
493 // canonicalizing individual components. | 496 // canonicalizing individual components. |
494 // | 497 // |
495 // The output will be ASCII except the reference fragment, which may be UTF-8. | 498 // The output will be ASCII except the reference fragment, which may be UTF-8. |
496 // | 499 // |
497 // The 8-bit versions require UTF-8 encoding. | 500 // The 8-bit versions require UTF-8 encoding. |
498 | 501 |
499 // Use for standard URLs with authorities and paths. | 502 // Use for standard URLs with authorities and paths. |
500 bool CanonicalizeStandardURL(const char* spec, | 503 URL_EXPORT bool CanonicalizeStandardURL(const char* spec, |
501 int spec_len, | 504 int spec_len, |
502 const url_parse::Parsed& parsed, | 505 const url_parse::Parsed& parsed, |
503 CharsetConverter* query_converter, | 506 CharsetConverter* query_converter, |
504 CanonOutput* output, | 507 CanonOutput* output, |
505 url_parse::Parsed* new_parsed); | 508 url_parse::Parsed* new_parsed); |
506 bool CanonicalizeStandardURL(const char16* spec, | 509 URL_EXPORT bool CanonicalizeStandardURL(const char16* spec, |
507 int spec_len, | 510 int spec_len, |
508 const url_parse::Parsed& parsed, | 511 const url_parse::Parsed& parsed, |
509 CharsetConverter* query_converter, | 512 CharsetConverter* query_converter, |
510 CanonOutput* output, | 513 CanonOutput* output, |
511 url_parse::Parsed* new_parsed); | 514 url_parse::Parsed* new_parsed); |
512 | 515 |
513 // Use for file URLs. | 516 // Use for file URLs. |
514 bool CanonicalizeFileURL(const char* spec, | 517 URL_EXPORT bool CanonicalizeFileURL(const char* spec, |
515 int spec_len, | 518 int spec_len, |
516 const url_parse::Parsed& parsed, | 519 const url_parse::Parsed& parsed, |
517 CharsetConverter* query_converter, | 520 CharsetConverter* query_converter, |
518 CanonOutput* output, | 521 CanonOutput* output, |
519 url_parse::Parsed* new_parsed); | 522 url_parse::Parsed* new_parsed); |
520 bool CanonicalizeFileURL(const char16* spec, | 523 URL_EXPORT bool CanonicalizeFileURL(const char16* spec, |
521 int spec_len, | 524 int spec_len, |
522 const url_parse::Parsed& parsed, | 525 const url_parse::Parsed& parsed, |
523 CharsetConverter* query_converter, | 526 CharsetConverter* query_converter, |
524 CanonOutput* output, | 527 CanonOutput* output, |
525 url_parse::Parsed* new_parsed); | 528 url_parse::Parsed* new_parsed); |
526 | 529 |
527 // Use for filesystem URLs. | 530 // Use for filesystem URLs. |
528 bool CanonicalizeFileSystemURL(const char* spec, | 531 URL_EXPORT bool CanonicalizeFileSystemURL(const char* spec, |
529 int spec_len, | 532 int spec_len, |
530 const url_parse::Parsed& parsed, | 533 const url_parse::Parsed& parsed, |
531 CharsetConverter* query_converter, | 534 CharsetConverter* query_converter, |
532 CanonOutput* output, | 535 CanonOutput* output, |
533 url_parse::Parsed* new_parsed); | 536 url_parse::Parsed* new_parsed); |
534 bool CanonicalizeFileSystemURL(const char16* spec, | 537 URL_EXPORT bool CanonicalizeFileSystemURL(const char16* spec, |
535 int spec_len, | 538 int spec_len, |
536 const url_parse::Parsed& parsed, | 539 const url_parse::Parsed& parsed, |
537 CharsetConverter* query_converter, | 540 CharsetConverter* query_converter, |
538 CanonOutput* output, | 541 CanonOutput* output, |
539 url_parse::Parsed* new_parsed); | 542 url_parse::Parsed* new_parsed); |
540 | 543 |
541 // Use for path URLs such as javascript. This does not modify the path in any | 544 // Use for path URLs such as javascript. This does not modify the path in any |
542 // way, for example, by escaping it. | 545 // way, for example, by escaping it. |
543 bool CanonicalizePathURL(const char* spec, | 546 URL_EXPORT bool CanonicalizePathURL(const char* spec, |
544 int spec_len, | 547 int spec_len, |
545 const url_parse::Parsed& parsed, | 548 const url_parse::Parsed& parsed, |
546 CanonOutput* output, | 549 CanonOutput* output, |
547 url_parse::Parsed* new_parsed); | 550 url_parse::Parsed* new_parsed); |
548 bool CanonicalizePathURL(const char16* spec, | 551 URL_EXPORT bool CanonicalizePathURL(const char16* spec, |
549 int spec_len, | 552 int spec_len, |
550 const url_parse::Parsed& parsed, | 553 const url_parse::Parsed& parsed, |
551 CanonOutput* output, | 554 CanonOutput* output, |
552 url_parse::Parsed* new_parsed); | 555 url_parse::Parsed* new_parsed); |
553 | 556 |
554 // Use for mailto URLs. This "canonicalizes" the url into a path and query | 557 // Use for mailto URLs. This "canonicalizes" the url into a path and query |
555 // component. It does not attempt to merge "to" fields. It uses UTF-8 for | 558 // component. It does not attempt to merge "to" fields. It uses UTF-8 for |
556 // the query encoding if there is a query. This is because a mailto URL is | 559 // the query encoding if there is a query. This is because a mailto URL is |
557 // really intended for an external mail program, and the encoding of a page, | 560 // really intended for an external mail program, and the encoding of a page, |
558 // etc. which would influence a query encoding normally are irrelevant. | 561 // etc. which would influence a query encoding normally are irrelevant. |
559 bool CanonicalizeMailtoURL(const char* spec, | 562 URL_EXPORT bool CanonicalizeMailtoURL(const char* spec, |
560 int spec_len, | 563 int spec_len, |
561 const url_parse::Parsed& parsed, | 564 const url_parse::Parsed& parsed, |
562 CanonOutput* output, | 565 CanonOutput* output, |
563 url_parse::Parsed* new_parsed); | 566 url_parse::Parsed* new_parsed); |
564 bool CanonicalizeMailtoURL(const char16* spec, | 567 URL_EXPORT bool CanonicalizeMailtoURL(const char16* spec, |
565 int spec_len, | 568 int spec_len, |
566 const url_parse::Parsed& parsed, | 569 const url_parse::Parsed& parsed, |
567 CanonOutput* output, | 570 CanonOutput* output, |
568 url_parse::Parsed* new_parsed); | 571 url_parse::Parsed* new_parsed); |
569 | 572 |
570 // Part replacer -------------------------------------------------------------- | 573 // Part replacer -------------------------------------------------------------- |
571 | 574 |
572 // Internal structure used for storing separate strings for each component. | 575 // Internal structure used for storing separate strings for each component. |
573 // The basic canonicalization functions use this structure internally so that | 576 // The basic canonicalization functions use this structure internally so that |
574 // component replacement (different strings for different components) can be | 577 // component replacement (different strings for different components) can be |
575 // treated on the same code path as regular canonicalization (the same string | 578 // treated on the same code path as regular canonicalization (the same string |
576 // for each component). | 579 // for each component). |
577 // | 580 // |
578 // A url_parse::Parsed structure usually goes along with this. Those | 581 // A url_parse::Parsed structure usually goes along with this. Those |
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
743 // Replace component | (replacement string) (replacement component) | 746 // Replace component | (replacement string) (replacement component) |
744 // Delete component | (non-NULL) (invalid component: (0,-1)) | 747 // Delete component | (non-NULL) (invalid component: (0,-1)) |
745 // | 748 // |
746 // We use a pointer to the empty string for the source when the component | 749 // We use a pointer to the empty string for the source when the component |
747 // should be deleted. | 750 // should be deleted. |
748 URLComponentSource<CHAR> sources_; | 751 URLComponentSource<CHAR> sources_; |
749 url_parse::Parsed components_; | 752 url_parse::Parsed components_; |
750 }; | 753 }; |
751 | 754 |
752 // The base must be an 8-bit canonical URL. | 755 // The base must be an 8-bit canonical URL. |
753 bool ReplaceStandardURL(const char* base, | 756 URL_EXPORT bool ReplaceStandardURL(const char* base, |
754 const url_parse::Parsed& base_parsed, | 757 const url_parse::Parsed& base_parsed, |
755 const Replacements<char>& replacements, | 758 const Replacements<char>& replacements, |
756 CharsetConverter* query_converter, | 759 CharsetConverter* query_converter, |
757 CanonOutput* output, | 760 CanonOutput* output, |
758 url_parse::Parsed* new_parsed); | 761 url_parse::Parsed* new_parsed); |
759 bool ReplaceStandardURL(const char* base, | 762 URL_EXPORT bool ReplaceStandardURL(const char* base, |
760 const url_parse::Parsed& base_parsed, | 763 const url_parse::Parsed& base_parsed, |
761 const Replacements<char16>& replacements, | 764 const Replacements<char16>& replacements, |
762 CharsetConverter* query_converter, | 765 CharsetConverter* query_converter, |
763 CanonOutput* output, | 766 CanonOutput* output, |
764 url_parse::Parsed* new_parsed); | 767 url_parse::Parsed* new_parsed); |
765 | 768 |
766 // Filesystem URLs can only have the path, query, or ref replaced. | 769 // Filesystem URLs can only have the path, query, or ref replaced. |
767 // All other components will be ignored. | 770 // All other components will be ignored. |
768 bool ReplaceFileSystemURL(const char* base, | 771 URL_EXPORT bool ReplaceFileSystemURL(const char* base, |
769 const url_parse::Parsed& base_parsed, | 772 const url_parse::Parsed& base_parsed, |
770 const Replacements<char>& replacements, | 773 const Replacements<char>& replacements, |
771 CharsetConverter* query_converter, | 774 CharsetConverter* query_converter, |
772 CanonOutput* output, | 775 CanonOutput* output, |
773 url_parse::Parsed* new_parsed); | 776 url_parse::Parsed* new_parsed); |
774 bool ReplaceFileSystemURL(const char* base, | 777 URL_EXPORT bool ReplaceFileSystemURL(const char* base, |
775 const url_parse::Parsed& base_parsed, | 778 const url_parse::Parsed& base_parsed, |
776 const Replacements<char16>& replacements, | 779 const Replacements<char16>& replacements, |
777 CharsetConverter* query_converter, | 780 CharsetConverter* query_converter, |
778 CanonOutput* output, | 781 CanonOutput* output, |
779 url_parse::Parsed* new_parsed); | 782 url_parse::Parsed* new_parsed); |
780 | 783 |
781 // Replacing some parts of a file URL is not permitted. Everything except | 784 // Replacing some parts of a file URL is not permitted. Everything except |
782 // the host, path, query, and ref will be ignored. | 785 // the host, path, query, and ref will be ignored. |
783 bool ReplaceFileURL(const char* base, | 786 URL_EXPORT bool ReplaceFileURL(const char* base, |
784 const url_parse::Parsed& base_parsed, | 787 const url_parse::Parsed& base_parsed, |
785 const Replacements<char>& replacements, | 788 const Replacements<char>& replacements, |
786 CharsetConverter* query_converter, | 789 CharsetConverter* query_converter, |
787 CanonOutput* output, | 790 CanonOutput* output, |
788 url_parse::Parsed* new_parsed); | 791 url_parse::Parsed* new_parsed); |
789 bool ReplaceFileURL(const char* base, | 792 URL_EXPORT bool ReplaceFileURL(const char* base, |
790 const url_parse::Parsed& base_parsed, | 793 const url_parse::Parsed& base_parsed, |
791 const Replacements<char16>& replacements, | 794 const Replacements<char16>& replacements, |
792 CharsetConverter* query_converter, | 795 CharsetConverter* query_converter, |
793 CanonOutput* output, | 796 CanonOutput* output, |
794 url_parse::Parsed* new_parsed); | 797 url_parse::Parsed* new_parsed); |
795 | 798 |
796 // Path URLs can only have the scheme and path replaced. All other components | 799 // Path URLs can only have the scheme and path replaced. All other components |
797 // will be ignored. | 800 // will be ignored. |
798 bool ReplacePathURL(const char* base, | 801 URL_EXPORT bool ReplacePathURL(const char* base, |
799 const url_parse::Parsed& base_parsed, | 802 const url_parse::Parsed& base_parsed, |
800 const Replacements<char>& replacements, | 803 const Replacements<char>& replacements, |
801 CanonOutput* output, | 804 CanonOutput* output, |
802 url_parse::Parsed* new_parsed); | 805 url_parse::Parsed* new_parsed); |
803 bool ReplacePathURL(const char* base, | 806 URL_EXPORT bool ReplacePathURL(const char* base, |
804 const url_parse::Parsed& base_parsed, | 807 const url_parse::Parsed& base_parsed, |
805 const Replacements<char16>& replacements, | 808 const Replacements<char16>& replacements, |
806 CanonOutput* output, | 809 CanonOutput* output, |
807 url_parse::Parsed* new_parsed); | 810 url_parse::Parsed* new_parsed); |
808 | 811 |
809 // Mailto URLs can only have the scheme, path, and query replaced. | 812 // Mailto URLs can only have the scheme, path, and query replaced. |
810 // All other components will be ignored. | 813 // All other components will be ignored. |
811 bool ReplaceMailtoURL(const char* base, | 814 URL_EXPORT bool ReplaceMailtoURL(const char* base, |
812 const url_parse::Parsed& base_parsed, | 815 const url_parse::Parsed& base_parsed, |
813 const Replacements<char>& replacements, | 816 const Replacements<char>& replacements, |
814 CanonOutput* output, | 817 CanonOutput* output, |
815 url_parse::Parsed* new_parsed); | 818 url_parse::Parsed* new_parsed); |
816 bool ReplaceMailtoURL(const char* base, | 819 URL_EXPORT bool ReplaceMailtoURL(const char* base, |
817 const url_parse::Parsed& base_parsed, | 820 const url_parse::Parsed& base_parsed, |
818 const Replacements<char16>& replacements, | 821 const Replacements<char16>& replacements, |
819 CanonOutput* output, | 822 CanonOutput* output, |
820 url_parse::Parsed* new_parsed); | 823 url_parse::Parsed* new_parsed); |
821 | 824 |
822 // Relative URL --------------------------------------------------------------- | 825 // Relative URL --------------------------------------------------------------- |
823 | 826 |
824 // Given an input URL or URL fragment |fragment|, determines if it is a | 827 // Given an input URL or URL fragment |fragment|, determines if it is a |
825 // relative or absolute URL and places the result into |*is_relative|. If it is | 828 // relative or absolute URL and places the result into |*is_relative|. If it is |
826 // relative, the relevant portion of the URL will be placed into | 829 // relative, the relevant portion of the URL will be placed into |
827 // |*relative_component| (there may have been trimmed whitespace, for example). | 830 // |*relative_component| (there may have been trimmed whitespace, for example). |
828 // This value is passed to ResolveRelativeURL. If the input is not relative, | 831 // This value is passed to ResolveRelativeURL. If the input is not relative, |
829 // this value is UNDEFINED (it may be changed by the function). | 832 // this value is UNDEFINED (it may be changed by the function). |
830 // | 833 // |
831 // Returns true on success (we successfully determined the URL is relative or | 834 // Returns true on success (we successfully determined the URL is relative or |
832 // not). Failure means that the combination of URLs doesn't make any sense. | 835 // not). Failure means that the combination of URLs doesn't make any sense. |
833 // | 836 // |
834 // The base URL should always be canonical, therefore is ASCII. | 837 // The base URL should always be canonical, therefore is ASCII. |
835 bool IsRelativeURL(const char* base, | 838 URL_EXPORT bool IsRelativeURL(const char* base, |
836 const url_parse::Parsed& base_parsed, | 839 const url_parse::Parsed& base_parsed, |
837 const char* fragment, | 840 const char* fragment, |
838 int fragment_len, | 841 int fragment_len, |
839 bool is_base_hierarchical, | 842 bool is_base_hierarchical, |
840 bool* is_relative, | 843 bool* is_relative, |
841 url_parse::Component* relative_component); | 844 url_parse::Component* relative_component); |
842 bool IsRelativeURL(const char* base, | 845 URL_EXPORT bool IsRelativeURL(const char* base, |
843 const url_parse::Parsed& base_parsed, | 846 const url_parse::Parsed& base_parsed, |
844 const char16* fragment, | 847 const char16* fragment, |
845 int fragment_len, | 848 int fragment_len, |
846 bool is_base_hierarchical, | 849 bool is_base_hierarchical, |
847 bool* is_relative, | 850 bool* is_relative, |
848 url_parse::Component* relative_component); | 851 url_parse::Component* relative_component); |
849 | 852 |
850 // Given a canonical parsed source URL, a URL fragment known to be relative, | 853 // Given a canonical parsed source URL, a URL fragment known to be relative, |
851 // and the identified relevant portion of the relative URL (computed by | 854 // and the identified relevant portion of the relative URL (computed by |
852 // IsRelativeURL), this produces a new parsed canonical URL in |output| and | 855 // IsRelativeURL), this produces a new parsed canonical URL in |output| and |
853 // |out_parsed|. | 856 // |out_parsed|. |
854 // | 857 // |
855 // It also requires a flag indicating whether the base URL is a file: URL | 858 // It also requires a flag indicating whether the base URL is a file: URL |
856 // which triggers additional logic. | 859 // which triggers additional logic. |
857 // | 860 // |
858 // The base URL should be canonical and have a host (may be empty for file | 861 // The base URL should be canonical and have a host (may be empty for file |
859 // URLs) and a path. If it doesn't have these, we can't resolve relative | 862 // URLs) and a path. If it doesn't have these, we can't resolve relative |
860 // URLs off of it and will return the base as the output with an error flag. | 863 // URLs off of it and will return the base as the output with an error flag. |
861 // Becausee it is canonical is should also be ASCII. | 864 // Becausee it is canonical is should also be ASCII. |
862 // | 865 // |
863 // The query charset converter follows the same rules as CanonicalizeQuery. | 866 // The query charset converter follows the same rules as CanonicalizeQuery. |
864 // | 867 // |
865 // Returns true on success. On failure, the output will be "something | 868 // Returns true on success. On failure, the output will be "something |
866 // reasonable" that will be consistent and valid, just probably not what | 869 // reasonable" that will be consistent and valid, just probably not what |
867 // was intended by the web page author or caller. | 870 // was intended by the web page author or caller. |
868 bool ResolveRelativeURL(const char* base_url, | 871 URL_EXPORT bool ResolveRelativeURL( |
869 const url_parse::Parsed& base_parsed, | 872 const char* base_url, |
870 bool base_is_file, | 873 const url_parse::Parsed& base_parsed, |
871 const char* relative_url, | 874 bool base_is_file, |
872 const url_parse::Component& relative_component, | 875 const char* relative_url, |
873 CharsetConverter* query_converter, | 876 const url_parse::Component& relative_component, |
874 CanonOutput* output, | 877 CharsetConverter* query_converter, |
875 url_parse::Parsed* out_parsed); | 878 CanonOutput* output, |
876 bool ResolveRelativeURL(const char* base_url, | 879 url_parse::Parsed* out_parsed); |
877 const url_parse::Parsed& base_parsed, | 880 URL_EXPORT bool ResolveRelativeURL( |
878 bool base_is_file, | 881 const char* base_url, |
879 const char16* relative_url, | 882 const url_parse::Parsed& base_parsed, |
880 const url_parse::Component& relative_component, | 883 bool base_is_file, |
881 CharsetConverter* query_converter, | 884 const char16* relative_url, |
882 CanonOutput* output, | 885 const url_parse::Component& relative_component, |
883 url_parse::Parsed* out_parsed); | 886 CharsetConverter* query_converter, |
| 887 CanonOutput* output, |
| 888 url_parse::Parsed* out_parsed); |
884 | 889 |
885 } // namespace url_canon | 890 } // namespace url_canon |
886 | 891 |
887 #endif // URL_URL_CANON_H_ | 892 #endif // URL_URL_CANON_H_ |
OLD | NEW |