| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (C) 2005, 2007 Apple Inc. All rights reserved. | |
| 3 * | |
| 4 * Redistribution and use in source and binary forms, with or without | |
| 5 * modification, are permitted provided that the following conditions | |
| 6 * are met: | |
| 7 * | |
| 8 * 1. Redistributions of source code must retain the above copyright | |
| 9 * notice, this list of conditions and the following disclaimer. | |
| 10 * 2. Redistributions in binary form must reproduce the above copyright | |
| 11 * notice, this list of conditions and the following disclaimer in the | |
| 12 * documentation and/or other materials provided with the distribution. | |
| 13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of | |
| 14 * its contributors may be used to endorse or promote products derived | |
| 15 * from this software without specific prior written permission. | |
| 16 * | |
| 17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY | |
| 18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY | |
| 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | |
| 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 27 */ | |
| 28 | |
| 29 #import "config.h" | |
| 30 #import "WebCoreObjCExtras.h" | |
| 31 #import "WebCoreNSStringExtras.h" | |
| 32 #import "WebCoreNSURLExtras.h" | |
| 33 #import "WebCoreSystemInterface.h" | |
| 34 #import <wtf/RetainPtr.h> | |
| 35 #import <wtf/Vector.h> | |
| 36 #import <unicode/uchar.h> | |
| 37 #import <unicode/uidna.h> | |
| 38 #import <unicode/uscript.h> | |
| 39 | |
| 40 // Needs to be big enough to hold an IDN-encoded name. | |
| 41 // For host names bigger than this, we won't do IDN encoding, which is almost ce
rtainly OK. | |
| 42 #define HOST_NAME_BUFFER_LENGTH 2048 | |
| 43 #define URL_BYTES_BUFFER_LENGTH 2048 | |
| 44 | |
| 45 typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, voi
d *context); | |
| 46 | |
| 47 static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT; | |
| 48 static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32]; | |
| 49 | |
| 50 namespace WebCore { | |
| 51 | |
| 52 static inline BOOL isLookalikeCharacter(int charCode) | |
| 53 { | |
| 54 // This function treats the following as unsafe, lookalike characters: | |
| 55 // any non-printable character, any character considered as whitespace that
isn't already converted to a space by ICU, | |
| 56 // any ignorable character, and emoji characters related to locks. | |
| 57 | |
| 58 // We also considered the characters in Mozilla's blacklist (http://kb.mozil
lazine.org/Network.IDN.blacklist_chars), | |
| 59 // and included all of these characters that ICU can encode. | |
| 60 | |
| 61 if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty
(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) | |
| 62 return YES; | |
| 63 | |
| 64 switch (charCode) { | |
| 65 case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */ | |
| 66 case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */ | |
| 67 case 0x0251: /* LATIN SMALL LETTER ALPHA */ | |
| 68 case 0x0261: /* LATIN SMALL LETTER SCRIPT G */ | |
| 69 case 0x0335: /* COMBINING SHORT STROKE OVERLAY */ | |
| 70 case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */ | |
| 71 case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */ | |
| 72 case 0x05B4: /* HEBREW POINT HIRIQ */ | |
| 73 case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */ | |
| 74 case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */ | |
| 75 case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */ | |
| 76 case 0x0660: /* ARABIC INDIC DIGIT ZERO */ | |
| 77 case 0x06D4: /* ARABIC FULL STOP */ | |
| 78 case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */ | |
| 79 case 0x2027: /* HYPHENATION POINT */ | |
| 80 case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ | |
| 81 case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ | |
| 82 case 0x2044: /* FRACTION SLASH */ | |
| 83 case 0x2215: /* DIVISION SLASH */ | |
| 84 case 0x2216: /* SET MINUS */ | |
| 85 case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */ | |
| 86 case 0x23AE: /* INTEGRAL EXTENSION */ | |
| 87 case 0x244A: /* OCR DOUBLE BACKSLASH */ | |
| 88 case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */ | |
| 89 case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */ | |
| 90 case 0x29F8: /* BIG SOLIDUS */ | |
| 91 case 0x29f6: /* SOLIDUS WITH OVERBAR */ | |
| 92 case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */ | |
| 93 case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */ | |
| 94 case 0x3008: /* LEFT ANGLE BRACKET */ | |
| 95 case 0x3014: /* LEFT TORTOISE SHELL BRACKET */ | |
| 96 case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */ | |
| 97 case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */ | |
| 98 case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */ | |
| 99 case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */ | |
| 100 case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */ | |
| 101 case 0x33DF: /* SQUARE A OVER M */ | |
| 102 case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */ | |
| 103 case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ | |
| 104 case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */ | |
| 105 case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */ | |
| 106 case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */ | |
| 107 case 0x1F50F: /* LOCK WITH INK PEN */ | |
| 108 case 0x1F510: /* CLOSED LOCK WITH KEY */ | |
| 109 case 0x1F511: /* KEY */ | |
| 110 case 0x1F512: /* LOCK */ | |
| 111 case 0x1F513: /* OPEN LOCK */ | |
| 112 return YES; | |
| 113 default: | |
| 114 return NO; | |
| 115 } | |
| 116 } | |
| 117 | |
| 118 static BOOL readIDNScriptWhiteListFile(NSString *filename) | |
| 119 { | |
| 120 if (!filename) | |
| 121 return NO; | |
| 122 | |
| 123 FILE *file = fopen([filename fileSystemRepresentation], "r"); | |
| 124 if (!file) | |
| 125 return NO; | |
| 126 | |
| 127 // Read a word at a time. | |
| 128 // Allow comments, starting with # character to the end of the line. | |
| 129 while (1) { | |
| 130 // Skip a comment if present. | |
| 131 if (fscanf(file, " #%*[^\n\r]%*[\n\r]") == EOF) | |
| 132 break; | |
| 133 | |
| 134 // Read a script name if present. | |
| 135 char word[33]; | |
| 136 int result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word); | |
| 137 if (result == EOF) | |
| 138 break; | |
| 139 | |
| 140 if (result == 1) { | |
| 141 // Got a word, map to script code and put it into the array. | |
| 142 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); | |
| 143 if (script >= 0 && script < USCRIPT_CODE_LIMIT) { | |
| 144 size_t index = script / 32; | |
| 145 uint32_t mask = 1 << (script % 32); | |
| 146 IDNScriptWhiteList[index] |= mask; | |
| 147 } | |
| 148 } | |
| 149 } | |
| 150 fclose(file); | |
| 151 return YES; | |
| 152 } | |
| 153 | |
| 154 static void readIDNScriptWhiteList(void) | |
| 155 { | |
| 156 // Read white list from library. | |
| 157 NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAl
lDomainsMask, YES); | |
| 158 int numDirs = [dirs count]; | |
| 159 for (int i = 0; i < numDirs; i++) { | |
| 160 if (readIDNScriptWhiteListFile([[dirs objectAtIndex:i] stringByAppending
PathComponent:@"IDNScriptWhiteList.txt"])) | |
| 161 return; | |
| 162 } | |
| 163 | |
| 164 // Fall back on white list inside bundle. | |
| 165 NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebKit"]; | |
| 166 | |
| 167 if (!readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList
" ofType:@"txt"])) | |
| 168 CRASH(); | |
| 169 } | |
| 170 | |
| 171 static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t lengt
h) | |
| 172 { | |
| 173 pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList); | |
| 174 | |
| 175 int32_t i = 0; | |
| 176 while (i < length) { | |
| 177 UChar32 c; | |
| 178 U16_NEXT(buffer, i, length, c) | |
| 179 UErrorCode error = U_ZERO_ERROR; | |
| 180 UScriptCode script = uscript_getScript(c, &error); | |
| 181 if (error != U_ZERO_ERROR) { | |
| 182 LOG_ERROR("got ICU error while trying to look at scripts: %d", error
); | |
| 183 return NO; | |
| 184 } | |
| 185 if (script < 0) { | |
| 186 LOG_ERROR("got negative number for script code from ICU: %d", script
); | |
| 187 return NO; | |
| 188 } | |
| 189 if (script >= USCRIPT_CODE_LIMIT) | |
| 190 return NO; | |
| 191 | |
| 192 size_t index = script / 32; | |
| 193 uint32_t mask = 1 << (script % 32); | |
| 194 if (!(IDNScriptWhiteList[index] & mask)) | |
| 195 return NO; | |
| 196 | |
| 197 if (isLookalikeCharacter(c)) | |
| 198 return NO; | |
| 199 } | |
| 200 return YES; | |
| 201 } | |
| 202 | |
| 203 static BOOL allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length) | |
| 204 { | |
| 205 // Skip trailing dot for root domain. | |
| 206 if (buffer[length - 1] == '.') | |
| 207 length--; | |
| 208 | |
| 209 if (length > 3 && buffer[length - 3] == '.' | |
| 210 && buffer[length - 2] == 0x0440 // CYRILLIC SMALL LETTER ER | |
| 211 && buffer[length - 1] == 0x0444) // CYRILLIC SMALL LETTER EF | |
| 212 { | |
| 213 // Rules defined by <http://www.cctld.ru/ru/docs/rulesrf.php>. This code
only checks requirements that matter for presentation purposes. | |
| 214 for (int32_t i = length - 4; i; --i) { | |
| 215 UChar ch = buffer[i]; | |
| 216 | |
| 217 // Only modern Russian letters, digits and dashes are allowed. | |
| 218 if ((ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451|| (ch >= '0' && c
h <= '9') || ch == '-') | |
| 219 continue; | |
| 220 | |
| 221 // Only check top level domain. Lower level registrars may have diff
erent rules. | |
| 222 if (ch == '.') | |
| 223 break; | |
| 224 | |
| 225 return NO; | |
| 226 } | |
| 227 return YES; | |
| 228 } | |
| 229 | |
| 230 // Not a known top level domain with special rules. | |
| 231 return NO; | |
| 232 } | |
| 233 | |
| 234 // Return value of nil means no mapping is necessary. | |
| 235 // If makeString is NO, then return value is either nil or self to indicate mapp
ing is necessary. | |
| 236 // If makeString is YES, then return value is either nil or the mapped string. | |
| 237 static NSString *mapHostNameWithRange(NSString *string, NSRange range, BOOL enco
de, BOOL makeString) | |
| 238 { | |
| 239 if (range.length > HOST_NAME_BUFFER_LENGTH) | |
| 240 return nil; | |
| 241 | |
| 242 if (![string length]) | |
| 243 return nil; | |
| 244 | |
| 245 UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH]; | |
| 246 UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH]; | |
| 247 | |
| 248 if (encode && [string rangeOfString:@"%" options:NSLiteralSearch range:range
].location != NSNotFound) { | |
| 249 NSString *substring = [string substringWithRange:range]; | |
| 250 substring = WebCoreCFAutorelease(CFURLCreateStringByReplacingPercentEsca
pes(NULL, (CFStringRef)substring, CFSTR(""))); | |
| 251 if (substring) { | |
| 252 string = substring; | |
| 253 range = NSMakeRange(0, [string length]); | |
| 254 } | |
| 255 } | |
| 256 | |
| 257 int length = range.length; | |
| 258 [string getCharacters:sourceBuffer range:range]; | |
| 259 | |
| 260 UErrorCode error = U_ZERO_ERROR; | |
| 261 int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUni
code)(sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_AL
LOW_UNASSIGNED, NULL, &error); | |
| 262 if (error != U_ZERO_ERROR) | |
| 263 return nil; | |
| 264 | |
| 265 if (numCharactersConverted == length && !memcmp(sourceBuffer, destinationBuf
fer, length * sizeof(UChar))) | |
| 266 return nil; | |
| 267 | |
| 268 if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numChar
actersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharac
tersConverted)) | |
| 269 return nil; | |
| 270 | |
| 271 return makeString ? (NSString *)[NSString stringWithCharacters:destinationBu
ffer length:numCharactersConverted] : string; | |
| 272 } | |
| 273 | |
| 274 BOOL hostNameNeedsDecodingWithRange(NSString *string, NSRange range) | |
| 275 { | |
| 276 return mapHostNameWithRange(string, range, NO, NO) != nil; | |
| 277 } | |
| 278 | |
| 279 BOOL hostNameNeedsEncodingWithRange(NSString *string, NSRange range) | |
| 280 { | |
| 281 return mapHostNameWithRange(string, range, YES, NO) != nil; | |
| 282 } | |
| 283 | |
| 284 NSString *decodeHostNameWithRange(NSString *string, NSRange range) | |
| 285 { | |
| 286 return mapHostNameWithRange(string, range, NO, YES); | |
| 287 } | |
| 288 | |
| 289 NSString *encodeHostNameWithRange(NSString *string, NSRange range) | |
| 290 { | |
| 291 return mapHostNameWithRange(string, range, YES, YES); | |
| 292 } | |
| 293 | |
| 294 NSString *decodeHostName(NSString *string) | |
| 295 { | |
| 296 NSString *name = mapHostNameWithRange(string, NSMakeRange(0, [string length]
), NO, YES); | |
| 297 return !name ? string : name; | |
| 298 } | |
| 299 | |
| 300 NSString *encodeHostName(NSString *string) | |
| 301 { | |
| 302 NSString *name = mapHostNameWithRange(string, NSMakeRange(0, [string length
]), YES, YES); | |
| 303 return !name ? string : name; | |
| 304 } | |
| 305 | |
| 306 static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *
context, BOOL encode) | |
| 307 { | |
| 308 BOOL needsMapping = encode ? hostNameNeedsEncodingWithRange(string, range) :
hostNameNeedsDecodingWithRange(string, range); | |
| 309 if (!needsMapping) | |
| 310 return; | |
| 311 | |
| 312 NSMutableArray **array = (NSMutableArray **)context; | |
| 313 if (!*array) | |
| 314 *array = [[NSMutableArray alloc] init]; | |
| 315 | |
| 316 [*array addObject:[NSValue valueWithRange:range]]; | |
| 317 } | |
| 318 | |
| 319 static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void
*context) | |
| 320 { | |
| 321 return collectRangesThatNeedMapping(string, range, context, YES); | |
| 322 } | |
| 323 | |
| 324 static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void
*context) | |
| 325 { | |
| 326 return collectRangesThatNeedMapping(string, range, context, NO); | |
| 327 } | |
| 328 | |
| 329 static inline NSCharacterSet *retain(NSCharacterSet *charset) | |
| 330 { | |
| 331 CFRetain(charset); | |
| 332 return charset; | |
| 333 } | |
| 334 | |
| 335 static void applyHostNameFunctionToMailToURLString(NSString *string, StringRange
ApplierFunction f, void *context) | |
| 336 { | |
| 337 // In a mailto: URL, host names come after a '@' character and end with a '>
' or ',' or '?' character. | |
| 338 // Skip quoted strings so that characters in them don't confuse us. | |
| 339 // When we find a '?' character, we are past the part of the URL that contai
ns host names. | |
| 340 | |
| 341 static NSCharacterSet *hostNameOrStringStartCharacters = retain([NSCharacter
Set characterSetWithCharactersInString:@"\"@?"]); | |
| 342 static NSCharacterSet *hostNameEndCharacters = retain([NSCharacterSet charac
terSetWithCharactersInString:@">,?"]); | |
| 343 static NSCharacterSet *quotedStringCharacters = retain([NSCharacterSet chara
cterSetWithCharactersInString:@"\"\\"]); | |
| 344 | |
| 345 unsigned stringLength = [string length]; | |
| 346 NSRange remaining = NSMakeRange(0, stringLength); | |
| 347 | |
| 348 while (1) { | |
| 349 // Find start of host name or of quoted string. | |
| 350 NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostName
OrStringStartCharacters options:0 range:remaining]; | |
| 351 if (hostNameOrStringStart.location == NSNotFound) | |
| 352 return; | |
| 353 | |
| 354 unichar c = [string characterAtIndex:hostNameOrStringStart.location]; | |
| 355 remaining.location = NSMaxRange(hostNameOrStringStart); | |
| 356 remaining.length = stringLength - remaining.location; | |
| 357 | |
| 358 if (c == '?') | |
| 359 return; | |
| 360 | |
| 361 if (c == '@') { | |
| 362 // Find end of host name. | |
| 363 unsigned hostNameStart = remaining.location; | |
| 364 NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCha
racters options:0 range:remaining]; | |
| 365 BOOL done; | |
| 366 if (hostNameEnd.location == NSNotFound) { | |
| 367 hostNameEnd.location = stringLength; | |
| 368 done = YES; | |
| 369 } else { | |
| 370 remaining.location = hostNameEnd.location; | |
| 371 remaining.length = stringLength - remaining.location; | |
| 372 done = NO; | |
| 373 } | |
| 374 | |
| 375 // Process host name range. | |
| 376 f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostName
Start), context); | |
| 377 | |
| 378 if (done) | |
| 379 return; | |
| 380 } else { | |
| 381 // Skip quoted string. | |
| 382 ASSERT(c == '"'); | |
| 383 while (1) { | |
| 384 NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFr
omSet:quotedStringCharacters options:0 range:remaining]; | |
| 385 if (escapedCharacterOrStringEnd.location == NSNotFound) | |
| 386 return; | |
| 387 | |
| 388 c = [string characterAtIndex:escapedCharacterOrStringEnd.locatio
n]; | |
| 389 remaining.location = NSMaxRange(escapedCharacterOrStringEnd); | |
| 390 remaining.length = stringLength - remaining.location; | |
| 391 | |
| 392 // If we are the end of the string, then break from the string l
oop back to the host name loop. | |
| 393 if (c == '"') | |
| 394 break; | |
| 395 | |
| 396 // Skip escaped character. | |
| 397 ASSERT(c == '\\'); | |
| 398 if (!remaining.length) | |
| 399 return; | |
| 400 | |
| 401 remaining.location += 1; | |
| 402 remaining.length -= 1; | |
| 403 } | |
| 404 } | |
| 405 } | |
| 406 } | |
| 407 | |
| 408 static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplie
rFunction f, void *context) | |
| 409 { | |
| 410 // Find hostnames. Too bad we can't use any real URL-parsing code to do this
, | |
| 411 // but we have to do it before doing all the %-escaping, and this is the onl
y | |
| 412 // code we have that parses mailto URLs anyway. | |
| 413 | |
| 414 // Maybe we should implement this using a character buffer instead? | |
| 415 | |
| 416 if (hasCaseInsensitivePrefix(string, @"mailto:")) { | |
| 417 applyHostNameFunctionToMailToURLString(string, f, context); | |
| 418 return; | |
| 419 } | |
| 420 | |
| 421 // Find the host name in a hierarchical URL. | |
| 422 // It comes after a "://" sequence, with scheme characters preceding. | |
| 423 // If ends with the end of the string or a ":", "/", or a "?". | |
| 424 // If there is a "@" character, the host part is just the part after the "@"
. | |
| 425 NSRange separatorRange = [string rangeOfString:@"://"]; | |
| 426 if (separatorRange.location == NSNotFound) | |
| 427 return; | |
| 428 | |
| 429 // Check that all characters before the :// are valid scheme characters. | |
| 430 static NSCharacterSet *nonSchemeCharacters = retain([[NSCharacterSet charact
erSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwx
yz0123456789+-."] invertedSet]); | |
| 431 if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMa
keRange(0, separatorRange.location)].location != NSNotFound) | |
| 432 return; | |
| 433 | |
| 434 unsigned stringLength = [string length]; | |
| 435 | |
| 436 static NSCharacterSet *hostTerminators = retain([NSCharacterSet characterSet
WithCharactersInString:@":/?#"]); | |
| 437 | |
| 438 // Start after the separator. | |
| 439 unsigned authorityStart = NSMaxRange(separatorRange); | |
| 440 | |
| 441 // Find terminating character. | |
| 442 NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators
options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)]; | |
| 443 unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLen
gth : hostNameTerminator.location; | |
| 444 | |
| 445 // Find "@" for the start of the host name. | |
| 446 NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMa
keRange(authorityStart, hostNameEnd - authorityStart)]; | |
| 447 unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authori
tyStart : NSMaxRange(userInfoTerminator); | |
| 448 | |
| 449 f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context); | |
| 450 } | |
| 451 | |
| 452 static NSString *mapHostNames(NSString *string, BOOL encode) | |
| 453 { | |
| 454 // Generally, we want to optimize for the case where there is one host name
that does not need mapping. | |
| 455 | |
| 456 if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding]) | |
| 457 return string; | |
| 458 | |
| 459 // Make a list of ranges that actually need mapping. | |
| 460 NSMutableArray *hostNameRanges = nil; | |
| 461 StringRangeApplierFunction f = encode ? collectRangesThatNeedEncoding : coll
ectRangesThatNeedDecoding; | |
| 462 applyHostNameFunctionToURLString(string, f, &hostNameRanges); | |
| 463 if (!hostNameRanges) | |
| 464 return string; | |
| 465 | |
| 466 // Do the mapping. | |
| 467 NSMutableString *mutableCopy = [string mutableCopy]; | |
| 468 unsigned i = [hostNameRanges count]; | |
| 469 while (i--) { | |
| 470 NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue]; | |
| 471 NSString *mappedHostName = encode ? encodeHostNameWithRange(string, host
NameRange) : decodeHostNameWithRange(string, hostNameRange); | |
| 472 [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHos
tName]; | |
| 473 } | |
| 474 [hostNameRanges release]; | |
| 475 return [mutableCopy autorelease]; | |
| 476 } | |
| 477 | |
| 478 static BOOL isHexDigit(char c) | |
| 479 { | |
| 480 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <=
'f'); | |
| 481 } | |
| 482 | |
| 483 static char hexDigit(int i) | |
| 484 { | |
| 485 if (i < 0 || i > 16) | |
| 486 return '0'; | |
| 487 | |
| 488 return (i >= 10) ? i - 10 + 'A' : i += '0'; | |
| 489 } | |
| 490 | |
| 491 static int hexDigitValue(char c) | |
| 492 { | |
| 493 if (c >= '0' && c <= '9') | |
| 494 return c - '0'; | |
| 495 | |
| 496 if (c >= 'A' && c <= 'F') | |
| 497 return c - 'A' + 10; | |
| 498 | |
| 499 if (c >= 'a' && c <= 'f') | |
| 500 return c - 'a' + 10; | |
| 501 | |
| 502 LOG_ERROR("illegal hex digit"); | |
| 503 return 0; | |
| 504 } | |
| 505 | |
| 506 static NSString *stringByTrimmingWhitespace(NSString *string) | |
| 507 { | |
| 508 NSMutableString *trimmed = [[string mutableCopy] autorelease]; | |
| 509 CFStringTrimWhitespace((CFMutableStringRef)trimmed); | |
| 510 return trimmed; | |
| 511 } | |
| 512 | |
| 513 NSURL *URLByTruncatingOneCharacterBeforeComponent(NSURL *URL, CFIndex component) | |
| 514 { | |
| 515 if (!URL) | |
| 516 return nil; | |
| 517 | |
| 518 CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)URL, static_cast<CF
URLComponentType>(component), NULL); | |
| 519 if (fragRg.location == kCFNotFound) | |
| 520 return URL; | |
| 521 | |
| 522 UInt8 *urlBytes, buffer[2048]; | |
| 523 CFIndex numBytes = CFURLGetBytes((CFURLRef)URL, buffer, 2048); | |
| 524 if (numBytes == -1) { | |
| 525 numBytes = CFURLGetBytes((CFURLRef)URL, NULL, 0); | |
| 526 urlBytes = static_cast<UInt8*>(malloc(numBytes)); | |
| 527 CFURLGetBytes((CFURLRef)URL, urlBytes, numBytes); | |
| 528 } else | |
| 529 urlBytes = buffer; | |
| 530 | |
| 531 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlByt
es, fragRg.location - 1, kCFStringEncodingUTF8, NULL)); | |
| 532 if (!result) | |
| 533 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes,
fragRg.location - 1, kCFStringEncodingISOLatin1, NULL)); | |
| 534 | |
| 535 if (urlBytes != buffer) | |
| 536 free(urlBytes); | |
| 537 return result ? [result autorelease] : URL; | |
| 538 } | |
| 539 | |
| 540 static NSURL *URLByRemovingResourceSpecifier(NSURL *URL) | |
| 541 { | |
| 542 return URLByTruncatingOneCharacterBeforeComponent(URL, kCFURLComponentResour
ceSpecifier); | |
| 543 } | |
| 544 | |
| 545 NSURL *URLWithData(NSData *data, NSURL *baseURL) | |
| 546 { | |
| 547 if (!data) | |
| 548 return nil; | |
| 549 | |
| 550 NSURL *result = nil; | |
| 551 size_t length = [data length]; | |
| 552 if (length > 0) { | |
| 553 // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRU
E) doesn't remove non-path components. | |
| 554 baseURL = URLByRemovingResourceSpecifier(baseURL); | |
| 555 | |
| 556 const UInt8 *bytes = static_cast<const UInt8*>([data bytes]); | |
| 557 | |
| 558 // CFURLCreateAbsoluteURLWithBytes would complain to console if we passe
d a path to it. | |
| 559 if (bytes[0] == '/' && !baseURL) | |
| 560 return nil; | |
| 561 | |
| 562 // NOTE: We use UTF-8 here since this encoding is used when computing st
rings when returning URL components | |
| 563 // (e.g calls to NSURL -path). However, this function is not tolerant of
illegal UTF-8 sequences, which | |
| 564 // could either be a malformed string or bytes in a different encoding,
like shift-jis, so we fall back | |
| 565 // onto using ISO Latin 1 in those cases. | |
| 566 result = WebCoreCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, byte
s, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES)); | |
| 567 if (!result) | |
| 568 result = WebCoreCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL,
bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES)); | |
| 569 } else | |
| 570 result = [NSURL URLWithString:@""]; | |
| 571 | |
| 572 return result; | |
| 573 } | |
| 574 | |
| 575 NSURL *URLWithUserTypedString(NSString *string, NSURL *URL) | |
| 576 { | |
| 577 if (!string) | |
| 578 return nil; | |
| 579 | |
| 580 string = mapHostNames(stringByTrimmingWhitespace(string), YES); | |
| 581 | |
| 582 NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding]; | |
| 583 ASSERT(userTypedData); | |
| 584 | |
| 585 const UInt8* inBytes = static_cast<const UInt8 *>([userTypedData bytes]); | |
| 586 int inLength = [userTypedData length]; | |
| 587 if (!inLength) | |
| 588 return [NSURL URLWithString:@""]; | |
| 589 | |
| 590 char* outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough
to %-escape every character | |
| 591 char* p = outBytes; | |
| 592 int outLength = 0; | |
| 593 for (int i = 0; i < inLength; i++) { | |
| 594 UInt8 c = inBytes[i]; | |
| 595 if (c <= 0x20 || c >= 0x7f) { | |
| 596 *p++ = '%'; | |
| 597 *p++ = hexDigit(c >> 4); | |
| 598 *p++ = hexDigit(c & 0xf); | |
| 599 outLength += 3; | |
| 600 } else { | |
| 601 *p++ = c; | |
| 602 outLength++; | |
| 603 } | |
| 604 } | |
| 605 | |
| 606 NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // ad
opts outBytes | |
| 607 return URLWithData(data, URL); | |
| 608 } | |
| 609 | |
| 610 static BOOL hasQuestionMarkOnlyQueryString(NSURL *URL) | |
| 611 { | |
| 612 CFRange rangeWithSeparators; | |
| 613 CFURLGetByteRangeForComponent((CFURLRef)URL, kCFURLComponentQuery, &rangeWit
hSeparators); | |
| 614 if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.lengt
h == 1) | |
| 615 return YES; | |
| 616 | |
| 617 return NO; | |
| 618 } | |
| 619 | |
| 620 #define completeURL (CFURLComponentType)-1 | |
| 621 | |
| 622 NSData *dataForURLComponentType(NSURL *URL, CFIndex componentType) | |
| 623 { | |
| 624 static int URLComponentTypeBufferLength = 2048; | |
| 625 | |
| 626 UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength]; | |
| 627 UInt8 *allBytesBuffer = staticAllBytesBuffer; | |
| 628 | |
| 629 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)URL, allBytesBuffer, URLCompon
entTypeBufferLength); | |
| 630 if (bytesFilled == -1) { | |
| 631 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)URL, NULL, 0); | |
| 632 allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate)); | |
| 633 bytesFilled = CFURLGetBytes((CFURLRef)URL, allBytesBuffer, bytesToAlloca
te); | |
| 634 } | |
| 635 | |
| 636 CFRange range; | |
| 637 if (componentType != completeURL) { | |
| 638 range = CFURLGetByteRangeForComponent((CFURLRef)URL, static_cast<CFURLCo
mponentType>(componentType), NULL); | |
| 639 if (range.location == kCFNotFound) | |
| 640 return nil; | |
| 641 } else { | |
| 642 range.location = 0; | |
| 643 range.length = bytesFilled; | |
| 644 } | |
| 645 | |
| 646 NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.locatio
n length:range.length]; | |
| 647 | |
| 648 const unsigned char *bytes = static_cast<const unsigned char *>([componentDa
ta bytes]); | |
| 649 NSMutableData *resultData = [NSMutableData data]; | |
| 650 // NOTE: add leading '?' to query strings non-zero length query strings. | |
| 651 // NOTE: retain question-mark only query strings. | |
| 652 if (componentType == kCFURLComponentQuery) { | |
| 653 if (range.length > 0 || hasQuestionMarkOnlyQueryString(URL)) | |
| 654 [resultData appendBytes:"?" length:1]; | |
| 655 } | |
| 656 for (int i = 0; i < range.length; i++) { | |
| 657 unsigned char c = bytes[i]; | |
| 658 if (c <= 0x20 || c >= 0x7f) { | |
| 659 char escaped[3]; | |
| 660 escaped[0] = '%'; | |
| 661 escaped[1] = hexDigit(c >> 4); | |
| 662 escaped[2] = hexDigit(c & 0xf); | |
| 663 [resultData appendBytes:escaped length:3]; | |
| 664 } else { | |
| 665 char b[1]; | |
| 666 b[0] = c; | |
| 667 [resultData appendBytes:b length:1]; | |
| 668 } | |
| 669 } | |
| 670 | |
| 671 if (staticAllBytesBuffer != allBytesBuffer) | |
| 672 free(allBytesBuffer); | |
| 673 | |
| 674 return resultData; | |
| 675 } | |
| 676 | |
| 677 static NSURL *URLByRemovingComponentAndSubsequentCharacter(NSURL *URL, CFURLComp
onentType component) | |
| 678 { | |
| 679 CFRange range = CFURLGetByteRangeForComponent((CFURLRef)URL, component, 0); | |
| 680 if (range.location == kCFNotFound) | |
| 681 return URL; | |
| 682 | |
| 683 // Remove one subsequent character. | |
| 684 range.length++; | |
| 685 | |
| 686 UInt8* urlBytes; | |
| 687 UInt8 buffer[2048]; | |
| 688 CFIndex numBytes = CFURLGetBytes((CFURLRef)URL, buffer, 2048); | |
| 689 if (numBytes == -1) { | |
| 690 numBytes = CFURLGetBytes((CFURLRef)URL, NULL, 0); | |
| 691 urlBytes = static_cast<UInt8*>(malloc(numBytes)); | |
| 692 CFURLGetBytes((CFURLRef)URL, urlBytes, numBytes); | |
| 693 } else | |
| 694 urlBytes = buffer; | |
| 695 | |
| 696 if (numBytes < range.location) | |
| 697 return URL; | |
| 698 if (numBytes < range.location + range.length) | |
| 699 range.length = numBytes - range.location; | |
| 700 | |
| 701 memmove(urlBytes + range.location, urlBytes + range.location + range.length,
numBytes - range.location + range.length); | |
| 702 | |
| 703 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlByt
es, numBytes - range.length, kCFStringEncodingUTF8, NULL)); | |
| 704 if (!result) | |
| 705 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes,
numBytes - range.length, kCFStringEncodingISOLatin1, NULL)); | |
| 706 | |
| 707 if (urlBytes != buffer) | |
| 708 free(urlBytes); | |
| 709 | |
| 710 return result ? [result autorelease] : URL; | |
| 711 } | |
| 712 | |
| 713 NSURL *URLByRemovingUserInfo(NSURL *URL) | |
| 714 { | |
| 715 return URLByRemovingComponentAndSubsequentCharacter(URL, kCFURLComponentUser
Info); | |
| 716 } | |
| 717 | |
| 718 NSData *originalURLData(NSURL *URL) | |
| 719 { | |
| 720 UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH); | |
| 721 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)URL, buffer, URL_BYTES_BUFFER_
LENGTH); | |
| 722 if (bytesFilled == -1) { | |
| 723 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)URL, NULL, 0); | |
| 724 buffer = (UInt8 *)realloc(buffer, bytesToAllocate); | |
| 725 bytesFilled = CFURLGetBytes((CFURLRef)URL, buffer, bytesToAllocate); | |
| 726 ASSERT(bytesFilled == bytesToAllocate); | |
| 727 } | |
| 728 | |
| 729 // buffer is adopted by the NSData | |
| 730 NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhe
nDone:YES]; | |
| 731 | |
| 732 NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)URL); | |
| 733 if (baseURL) | |
| 734 return originalURLData(URLWithData(data, baseURL)); | |
| 735 return data; | |
| 736 } | |
| 737 | |
| 738 static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string) | |
| 739 { | |
| 740 CFIndex length = CFStringGetLength(string); | |
| 741 Vector<UChar, 2048> sourceBuffer(length); | |
| 742 CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data()); | |
| 743 | |
| 744 Vector<UChar, 2048> outBuffer; | |
| 745 | |
| 746 CFIndex i = 0; | |
| 747 while (i < length) { | |
| 748 UChar32 c; | |
| 749 U16_NEXT(sourceBuffer, i, length, c) | |
| 750 | |
| 751 if (isLookalikeCharacter(c)) { | |
| 752 uint8_t utf8Buffer[4]; | |
| 753 CFIndex offset = 0; | |
| 754 UBool failure = false; | |
| 755 U8_APPEND(utf8Buffer, offset, 4, c, failure) | |
| 756 ASSERT(!failure); | |
| 757 | |
| 758 for (CFIndex j = 0; j < offset; ++j) { | |
| 759 outBuffer.append('%'); | |
| 760 outBuffer.append(hexDigit(utf8Buffer[j] >> 4)); | |
| 761 outBuffer.append(hexDigit(utf8Buffer[j] & 0xf)); | |
| 762 } | |
| 763 } else { | |
| 764 UChar utf16Buffer[2]; | |
| 765 CFIndex offset = 0; | |
| 766 UBool failure = false; | |
| 767 U16_APPEND(utf16Buffer, offset, 2, c, failure) | |
| 768 ASSERT(!failure); | |
| 769 for (CFIndex j = 0; j < offset; ++j) | |
| 770 outBuffer.append(utf16Buffer[j]); | |
| 771 } | |
| 772 } | |
| 773 | |
| 774 return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size()
); | |
| 775 } | |
| 776 | |
| 777 NSString *userVisibleString(NSURL *URL) | |
| 778 { | |
| 779 NSData *data = originalURLData(URL); | |
| 780 const unsigned char *before = static_cast<const unsigned char*>([data bytes]
); | |
| 781 int length = [data length]; | |
| 782 | |
| 783 bool needsHostNameDecoding = false; | |
| 784 | |
| 785 const unsigned char *p = before; | |
| 786 int bufferLength = (length * 3) + 1; | |
| 787 char *after = static_cast<char *>(malloc(bufferLength)); // large enough to
%-escape every character | |
| 788 char *q = after; | |
| 789 for (int i = 0; i < length; i++) { | |
| 790 unsigned char c = p[i]; | |
| 791 // unescape escape sequences that indicate bytes greater than 0x7f | |
| 792 if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < leng
th && isHexDigit(p[i + 2])) { | |
| 793 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i
+ 2]); | |
| 794 if (u > 0x7f) { | |
| 795 // unescape | |
| 796 *q++ = u; | |
| 797 } else { | |
| 798 // do not unescape | |
| 799 *q++ = p[i]; | |
| 800 *q++ = p[i + 1]; | |
| 801 *q++ = p[i + 2]; | |
| 802 } | |
| 803 i += 2; | |
| 804 } else { | |
| 805 *q++ = c; | |
| 806 | |
| 807 // Check for "xn--" in an efficient, non-case-sensitive, way. | |
| 808 if (c == '-' && i >= 3 && !needsHostNameDecoding && (q[-4] | 0x20) =
= 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-') | |
| 809 needsHostNameDecoding = true; | |
| 810 } | |
| 811 } | |
| 812 *q = '\0'; | |
| 813 | |
| 814 // Check string to see if it can be converted to display using UTF-8 | |
| 815 NSString *result = [NSString stringWithUTF8String:after]; | |
| 816 if (!result) { | |
| 817 // Could not convert to UTF-8. | |
| 818 // Convert characters greater than 0x7f to escape sequences. | |
| 819 // Shift current string to the end of the buffer | |
| 820 // then we will copy back bytes to the start of the buffer | |
| 821 // as we convert. | |
| 822 int afterlength = q - after; | |
| 823 char *p = after + bufferLength - afterlength - 1; | |
| 824 memmove(p, after, afterlength + 1); // copies trailing '\0' | |
| 825 char *q = after; | |
| 826 while (*p) { | |
| 827 unsigned char c = *p; | |
| 828 if (c > 0x7f) { | |
| 829 *q++ = '%'; | |
| 830 *q++ = hexDigit(c >> 4); | |
| 831 *q++ = hexDigit(c & 0xf); | |
| 832 } else | |
| 833 *q++ = *p; | |
| 834 p++; | |
| 835 } | |
| 836 *q = '\0'; | |
| 837 result = [NSString stringWithUTF8String:after]; | |
| 838 } | |
| 839 | |
| 840 free(after); | |
| 841 | |
| 842 result = mapHostNames(result, !needsHostNameDecoding); | |
| 843 result = [result precomposedStringWithCanonicalMapping]; | |
| 844 return WebCoreCFAutorelease(createStringWithEscapedUnsafeCharacters((CFStrin
gRef)result)); | |
| 845 } | |
| 846 | |
| 847 BOOL isUserVisibleURL(NSString *string) | |
| 848 { | |
| 849 BOOL valid = YES; | |
| 850 // get buffer | |
| 851 | |
| 852 char static_buffer[1024]; | |
| 853 const char *p; | |
| 854 BOOL success = CFStringGetCString((CFStringRef)string, static_buffer, 1023,
kCFStringEncodingUTF8); | |
| 855 p = success ? static_buffer : [string UTF8String]; | |
| 856 | |
| 857 int length = strlen(p); | |
| 858 | |
| 859 // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn
--, these | |
| 860 // are the things that will lead _web_userVisibleString to actually change t
hings. | |
| 861 for (int i = 0; i < length; i++) { | |
| 862 unsigned char c = p[i]; | |
| 863 // escape control characters, space, and delete | |
| 864 if (c <= 0x20 || c == 0x7f) { | |
| 865 valid = NO; | |
| 866 break; | |
| 867 } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2
< length && isHexDigit(p[i + 2])) { | |
| 868 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i
+ 2]); | |
| 869 if (u > 0x7f) { | |
| 870 valid = NO; | |
| 871 break; | |
| 872 } | |
| 873 i += 2; | |
| 874 } else { | |
| 875 // Check for "xn--" in an efficient, non-case-sensitive, way. | |
| 876 if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x
20) == 'n' && p[i - 1] == '-') { | |
| 877 valid = NO; | |
| 878 break; | |
| 879 } | |
| 880 } | |
| 881 } | |
| 882 | |
| 883 return valid; | |
| 884 } | |
| 885 | |
| 886 } // namespace WebCore | |
| OLD | NEW |