OLD | NEW |
(Empty) | |
| 1 const fs = require('fs'); |
| 2 |
| 3 /* |
| 4 How to use: |
| 5 1) Get dump of data as CSV format and name it 3pas.csv same directory as this sc
ript. |
| 6 2) Header fields in the CSV will be used as keys when destructing into JSON obje
cts [ie: top row data should not have spaces or special chars] |
| 7 3) The two important column names are: 'name_legal_product' and 'domain'. |
| 8 4) There may not be a header named 'prefix'. |
| 9 5) 'name_legal_product' Will have it's data cleaned up a bit, so be prepared for
it to change. |
| 10 6) This script tries to de-duplicate any data, so be prepared for many entries t
o go away if it finds a shorter one. |
| 11 7) This script will output a javascript file in the product_registry's data form
at. |
| 12 */ |
| 13 |
| 14 /* |
| 15 * Configurable variables. You may need to tweak these to be compatible with |
| 16 * the server-side, but the defaults work in most cases. |
| 17 */ |
| 18 const hexcase = 0; /* hex output format. 0 - lowercase; 1 - uppercase */ |
| 19 const b64pad = '='; /* base-64 pad character. "=" for strict RFC compliance */ |
| 20 const chrsz = 8; /* bits per input character. 8 - ASCII; 16 - Unicode */ |
| 21 |
| 22 var data = fs.readFileSync('3pas.csv', 'utf8'); |
| 23 var headerLine = data.split('\n', 1)[0]; |
| 24 data = data.substr(headerLine.length); |
| 25 var headerLineOrigLength = headerLine.length; |
| 26 |
| 27 var columnNames = Array.from(csvUnmarshaller(headerLine)).map(v => v[0]); |
| 28 var lineObjs = []; |
| 29 |
| 30 var marshaller = csvUnmarshaller(data, 2); |
| 31 var lineObj = {}; |
| 32 var colIndex = 0; |
| 33 for (var [colData, isEnding] of marshaller) { |
| 34 if (!(columnNames[colIndex] in lineObj)) |
| 35 lineObj[columnNames[colIndex]] = colData; |
| 36 colIndex++; |
| 37 if (isEnding) { |
| 38 lineObj = {}; |
| 39 lineObjs.push(lineObj); |
| 40 colIndex = 0; |
| 41 } |
| 42 } |
| 43 |
| 44 var map = new Map(); |
| 45 for (var lineObj of lineObjs) { |
| 46 if (lineObj.domain === null || lineObj.domain === undefined || |
| 47 (lineObj.status_allowed !== 'allowed' && lineObj.status_allowed !== 'disal
lowed')) |
| 48 continue; |
| 49 lineObj.domain = |
| 50 lineObj.domain.trim().toLowerCase().replace(/[^a-z0-9_\-*.]/g, '').replace
(/^www\.(?=[^.]+\.[^.]+$)/, ''); |
| 51 |
| 52 lineObj.name_legal_product = lineObj.name_legal_product.trim() |
| 53 .replace(/\s\s/g, ' ') |
| 54 .replace(/[\x00-\x1F]/g, '') |
| 55 .replace(/"/g, '"') |
| 56 // The following two lines are to keep input
data from currupting output data. |
| 57 .replace(/","/g, '') |
| 58 .replace(/},{/g, '') |
| 59 .replace(/“|”/g, '"') |
| 60 .replace(/,$/g, '') |
| 61 .replace(/&/g, '&') |
| 62 // This is how csv escapes double quotes. |
| 63 .replace(/""/g, '"'); |
| 64 if (!map.has(lineObj.domain)) |
| 65 map.set(lineObj.domain, lineObj); |
| 66 } |
| 67 |
| 68 lineObjs = Array.from(map.values()); |
| 69 |
| 70 var map = new Map(); |
| 71 for (var lineObj of lineObjs) { |
| 72 if (!lineObj) |
| 73 continue; |
| 74 var domain = lineObj.domain.trim(); |
| 75 if (!domain.length) |
| 76 continue; |
| 77 var prefixSuffix = domain.split('*'); |
| 78 if (prefixSuffix.length > 2) |
| 79 throw 'We do not support multiple * in domains'; |
| 80 var prefix = ''; |
| 81 var suffixDomain = ''; |
| 82 if (prefixSuffix.length === 1) { |
| 83 suffixDomain = prefixSuffix[0]; |
| 84 } else { |
| 85 prefix = prefixSuffix[0]; |
| 86 if (prefix === '') |
| 87 prefix = '*'; |
| 88 suffixDomain = prefixSuffix[1]; |
| 89 } |
| 90 |
| 91 var domainParts = suffixDomain.split('.'); |
| 92 if (domainParts.length < 2) |
| 93 throw 'Invalid domain'; |
| 94 var baseDomain = domainParts[domainParts.length - 2] + '.' + domainParts[domai
nParts.length - 1]; |
| 95 while (domainParts[0] === '') |
| 96 domainParts.shift(); |
| 97 lineObj.domain = domainParts.join('.'); |
| 98 lineObj.prefix = prefix; |
| 99 |
| 100 var mapOfSubdomains = map.get(baseDomain); |
| 101 if (!mapOfSubdomains) { |
| 102 mapOfSubdomains = new Map(); |
| 103 map.set(baseDomain, mapOfSubdomains); |
| 104 } |
| 105 |
| 106 var prefixMap = mapOfSubdomains.get(lineObj.domain); |
| 107 if (!prefixMap) { |
| 108 prefixMap = new Map(); |
| 109 mapOfSubdomains.set(lineObj.domain, prefixMap); |
| 110 } |
| 111 if (prefixMap.has(prefix)) |
| 112 console.log('Problem with: ', domain, lineObj.domain); |
| 113 prefixMap.set(prefix, lineObj); |
| 114 } |
| 115 |
| 116 var outputProducts = []; |
| 117 var outputObj = new Map(); |
| 118 for (var [baseDomain, subdomains] of map) { |
| 119 for (var prefixes of subdomains.values()) { |
| 120 SKIP_ENTRY: for (var lineObj of prefixes.values()) { |
| 121 var prefix = lineObj.prefix; |
| 122 var wildLineObj = prefixes.get('*'); |
| 123 if (wildLineObj && prefix !== '*') { |
| 124 if (wildLineObj.name_legal_product === lineObj.name_legal_product) { |
| 125 // Skip entry, since wild card is there and already in table. |
| 126 continue SKIP_ENTRY; |
| 127 } |
| 128 } |
| 129 var fullSubdomain = lineObj.domain; |
| 130 var domainParts = lineObj.domain.split('.'); |
| 131 // Ignore fist one since we are on it now. |
| 132 var previousDomainPart = domainParts.shift(); |
| 133 var ignoreEntry = false; |
| 134 |
| 135 while (domainParts.length > 1) { |
| 136 var subdomain = domainParts.join('.'); |
| 137 var subdomainPrefixes = subdomains.get(subdomain); |
| 138 if (subdomainPrefixes) { |
| 139 for (var innerLineObj of subdomainPrefixes.values()) { |
| 140 if (innerLineObj.prefix === '' || innerLineObj.name_legal_product !=
= lineObj.name_legal_product) |
| 141 continue; |
| 142 if (innerLineObj.prefix === '*') |
| 143 continue SKIP_ENTRY; |
| 144 // Per chat with 3pas team. We need to check prefix on subdomain not
top level domain. |
| 145 // ie: f*.foo.bar -> [b.f00.foo.bar, true], [f00.foo.bar, true], [f0
0.b.foo.bar, false] |
| 146 if (previousDomainPart.substr(0, innerLineObj.prefix.length) === inn
erLineObj.prefix) |
| 147 continue SKIP_ENTRY; |
| 148 } |
| 149 } |
| 150 previousDomainPart = domainParts.shift(); |
| 151 } |
| 152 var outputPart = outputObj.get(fullSubdomain); |
| 153 if (!outputPart) { |
| 154 outputPart = {hash: hex_sha1(fullSubdomain).substr(0, 16), prefixes: {}}
; |
| 155 outputObj.set(fullSubdomain, outputPart); |
| 156 } |
| 157 outputPart.prefixes[lineObj.prefix] = registerOutputProduct(lineObj.name_l
egal_product); |
| 158 } |
| 159 } |
| 160 } |
| 161 |
| 162 console.log( |
| 163 '// Copyright 2017 The Chromium Authors. All rights reserved.\n' + |
| 164 '// Use of this source code is governed by a BSD-style license that can be\n
' + |
| 165 '// found in the LICENSE file.\n' + |
| 166 '// clang-format off\n' + |
| 167 '/* eslint-disable */\n' + |
| 168 'ProductRegistry.register(['); |
| 169 var data = JSON.stringify(outputProducts).replace(/","/g, '",\n "'); |
| 170 console.log(' ' + data.substring(1, data.length - 1)); |
| 171 console.log('],'); |
| 172 console.log('['); |
| 173 var outputObjArray = Array.from(outputObj.values()); |
| 174 for (var i = 0; i < outputObjArray.length; i++) { |
| 175 var obj = outputObjArray[i]; |
| 176 var lineEnding = (i === outputObjArray.length - 1) ? '' : ','; |
| 177 var comments = []; |
| 178 for (var prefix in obj.prefixes) |
| 179 comments.push('[' + outputProducts[obj.prefixes[prefix]] + ']'); |
| 180 console.log(' ' + JSON.stringify(obj) + lineEnding + ' // ' + comments.join('
')); |
| 181 } |
| 182 console.log(']);'); |
| 183 |
| 184 |
| 185 // items.forEach(lineObj => console.log(lineObj.name_legal_product.padStart(50),
lineObj.domain.padStart(30))); |
| 186 // console.log("With *: ", items.filter(v => v.domain.indexOf('*') !== -1).lengt
h); |
| 187 // console.log("Total: ", items.length); |
| 188 |
| 189 |
| 190 |
| 191 // Linear but meh. |
| 192 function registerOutputProduct(name) { |
| 193 var index = outputProducts.indexOf(name); |
| 194 if (index === -1) { |
| 195 outputProducts.push(name); |
| 196 return outputProducts.length - 1; |
| 197 } |
| 198 return index; |
| 199 } |
| 200 |
| 201 function* csvUnmarshaller(data, lineOffset) { |
| 202 var origLen = data.length; |
| 203 var colLength = 0; |
| 204 var lineNo = lineOffset || 1; |
| 205 while (data.length) { |
| 206 var colData; |
| 207 var match; |
| 208 if (data[0] === '"') { |
| 209 match = data.match(/^"((?:[^"]|"")*)"(,|\n|$)/m); |
| 210 if (!match) |
| 211 throw 'Bad data at line ' + lineNo + ' col: ' + colLength + ' ' + data.s
ubstr(0, 15); |
| 212 } else if (data[0] === '\'') { |
| 213 match = data.match(/^'((?:[^']|'')*)'(,|\n|$)/m); |
| 214 if (!match) |
| 215 throw 'Bad data at line ' + lineNo + ' col: ' + colLength + ' ' + data.s
ubstr(0, 15); |
| 216 } else { |
| 217 match = data.match(/^([^,\n]*)(,|\n|$)/); |
| 218 if (!match) |
| 219 throw 'Bad data at line ' + lineNo + ' col: ' + colLength + ' ' + data.s
ubstr(0, 15); |
| 220 match[1] = match[1] === 'NULL' ? null : match[1]; |
| 221 } |
| 222 colLength += match[0].length; |
| 223 if (match[2] === '\n') { |
| 224 lineNo++; |
| 225 colLength = 0; |
| 226 } |
| 227 yield [match[1], match[2] === '\n']; |
| 228 data = data.substr(match[0].length); |
| 229 } |
| 230 } |
| 231 |
| 232 |
| 233 // All sha1 helpers from here down. |
| 234 |
| 235 |
| 236 /* |
| 237 * A JavaScript implementation of the Secure Hash Algorithm, SHA-1, as defined |
| 238 * in FIPS PUB 180-1 |
| 239 * Version 2.1a Copyright Paul Johnston 2000 - 2002. |
| 240 * Other contributors: Greg Holt, Andrew Kepert, Ydnar, Lostinet |
| 241 * Distributed under the BSD License |
| 242 * See http://pajhome.org.uk/crypt/md5 for details. |
| 243 */ |
| 244 |
| 245 /* |
| 246 * These are the functions you'll usually want to call |
| 247 * They take string arguments and return either hex or base-64 encoded strings |
| 248 */ |
| 249 function hex_sha1(s) { |
| 250 return binb2hex(core_sha1(str2binb(s), s.length * chrsz)); |
| 251 } |
| 252 function b64_sha1(s) { |
| 253 return binb2b64(core_sha1(str2binb(s), s.length * chrsz)); |
| 254 } |
| 255 function str_sha1(s) { |
| 256 return binb2str(core_sha1(str2binb(s), s.length * chrsz)); |
| 257 } |
| 258 function hex_hmac_sha1(key, data) { |
| 259 return binb2hex(core_hmac_sha1(key, data)); |
| 260 } |
| 261 function b64_hmac_sha1(key, data) { |
| 262 return binb2b64(core_hmac_sha1(key, data)); |
| 263 } |
| 264 function str_hmac_sha1(key, data) { |
| 265 return binb2str(core_hmac_sha1(key, data)); |
| 266 } |
| 267 |
| 268 /* |
| 269 * Perform a simple self-test to see if the VM is working |
| 270 */ |
| 271 function sha1_vm_test() { |
| 272 return hex_sha1('abc') == 'a9993e364706816aba3e25717850c26c9cd0d89d'; |
| 273 } |
| 274 |
| 275 /* |
| 276 * Calculate the SHA-1 of an array of big-endian words, and a bit length |
| 277 */ |
| 278 function core_sha1(x, len) { |
| 279 /* append padding */ |
| 280 x[len >> 5] |= 0x80 << (24 - len % 32); |
| 281 x[((len + 64 >> 9) << 4) + 15] = len; |
| 282 |
| 283 var w = Array(80); |
| 284 var a = 1732584193; |
| 285 var b = -271733879; |
| 286 var c = -1732584194; |
| 287 var d = 271733878; |
| 288 var e = -1009589776; |
| 289 |
| 290 for (var i = 0; i < x.length; i += 16) { |
| 291 var olda = a; |
| 292 var oldb = b; |
| 293 var oldc = c; |
| 294 var oldd = d; |
| 295 var olde = e; |
| 296 |
| 297 for (var j = 0; j < 80; j++) { |
| 298 if (j < 16) |
| 299 w[j] = x[i + j]; |
| 300 else |
| 301 w[j] = rol(w[j - 3] ^ w[j - 8] ^ w[j - 14] ^ w[j - 16], 1); |
| 302 var t = safe_add(safe_add(rol(a, 5), sha1_ft(j, b, c, d)), safe_add(safe_a
dd(e, w[j]), sha1_kt(j))); |
| 303 e = d; |
| 304 d = c; |
| 305 c = rol(b, 30); |
| 306 b = a; |
| 307 a = t; |
| 308 } |
| 309 |
| 310 a = safe_add(a, olda); |
| 311 b = safe_add(b, oldb); |
| 312 c = safe_add(c, oldc); |
| 313 d = safe_add(d, oldd); |
| 314 e = safe_add(e, olde); |
| 315 } |
| 316 return Array(a, b, c, d, e); |
| 317 } |
| 318 |
| 319 /* |
| 320 * Perform the appropriate triplet combination function for the current |
| 321 * iteration |
| 322 */ |
| 323 function sha1_ft(t, b, c, d) { |
| 324 if (t < 20) |
| 325 return (b & c) | ((~b) & d); |
| 326 if (t < 40) |
| 327 return b ^ c ^ d; |
| 328 if (t < 60) |
| 329 return (b & c) | (b & d) | (c & d); |
| 330 return b ^ c ^ d; |
| 331 } |
| 332 |
| 333 /* |
| 334 * Determine the appropriate additive constant for the current iteration |
| 335 */ |
| 336 function sha1_kt(t) { |
| 337 return (t < 20) ? 1518500249 : (t < 40) ? 1859775393 : (t < 60) ? -1894007588
: -899497514; |
| 338 } |
| 339 |
| 340 /* |
| 341 * Calculate the HMAC-SHA1 of a key and some data |
| 342 */ |
| 343 function core_hmac_sha1(key, data) { |
| 344 var bkey = str2binb(key); |
| 345 if (bkey.length > 16) |
| 346 bkey = core_sha1(bkey, key.length * chrsz); |
| 347 |
| 348 var ipad = Array(16), opad = Array(16); |
| 349 for (var i = 0; i < 16; i++) { |
| 350 ipad[i] = bkey[i] ^ 0x36363636; |
| 351 opad[i] = bkey[i] ^ 0x5C5C5C5C; |
| 352 } |
| 353 |
| 354 var hash = core_sha1(ipad.concat(str2binb(data)), 512 + data.length * chrsz); |
| 355 return core_sha1(opad.concat(hash), 512 + 160); |
| 356 } |
| 357 |
| 358 /* |
| 359 * Add integers, wrapping at 2^32. This uses 16-bit operations internally |
| 360 * to work around bugs in some JS interpreters. |
| 361 */ |
| 362 function safe_add(x, y) { |
| 363 var lsw = (x & 0xFFFF) + (y & 0xFFFF); |
| 364 var msw = (x >> 16) + (y >> 16) + (lsw >> 16); |
| 365 return (msw << 16) | (lsw & 0xFFFF); |
| 366 } |
| 367 |
| 368 /* |
| 369 * Bitwise rotate a 32-bit number to the left. |
| 370 */ |
| 371 function rol(num, cnt) { |
| 372 return (num << cnt) | (num >>> (32 - cnt)); |
| 373 } |
| 374 |
| 375 /* |
| 376 * Convert an 8-bit or 16-bit string to an array of big-endian words |
| 377 * In 8-bit function, characters >255 have their hi-byte silently ignored. |
| 378 */ |
| 379 function str2binb(str) { |
| 380 var bin = Array(); |
| 381 var mask = (1 << chrsz) - 1; |
| 382 for (var i = 0; i < str.length * chrsz; i += chrsz) |
| 383 bin[i >> 5] |= (str.charCodeAt(i / chrsz) & mask) << (32 - chrsz - i % 32); |
| 384 return bin; |
| 385 } |
| 386 |
| 387 /* |
| 388 * Convert an array of big-endian words to a string |
| 389 */ |
| 390 function binb2str(bin) { |
| 391 var str = ''; |
| 392 var mask = (1 << chrsz) - 1; |
| 393 for (var i = 0; i < bin.length * 32; i += chrsz) |
| 394 str += String.fromCharCode((bin[i >> 5] >>> (32 - chrsz - i % 32)) & mask); |
| 395 return str; |
| 396 } |
| 397 |
| 398 /* |
| 399 * Convert an array of big-endian words to a hex string. |
| 400 */ |
| 401 function binb2hex(binarray) { |
| 402 var hex_tab = hexcase ? '0123456789ABCDEF' : '0123456789abcdef'; |
| 403 var str = ''; |
| 404 for (var i = 0; i < binarray.length * 4; i++) { |
| 405 str += hex_tab.charAt((binarray[i >> 2] >> ((3 - i % 4) * 8 + 4)) & 0xF) + |
| 406 hex_tab.charAt((binarray[i >> 2] >> ((3 - i % 4) * 8)) & 0xF); |
| 407 } |
| 408 return str; |
| 409 } |
| 410 |
| 411 /* |
| 412 * Convert an array of big-endian words to a base-64 string |
| 413 */ |
| 414 function binb2b64(binarray) { |
| 415 var tab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; |
| 416 var str = ''; |
| 417 for (var i = 0; i < binarray.length * 4; i += 3) { |
| 418 var triplet = (((binarray[i >> 2] >> 8 * (3 - i % 4)) & 0xFF) << 16) | |
| 419 (((binarray[i + 1 >> 2] >> 8 * (3 - (i + 1) % 4)) & 0xFF) << 8) | |
| 420 ((binarray[i + 2 >> 2] >> 8 * (3 - (i + 2) % 4)) & 0xFF); |
| 421 for (var j = 0; j < 4; j++) { |
| 422 if (i * 8 + j * 6 > binarray.length * 32) |
| 423 str += b64pad; |
| 424 else |
| 425 str += tab.charAt((triplet >> 6 * (3 - j)) & 0x3F); |
| 426 } |
| 427 } |
| 428 return str; |
| 429 } |
OLD | NEW |