diff --git a/lib/url.js b/lib/url.js index cbea2f0fc3a..99a0e67422b 100644 --- a/lib/url.js +++ b/lib/url.js @@ -24,25 +24,40 @@ exports.resolve = urlResolve; exports.resolveObject = urlResolveObject; exports.format = urlFormat; +// Reference: RFC 3986, RFC 1808, RFC 2396 + // define these here so at least they only have to be // compiled once on the first module load. -var protocolPattern = /^([a-z0-9]+:)/, +var protocolPattern = /^([a-z0-9]+:)/i, portPattern = /:[0-9]+$/, - delims = ['<', '>', '"', '\'', '`', /\s/], + // RFC 2396: characters reserved for delimiting URLs. + delims = ['<', '>', '"', '`', ' ', '\r', '\n', '\t'], + // RFC 2396: characters not allowed for various reasons. unwise = ['{', '}', '|', '\\', '^', '~', '[', ']', '`'].concat(delims), - nonHostChars = ['/', '?', ';', '#'].concat(unwise), + // Allowed by RFCs, but cause of XSS attacks. Always escape these. + autoEscape = ['\''], + // Characters that are never ever allowed in a hostname. + // Note that any invalid chars are also handled, but these + // are the ones that are *expected* to be seen, so we fast-path + // them. + nonHostChars = ['%', '/', '?', ';', '#'] + .concat(unwise).concat(autoEscape), hostnameMaxLen = 255, - hostnamePartPattern = /^[a-z0-9][a-z0-9A-Z-]{0,62}$/, + hostnamePartPattern = /^[a-zA-Z0-9][a-z0-9A-Z-]{0,62}$/, + hostnamePartStart = /^([a-zA-Z0-9][a-z0-9A-Z-]{0,62})(.*)$/, + // protocols that can allow "unsafe" and "unwise" chars. unsafeProtocol = { 'javascript': true, 'javascript:': true }, + // protocols that never have a hostname. hostlessProtocol = { 'javascript': true, 'javascript:': true, 'file': true, 'file:': true }, + // protocols that always have a path component. pathedProtocol = { 'http': true, 'https': true, @@ -54,6 +69,7 @@ var protocolPattern = /^([a-z0-9]+:)/, 'gopher:': true, 'file:': true }, + // protocols that always contain a // bit. slashedProtocol = { 'http': true, 'https': true, @@ -74,10 +90,19 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { var out = {}, rest = url; + // cut off any delimiters. + // This is to support parse stuff like "" + for (var i = 0, l = rest.length; i < l; i++) { + if (delims.indexOf(rest.charAt(i)) === -1) break; + } + if (i !== 0) rest = rest.substr(i); + + var proto = protocolPattern.exec(rest); if (proto) { proto = proto[0]; - out.protocol = proto; + var lowerProto = proto.toLowerCase(); + out.protocol = lowerProto; rest = rest.substr(proto.length); } @@ -119,6 +144,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { var key = keys[i]; out[key] = p[key]; } + // we've indicated that there is a hostname, // so even if it's empty, it has to be present. out.hostname = out.hostname || ''; @@ -130,17 +156,49 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { var hostparts = out.hostname.split(/\./); for (var i = 0, l = hostparts.length; i < l; i++) { var part = hostparts[i]; + if (!part) continue; if (!part.match(hostnamePartPattern)) { - out.hostname = ''; + var validParts = hostparts.slice(0, i); + var notHost = hostparts.slice(i + 1); + var bit = part.match(hostnamePartStart); + if (bit) { + validParts.push(bit[1]); + notHost.unshift(bit[2]); + } + if (notHost.length) { + rest = '/' + notHost.join('.') + rest + } + out.hostname = validParts.join('.'); break; } } } + // hostnames are always lower case. + out.hostname = out.hostname.toLowerCase(); + + out.host = ((out.auth) ? out.auth + '@' : '') + + (out.hostname || '') + + ((out.port) ? ':' + out.port : ''); + out.href += out.host; } // now rest is set to the post-host stuff. // chop off any delim chars. - if (!unsafeProtocol[proto]) { + if (!unsafeProtocol[lowerProto]) { + + // First, make 100% sure that any "autoEscape" chars get + // escaped, even if encodeURIComponent doesn't think they + // need to be. + for (var i = 0, l = autoEscape.length; i < l; i++) { + var ae = autoEscape[i]; + var esc = encodeURIComponent(ae); + if (esc === ae) { + esc = escape(ae); + } + rest = rest.split(ae).join(esc); + } + + // Now make sure that delims never appear in a url. var chop = rest.length; for (var i = 0, l = delims.length; i < l; i++) { var c = rest.indexOf(delims[i]); diff --git a/test/simple/test-url.js b/test/simple/test-url.js index 4f3d139ca9a..e52dacd8bd0 100644 --- a/test/simple/test-url.js +++ b/test/simple/test-url.js @@ -32,6 +32,99 @@ var parseTests = { 'href': '//some_path', 'pathname': '//some_path' }, + 'HTTP://www.example.com/' : { + 'href': 'http://www.example.com/', + 'protocol': 'http:', + 'host': 'www.example.com', + 'hostname': 'www.example.com', + 'pathname': '/' + }, + 'http://www.ExAmPlE.com/' : { + 'href': 'http://www.example.com/', + 'protocol': 'http:', + 'host': 'www.example.com', + 'hostname': 'www.example.com', + 'pathname': '/' + + }, + 'http://user:pw@www.ExAmPlE.com/' : { + 'href': 'http://user:pw@www.example.com/', + 'protocol': 'http:', + 'auth': 'user:pw', + 'host': 'user:pw@www.example.com', + 'hostname': 'www.example.com', + 'pathname': '/' + + }, + 'http://USER:PW@www.ExAmPlE.com/' : { + 'href': 'http://USER:PW@www.example.com/', + 'protocol': 'http:', + 'auth': 'USER:PW', + 'host': 'USER:PW@www.example.com', + 'hostname': 'www.example.com', + 'pathname': '/' + }, + 'http://x.com/path?that\'s#all, folks' : { + 'href': 'http://x.com/path?that%27s#all,', + 'protocol': 'http:', + 'host': 'x.com', + 'hostname': 'x.com', + 'search': '?that%27s', + 'query': 'that%27s', + 'pathname': '/path', + 'hash': '#all,' + }, + 'HTTP://X.COM/Y' : { + 'href': 'http://x.com/Y', + 'protocol': 'http:', + 'host': 'x.com', + 'hostname': 'x.com', + 'pathname': '/Y', + }, + // an unexpected invalid char in the hostname. + 'HtTp://x.y.cOm*a/b/c?d=e#f gi' : { + 'href': 'http://x.y.com/*a/b/c?d=e#f', + 'protocol': 'http:', + 'host': 'x.y.com', + 'hostname': 'x.y.com', + 'pathname': '/*a/b/c', + 'search': '?d=e', + 'query': 'd=e', + 'hash': '#f' + }, + // make sure that we don't accidentally lcast the path parts. + 'HtTp://x.y.cOm*A/b/c?d=e#f gi' : { + 'href': 'http://x.y.com/*A/b/c?d=e#f', + 'protocol': 'http:', + 'host': 'x.y.com', + 'hostname': 'x.y.com', + 'pathname': '/*A/b/c', + 'search': '?d=e', + 'query': 'd=e', + 'hash': '#f' + }, + 'http://x...y...#p': { + 'href': 'http://x...y.../#p', + 'protocol': 'http:', + 'host': 'x...y...', + 'hostname': 'x...y...', + 'hash': '#p', + 'pathname': '/' + }, + 'http://x/p/"quoted"': { + 'href': 'http://x/p/', + 'protocol':'http:', + 'host': 'x', + 'hostname': 'x', + 'pathname': '/p/' + }, + ' Is a URL!': { + 'href': 'http://goo.corn/bread', + 'protocol': 'http:', + 'host': 'goo.corn', + 'hostname': 'goo.corn', + 'pathname': '/bread' + }, 'http://www.narwhaljs.org/blog/categories?id=news' : { 'href': 'http://www.narwhaljs.org/blog/categories?id=news', 'protocol': 'http:', @@ -58,17 +151,18 @@ var parseTests = { 'query': '??&hl=en&src=api&x=2&y=2&z=3&s=', 'pathname': '/vt/lyrs=m@114' }, - 'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=' : { - 'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' + - '&hl=en&src=api&x=2&y=2&z=3&s=', - 'protocol': 'http:', - 'host': 'user:pass@mt0.google.com', - 'auth': 'user:pass', - 'hostname': 'mt0.google.com', - 'search': '???&hl=en&src=api&x=2&y=2&z=3&s=', - 'query': '??&hl=en&src=api&x=2&y=2&z=3&s=', - 'pathname': '/vt/lyrs=m@114' - }, + 'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=': + { + 'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' + + '&hl=en&src=api&x=2&y=2&z=3&s=', + 'protocol': 'http:', + 'host': 'user:pass@mt0.google.com', + 'auth': 'user:pass', + 'hostname': 'mt0.google.com', + 'search': '???&hl=en&src=api&x=2&y=2&z=3&s=', + 'query': '??&hl=en&src=api&x=2&y=2&z=3&s=', + 'pathname': '/vt/lyrs=m@114' + }, 'file:///etc/passwd' : { 'href': 'file:///etc/passwd', 'protocol': 'file:', @@ -154,7 +248,7 @@ for (var u in parseTests) { 'parse(' + u + ').' + i + ' == ' + e + '\nactual: ' + a); } - var expected = u, + var expected = parseTests[u].href, actual = url.format(parseTests[u]); assert.equal(expected, actual,