Skip to content
This repository has been archived by the owner on Apr 22, 2023. It is now read-only.

Commit

Permalink
Close #954 URL parsing/formatting corrections
Browse files Browse the repository at this point in the history
1. Allow single-quotes in urls, but escape them.
2. Add comments about which RFCs we're following for guidance.
3. Handle any invalid character in the hostname portion.
4. lcase protocol and hostname portions, since they are
case-insensitive.
  • Loading branch information
isaacs committed Apr 20, 2011
1 parent d3d35ec commit 90802d6
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 19 deletions.
72 changes: 65 additions & 7 deletions lib/url.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,40 @@ exports.resolve = urlResolve;
exports.resolveObject = urlResolveObject;
exports.format = urlFormat;

// Reference: RFC 3986, RFC 1808, RFC 2396

// define these here so at least they only have to be
// compiled once on the first module load.
var protocolPattern = /^([a-z0-9]+:)/,
var protocolPattern = /^([a-z0-9]+:)/i,
portPattern = /:[0-9]+$/,
delims = ['<', '>', '"', '\'', '`', /\s/],
// RFC 2396: characters reserved for delimiting URLs.
delims = ['<', '>', '"', '`', ' ', '\r', '\n', '\t'],
// RFC 2396: characters not allowed for various reasons.
unwise = ['{', '}', '|', '\\', '^', '~', '[', ']', '`'].concat(delims),
nonHostChars = ['/', '?', ';', '#'].concat(unwise),
// Allowed by RFCs, but cause of XSS attacks. Always escape these.
autoEscape = ['\''],
// Characters that are never ever allowed in a hostname.
// Note that any invalid chars are also handled, but these
// are the ones that are *expected* to be seen, so we fast-path
// them.
nonHostChars = ['%', '/', '?', ';', '#']
.concat(unwise).concat(autoEscape),
hostnameMaxLen = 255,
hostnamePartPattern = /^[a-z0-9][a-z0-9A-Z-]{0,62}$/,
hostnamePartPattern = /^[a-zA-Z0-9][a-z0-9A-Z-]{0,62}$/,
hostnamePartStart = /^([a-zA-Z0-9][a-z0-9A-Z-]{0,62})(.*)$/,
// protocols that can allow "unsafe" and "unwise" chars.
unsafeProtocol = {
'javascript': true,
'javascript:': true
},
// protocols that never have a hostname.
hostlessProtocol = {
'javascript': true,
'javascript:': true,
'file': true,
'file:': true
},
// protocols that always have a path component.
pathedProtocol = {
'http': true,
'https': true,
Expand All @@ -54,6 +69,7 @@ var protocolPattern = /^([a-z0-9]+:)/,
'gopher:': true,
'file:': true
},
// protocols that always contain a // bit.
slashedProtocol = {
'http': true,
'https': true,
Expand All @@ -74,10 +90,19 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
var out = {},
rest = url;

// cut off any delimiters.
// This is to support parse stuff like "<http://foo.com>"
for (var i = 0, l = rest.length; i < l; i++) {
if (delims.indexOf(rest.charAt(i)) === -1) break;
}
if (i !== 0) rest = rest.substr(i);


var proto = protocolPattern.exec(rest);
if (proto) {
proto = proto[0];
out.protocol = proto;
var lowerProto = proto.toLowerCase();
out.protocol = lowerProto;
rest = rest.substr(proto.length);
}

Expand Down Expand Up @@ -119,6 +144,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
var key = keys[i];
out[key] = p[key];
}

// we've indicated that there is a hostname,
// so even if it's empty, it has to be present.
out.hostname = out.hostname || '';
Expand All @@ -130,17 +156,49 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
var hostparts = out.hostname.split(/\./);
for (var i = 0, l = hostparts.length; i < l; i++) {
var part = hostparts[i];
if (!part) continue;
if (!part.match(hostnamePartPattern)) {
out.hostname = '';
var validParts = hostparts.slice(0, i);
var notHost = hostparts.slice(i + 1);
var bit = part.match(hostnamePartStart);
if (bit) {
validParts.push(bit[1]);
notHost.unshift(bit[2]);
}
if (notHost.length) {
rest = '/' + notHost.join('.') + rest
}
out.hostname = validParts.join('.');
break;
}
}
}
// hostnames are always lower case.
out.hostname = out.hostname.toLowerCase();

out.host = ((out.auth) ? out.auth + '@' : '') +
(out.hostname || '') +
((out.port) ? ':' + out.port : '');
out.href += out.host;
}

// now rest is set to the post-host stuff.
// chop off any delim chars.
if (!unsafeProtocol[proto]) {
if (!unsafeProtocol[lowerProto]) {

// First, make 100% sure that any "autoEscape" chars get
// escaped, even if encodeURIComponent doesn't think they
// need to be.
for (var i = 0, l = autoEscape.length; i < l; i++) {
var ae = autoEscape[i];
var esc = encodeURIComponent(ae);
if (esc === ae) {
esc = escape(ae);
}
rest = rest.split(ae).join(esc);
}

// Now make sure that delims never appear in a url.
var chop = rest.length;
for (var i = 0, l = delims.length; i < l; i++) {
var c = rest.indexOf(delims[i]);
Expand Down
118 changes: 106 additions & 12 deletions test/simple/test-url.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,99 @@ var parseTests = {
'href': '//some_path',
'pathname': '//some_path'
},
'HTTP://www.example.com/' : {
'href': 'http://www.example.com/',
'protocol': 'http:',
'host': 'www.example.com',
'hostname': 'www.example.com',
'pathname': '/'
},
'http://www.ExAmPlE.com/' : {
'href': 'http://www.example.com/',
'protocol': 'http:',
'host': 'www.example.com',
'hostname': 'www.example.com',
'pathname': '/'

},
'http://user:pw@www.ExAmPlE.com/' : {
'href': 'http://user:pw@www.example.com/',
'protocol': 'http:',
'auth': 'user:pw',
'host': 'user:pw@www.example.com',
'hostname': 'www.example.com',
'pathname': '/'

},
'http://USER:PW@www.ExAmPlE.com/' : {
'href': 'http://USER:PW@www.example.com/',
'protocol': 'http:',
'auth': 'USER:PW',
'host': 'USER:PW@www.example.com',
'hostname': 'www.example.com',
'pathname': '/'
},
'http://x.com/path?that\'s#all, folks' : {
'href': 'http://x.com/path?that%27s#all,',
'protocol': 'http:',
'host': 'x.com',
'hostname': 'x.com',
'search': '?that%27s',
'query': 'that%27s',
'pathname': '/path',
'hash': '#all,'
},
'HTTP://X.COM/Y' : {
'href': 'http://x.com/Y',
'protocol': 'http:',
'host': 'x.com',
'hostname': 'x.com',
'pathname': '/Y',
},
// an unexpected invalid char in the hostname.
'HtTp://x.y.cOm*a/b/c?d=e#f g<h>i' : {
'href': 'http://x.y.com/*a/b/c?d=e#f',
'protocol': 'http:',
'host': 'x.y.com',
'hostname': 'x.y.com',
'pathname': '/*a/b/c',
'search': '?d=e',
'query': 'd=e',
'hash': '#f'
},
// make sure that we don't accidentally lcast the path parts.
'HtTp://x.y.cOm*A/b/c?d=e#f g<h>i' : {
'href': 'http://x.y.com/*A/b/c?d=e#f',
'protocol': 'http:',
'host': 'x.y.com',
'hostname': 'x.y.com',
'pathname': '/*A/b/c',
'search': '?d=e',
'query': 'd=e',
'hash': '#f'
},
'http://x...y...#p': {
'href': 'http://x...y.../#p',
'protocol': 'http:',
'host': 'x...y...',
'hostname': 'x...y...',
'hash': '#p',
'pathname': '/'
},
'http://x/p/"quoted"': {
'href': 'http://x/p/',
'protocol':'http:',
'host': 'x',
'hostname': 'x',
'pathname': '/p/'
},
'<http://goo.corn/bread> Is a URL!': {
'href': 'http://goo.corn/bread',
'protocol': 'http:',
'host': 'goo.corn',
'hostname': 'goo.corn',
'pathname': '/bread'
},
'http://www.narwhaljs.org/blog/categories?id=news' : {
'href': 'http://www.narwhaljs.org/blog/categories?id=news',
'protocol': 'http:',
Expand All @@ -58,17 +151,18 @@ var parseTests = {
'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
'pathname': '/vt/lyrs=m@114'
},
'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=' : {
'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' +
'&hl=en&src=api&x=2&y=2&z=3&s=',
'protocol': 'http:',
'host': 'user:pass@mt0.google.com',
'auth': 'user:pass',
'hostname': 'mt0.google.com',
'search': '???&hl=en&src=api&x=2&y=2&z=3&s=',
'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
'pathname': '/vt/lyrs=m@114'
},
'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=':
{
'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' +
'&hl=en&src=api&x=2&y=2&z=3&s=',
'protocol': 'http:',
'host': 'user:pass@mt0.google.com',
'auth': 'user:pass',
'hostname': 'mt0.google.com',
'search': '???&hl=en&src=api&x=2&y=2&z=3&s=',
'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
'pathname': '/vt/lyrs=m@114'
},
'file:///etc/passwd' : {
'href': 'file:///etc/passwd',
'protocol': 'file:',
Expand Down Expand Up @@ -154,7 +248,7 @@ for (var u in parseTests) {
'parse(' + u + ').' + i + ' == ' + e + '\nactual: ' + a);
}

var expected = u,
var expected = parseTests[u].href,
actual = url.format(parseTests[u]);

assert.equal(expected, actual,
Expand Down

0 comments on commit 90802d6

Please sign in to comment.