Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(tools.uri) normalization decodes as much as possible #8140

Merged
merged 11 commits into from
Jul 26, 2022
97 changes: 55 additions & 42 deletions kong/tools/uri.lua
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
local _M = {}

local table_new = require "table.new"

local string_char = string.char
local string_upper = string.upper
Expand All @@ -12,52 +11,71 @@ local table_concat = table.concat
local ngx_re_find = ngx.re.find
local ngx_re_gsub = ngx.re.gsub

-- Charset:
-- reserved = "!" / "*" / "'" / "(" / ")" / ";" / ":" /
-- "@" / "&" / "=" / "+" / "$" / "," / "/" / "?" / "%" / "#" / "[" / "]"
-- unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
-- other: * (meaning any char that is not mentioned above)

-- Reserved characters have special meaning in URI. Encoding/decoding it affects the semantics of the URI;
-- Unreserved characters are safe to use as part of HTTP message without encoding;
-- Other characters has not special meaning but may be not safe to use as part of HTTP message without encoding;

-- We should not unescape or escape reserved characters;
-- We should unescape but not escape unreserved characters;
-- We choose to unescape when processing and escape when forwarding for other characters

local RESERVED_CHARS = "!*'();:@&=+$,/?%#[]"

local chars_to_decode = table_new(256, 0)
do
-- reserved
for i = 1, #RESERVED_CHARS do
chars_to_decode[string_byte(RESERVED_CHARS, i)] = true
end

-- unreserved and others default to nil
end

local RESERVED_CHARACTERS = {
[0x21] = true, -- !
[0x23] = true, -- #
[0x24] = true, -- $
[0x25] = true, -- %
[0x26] = true, -- &
[0x27] = true, -- '
[0x28] = true, -- (
[0x29] = true, -- )
[0x2A] = true, -- *
[0x2B] = true, -- +
[0x2C] = true, -- ,
[0x2F] = true, -- /
[0x3A] = true, -- :
[0x3B] = true, -- ;
[0x3D] = true, -- =
[0x3F] = true, -- ?
[0x40] = true, -- @
[0x5B] = true, -- [
[0x5D] = true, -- ]
}

local ESCAPE_PATTERN = "[^!#$&'()*+,/:;=?@[\\]A-Z\\d-_.~%]"

local TMP_OUTPUT = require("table.new")(16, 0)
local DOT = string_byte(".")
local SLASH = string_byte("/")

local function percent_decode(m)
local hex = m[1]
local num = tonumber(hex, 16)
if RESERVED_CHARACTERS[num] then
return string_upper(m[0])
end

local function normalize_decode(m)
local hex = m[1]
local num = tonumber(hex, 16)

-- from rfc3986 we should decode unreserved character
-- and we choose to decode "others"
if not chars_to_decode[num] then -- is not reserved(false or nil)
return string_char(num)
end

return string_upper(m[0])
end


local function escape(m)
local function percent_escape(m)
return string_format("%%%02X", string_byte(m[0]))
end

-- This function does slightly different things from its name.
-- It ensures the output to be safe to a part of HTTP message (headers or path)
-- and preserve origin semantics
local function escape(uri)
if ngx_re_find(uri, ESCAPE_PATTERN, "joi") then
return (ngx_re_gsub(uri, ESCAPE_PATTERN, percent_escape, "joi"))
end

return uri
end

function _M.normalize(uri, merge_slashes)

local function normalize(uri, merge_slashes)
Copy link
Member Author

@bungle bungle Jan 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function does not percent encode anymore.

Copy link
Member Author

@bungle bungle Jan 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This issue can be illustrated with curl:

~ ❯ curl http://localhost:8080/uri/ö      
<html><body>/uri/ö</html>

where I have this location block:

location /uri {
  content_by_lua '
    ngx.say("<html><body>", ngx.var.request_uri, "</html>")
  ';
}

See, the ö is unescaped, but normalization function has to escape it properly. If we don't do it then:

/uri/%C3%B6

and

/uri/ö

Are different paths and you need to create both in Route paths. They should be treated equally.

-- check for simple cases and early exit
if uri == "" or uri == "/" then
return uri
Expand All @@ -67,11 +85,12 @@ function _M.normalize(uri, merge_slashes)
-- (this can in some cases lead to unnecessary percent-decoding)
if string_find(uri, "%", 1, true) then
-- decoding percent-encoded triplets of unreserved characters
uri = ngx_re_gsub(uri, "%([\\dA-F]{2})", percent_decode, "joi")
uri = ngx_re_gsub(uri, "%([\\dA-F]{2})", normalize_decode, "joi")
end

-- check if the uri contains a dot
-- (this can in some cases lead to unnecessary dot removal processing)
-- notice: it's expected that /%2e./ is considered the same of /../
if string_find(uri, ".", 1, true) == nil then
if not merge_slashes then
return uri
Expand Down Expand Up @@ -148,13 +167,7 @@ function _M.normalize(uri, merge_slashes)
end


function _M.escape(uri)
if ngx_re_find(uri, ESCAPE_PATTERN, "joi") then
return ngx_re_gsub(uri, ESCAPE_PATTERN, escape, "joi")
end

return uri
end


return _M
return {
escape = escape,
normalize = normalize,
}
40 changes: 40 additions & 0 deletions spec/01-unit/08-router_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2789,6 +2789,46 @@ for _, flavor in ipairs({ "traditional", "traditional_compatible" }) do
assert.equal(2, #match_t.matches.uri_captures)
end)

it("returns uri_captures normalized, fix #7913", function()
local use_case = {
{
service = service,
route = {
id = "e8fb37f1-102d-461e-9c51-6608a6bb8101",
paths = { [[~/users/(?P<fullname>[a-zA-Z\s\d%]+)/profile/?(?P<scope>[a-z]*)]] },
},
},
}


local router = assert(Router.new(use_case))
local _ngx = mock_ngx("GET", "/users/%6aohn%20doe/profile", { host = "domain.org" })

router._set_ngx(_ngx)
local match_t = router:exec()
assert.equal("john doe", match_t.matches.uri_captures[1])
assert.equal("john doe", match_t.matches.uri_captures.fullname)
assert.equal("", match_t.matches.uri_captures[2])
assert.equal("", match_t.matches.uri_captures.scope)
-- returns the full match as well
assert.equal("/users/john doe/profile", match_t.matches.uri_captures[0])
-- no stripped_uri capture
assert.is_nil(match_t.matches.uri_captures.stripped_uri)
assert.equal(2, #match_t.matches.uri_captures)

-- again, this time from the LRU cache
local match_t = router:exec()
assert.equal("john doe", match_t.matches.uri_captures[1])
assert.equal("john doe", match_t.matches.uri_captures.fullname)
assert.equal("", match_t.matches.uri_captures[2])
assert.equal("", match_t.matches.uri_captures.scope)
-- returns the full match as well
assert.equal("/users/john doe/profile", match_t.matches.uri_captures[0])
-- no stripped_uri capture
assert.is_nil(match_t.matches.uri_captures.stripped_uri)
assert.equal(2, #match_t.matches.uri_captures)
end)

it("returns no uri_captures from a [uri prefix] match", function()
local use_case = {
{
Expand Down
6 changes: 5 additions & 1 deletion spec/01-unit/18-tools_uri_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ describe("kong.tools.uri", function()

it("no normalization necessary (reserved characters)", function()
assert.equal("/a%2Fb%2Fc/", uri.normalize("/a%2Fb%2Fc/"))
assert.equal("/%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D", uri.normalize("/%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D"))
assert.equal("/ %21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D", uri.normalize("/%20%21%23%24%25%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D"))
end)

it("converting percent-encoded triplets to uppercase", function()
Expand Down Expand Up @@ -50,6 +50,10 @@ describe("kong.tools.uri", function()
it("decodes non-ASCII characters that are unreserved, issue #2366", function()
assert.equal("/endeløst", uri.normalize("/endel%C3%B8st"))
end)

it("does normalize complex uri that has characters outside of normal uri charset", function()
assert.equal("/ä/a./a/_\x99\xaf%2F%2F" , uri.normalize("/%C3%A4/a/%2e./a%2E//a/%2e/./a/../a/%2e%2E/%5f%99%af%2f%2F", true))
end)
end)

describe("escape()", function()
Expand Down