From 135ef8acf3510682a240cbc9fd73fdc244f3b8eb Mon Sep 17 00:00:00 2001 From: Keery Nie Date: Fri, 23 Aug 2024 11:13:03 +0800 Subject: [PATCH] fix(vault): let shdict secret vault cache presist enough time during resurrect_ttl (#13471) This PR fixes an issue that rotate_secret may flush a secret value with NEGATIVE_CACHED_VALUE when vault backend is down and a secret value stored in the shared dict has passed its ttl and hasn't finished consuming its resurrect_ttl. TLDR; this issue happens easily when a reference is being used via the vault PDK function in custom codes(serverless functions, custom plugins, etc.), and some of the worker processes may not be triggered via the service/routes that use these custom codes, and these worker processes do not hold a valid LRU cache for the secret value The issue was first reported in FTI-6137. --------- Signed-off-by: Aapo Talvensaari Co-authored-by: Aapo Talvensaari (cherry picked from commit 9269195330e2a1ef7f6ae231d3063aa28e94403b) --- .../fix-vault-resurrect-ttl-multi-worker.yml | 3 + kong/pdk/vault.lua | 24 ++- .../13-vaults/07-resurrect_spec.lua | 139 ++++++++++++++++++ .../custom_vaults/kong/vaults/test/init.lua | 39 +++++ 4 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 changelog/unreleased/kong/fix-vault-resurrect-ttl-multi-worker.yml diff --git a/changelog/unreleased/kong/fix-vault-resurrect-ttl-multi-worker.yml b/changelog/unreleased/kong/fix-vault-resurrect-ttl-multi-worker.yml new file mode 100644 index 000000000000..e563b3ee422e --- /dev/null +++ b/changelog/unreleased/kong/fix-vault-resurrect-ttl-multi-worker.yml @@ -0,0 +1,3 @@ +message: Fixed an issue where the Vault secret cache got refreshed during `resurrect_ttl` time and could not be fetched by other workers. +type: bugfix +scope: Core diff --git a/kong/pdk/vault.lua b/kong/pdk/vault.lua index 2aad41c8a639..14bf4f8bbbab 100644 --- a/kong/pdk/vault.lua +++ b/kong/pdk/vault.lua @@ -822,9 +822,20 @@ local function new(self) -- @tparam table parsed_reference the parsed reference -- @treturn string|nil the retrieved value from the vault, of `nil` -- @treturn string|nil a string describing an error if there was one + -- @treturn boolean|nil whether to resurrect value in case vault errors or doesn't return value -- @usage local value, err = get_from_vault(reference, strategy, config, cache_key, parsed_reference) - local function get_from_vault(reference, strategy, config, cache_key, parsed_reference) + local function get_from_vault(reference, strategy, config, cache_key, parsed_reference, resurrect) local value, err, ttl = invoke_strategy(strategy, config, parsed_reference) + if resurrect and value == nil then + local resurrected_value = SECRETS_CACHE:get(cache_key) + if resurrected_value then + return resurrected_value + + else + return nil, fmt("could not get value from external vault (%s)", err) + end + end + local cache_value, shdict_ttl, lru_ttl = get_cache_value_and_ttl(value, config, ttl) local ok, cache_err = SECRETS_CACHE:safe_set(cache_key, cache_value, shdict_ttl) if not ok then @@ -1267,18 +1278,25 @@ local function new(self) -- If the TTL is still greater than the resurrect time -- we don't have to rotate the secret, except it if it -- negatively cached. + local resurrect local ttl = SECRETS_CACHE:ttl(new_cache_key) if ttl and SECRETS_CACHE:get(new_cache_key) ~= NEGATIVELY_CACHED_VALUE then local resurrect_ttl = max(config.resurrect_ttl or DAO_MAX_TTL, SECRETS_CACHE_MIN_TTL) + -- the secret is still within ttl, no need to refresh if ttl > resurrect_ttl then return true end + + -- the secret is still within resurrect ttl time, so when we try to refresh the secret + -- we do not forciblly override it with a negative value, so that the cached value + -- can be resurrected + resurrect = ttl > SECRETS_CACHE_MIN_TTL end strategy = caching_strategy(strategy, config_hash) - -- we should refresh the secret at this point - local ok, err = get_from_vault(reference, strategy, config, new_cache_key, parsed_reference) + -- try to refresh the secret, according to the remaining time the cached value may or may not be refreshed. + local ok, err = get_from_vault(reference, strategy, config, new_cache_key, parsed_reference, resurrect) if not ok then return nil, fmt("could not retrieve value for reference %s (%s)", reference, err) end diff --git a/spec/02-integration/13-vaults/07-resurrect_spec.lua b/spec/02-integration/13-vaults/07-resurrect_spec.lua index 0f4ca99422de..eded8676359a 100644 --- a/spec/02-integration/13-vaults/07-resurrect_spec.lua +++ b/spec/02-integration/13-vaults/07-resurrect_spec.lua @@ -15,6 +15,8 @@ local LUA_PATH = CUSTOM_VAULTS .. "/?.lua;" .. local DUMMY_HEADER = "Dummy-Plugin" local fmt = string.format +local json = require "cjson" + --- A vault test harness is a driver for vault backends, which implements @@ -48,6 +50,9 @@ local fmt = string.format --- fixtures() output is passed directly to `helpers.start_kong()` ---@field fixtures fun(self: vault_test_harness):table|nil --- +--- pause() is exactly what you'd expect +---@field pause fun(self: vault_test_harness) +--- --- ---@field prefix string # generated by the test suite ---@field host string # generated by the test suite @@ -88,6 +93,10 @@ local VAULTS = { } } end, + + pause = function(_) + return test_vault.client.pause() + end, }, } @@ -103,6 +112,7 @@ for _, vault in ipairs(VAULTS) do vault.setup = vault.setup or noop vault.teardown = vault.teardown or noop vault.fixtures = vault.fixtures or noop + vault.pause = vault.pause or noop end @@ -232,5 +242,134 @@ describe("vault resurrect_ttl and rotation (#" .. strategy .. ") #" .. vault.nam end) +describe("#multiworker vault resurrect_ttl and rotation (#" .. strategy .. ") #" .. vault.name, function() + local client, admin_client + local secret = "my-secret" + + lazy_setup(function() + helpers.setenv("KONG_LUA_PATH_OVERRIDE", LUA_PATH) + helpers.setenv("KONG_VAULT_ROTATION_INTERVAL", "1") + + vault:setup() + vault:create_secret(secret, "init") + + local bp = helpers.get_db_utils(strategy, + { "vaults", "routes", "services", "plugins" }, + { "dummy" }, + { vault.name }) + + + assert(bp.vaults:insert({ + name = vault.name, + prefix = vault.prefix, + config = vault.config, + })) + + local route = assert(bp.routes:insert({ + name = vault.host, + hosts = { vault.host }, + paths = { "/" }, + service = assert(bp.services:insert()), + })) + + + assert(bp.plugins:insert({ + name = "post-function", + config = { + access = {fmt([[ + local value, err = kong.vault.get("{vault://%s/%s?ttl=%d&resurrect_ttl=%d}") + if value then + kong.response.exit(200, {["value"]=value, ["pid"]=ngx.worker.pid()}, {["Content-Type"]="application/json"}) + end + ]], vault.prefix, secret, 2, 5),} + }, + route = { id = route.id }, + })) + + assert(helpers.start_kong({ + database = strategy, + nginx_conf = "spec/fixtures/custom_nginx.template", + vaults = vault.name, + plugins = "post-function", + log_level = "debug", + dedicated_config_processing = false, + -- nginx_worker_processes = 2, + nginx_main_worker_processes = 2, + }, nil, nil, vault:fixtures() )) + + client = helpers.proxy_client() + admin_client = helpers.admin_client() + end) + + + lazy_teardown(function() + if client then + client:close() + end + + if admin_client then + admin_client:close() + end + + helpers.stop_kong() + vault:teardown() + + helpers.unsetenv("KONG_LUA_PATH_OVERRIDE") + end) + + + it("resurrects secret value from shared dict when secret is deleted (backend: #" .. vault.name .. ")", function() + -- fetch all worker pids + local status_ret = admin_client:get("/") + local body = assert.res_status(200, status_ret) + local json_body = json.decode(body) + assert.truthy(json_body) + local worker_pids = json_body.pids.workers + assert.truthy(#worker_pids == 2) + + local worker_secret_hits = {} + for _, worker_pid in ipairs(worker_pids) do + worker_secret_hits[tostring(worker_pid)] = false + end + + vault:update_secret(secret, "old", { ttl = 2, resurrect_ttl = 5 }) + + -- trigger post-function in one of the workers + local res = client:get("/", {headers = {host = assert(vault.host)}}) + local body = assert.res_status(200, res) + local json_body = json.decode(body) + assert.same("old", json_body.value) + worker_secret_hits[tostring(json_body.pid)] = true + + vault:pause() + + -- let ttl pass and try to trigger post-function in all workers + -- check all of them can resurrect the secret from shared dict + ngx.sleep(3) + + assert.with_timeout(5).with_step(0.1).eventually( + function() + -- avoid connection reuse so that we can hit all workers + local new_client = helpers.proxy_client() + local res = new_client:get("/", {headers = {host = assert(vault.host)}}) + local body = assert.res_status(200, res) + local json_body = json.decode(body) + new_client:close() + assert.same("old", json_body.value) + worker_secret_hits[tostring(json_body.pid)] = true + + for k, v in pairs(worker_secret_hits) do + if not v then + return false, "worker pid " .. k .. " did not hit the secret" + end + end + + return true + end + ).is_truthy("expected all workers to resurrect the secret from shared dict") + end) +end) + + end -- each vault backend end -- each strategy diff --git a/spec/fixtures/custom_vaults/kong/vaults/test/init.lua b/spec/fixtures/custom_vaults/kong/vaults/test/init.lua index 8e4ef04e31c8..6ef4627c384c 100644 --- a/spec/fixtures/custom_vaults/kong/vaults/test/init.lua +++ b/spec/fixtures/custom_vaults/kong/vaults/test/init.lua @@ -49,7 +49,24 @@ function test.init() end +function test.pause() + local shm = ngx.shared[test.SHM_NAME] + shm:set("paused", true) + return kong.response.exit(200, { message = "succeed" }) +end + + +function test.is_running() + local shm = ngx.shared[test.SHM_NAME] + return shm:get("paused") ~= true +end + + function test.get(conf, resource, version) + if not test.is_running() then + return nil, "Vault server paused" + end + local secret = get_from_shm(resource, version) kong.log.inspect({ @@ -92,6 +109,10 @@ end function test.api() + if not test.is_running() then + return kong.response.exit(503, { message = "Vault server paused" }) + end + local shm = assert(ngx.shared[test.SHM_NAME]) local secret = assert(ngx.var.secret) local args = assert(kong.request.get_query()) @@ -200,6 +221,18 @@ function test.client.get(secret, version) end +function test.client.pause() + local client = assert(http.new()) + + local uri = fmt("http://127.0.0.1:%d/pause", test.PORT) + + local res, err = client:request_uri(uri, { method = "GET" }) + assert(err == nil, "failed GET " .. uri .. ": " .. tostring(err)) + + return cjson.decode(res.body) +end + + test.http_mock = [[ lua_shared_dict ]] .. test.SHM_NAME .. [[ 5m; @@ -212,6 +245,12 @@ test.http_mock = [[ require("kong.vaults.test").api() } } + + location ~^/pause { + content_by_lua_block { + require("kong.vaults.test").pause() + } + } } ]]