diff --git a/.changesets/maint_garypen_3533_istio_warn.md b/.changesets/maint_garypen_3533_istio_warn.md new file mode 100644 index 0000000000..916457ce2d --- /dev/null +++ b/.changesets/maint_garypen_3533_istio_warn.md @@ -0,0 +1,9 @@ +### Add a warning if we think istio-proxy injection is causing problems ([Issue #3533](https://github.com/apollographql/router/issues/3533)) + +We have encountered situations where the injection of istio-proxy in a router pod (executing in Kubernetes) causes networking errors during uplink retrieval. + +The root cause is that the router is executing and attempting to retrieve uplink schemas while the istio-proxy is simultaneously modifying network configuration. + +This new warning message directs users to information which should help them to configure their Kubernetes cluster or pod to avoid this problem. + +By [@garypen](https://github.com/garypen) in https://github.com/apollographql/router/pull/3545 \ No newline at end of file diff --git a/apollo-router/src/uplink/mod.rs b/apollo-router/src/uplink/mod.rs index f9afb7ce49..8f10a44c28 100644 --- a/apollo-router/src/uplink/mod.rs +++ b/apollo-router/src/uplink/mod.rs @@ -1,3 +1,4 @@ +use std::error::Error as stdError; use std::fmt::Debug; use std::time::Duration; use std::time::Instant; @@ -359,7 +360,27 @@ where Query: graphql_client::GraphQLQuery, { let client = reqwest::Client::builder().timeout(timeout).build()?; - let res = client.post(url).json(request_body).send().await?; + // It is possible that istio-proxy is re-configuring networking beneath us. If it is, we'll see an error something like this: + // level: "ERROR" + // message: "fetch failed from all endpoints" + // target: "apollo_router::router::event::schema" + // timestamp: "2023-08-01T10:40:28.831196Z" + // That's deeply confusing and very hard to debug. Let's try to help by printing out a helpful error message here + let res = client + .post(url) + .json(request_body) + .send() + .await + .map_err(|e| { + if let Some(hyper_err) = e.source() { + if let Some(os_err) = hyper_err.source() { + if os_err.to_string().contains("tcp connect error: Cannot assign requested address (os error 99)") { + tracing::warn!("If your router is executing within a kubernetes pod, this failure may be caused by istio-proxy injection. See https://github.com/apollographql/router/issues/3533 for more details about how to solve this"); + } + } + } + e + })?; tracing::debug!("uplink response {:?}", res); let response_body: graphql_client::Response = res.json().await?; Ok(response_body) diff --git a/docs/source/containerization/kubernetes.mdx b/docs/source/containerization/kubernetes.mdx index b0481dbb8d..46c4b30187 100644 --- a/docs/source/containerization/kubernetes.mdx +++ b/docs/source/containerization/kubernetes.mdx @@ -254,3 +254,8 @@ If you had a router running on your localhost, with default health-check configu curl "http://localhost:8088/health" +## Using `istio` with the router + +The [istio service mesh](https://istio.io/) is a very popular choice for enhanced traffic routing within Kubernetes. + +`istio-proxy` pod injection can cause an [issue](https://github.com/apollographql/router/issues/3533) in the router. The router may start executing at the same time that istio is reconfiguring networking for the router pod. This is an issue with `istio`, not the router, and you can resolve it by following the advice in [istio's injection documentation](https://istio.io/latest/docs/ops/common-problems/injection/#pod-or-containers-start-with-network-issues-if-istio-proxy-is-not-ready).