Skip to content

Commit

Permalink
Don't reload the router if the schema/license hasn't changed (#3478)
Browse files Browse the repository at this point in the history
The router is performing frequent schema reloads due to notifications
from uplink. In the majority of cases a schema reload is not required,
because the schema hasn't actually changed.

We won't reload the router if the schema/license hasn't changed.

Fixes #3180

**Checklist**

Complete the checklist (and note appropriate exceptions) before a final
PR is raised.

- [x] Changes are compatible[^1]
- [ ] Documentation[^2] completed
- [ ] Performance impact assessed and acceptable
- Tests added and passing[^3]
    - [x] Unit Tests
    - [x] Integration Tests
    - [x] Manual Tests

**Exceptions**

*Note any exceptions here*

**Notes**

[^1]. It may be appropriate to bring upcoming changes to the attention
of other (impacted) groups. Please endeavour to do this before seeking
PR approval. The mechanism for doing this will vary considerably, so use
your judgement as to how and when to do this.
[^2]. Configuration is an important part of many changes. Where
applicable please try to document configuration examples.
[^3]. Tick whichever testing boxes are applicable. If you are adding
Manual Tests:
- please document the manual testing (extensively) in the Exceptions.
- please raise a separate issue to automate the test and label it (or
ask for it to be labeled) as `manual test`

---------

Signed-off-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com>
Co-authored-by: Gary Pennington <gary@apollographql.com>
Co-authored-by: bryn <bryn@apollographql.com>
  • Loading branch information
3 people authored Jul 25, 2023
1 parent f529b37 commit 2a9349d
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 51 deletions.
7 changes: 7 additions & 0 deletions .changesets/maint_bnjjj_fix_3180.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
### Don't reload the router if the schema/license hasn't changed ([Issue #3180](https://github.com/apollographql/router/issues/3180))

The router is performing frequent schema reloads due to notifications from uplink. In the majority of cases a schema reload is not required, because the schema hasn't actually changed.

We won't reload the router if the schema/license hasn't changed.

By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/3478
154 changes: 109 additions & 45 deletions apollo-router/src/state_machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ use crate::uplink::license_enforcement::LicenseState;
use crate::uplink::license_enforcement::LICENSE_EXPIRED_URL;
use crate::ApolloRouterError::NoLicense;

const STATE_CHANGE: &str = "state change";

#[derive(Default, Clone)]
pub(crate) struct ListenAddresses {
pub(crate) graphql_listen_address: Option<ListenAddr>,
Expand Down Expand Up @@ -162,65 +164,102 @@ impl<FA: RouterSuperServiceFactory> State<FA> {
router_service_factory,
all_connections_stopped_signals,
} => {
tracing::info!(
new_schema = new_schema.is_some(),
new_license = new_license.is_some(),
new_configuration = new_configuration.is_some(),
"reloading"
);

// When we get an unlicensed event, if we were licensed before then just carry on.
// This means that users can delete and then undelete their graphs in studio while having their routers continue to run.
if new_license == Some(LicenseState::Unlicensed)
&& *license != LicenseState::Unlicensed
{
// When we get an unlicensed event, if we were licensed before then just carry on.
// This means that users can delete and then undelete their graphs in studio while having their routers continue to run.
tracing::info!("loss of license detected, ignoring reload");
tracing::info!(
event = STATE_CHANGE,
"ignoring reload because of loss of license"
);
return self;
}

// We update the running config. This is OK even in the case that the router could not reload as we always want to retain the latest information for when we try to reload next.
// In the case of a failed reload the server handle is retained, which has the old config/schema/license in.
// Have things actually changed?
let (mut license_reload, mut schema_reload, mut configuration_reload) =
(false, false, false);
if let Some(new_configuration) = new_configuration {
*configuration = new_configuration;
configuration_reload = true;
}
if let Some(new_schema) = new_schema {
*schema = new_schema;
if schema.as_ref() != new_schema.as_ref() {
*schema = new_schema;
schema_reload = true;
}
}
if let Some(new_license) = new_license {
*license = new_license;
if *license != new_license {
*license = new_license;
license_reload = true;
}
}

let mut guard = state_machine.listen_addresses.clone().write_owned().await;
let signals = std::mem::take(all_connections_stopped_signals);
new_state = match Self::try_start(
state_machine,
server_handle,
Some(router_service_factory),
configuration.clone(),
schema.clone(),
*license,
&mut guard,
signals,
)
.await
{
Ok(new_state) => {
tracing::info!("reload complete");
Some(new_state)
}
Err(e) => {
// If we encountered an error it may be fatal depending on if we consumed the server handle or not.
match server_handle {
None => {
tracing::error!("fatal error while trying to reload; {}", e);
Some(Errored(e))
}
Some(_) => {
tracing::info!("error while reloading, continuing with previous configuration; {}", e);
None
// Let users know we are about to process a state reload event
tracing::info!(
new_schema = schema_reload,
new_license = license_reload,
new_configuration = configuration_reload,
event = STATE_CHANGE,
"processing event"
);

let need_reload = schema_reload || license_reload || configuration_reload;

if need_reload {
// We update the running config. This is OK even in the case that the router could not reload as we always want to retain the latest information for when we try to reload next.
// In the case of a failed reload the server handle is retained, which has the old config/schema/license in.
let mut guard = state_machine.listen_addresses.clone().write_owned().await;
let signals = std::mem::take(all_connections_stopped_signals);
new_state = match Self::try_start(
state_machine,
server_handle,
Some(router_service_factory),
configuration.clone(),
schema.clone(),
*license,
&mut guard,
signals,
)
.await
{
Ok(new_state) => {
tracing::info!(
new_schema = schema_reload,
new_license = license_reload,
new_configuration = configuration_reload,
event = STATE_CHANGE,
"reload complete"
);
Some(new_state)
}
Err(e) => {
// If we encountered an error it may be fatal depending on if we consumed the server handle or not.
match server_handle {
None => {
tracing::error!(
error = %e,
event = STATE_CHANGE,
"fatal error while trying to reload"
);
Some(Errored(e))
}
Some(_) => {
tracing::error!(error = %e, event = STATE_CHANGE, "error while reloading, continuing with previous configuration");
None
}
}
}
}
} else {
tracing::info!(
new_schema = schema_reload,
new_license = license_reload,
new_configuration = configuration_reload,
event = STATE_CHANGE,
"no reload necessary"
);
}
}
_ => {}
Expand Down Expand Up @@ -821,6 +860,29 @@ mod tests {
assert_eq!(shutdown_receivers.lock().unwrap().len(), 2);
}

#[test(tokio::test)]
async fn startup_no_reload_schema() {
let router_factory = create_mock_router_configurator(1);
let (server_factory, shutdown_receivers) = create_mock_server_factory(1, 1, 1, 1, 1);
let minimal_schema = include_str!("testdata/minimal_supergraph.graphql");
assert_matches!(
execute(
server_factory,
router_factory,
stream::iter(vec![
UpdateConfiguration(Configuration::builder().build().unwrap()),
UpdateSchema(minimal_schema.to_owned()),
UpdateLicense(LicenseState::default()),
UpdateSchema(minimal_schema.to_owned()),
Shutdown
])
)
.await,
Ok(())
);
assert_eq!(shutdown_receivers.lock().unwrap().len(), 1);
}

#[test(tokio::test)]
async fn startup_reload_license() {
let router_factory = create_mock_router_configurator(2);
Expand All @@ -834,7 +896,7 @@ mod tests {
UpdateConfiguration(Configuration::builder().build().unwrap()),
UpdateSchema(minimal_schema.to_owned()),
UpdateLicense(LicenseState::default()),
UpdateLicense(LicenseState::default()),
UpdateLicense(LicenseState::Licensed),
Shutdown
])
)
Expand Down Expand Up @@ -945,6 +1007,7 @@ mod tests {
.returning(|_, _, _, _| Err(BoxError::from("error")));

let (server_factory, shutdown_receivers) = create_mock_server_factory(1, 1, 1, 1, 1);
let minimal_schema = include_str!("testdata/minimal_supergraph.graphql");

assert_matches!(
execute(
Expand All @@ -954,7 +1017,7 @@ mod tests {
UpdateConfiguration(Configuration::builder().build().unwrap()),
UpdateSchema(example_schema()),
UpdateLicense(LicenseState::default()),
UpdateSchema(example_schema()),
UpdateSchema(minimal_schema.to_owned()),
Shutdown
])
)
Expand Down Expand Up @@ -996,6 +1059,7 @@ mod tests {
});

let (server_factory, shutdown_receivers) = create_mock_server_factory(2, 1, 1, 1, 1);
let minimal_schema = include_str!("testdata/minimal_supergraph.graphql");

assert_matches!(
execute(
Expand All @@ -1011,7 +1075,7 @@ mod tests {
.build()
.unwrap()
),
UpdateSchema(example_schema()),
UpdateSchema(minimal_schema.to_owned()),
Shutdown
]),
)
Expand Down
5 changes: 5 additions & 0 deletions apollo-router/tests/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,11 @@ impl IntegrationTest {
self.assert_log_contains("reload complete").await;
}

#[allow(dead_code)]
pub async fn assert_no_reload_necessary(&mut self) {
self.assert_log_contains("no reload necessary").await;
}

#[allow(dead_code)]
pub async fn assert_not_reloaded(&mut self) {
self.assert_log_contains("continuing with previous configuration")
Expand Down
12 changes: 6 additions & 6 deletions apollo-router/tests/lifecycle_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,12 @@ async fn test_reload_config_with_broken_plugin() -> Result<(), BoxError> {

#[tokio::test(flavor = "multi_thread")]
async fn test_reload_config_with_broken_plugin_recovery() -> Result<(), BoxError> {
let mut router = IntegrationTest::builder()
.config(HAPPY_CONFIG)
.build()
.await;
for i in 0..3 {
println!("iteration {i}");
let mut router = IntegrationTest::builder()
.config(HAPPY_CONFIG)
.build()
.await;
router.start().await;
router.assert_started().await;
router.execute_default_query().await;
Expand Down Expand Up @@ -151,7 +151,7 @@ async fn test_force_reload() -> Result<(), BoxError> {
router.start().await;
router.assert_started().await;
tokio::time::sleep(Duration::from_secs(2)).await;
router.assert_reloaded().await;
router.assert_no_reload_necessary().await;
router.graceful_shutdown().await;
Ok(())
}
Expand All @@ -166,7 +166,7 @@ async fn test_reload_via_sighup() -> Result<(), BoxError> {
router.start().await;
router.assert_started().await;
router.send_sighup().await;
router.assert_reloaded().await;
router.assert_no_reload_necessary().await;
router.graceful_shutdown().await;
Ok(())
}
Expand Down

0 comments on commit 2a9349d

Please sign in to comment.