From 5b64da6792b1482b79eec515ddd7e03a435a47ca Mon Sep 17 00:00:00 2001 From: Minutis Date: Fri, 4 Oct 2024 20:15:54 +0300 Subject: [PATCH] Add LIGHTER_ZOMBIE_TIMEOUT to configure zombie timeout --- docs/configuration.md | 39 ++++++++++--------- .../application/ApplicationStatusHandler.java | 9 +++-- .../configuration/AppConfiguration.java | 27 +++++++++---- server/src/main/resources/application.yml | 1 + .../ApplicationStatusHandlerTest.groovy | 3 +- .../exacaster/lighter/test/Factories.groovy | 1 + 6 files changed, 49 insertions(+), 31 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 16a6556d..ca20c013 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -4,25 +4,26 @@ Lighter can be configured by using environment variables. Currently, Lighter sup ## Global properties -| Property | Description | Default | -|----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------| -| LIGHTER_MAX_RUNNING_JOBS | Max running Batch jobs in parallel | 5 | -| LIGHTER_MAX_STARTING_JOBS | Max starting Batch jobs in parallel | 5 | -| LIGHTER_SPARK_HISTORY_SERVER_URL | Spark history server URL used on the Lighter UI | http://localhost/spark-history/ | -| LIGHTER_EXTERNAL_LOGS_URL_TEMPLATE | Template for link to external logs (Grafana, Graylog, etc.) used on the Lighter UI. Allowed placeholders: `{{id}}`, `{{appId}}`, `{{createdTs}}` | | -| LIGHTER_PY_GATEWAY_PORT | Port for live Spark session communication | 25333 | -| LIGHTER_URL | URL which can be used to access Lighter form Spark Job | http://lighter.spark:8080 | -| LIGHTER_SESSION_TIMEOUT_INTERVAL | `java.time.Duration` representing session lifetime (from last statement creation). Use `0m` value to disable | 90m | -| LIGHTER_SESSION_TIMEOUT_ACTIVE | Should Lighter kill sessions with waiting statements (obsolete when `LIGHTER_SESSION_TIMEOUT_INTERVAL` is `0m`) | false | -| LIGHTER_SESSION_SCHEDULE_INTERVAL | `java.time.Duration` representing the interval at which a task is triggered to initiate scheduled sessions | 1m | -| LIGHTER_SESSION_TRACK_RUNNING_INTERVAL | `java.time.Duration` representing the interval at which a task is triggered to process and update running session state | 2m | -| LIGHTER_STORAGE_JDBC_URL | JDBC url for lighter storage | jdbc:h2:mem:lighter | -| LIGHTER_STORAGE_JDBC_USERNAME | JDBC username | sa | -| LIGHTER_STORAGE_JDBC_PASSWORD | JDBC password | | -| LIGHTER_STORAGE_JDBC_DRIVER_CLASS_NAME | JDBC driver class name | org.h2.Driver | -| LIGHTER_BATCH_DEFAULT_CONF | Default `conf` props for batch applications (JSON)* | | -| LIGHTER_SESSION_DEFAULT_CONF | Default `conf` props for session applications (JSON) | | -| LIGHTER_SESSION_PERMANENT_SESSIONS | List of configurations for [permanent sessions](./permanent_sessions.md) | "[]" | +| Property | Description | Default | +|----------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------| +| LIGHTER_MAX_RUNNING_JOBS | Max running Batch jobs in parallel | 5 | +| LIGHTER_MAX_STARTING_JOBS | Max starting Batch jobs in parallel | 5 | +| LIGHTER_SPARK_HISTORY_SERVER_URL | Spark history server URL used on the Lighter UI | http://localhost/spark-history/ | +| LIGHTER_EXTERNAL_LOGS_URL_TEMPLATE | Template for link to external logs (Grafana, Graylog, etc.) used on the Lighter UI. Allowed placeholders: `{{id}}`, `{{appId}}`, `{{createdTs}}` | | +| LIGHTER_PY_GATEWAY_PORT | Port for live Spark session communication | 25333 | +| LIGHTER_URL | URL which can be used to access Lighter form Spark Job | http://lighter.spark:8080 | +| LIGHTER_ZOMBIE_INTERVAL | How long for Lighter to try to fetch the status of the job before marking it as a zombie. (For jobs that "disappear" before Lighter could determine their final status) | 30m | +| LIGHTER_SESSION_TIMEOUT_INTERVAL | `java.time.Duration` representing session lifetime (from last statement creation). Use `0m` value to disable | 90m | +| LIGHTER_SESSION_TIMEOUT_ACTIVE | Should Lighter kill sessions with waiting statements (obsolete when `LIGHTER_SESSION_TIMEOUT_INTERVAL` is `0m`) | false | +| LIGHTER_SESSION_SCHEDULE_INTERVAL | `java.time.Duration` representing the interval at which a task is triggered to initiate scheduled sessions | 1m | +| LIGHTER_SESSION_TRACK_RUNNING_INTERVAL | `java.time.Duration` representing the interval at which a task is triggered to process and update running session state | 2m | +| LIGHTER_STORAGE_JDBC_URL | JDBC url for lighter storage | jdbc:h2:mem:lighter | +| LIGHTER_STORAGE_JDBC_USERNAME | JDBC username | sa | +| LIGHTER_STORAGE_JDBC_PASSWORD | JDBC password | | +| LIGHTER_STORAGE_JDBC_DRIVER_CLASS_NAME | JDBC driver class name | org.h2.Driver | +| LIGHTER_BATCH_DEFAULT_CONF | Default `conf` props for batch applications (JSON)* | | +| LIGHTER_SESSION_DEFAULT_CONF | Default `conf` props for session applications (JSON) | | +| LIGHTER_SESSION_PERMANENT_SESSIONS | List of configurations for [permanent sessions](./permanent_sessions.md) | "[]" | * default configs will be merged with configss provided in submit request, if property is defined in submit request, default will be ignored. Example of `LIGHTER_BATCH_DEFAULT_CONF`: `{"spark.kubernetes.driverEnv.TEST1":"test1"}`. diff --git a/server/src/main/java/com/exacaster/lighter/application/ApplicationStatusHandler.java b/server/src/main/java/com/exacaster/lighter/application/ApplicationStatusHandler.java index d41bdfa0..8195a882 100644 --- a/server/src/main/java/com/exacaster/lighter/application/ApplicationStatusHandler.java +++ b/server/src/main/java/com/exacaster/lighter/application/ApplicationStatusHandler.java @@ -4,6 +4,7 @@ import static org.slf4j.LoggerFactory.getLogger; import com.exacaster.lighter.backend.Backend; +import com.exacaster.lighter.configuration.AppConfiguration; import com.exacaster.lighter.log.Log; import com.exacaster.lighter.log.LogService; import com.exacaster.lighter.storage.ApplicationStorage; @@ -20,11 +21,13 @@ public class ApplicationStatusHandler { private final ApplicationStorage applicationStorage; private final Backend backend; private final LogService logService; + private final AppConfiguration conf; - public ApplicationStatusHandler(ApplicationStorage applicationStorage, Backend backend, LogService logService) { + public ApplicationStatusHandler(ApplicationStorage applicationStorage, Backend backend, LogService logService, AppConfiguration conf) { this.applicationStorage = applicationStorage; this.backend = backend; this.logService = logService; + this.conf = conf; } public Application processApplicationStarting(Application application) { @@ -85,13 +88,13 @@ private ApplicationState trackStatus(Application app, ApplicationInfo info) { private ApplicationState checkZombie(Application app) { LOG.info("No info for {}", app); - if (app.getContactedAt() != null && app.getContactedAt().isBefore(LocalDateTime.now().minusMinutes(30))) { + if (app.getContactedAt() != null && app.getContactedAt().isBefore(LocalDateTime.now().minus(conf.getZombieInterval()))) { LOG.info("Assuming zombie ({})", app.getId()); applicationStorage.saveApplication(ApplicationBuilder.builder(app) .setState(ApplicationState.ERROR) .build()); logService.save(new Log(app.getId(), - "Application was not reachable for 10 minutes, so we assume something went wrong")); + "Application was not reachable for " + conf.getZombieInterval().toMinutes() + " minutes, so we assume something went wrong")); return ApplicationState.ERROR; } return app.getState(); diff --git a/server/src/main/java/com/exacaster/lighter/configuration/AppConfiguration.java b/server/src/main/java/com/exacaster/lighter/configuration/AppConfiguration.java index 018cf47f..00d1c25a 100644 --- a/server/src/main/java/com/exacaster/lighter/configuration/AppConfiguration.java +++ b/server/src/main/java/com/exacaster/lighter/configuration/AppConfiguration.java @@ -34,6 +34,7 @@ public class AppConfiguration { private final Integer pyGatewayPort; @JsonProperty(access = Access.WRITE_ONLY) private final String url; + private final Duration zombieInterval; private final SessionConfiguration sessionConfiguration; private final Map batchDefaultConf; private final Map sessionDefaultConf; @@ -45,6 +46,7 @@ public AppConfiguration(Integer maxRunningJobs, @Nullable String externalLogsUrlTemplate, Integer pyGatewayPort, String url, + Duration zombieInterval, SessionConfiguration sessionConfiguration, @MapFormat(transformation = FLAT, keyFormat = RAW) @Nullable Map batchDefaultConf, @@ -55,6 +57,7 @@ public AppConfiguration(Integer maxRunningJobs, this.externalLogsUrlTemplate = externalLogsUrlTemplate; this.pyGatewayPort = pyGatewayPort; this.url = url; + this.zombieInterval = zombieInterval; this.sessionConfiguration = sessionConfiguration; this.batchDefaultConf = ofNullable(batchDefaultConf).orElse(Map.of()); this.sessionDefaultConf = ofNullable(sessionDefaultConf).orElse(Map.of()); @@ -84,6 +87,10 @@ public String getUrl() { return url; } + public Duration getZombieInterval() { + return zombieInterval; + } + public SessionConfiguration getSessionConfiguration() { return sessionConfiguration; } @@ -98,14 +105,18 @@ public Map getSessionDefaultConf() { @Override public String toString() { - return new StringJoiner(", ", AppConfiguration.class.getSimpleName() + "[", "]") - .add("maxRunningJobs=" + maxRunningJobs) - .add("maxStartingJobs=" + maxStartingJobs) - .add("sparkHistoryServerUrl=" + sparkHistoryServerUrl) - .add("sessionConfiguration=" + sessionConfiguration) - .add("batchDefaultConf=" + batchDefaultConf) - .add("sessionDefaultConf=" + sessionDefaultConf) - .toString(); + return "AppConfiguration{" + + "maxRunningJobs=" + maxRunningJobs + + ", maxStartingJobs=" + maxStartingJobs + + ", sparkHistoryServerUrl='" + sparkHistoryServerUrl + '\'' + + ", externalLogsUrlTemplate='" + externalLogsUrlTemplate + '\'' + + ", pyGatewayPort=" + pyGatewayPort + + ", url='" + url + '\'' + + ", zombieInterval=" + zombieInterval + + ", sessionConfiguration=" + sessionConfiguration + + ", batchDefaultConf=" + batchDefaultConf + + ", sessionDefaultConf=" + sessionDefaultConf + + '}'; } @Introspected diff --git a/server/src/main/resources/application.yml b/server/src/main/resources/application.yml index 32b5d490..effcb546 100644 --- a/server/src/main/resources/application.yml +++ b/server/src/main/resources/application.yml @@ -5,6 +5,7 @@ lighter: frontend-path: file:${FRONTEND_PATH:../frontend/build} py-gateway-port: 25333 url: http://lighter.spark:8080 + zombie-interval: 30m session: timeout-interval: 90m timeout-active: false diff --git a/server/src/test/groovy/com/exacaster/lighter/application/ApplicationStatusHandlerTest.groovy b/server/src/test/groovy/com/exacaster/lighter/application/ApplicationStatusHandlerTest.groovy index 2370bf49..515928b2 100644 --- a/server/src/test/groovy/com/exacaster/lighter/application/ApplicationStatusHandlerTest.groovy +++ b/server/src/test/groovy/com/exacaster/lighter/application/ApplicationStatusHandlerTest.groovy @@ -9,6 +9,7 @@ import spock.lang.Subject import java.time.LocalDateTime +import static com.exacaster.lighter.test.Factories.appConfiguration import static com.exacaster.lighter.test.Factories.newSession class ApplicationStatusHandlerTest extends Specification { @@ -21,7 +22,7 @@ class ApplicationStatusHandlerTest extends Specification { LogService logService = Mock() @Subject - ApplicationStatusHandler handler = new ApplicationStatusHandler(storage, backend, logService) + ApplicationStatusHandler handler = new ApplicationStatusHandler(storage, backend, logService, appConfiguration()) def cleanup() { storage.cleanup() diff --git a/server/src/test/groovy/com/exacaster/lighter/test/Factories.groovy b/server/src/test/groovy/com/exacaster/lighter/test/Factories.groovy index 757829d6..c8cfc172 100644 --- a/server/src/test/groovy/com/exacaster/lighter/test/Factories.groovy +++ b/server/src/test/groovy/com/exacaster/lighter/test/Factories.groovy @@ -90,6 +90,7 @@ class Factories { null, 5432, "http://lighter:8080", + Duration.ofMinutes(30), new AppConfiguration.SessionConfiguration(Duration.ofMinutes(20), false, [new AppConfiguration.PermanentSession("permanentSessionId", submitParams())] , Duration.ofMinutes(1), Duration.ofMinutes(2)),