diff --git a/config.sample.toml b/config.sample.toml index c3af1f5..b81de5b 100644 --- a/config.sample.toml +++ b/config.sample.toml @@ -3,8 +3,13 @@ source_collector_dir = "path/to/source_collector_dir/" host_modifier_dir = "path/to/host_modifier_dir/" db_uri = "dbname='zac' user='zabbix' host='localhost' password='secret' port=5432 connect_timeout=2" log_level = "DEBUG" +# Health status for each ZAC process. health_file = "/tmp/zac_health.json" +# File containing hostnames of hosts to add/remove when failsafe is reached. failsafe_file = "/tmp/zac_failsafe.json" +# File to signal manual approval of adding/removing hosts when failsafe is reached. +# The file is automatically removed after changes are made. +failsafe_ok_file = "/tmp/zac_failsafe_ok" [zabbix] map_dir = "path/to/map_dir/" diff --git a/zabbix_auto_config/models.py b/zabbix_auto_config/models.py index c5fa4ac..75cfa25 100644 --- a/zabbix_auto_config/models.py +++ b/zabbix_auto_config/models.py @@ -82,11 +82,12 @@ class ZacSettings(ConfigBaseModel): source_collector_dir: str host_modifier_dir: str db_uri: str - health_file: Optional[Path] = None log_level: int = Field(logging.DEBUG, description="The log level to use.") + health_file: Optional[Path] = None failsafe_file: Optional[Path] = None + failsafe_ok_file: Optional[Path] = None - @field_validator("failsafe_file", "health_file", mode="after") + @field_validator("health_file", "failsafe_file", "failsafe_ok_file", mode="after") @classmethod def _validate_file_path(cls, v: Optional[Path], info: ValidationInfo) -> Optional[Path]: if v is None: @@ -283,4 +284,4 @@ class HostActions(BaseModel): def write_json(self, path: Path) -> None: """Writes a JSON serialized representation of self to a file.""" - utils.write_file(path, self.model_dump_json(indent=2)) \ No newline at end of file + utils.write_file(path, self.model_dump_json(indent=2)) diff --git a/zabbix_auto_config/processing.py b/zabbix_auto_config/processing.py index bb91e61..b0696da 100644 --- a/zabbix_auto_config/processing.py +++ b/zabbix_auto_config/processing.py @@ -733,6 +733,22 @@ def set_tags(self, zabbix_host, tags): else: logging.info("DRYRUN: Setting tags (%s) on host: '%s' (%s)", tags, zabbix_host["host"], zabbix_host["hostid"]) + def handle_failsafe_limit(self, to_add: List[str], to_remove: List[str]) -> None: + """Handles situations where the number of hosts to add/remove exceeds the failsafe. + + If a failsafe OK file exists, the method will remove it and proceed with the changes. + Otherwise, it will write the list of hosts to add and remove to a failsafe file and + raise a ZACException.""" + if self._check_failsafe_ok_file(): + return + self.write_failsafe_hosts(to_add, to_remove) + logging.warning( + "Too many hosts to change (failsafe=%d). Remove: %d, Add: %d. Aborting", + self.config.failsafe, + len(to_remove), + len(to_add), + ) + raise exceptions.ZACException("Failsafe triggered") def write_failsafe_hosts(self, to_add: List[str], to_remove: List[str]) -> None: if not self.settings.zac.failsafe_file: @@ -747,6 +763,25 @@ def write_failsafe_hosts(self, to_add: List[str], to_remove: List[str]) -> None: self.settings.zac.failsafe_file, ) + def _check_failsafe_ok_file(self) -> bool: + if not self.settings.zac.failsafe_ok_file: + return False + if not self.settings.zac.failsafe_ok_file.exists(): + logging.info( + "Failsafe OK file %s does not exist. Create it to approve changes.", + self.settings.zac.failsafe_ok_file, + ) + return False + try: + self.settings.zac.failsafe_ok_file.unlink() + except OSError as e: + logging.error( + "Failsafe cannot be approved. Unable to delete failsafe OK file: %s", e + ) + return False + logging.info("Failsafe OK file exists. Proceeding with changes.") + return True + def do_update(self): with self.db_connection, self.db_connection.cursor() as db_cursor: db_cursor.execute(f"SELECT data FROM {self.db_hosts_table} WHERE data->>'enabled' = 'true'") @@ -795,10 +830,11 @@ def do_update(self): logging.debug("Only in db: %s", " ".join(hostnames_to_add[:10])) logging.debug("In both: %d", len(hostnames_in_both)) - if len(hostnames_to_remove) > self.config.failsafe or len(hostnames_to_add) > self.config.failsafe: - logging.warning("Too many hosts to change (failsafe=%d). Remove: %d, Add: %d. Aborting", self.config.failsafe, len(hostnames_to_remove), len(hostnames_to_add)) - self.write_failsafe_hosts(hostnames_to_add, hostnames_to_remove) - raise exceptions.ZACException("Failsafe triggered") + if ( + len(hostnames_to_remove) > self.config.failsafe + or len(hostnames_to_add) > self.config.failsafe + ): + self.handle_failsafe_limit(hostnames_to_add, hostnames_to_remove) for hostname in hostnames_to_remove: if self.stop_event.is_set():