loculus-project · anna-parker · Oct 25, 2024 · Oct 8, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/.github/workflows/preprocessing-tests.yaml b/.github/workflows/preprocessing-tests.yaml
@@ -28,6 +28,6 @@ jobs:
       - name: Run tests
         run: |
           pip install -e '.[test]'
-          python3 tests/test.py
+          pytest
         shell: micromamba-shell {0}
         working-directory: preprocessing/nextclade/
diff --git a/preprocessing/nextclade/.justfile b/preprocessing/nextclade/.justfile
@@ -1,5 +1,14 @@
 all: ruff_format ruff_check run_mypy
 
+create_env:
+    micromamba create -f environment.yml --rc-file .mambarc
+
+install:
+    pip install -e .
+
+install_test:
+    pip install -e .[test]
+
 r: ruff
 
 ruff: ruff_check ruff_format

diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md
@@ -15,18 +15,20 @@ This preprocessing pipeline is still a work in progress. It requests unaligned n
 
 ## Setup
 
-### Start directly
+### Installation
 
 1. Install `conda`/`mamba`/`micromamba`: see e.g. [micromamba installation docs](https://mamba.readthedocs.io/en/latest/micromamba-installation.html#umamba-install)
-2. Install environment:
+1. Install environment:
 
-   ```bash
+   ```sh
    mamba env create -n loculus-nextclade -f environment.yml
    ```
 
-3. Start backend (see [backend README](../backend/README.md)), run ingest script to submit sequences from INSDC. (Alternatively you can run `./deploy.py --enablePreprocessing` to start the backend and preprocessing pods in one command.)
+### Running
+
+1. Start backend (see [backend README](../backend/README.md)), run ingest script to submit sequences from INSDC. (Alternatively you can run `./deploy.py --enablePreprocessing` to start the backend and preprocessing pods in one command.)
 
-4. Run pipeline
+1. Run pipeline
 
    ```bash
    mamba activate loculus-nextclade
@@ -38,10 +40,10 @@ This preprocessing pipeline is still a work in progress. It requests unaligned n
 
 Tests can be run from the same directory
 
-```bash
+```sh
 mamba activate loculus-nextclade
-pip install -e .
-python3 tests/test.py
+pip install -e '.[test]'
+pytest
 ```
 
 Note that we do not add the tests folder to the docker image. In the CI tests are run using the same mamba environment as the preprocessing docker image but do not use the actual docker image. We chose this approach as it makes the CI tests faster but could potentially lead to the tests using a different program version than used in the docker image.
@@ -66,13 +68,13 @@ docker run -it --platform=linux/amd64 --network host --rm nextclade_processing p
 
 When deployed on kubernetes the preprocessing pipeline reads in config files which are created by `loculus/kubernetes/loculus/templates/loculus-preprocessing-config.yaml`. When run locally the pipeline uses only the default values defined in `preprocessing/nextclade/src/loculus_preprocessing/config.py`. When running the preprocessing pipeline locally it makes sense to create a local config file using the command:
 
-```
+```sh
 ../../generate_local_test_config.sh
 ```
 
 and use this in the pipeline as follows:
 
-```
+```sh
 prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp-dir
 ```
 
@@ -103,7 +105,7 @@ However, the `preprocessing` field can be customized to take an arbitrary number
 
 Using these functions in your `values.yaml` will look like:
 
-```
+```yaml
 - name: sampleCollectionDate
    type: date
    preprocessing:

diff --git a/preprocessing/nextclade/pyproject.toml b/preprocessing/nextclade/pyproject.toml
@@ -15,4 +15,4 @@ build-backend = "hatchling.build"
 packages = ["src/loculus_preprocessing"]
 
 [project.optional-dependencies]
-test = ["pytest"]
+test = ["pytest", "mypy", "types-pytz"]
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -140,10 +140,9 @@ def submit_processed_sequences(
     if not response.ok:
         Path("failed_submission.json").write_text(ndjson_string, encoding="utf-8")
         msg = (
-            f"Submitting processed data failed. Status code: {
-                response.status_code}\n"
+            f"Submitting processed data failed. Status code: {response.status_code}\n"
             f"Response: {response.text}\n"
-            f"Data sent in request: {ndjson_string[0:1000]}...\n"
+            f"Data sent: {ndjson_string[:1000]}...\n"
         )
         raise RuntimeError(msg)
     logging.info("Processed data submitted successfully")
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -18,7 +18,7 @@
 @dataclass
 class Config:
     organism: str = "mpox"
-    backend_host: str = ""  # populated in get_config if left empty
+    backend_host: str = ""  # populated in get_config if left empty, so we can use organism
     keycloak_host: str = "http://127.0.0.1:8083"
     keycloak_user: str = "preprocessing_pipeline"
     keycloak_password: str = "preprocessing_pipeline"
@@ -29,16 +29,16 @@ class Config:
     config_file: str | None = None
     log_level: str = "DEBUG"
     genes: list[str] = dataclasses.field(default_factory=list)
-    nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"])
+    nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"])  # noqa: N815
     keep_tmp_dir: bool = False
     reference_length: int = 197209
     batch_size: int = 5
     processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
     pipeline_version: int = 1
 
 
-def load_config_from_yaml(config_file: str, config: Config) -> Config:
-    config = copy.deepcopy(config)
+def load_config_from_yaml(config_file: str, config: Config | None = None) -> Config:
+    config = Config() if config is None else copy.deepcopy(config)
     with open(config_file, encoding="utf-8") as file:
         yaml_config = yaml.safe_load(file)
         logging.debug(f"Loaded config from {config_file}: {yaml_config}")
@@ -78,25 +78,28 @@ def generate_argparse_from_dataclass(config_cls: type[Config]) -> argparse.Argum
 
 
 def get_config(config_file: str | None = None) -> Config:
-    # Config precedence: CLI args > ENV variables > config file > default
+    """
+    Config precedence: Direct function args > CLI args > ENV variables > config file > default
 
+    args:
+        config_file: Path to YAML config file - only used by tests
+    """
+
+    # Set just log level this early from env, so we can debug log during config loading
     env_log_level = os.environ.get("PREPROCESSING_LOG_LEVEL")
     if env_log_level:
         logging.basicConfig(level=env_log_level)
 
     parser = generate_argparse_from_dataclass(Config)
     args = parser.parse_args()
 
-    # Load default config
-    config = Config()
+    # Use first config file present in order of precedence
+    config_file_path = (
+        config_file or args.config_file or os.environ.get("PREPROCESSING_CONFIG_FILE")
+    )
 
-    # Overwrite config with config in config_file
-    if config_file:
-        config = load_config_from_yaml(config_file, config)
-    if args.config_file:
-        config = load_config_from_yaml(args.config_file, config)
-    if not config.backend_host:  # Check if backend_host wasn't set during initialization
-        config.backend_host = f"http://127.0.0.1:8079/{config.organism}"
+    # Start with lowest precedence config, then overwrite with higher precedence
+    config = load_config_from_yaml(config_file_path) if config_file_path else Config()
 
     # Use environment variables if available
     for key in config.__dict__:
@@ -109,4 +112,7 @@ def get_config(config_file: str | None = None) -> Config:
         if value is not None:
             setattr(config, key, value)
 
+    if not config.backend_host:  # Set here so we can use organism
+        config.backend_host = f"http://127.0.0.1:8079/{config.organism}"
+
     return config
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -50,12 +50,12 @@ def __hash__(self):
 class UnprocessedData:
     submitter: str
     metadata: InputMetadata
-    unalignedNucleotideSequences: dict[str, NucleotideSequence]
+    unalignedNucleotideSequences: dict[str, NucleotideSequence]  # noqa: N815
 
 
 @dataclass
 class UnprocessedEntry:
-    accessionVersion: AccessionVersion  # {accession}.{version}
+    accessionVersion: AccessionVersion  # {accession}.{version}  # noqa: N815
     data: UnprocessedData
 
 
@@ -74,25 +74,25 @@ class ProcessingSpec:
 # For single segment, need to generalize for multi segments later
 @dataclass
 class UnprocessedAfterNextclade:
-    inputMetadata: InputMetadata
+    inputMetadata: InputMetadata  # noqa: N815
     # Derived metadata produced by Nextclade
-    nextcladeMetadata: dict[SegmentName, Any] | None
-    unalignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]
-    alignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]
-    nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]]
-    alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]
-    aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]
+    nextcladeMetadata: dict[SegmentName, Any] | None  # noqa: N815
+    unalignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]  # noqa: N815
+    alignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]  # noqa: N815
+    nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]]  # noqa: N815
+    alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]  # noqa: N815
+    aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]  # noqa: N815
     errors: list[ProcessingAnnotation]
 
 
 @dataclass
 class ProcessedData:
     metadata: ProcessedMetadata
-    unalignedNucleotideSequences: dict[str, Any]
-    alignedNucleotideSequences: dict[str, Any]
-    nucleotideInsertions: dict[str, Any]
-    alignedAminoAcidSequences: dict[str, Any]
-    aminoAcidInsertions: dict[str, Any]
+    unalignedNucleotideSequences: dict[str, Any]  # noqa: N815
+    alignedNucleotideSequences: dict[str, Any]  # noqa: N815
+    nucleotideInsertions: dict[str, Any]  # noqa: N815
+    alignedAminoAcidSequences: dict[str, Any]  # noqa: N815
+    aminoAcidInsertions: dict[str, Any]  # noqa: N815
 
 
 @dataclass