add initial support to run processing in single command

ElementoLab · Jul 1, 2021 · 09a97ba · 09a97ba
1 parent fd0c7fb
commit 09a97ba
Show file tree

Hide file tree

Showing 11 changed files with 166 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -5,16 +5,15 @@
 
 # Imaging mass cytometry
 
-This is a package for the analysis of imaging mass cytometry (IMC) data.
+This is a package for processing and  analysis of imaging mass cytometry (IMC) data.
 
 It implements image- and channel-wise quality control, quantification of cell
 intenstity and morphology, cell type discovery through clustering, automated
 cell type labeling, community and super-community finding and differential
 comparisons between sample groups, in addition to many handy visualization tools.
-
 Above all, it is a tool for the use of IMC data at scale.
-To do that, it implements out-of-memory handling of image stacks and masks.
-Other backends to handle multiplexed images are under development.
+
+Development is still underway, so use at your own risk.
 
 
 ## Requirements and installation
@@ -26,27 +25,35 @@ Install with `pip`:
 pip install git+https://github.com/ElementoLab/imc.git
 ```
 
+> N.B: at the moment, the `predict` step of pipeline processing only works in Linux - testing in MacOS is under way.
 
 ## Quick start
 
 ### Use case 1 (pipeline processing)
 
 #### Example: Lung sample processing from MCD to single-cell h5ad
 
-Pipeline processing:
+One-line IMC data processing:
 ```bash
 # Install imc package (do this inside virtual environment for example)
 pip install git+https://github.com/ElementoLab/imc.git#egg=imc[deepcell]
 
-# Get example data
+# Download some example data
 SAMPLE=20200629_NL1915A
 MCD_URL=https://zenodo.org/record/4110560/files/data/${SAMPLE}/${SAMPLE}.mcd
 mkdir -p imctest/data
 wget -q -O imctest/data/${SAMPLE}.mcd $MCD_URL
-
-# Run pipeline
 cd imctest/
 
+# Run pipeline in one step:
+imc process data/${SAMPLE}.mcd
+```
+Several MCD or TIFF files can be given to `imc process`. See more with the `--help` option.
+
+`imc` is nonetheless very modular and allows the user to run any of the step seperately as well.
+
+The above is also equivalent to the following:
+```bash
 ## output description of acquired data
 imc inspect data/${SAMPLE}.mcd
 
@@ -70,18 +77,17 @@ imc segment \
   --model deepcell \
   --compartment both $TIFFS
 
-## Quantify channel intensity for each single cell in every image
+## Quantify channel intensity and morphology for each single cell in every image
 imc quantify $TIFFS
+```
+There are many customization options for each step. Do `imc --help` or `imc <subcommand> --help` to see all.
 
-
-# `imc` also includes a lightweight interactive image viewer
+`imc` also includes a lightweight interactive image viewer:
+```bash
 imc view $TIFFS
 ```
 
-There are many customization options for each step. Do `imc --help` or `imc <subcommand> --help` to see all.
-A `all` command is in the making which will make sample processing as easy as `imc process <mcdfile>`.
-
-Quick analysis of single cell data downstream in IPython/Jupyter notebook:
+A quick example of further analysis steps of single cell data downstream in IPython/Jupyter notebook:
 ```python
 import scanpy as sc
 a = sc.read('processed/quantification.h5ad')
@@ -234,7 +240,7 @@ The expected files are produced by common preprocessing pipelines such as
 
 ## Documentation
 
-Documentation is for now mostly a skeleton but will be enlarged soon:
+Documentation is for now mostly a skeleton but will be expanded soon:
 
 ```bash
 make docs
@@ -245,5 +251,8 @@ make docs
 Tests are still very limited, but you can run tests this way:
 
 ```bash
+pip install pytest  # install testing package
 python -m pytest --pyargs imc
 ```
+
+For data processing, running the example lung data should make sure eveything is running smoothly.
diff --git a/imc/cli.py b/imc/cli.py
@@ -7,26 +7,27 @@
 
 import sys
 import argparse
-from typing import List
+import typing as tp
 
-import pandas as pd
-
-from imc.types import Path
+from imc.scripts.process import main as process
 from imc.scripts.inspect_mcds import main as inspect
 from imc.scripts.prepare_mcds import main as prepare
 from imc.scripts.predict import main as predict
 from imc.scripts.segment_stacks import main as segment
 from imc.scripts.quantify import main as quantify
 from imc.scripts.view import main as view
-from imc.utils import mcd_to_dir
+
+cli_config: tp.Dict[str, tp.Any]
 from imc.scripts import cli_config
 
 
-def main(cli: List[str] = None) -> int:
+def main(cli: tp.Sequence[str] = None) -> int:
     parser = get_args()
-    main_args, cmd_args = parser.parse_known_args()
+    main_args, cmd_args = parser.parse_known_args(cli)
 
-    if main_args.command == "inspect":
+    if main_args.command == "process":
+        process(cmd_args)
+    elif main_args.command == "inspect":
         inspect(cmd_args)
     elif main_args.command == "prepare":
         prepare(cmd_args)

diff --git a/imc/scripts/__init__.py b/imc/scripts/__init__.py
@@ -3,9 +3,9 @@
 
 from imc.types import Path
 
-
-DEFAULT_LIB_DIR = Path("~/.imc/lib").expanduser().mkdir()
-DEFAULT_MODELS_DIR = Path("~/.imc/models").expanduser().mkdir()
+DEFAULT_IMC_DIR = Path("~/.imc").expanduser().mkdir()
+DEFAULT_LIB_DIR = (DEFAULT_IMC_DIR / "lib").mkdir()
+DEFAULT_MODELS_DIR = (DEFAULT_IMC_DIR / "models").mkdir()
 
 
 epilog = "https://github.com/ElementoLab/imc"
@@ -16,6 +16,11 @@
         "epilog": epilog,
     },
     "subcommands": {
+        "process": {
+            "prog": "imc process",
+            "description": "Process raw IMC files end-to-end.",
+            "epilog": epilog,
+        },
         "inspect": {
             "prog": "imc inspect",
             "description": "Inspect MCD files and extract metadata.",
@@ -48,6 +53,16 @@
         },
     },
     "subcommand_arguments": {
+        "process": [
+            {
+                "kwargs": {
+                    "dest": "files",
+                    "nargs": "+",
+                    "type": Path,
+                    "help": "Input files to process. Can be MCD or TIFF.",
+                }
+            }
+        ],
         "inspect": [
             {"kwargs": {"dest": "mcd_files", "nargs": "+", "type": Path}},
             {

diff --git a/imc/scripts/inspect_mcds.py b/imc/scripts/inspect_mcds.py
@@ -9,7 +9,7 @@
 import yaml
 import argparse
 from collections import OrderedDict
-from typing import List, Tuple, Any
+import typing as tp
 
 import pandas as pd
 
@@ -20,12 +20,12 @@
 from imc.scripts import build_cli
 
 
-def main(cli: List[str] = None) -> int:
+def main(cli: tp.Sequence[str] = None) -> int:
     parser = build_cli("inspect")
     args = parser.parse_args(cli)
 
     fs = "\n\t- " + "\n\t- ".join([f.as_posix() for f in args.mcd_files])
-    print(f"Starting analysis of {len(args.mcd_files)} MCD files: {fs}!")
+    print(f"Starting inspection step for {len(args.mcd_files)} MCD files: {fs}!")
 
     # Inspect each MCD
     metas = dict()
@@ -59,15 +59,13 @@ def main(cli: List[str] = None) -> int:
         print(f"MCD files use different panels, {n_panels} in total.")
 
     if not args.no_write:
-        channels.to_csv(
-            args.output_prefix + ".all_mcds.channel_labels.csv", index=False
-        )
+        channels.to_csv(args.output_prefix + ".all_mcds.channel_labels.csv", index=False)
 
-    print("Finished with all files!")
+    print("Finished inspect step!")
     return 0
 
 
-def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]:
+def inspect_mcd(mcd_file: Path, args: Args) -> tp.Tuple[DataFrame, DataFrame]:
     cols = [
         "Target",
         "Metal_Tag",
@@ -77,7 +75,7 @@ def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]:
     ]
     exclude_channels = ["EMPTY", "190BCKG", "80Ar", "89Y", "127I", "124Xe"]
 
-    print(f"Started analyzing '{mcd_file}'!")
+    print(f"    Analyzing '{mcd_file}':")
 
     mcd = McdParser(mcd_file)
     session = mcd.session
@@ -89,9 +87,7 @@ def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]:
             # ac_id: pd.Series(cleanup_channel_names(
             #     session.acquisitions[ac_id].channel_labels
             # ).values, index=session.acquisitions[ac_id].channel_masses)
-            ac_id: cleanup_channel_names(
-                session.acquisitions[ac_id].channel_labels
-            )
+            ac_id: cleanup_channel_names(session.acquisitions[ac_id].channel_labels)
             for ac_id in ac_ids
         }
     )
@@ -128,9 +124,9 @@ def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]:
 
         annot = pd.DataFrame(ids, columns=cols)
         annot["Atom"] = annot["Metal_Tag"].str.extract(r"(\d+)")[0]
-        annot["full"] = (
-            ~annot.index.str.contains("|".join(exclude_channels))
-        ).astype(int)
+        annot["full"] = (~annot.index.str.contains("|".join(exclude_channels))).astype(
+            int
+        )
         annot["ilastik"] = (
             annot.index.str.contains("DNA") | annot.index.str.startswith("CD")
         ).astype(int)
@@ -153,9 +149,7 @@ def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]:
     meta["consensus_channels"] = (
         channel_names.iloc[:, 0].to_dict() if same_channels else None
     )
-    meta["panoramas"] = {
-        p: v.get_csv_dict() for p, v in session.panoramas.items()
-    }
+    meta["panoramas"] = {p: v.get_csv_dict() for p, v in session.panoramas.items()}
     meta["acquisitions"] = {
         a: ac.get_csv_dict() for a, ac in session.acquisitions.items()
     }
@@ -174,7 +168,7 @@ def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]:
     return meta, annot
 
 
-def encode(obj: Any) -> Any:
+def encode(obj: tp.Any) -> tp.Any:
     """
     For serializing to JSON or YAML with no special Python object references.
 

diff --git a/imc/scripts/predict.py b/imc/scripts/predict.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 """
-Generate probablity maps for each image.
+Generate probablity maps for each pixel in each image.
 """
 
 import sys
@@ -12,13 +12,14 @@
 from imc.scripts import build_cli
 
 
-def main(cli: tp.List[str] = None) -> int:
+def main(cli: tp.Sequence[str] = None) -> int:
     """Generate probability maps for each ROI using ilastik."""
     parser = build_cli("predict")
+    print(cli)
     args = parser.parse_args(cli)
 
     fs = "\n\t- " + "\n\t- ".join([f.as_posix() for f in args.tiffs])
-    print(f"Starting analysis of {len(args.tiffs)} TIFF files: {fs}!")
+    print(f"Starting predict step for {len(args.tiffs)} TIFF files: {fs}!")
 
     # Prepare ROI objects
     rois = list()
@@ -41,19 +42,20 @@ def main(cli: tp.List[str] = None) -> int:
         model_ilp = args.custom_model
 
     # Predict
+    print("Starting ilastik pixel classification.")
     tiff_files = [roi._get_input_filename("ilastik_input") for roi in rois]
-    predict(tiff_files, ilastik_sh, model_ilp, args.quiet)
+    predict_with_ilastik(tiff_files, ilastik_sh, model_ilp, args.quiet)
 
     for roi in rois:
         _in = roi.root_dir / roi.name + "_ilastik_s2_Probabilities.tiff"
         if _in.exists():
             _in.rename(roi._get_input_filename("probabilities"))
 
-    print("Finished with all files!")
+    print("Finished predict step!")
     return 0
 
 
-def predict(
+def predict_with_ilastik(
     tiff_files: tp.Sequence[Path], ilastik_sh: Path, model_ilp: Path, quiet: bool = True
 ) -> int:
     """
@@ -88,6 +90,7 @@ def get_ilastik(lib_dir: Path, version: str = "1.3.3post2") -> Path:
         print("Extracting ilastik archive.")
         with tarfile.open(lib_dir / file, "r:bz2") as tar:
             tar.extractall(lib_dir)
+        (lib_dir / file).unlink()
     return f
 
 
@@ -124,7 +127,7 @@ def download_file(url: str, output_file: tp.Union[Path, str], chunk_size=1024) -
         Size in bytes of chunk to write to disk at a time.
     """
     import shutil
-    import urllib.request as request
+    from urllib import request
     from contextlib import closing
     import requests
 
@@ -153,7 +156,7 @@ def run_shell_command(cmd: str, dry_run: bool = False) -> int:
     # the subprocess call must have its own shell
     # this should only occur if cellprofiler is being run uncontainerized
     # and needs a command to be called prior such as conda activate, etc
-    symbol = any([x in cmd for x in ["&", "&&", "|"]])
+    symbol = any(x in cmd for x in ["&", "&&", "|"])
     source = cmd.startswith("source")
     shell = bool(symbol or source)
     print(

diff --git a/imc/scripts/prepare_mcds.py b/imc/scripts/prepare_mcds.py
@@ -30,7 +30,7 @@ def main(cli: tp.Sequence[str] = None) -> int:
         args.sample_names = [None] * len(args.mcd_files)
 
     fs = "\n\t- " + "\n\t- ".join([f.as_posix() for f in args.mcd_files])
-    print(f"Starting analysis of {len(args.mcd_files)} MCD files: {fs}!")
+    print(f"Starting prepare step for {len(args.mcd_files)} MCD files: {fs}!")
 
     for mcd_file, pannel_csv, sample_name in zip(
         args.mcd_files, args.pannel_csvs, args.sample_names
@@ -56,10 +56,9 @@ def main(cli: tp.Sequence[str] = None) -> int:
             panorama_image_prefix=args.root_output_dir / mcd_file.stem / "Panorama_",
             save_roi_arrays=False,
         )
+        print(f"Finished with '{mcd_file}'.")
 
-        print(f"Finished processing '{mcd_file}'.")
-
-    print("Finished with all files!")
+    print("Finished prepare step!")
     return 0