Fix problems with ancestry aggregation & scaling on biobank-data (#19)

* Don't perform ancestry adjustments/keep AVG columns. * Edit expected column list * Bump versions (calc=0.2.0;utils=1.1.0) * simplify polygenicscore class (remove batches until we run into problems) * Fix pgscatalog.match performance regression (#22) * drop pyarrow support, it doesn't scale well, and be more consistent about public path properties * refactor to use polars for reading and writing IPC files to improve scalability * fix map_elements deprecation warning * update lockfiles * fix weird path -> is_path refactor that broke this test * missed one >_> * fix pyproject * update dockerfile * fix exception handling when one score fails matching * fix merging scoring files with different column sets * set pgscatalog package logging levels to INFO * Improve aggregation (#23) * export key functions for sorting chromosomes / effect types * use new key functions for sorting * reduce memory usage during aggregation * fix doctest output * make aggregation steps clearer * bump minor version of pgscatalog.core * minor version bump pgscatalog.match --------- Co-authored-by: Benjamin Wingfield <bwingfield@ebi.ac.uk>
PGScatalog · Jun 12, 2024 · cd027b3 · cd027b3
1 parent e88b41f
commit cd027b3
Show file tree

Hide file tree

Showing 32 changed files with 1,134 additions and 1,112 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -9,19 +9,20 @@ WORKDIR /app
 
 RUN pip install poetry
 
-COPY . .
+COPY pgscatalog.core /app/pgscatalog.core
+
+COPY pgscatalog.calc /app/pgscatalog.calc
+
+COPY pgscatalog.match /app/pgscatalog.match
+
+COPY pgscatalog.utils /app/pgscatalog.utils
 
 WORKDIR /app/pgscatalog.utils
 
 RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
 
-FROM python:3.11-slim-bullseye
-
-ENV VIRTUAL_ENV=/app/pgscatalog.utils/.venv \
-    PATH="/app/pgscatalog.utils/.venv/bin:$PATH"
-
-COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
-
 RUN apt-get update && apt-get install -y procps && rm -rf /var/lib/apt/lists/*
 
-ENV PATH="/venv/bin:${PATH}"
+ENV PATH="/app/pgscatalog.utils/.venv/bin:$PATH"
+
+
diff --git a/pgscatalog.calc/poetry.lock b/pgscatalog.calc/poetry.lock
diff --git a/pgscatalog.calc/pyproject.toml b/pgscatalog.calc/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog.calc"
-version = "0.1.1"
+version = "0.2.0"
 description = "Libraries and applications for working with calculated polygenic scores"
 authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>", "Laurent Gil <lg10@sanger.ac.uk>"]
 readme = "README.md"
@@ -10,7 +10,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = "^3.11"
-"pgscatalog.core" = "^0.1.0"
+"pgscatalog.core" = {path = "../pgscatalog.core", develop = true}
 numpy = "^1.26.4"
 pandas = "^2.2.0"
 pyarrow = "^15.0.0"

diff --git a/pgscatalog.calc/src/pgscatalog/calc/__init__.py b/pgscatalog.calc/src/pgscatalog/calc/__init__.py
@@ -21,4 +21,4 @@
     "AdjustResults",
 ]
 
-__version__ = "0.1.1"
+__version__ = "0.2.0"
diff --git a/pgscatalog.calc/src/pgscatalog/calc/cli/__init__.py b/pgscatalog.calc/src/pgscatalog/calc/cli/__init__.py
@@ -1,3 +1 @@
-from .aggregate_cli import run_aggregate
-
-__all__ = ["run_aggregate"]
+__all__ = []
diff --git a/pgscatalog.calc/src/pgscatalog/calc/cli/aggregate_cli.py b/pgscatalog.calc/src/pgscatalog/calc/cli/aggregate_cli.py
@@ -2,10 +2,11 @@
 import logging
 import pathlib
 import textwrap
-import operator
-import functools
+from collections import deque
+from typing import Optional
 
-from ..lib.polygenicscore import PolygenicScore
+from ..lib import PolygenicScore
+from pgscatalog.core import chrom_keyfunc
 
 logger = logging.getLogger(__name__)
 
@@ -21,15 +22,50 @@ def run_aggregate():
 
     if args.verbose:
         logger.setLevel(logging.INFO)
+        logging.getLogger("pgscatalog.core").setLevel(logging.INFO)
+        logging.getLogger("pgscatalog.calc").setLevel(logging.INFO)
 
     if not (outdir := pathlib.Path(args.outdir)).exists():
         raise FileNotFoundError(f"--outdir {outdir.name} doesn't exist")
 
-    score_paths = [pathlib.Path(x) for x in args.scores]
-    pgs = [PolygenicScore(path=x) for x in score_paths]
-    # call __add__ a lot
-    aggregated = functools.reduce(operator.add, pgs)
+    score_paths = sorted([pathlib.Path(x) for x in args.scores], key=chrom_keyfunc())
+    # dfs are only read into memory after accessing them explicitly e.g. pgs[0].df
+    pgs = deque(PolygenicScore(path=x) for x in score_paths)
+
+    observed_columns = set()
+    aggregated: Optional[PolygenicScore] = None
+
+    # first, use PolygenicScore's __add__ method, which implements df.add(fill_value=0)
+    while pgs:
+        # popleft ensures that dfs are removed from memory after each aggregation
+        score: PolygenicScore = pgs.popleft()
+        if aggregated is None:
+            logger.info(f"Initialising aggregation with {score}")
+            aggregated: PolygenicScore = score
+        else:
+            logger.info(f"Adding {score}")
+            aggregated += score
+        observed_columns.update(set(score.df.columns))
+
+    # check to make sure that every column we saw in the dataframes is in the output
+    if (dfcols := set(aggregated.df.columns)) != observed_columns:
+        raise ValueError(
+            f"Missing columns in aggregated file!. "
+            f"Observed: {observed_columns}. "
+            f"In aggregated: {dfcols}"
+        )
+    else:
+        logger.info("Aggregated columns match observed columns")
+
+    # next, melt the plink2 scoring files from wide (many columns) format to long format
+    aggregated.melt()
+
+    # recalculate PGS average using aggregated SUM and DENOM
+    aggregated.average()
+
+    logger.info("Aggregation finished! Writing to a file")
     aggregated.write(outdir=args.outdir, split=args.split)
+    logger.info("all done. bye :)")
 
 
 def _description_text() -> str:

diff --git a/pgscatalog.calc/src/pgscatalog/calc/cli/ancestry_cli.py b/pgscatalog.calc/src/pgscatalog/calc/cli/ancestry_cli.py
@@ -20,6 +20,8 @@ def run_ancestry():
 
     if args.verbose:
         logger.setLevel(logging.INFO)
+        logging.getLogger("pgscatalog.core").setLevel(logging.INFO)
+        logging.getLogger("pgscatalog.calc").setLevel(logging.INFO)
         logger.info("Starting ancestry adjustment")
         logger.info("Verbose mode enabled")
 

diff --git a/pgscatalog.calc/src/pgscatalog/calc/lib/_ancestry/read.py b/pgscatalog.calc/src/pgscatalog/calc/lib/_ancestry/read.py
@@ -87,8 +87,8 @@ def read_pgs(loc_aggscore):
         index_col=["sampleset", "IID"],
         converters={"IID": str},
         header=0,
-    ).pivot(columns=["PGS"], values=["SUM", "AVG"])
-    # join column levels ({PGS}_{VALUE})
-    df.columns = [f"{j}_{i}" for i, j in df.columns]
+    ).pivot(columns=["PGS"], values=["SUM"])
+    # rename to PGS only
+    df.columns = [f"{j}" for i, j in df.columns]
 
     return df