From 5a529762dcdbb9aa572c6288d597fc87060ac8fb Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 4 Mar 2024 23:33:19 +0100 Subject: [PATCH 01/20] Add method to calculate embeddings for variable by distance aggregation --- src/squidpy/tl/__init__.py | 1 + src/squidpy/tl/_var_embeddings.py | 92 +++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 src/squidpy/tl/_var_embeddings.py diff --git a/src/squidpy/tl/__init__.py b/src/squidpy/tl/__init__.py index edaa3182..eb3ac595 100644 --- a/src/squidpy/tl/__init__.py +++ b/src/squidpy/tl/__init__.py @@ -3,3 +3,4 @@ from __future__ import annotations from squidpy.tl._var_by_distance import var_by_distance +from squidpy.tl._var_embeddings import var_embeddings diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py new file mode 100644 index 00000000..d3308a87 --- /dev/null +++ b/src/squidpy/tl/_var_embeddings.py @@ -0,0 +1,92 @@ +from __future__ import annotations + + +import pandas as pd +from anndata import AnnData +from scanpy import logging as logg +from sklearn.preprocessing import StandardScaler +import umap + +from squidpy._docs import d + +__all__ = ["var_embeddings"] + + +@d.dedent +def var_embeddings( + adata: AnnData, + cluster_key: str, + design_matrix_key: str = "design_matrix", + n_bins: int = 100, + include_anchor: bool = False, +) -> AnnData: + """ + Cluster variables by previously calculated distance to an anchor point. + + Parameters + ---------- + %(adata)s + cluster_key + Annotation column in `.obs` that is used as anchor. + design_matrix_key + Name of the design matrix saved to `.obsm`. + n_bins + Number of bins to use for aggregation. + include_anchor + Whether to include the variable counts belonging to the anchor point in the aggregation. + Returns + ------- + If ``copy = True``, returns the design_matrix with the distances to an anchor point + Otherwise, stores design_matrix in `.obsm`. + """ + if design_matrix_key not in adata.obsm.keys(): + raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") + + logg.info("Calculating embeddings for distance aggregations by gene.") + + df = adata.obsm[design_matrix_key].copy() + + # bin the data by distance + df["bins"] = pd.cut(df[cluster_key], bins=n_bins) + + # get median value of each interval + df['median_value'] = df['bins'].apply(calculate_median) + + # turn categorical NaNs into float 0s + df['median_value'] = pd.to_numeric(df['median_value'], errors='coerce').fillna(0).astype(float) + + # get count matrix and add binned distance to each .obs + X_df = adata.to_df() + X_df["distance"] = df["median_value"].copy() + + # transpose the count matrix + X_df_T = X_df.T + + # aggregate the transposed count matrix by the distances and remove the distance row + mth_row_values = X_df_T.iloc[-1] + result = X_df_T.groupby(mth_row_values, axis=1).sum() + result.drop(result.tail(1).index,inplace=True) + + # optionally include or remove variable values for distance 0 (anchro point) + if not include_anchor: + result = result.drop(result.columns[0], axis=1) + + reducer = umap.UMAP() + + # scale the data and reduce dimensionality + scaled_exp = StandardScaler().fit_transform(result.values) + scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) + embedding = reducer.fit_transform(scaled_exp_df) + + adata.varm[f"{n_bins}_bins_distance_aggregation"] = result + embedding_df = pd.DataFrame(embedding, index=result.index) + embedding_df["var"] = result.index + adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df + + return + +def calculate_median(interval): + median = interval.mid + + return median + From eb84518f6aa68893c98503822427e2a9a2b85fe0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 22:57:34 +0000 Subject: [PATCH 02/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index d3308a87..03868aea 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,11 +1,10 @@ from __future__ import annotations - import pandas as pd +import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler -import umap from squidpy._docs import d @@ -50,10 +49,10 @@ def var_embeddings( df["bins"] = pd.cut(df[cluster_key], bins=n_bins) # get median value of each interval - df['median_value'] = df['bins'].apply(calculate_median) + df["median_value"] = df["bins"].apply(calculate_median) # turn categorical NaNs into float 0s - df['median_value'] = pd.to_numeric(df['median_value'], errors='coerce').fillna(0).astype(float) + df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -65,14 +64,14 @@ def var_embeddings( # aggregate the transposed count matrix by the distances and remove the distance row mth_row_values = X_df_T.iloc[-1] result = X_df_T.groupby(mth_row_values, axis=1).sum() - result.drop(result.tail(1).index,inplace=True) + result.drop(result.tail(1).index, inplace=True) # optionally include or remove variable values for distance 0 (anchro point) if not include_anchor: result = result.drop(result.columns[0], axis=1) reducer = umap.UMAP() - + # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) @@ -85,8 +84,8 @@ def var_embeddings( return + def calculate_median(interval): median = interval.mid return median - From 488da20df560179f360e9f9c5b519e5bc94331ec Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 4 Mar 2024 23:58:46 +0100 Subject: [PATCH 03/20] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index d3308a87..f45942ff 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,11 +1,12 @@ from __future__ import annotations +from typing import Any import pandas as pd +import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler -import umap from squidpy._docs import d @@ -50,10 +51,10 @@ def var_embeddings( df["bins"] = pd.cut(df[cluster_key], bins=n_bins) # get median value of each interval - df['median_value'] = df['bins'].apply(calculate_median) + df["median_value"] = df["bins"].apply(calculate_median) # turn categorical NaNs into float 0s - df['median_value'] = pd.to_numeric(df['median_value'], errors='coerce').fillna(0).astype(float) + df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -65,14 +66,14 @@ def var_embeddings( # aggregate the transposed count matrix by the distances and remove the distance row mth_row_values = X_df_T.iloc[-1] result = X_df_T.groupby(mth_row_values, axis=1).sum() - result.drop(result.tail(1).index,inplace=True) + result.drop(result.tail(1).index, inplace=True) # optionally include or remove variable values for distance 0 (anchro point) if not include_anchor: result = result.drop(result.columns[0], axis=1) reducer = umap.UMAP() - + # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) @@ -85,8 +86,8 @@ def var_embeddings( return -def calculate_median(interval): + +def calculate_median(interval: pd.Interval) -> Any: median = interval.mid return median - From 0b724941c56b289139b47d9d17b90d978dd00997 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:00:55 +0000 Subject: [PATCH 04/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index c575de4b..60139f3f 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -4,7 +4,6 @@ import pandas as pd import umap -import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler @@ -78,7 +77,6 @@ def var_embeddings( reducer = umap.UMAP() - # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) From edcca877dbf21b542b160abd62a91cf183d8bc94 Mon Sep 17 00:00:00 2001 From: LLehner Date: Tue, 5 Mar 2024 00:05:23 +0100 Subject: [PATCH 05/20] Update param name --- src/squidpy/tl/_var_embeddings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index c575de4b..87c75d23 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -17,7 +17,7 @@ @d.dedent def var_embeddings( adata: AnnData, - cluster_key: str, + group: str, design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, @@ -28,7 +28,7 @@ def var_embeddings( Parameters ---------- %(adata)s - cluster_key + group Annotation column in `.obs` that is used as anchor. design_matrix_key Name of the design matrix saved to `.obsm`. @@ -49,7 +49,7 @@ def var_embeddings( df = adata.obsm[design_matrix_key].copy() # bin the data by distance - df["bins"] = pd.cut(df[cluster_key], bins=n_bins) + df["bins"] = pd.cut(df[group], bins=n_bins) # get median value of each interval df["median_value"] = df["bins"].apply(calculate_median) From 4be2529ef43bd6ecbea0d26464409b242d2e7899 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:00:55 +0000 Subject: [PATCH 06/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 87c75d23..91f1d176 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -4,7 +4,6 @@ import pandas as pd import umap -import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler @@ -78,7 +77,6 @@ def var_embeddings( reducer = umap.UMAP() - # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) From cfe496cbddc5c2aa31aa034f71f155b72d26e784 Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 22 Apr 2024 19:14:40 +0200 Subject: [PATCH 07/20] Remove duplicate code --- src/squidpy/tl/_var_embeddings.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 91f1d176..7205c32e 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -52,11 +52,9 @@ def var_embeddings( # get median value of each interval df["median_value"] = df["bins"].apply(calculate_median) - df["median_value"] = df["bins"].apply(calculate_median) # turn categorical NaNs into float 0s df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) - df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -69,23 +67,22 @@ def var_embeddings( mth_row_values = X_df_T.iloc[-1] result = X_df_T.groupby(mth_row_values, axis=1).sum() result.drop(result.tail(1).index, inplace=True) - result.drop(result.tail(1).index, inplace=True) - # optionally include or remove variable values for distance 0 (anchro point) + # optionally include or remove variable values for distance 0 (anchor point) if not include_anchor: result = result.drop(result.columns[0], axis=1) - reducer = umap.UMAP() + #reducer = umap.UMAP() # scale the data and reduce dimensionality - scaled_exp = StandardScaler().fit_transform(result.values) - scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) - embedding = reducer.fit_transform(scaled_exp_df) + #scaled_exp = StandardScaler().fit_transform(result.values) + #scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) + #embedding = reducer.fit_transform(scaled_exp_df) adata.varm[f"{n_bins}_bins_distance_aggregation"] = result - embedding_df = pd.DataFrame(embedding, index=result.index) - embedding_df["var"] = result.index - adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df + #embedding_df = pd.DataFrame(embedding, index=result.index) + #embedding_df["var"] = result.index + #adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df return From c4fca2920c82a9b17eab9c8b15471b0c4145e394 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:15:01 +0000 Subject: [PATCH 08/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 7205c32e..e9b289d5 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -72,17 +72,17 @@ def var_embeddings( if not include_anchor: result = result.drop(result.columns[0], axis=1) - #reducer = umap.UMAP() + # reducer = umap.UMAP() # scale the data and reduce dimensionality - #scaled_exp = StandardScaler().fit_transform(result.values) - #scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) - #embedding = reducer.fit_transform(scaled_exp_df) + # scaled_exp = StandardScaler().fit_transform(result.values) + # scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) + # embedding = reducer.fit_transform(scaled_exp_df) adata.varm[f"{n_bins}_bins_distance_aggregation"] = result - #embedding_df = pd.DataFrame(embedding, index=result.index) - #embedding_df["var"] = result.index - #adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df + # embedding_df = pd.DataFrame(embedding, index=result.index) + # embedding_df["var"] = result.index + # adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df return From 64e38dfb55b0c1b765a8476c09760f5170cd97b6 Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 22 Apr 2024 23:50:10 +0200 Subject: [PATCH 09/20] Improve performance, Update output --- src/squidpy/tl/_var_embeddings.py | 50 ++++++++++++++----------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 7205c32e..67545b5f 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -2,11 +2,11 @@ from typing import Any +import numpy as np import pandas as pd -import umap +import scanpy as sc from anndata import AnnData from scanpy import logging as logg -from sklearn.preprocessing import StandardScaler from squidpy._docs import d @@ -46,45 +46,39 @@ def var_embeddings( logg.info("Calculating embeddings for distance aggregations by gene.") df = adata.obsm[design_matrix_key].copy() - # bin the data by distance df["bins"] = pd.cut(df[group], bins=n_bins) - # get median value of each interval df["median_value"] = df["bins"].apply(calculate_median) - # turn categorical NaNs into float 0s df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) - # get count matrix and add binned distance to each .obs X_df = adata.to_df() - X_df["distance"] = df["median_value"].copy() - + X_df["distance"] = df["median_value"] + # aggregate the count matrix by the bins + aggregated_df = X_df.groupby(["distance"]).sum() # transpose the count matrix - X_df_T = X_df.T - - # aggregate the transposed count matrix by the distances and remove the distance row - mth_row_values = X_df_T.iloc[-1] - result = X_df_T.groupby(mth_row_values, axis=1).sum() - result.drop(result.tail(1).index, inplace=True) + result = aggregated_df.T # optionally include or remove variable values for distance 0 (anchor point) + start_bin = 0 if not include_anchor: result = result.drop(result.columns[0], axis=1) - - #reducer = umap.UMAP() - - # scale the data and reduce dimensionality - #scaled_exp = StandardScaler().fit_transform(result.values) - #scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) - #embedding = reducer.fit_transform(scaled_exp_df) - - adata.varm[f"{n_bins}_bins_distance_aggregation"] = result - #embedding_df = pd.DataFrame(embedding, index=result.index) - #embedding_df["var"] = result.index - #adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df - - return + start_bin = 1 + + # set genes x bins to count matrix (required for embeddings and clustering) + var_by_bins = sc.AnnData(result) + # set genes x bins to .obs (required for plotting counts by distance) + var_by_bins.obs = result + # rename column names for plotting + var_by_bins.obs.columns = range(start_bin, 101) + # create genes x genes identity matrix + identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") + # append identity matrix to obs column wise (required for highlighting genes in plot) + identity_df.index = var_by_bins.obs.index + var_by_bins.obs = pd.concat([var_by_bins.obs, identity_df], axis=1) + + return var_by_bins def calculate_median(interval: pd.Interval) -> Any: From 9eabd0d196ce0bf6522a6d6d3a4c477b60d363b8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:53:23 +0000 Subject: [PATCH 10/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 233cb6d3..a9055f69 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,13 +1,13 @@ from __future__ import annotations +import time from typing import Any import numpy as np import pandas as pd -from anndata import AnnData import scanpy as sc +from anndata import AnnData from scanpy import logging as logg -import time from squidpy._docs import d @@ -72,7 +72,7 @@ def var_embeddings( # set genes x bins to .obs (required for plotting counts by distance) var_by_bins.obs = result # rename column names for plotting - var_by_bins.obs.columns = range(start_bin,101) + var_by_bins.obs.columns = range(start_bin, 101) # create genes x genes identity matrix identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") # append identity matrix to obs column wise (required for highlighting genes in plot) From a40a8cfcddafddd18402f2df4ad433edaf13dc4a Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 22 Apr 2024 23:55:55 +0200 Subject: [PATCH 11/20] Remove import --- src/squidpy/tl/_var_embeddings.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 233cb6d3..67545b5f 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -4,10 +4,9 @@ import numpy as np import pandas as pd -from anndata import AnnData import scanpy as sc +from anndata import AnnData from scanpy import logging as logg -import time from squidpy._docs import d @@ -72,7 +71,7 @@ def var_embeddings( # set genes x bins to .obs (required for plotting counts by distance) var_by_bins.obs = result # rename column names for plotting - var_by_bins.obs.columns = range(start_bin,101) + var_by_bins.obs.columns = range(start_bin, 101) # create genes x genes identity matrix identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") # append identity matrix to obs column wise (required for highlighting genes in plot) From 09c72b08bcbeafd73e27b69553d75a0eb8ad104e Mon Sep 17 00:00:00 2001 From: LLehner <64135338+LLehner@users.noreply.github.com> Date: Tue, 23 Apr 2024 00:01:38 +0200 Subject: [PATCH 12/20] Remove import --- src/squidpy/tl/_var_embeddings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index a9055f69..67545b5f 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,5 @@ from __future__ import annotations -import time from typing import Any import numpy as np From 3396146dcc8fb475e5e14910f7d4f61237edbde3 Mon Sep 17 00:00:00 2001 From: LLehner Date: Sun, 26 May 2024 23:12:21 +0200 Subject: [PATCH 13/20] Update return --- src/squidpy/tl/_var_embeddings.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index a9055f69..2ce5e4c6 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,7 +1,6 @@ from __future__ import annotations -import time -from typing import Any +from typing import Any, Union import numpy as np import pandas as pd @@ -21,7 +20,8 @@ def var_embeddings( design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, -) -> AnnData: + copy: bool = False, +) -> Union[AnnData, (pd.DataFrame, pd.DataFrame)]: """ Cluster variables by previously calculated distance to an anchor point. @@ -38,8 +38,8 @@ def var_embeddings( Whether to include the variable counts belonging to the anchor point in the aggregation. Returns ------- - If ``copy = True``, returns the design_matrix with the distances to an anchor point - Otherwise, stores design_matrix in `.obsm`. + If ``copy = True``, returns var by distance matrices. + Otherwise, stores var by distance bin matrices in `.obsm`. """ if design_matrix_key not in adata.obsm.keys(): raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") @@ -67,19 +67,17 @@ def var_embeddings( result = result.drop(result.columns[0], axis=1) start_bin = 1 - # set genes x bins to count matrix (required for embeddings and clustering) - var_by_bins = sc.AnnData(result) - # set genes x bins to .obs (required for plotting counts by distance) - var_by_bins.obs = result # rename column names for plotting - var_by_bins.obs.columns = range(start_bin, 101) + result.columns = range(start_bin, 101) # create genes x genes identity matrix - identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") + obs = pd.DataFrame(np.eye(len(result)), columns=result.index, dtype="category") # append identity matrix to obs column wise (required for highlighting genes in plot) - identity_df.index = var_by_bins.obs.index - var_by_bins.obs = pd.concat([var_by_bins.obs, identity_df], axis=1) + obs.index = result.index + adata.obsm["var_by_distance_X"] = result + adata.obsm["var_by_distance_obs"] = obs - return var_by_bins + if copy: + return (result, obs) def calculate_median(interval: pd.Interval) -> Any: From 67bdd5c146b16815d7cf4e7ca850770ca449b461 Mon Sep 17 00:00:00 2001 From: LLehner Date: Sun, 26 May 2024 23:16:00 +0200 Subject: [PATCH 14/20] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 2ce5e4c6..d2e3aa02 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union +from typing import Any, Union, Tuple import numpy as np import pandas as pd @@ -21,7 +21,7 @@ def var_embeddings( n_bins: int = 100, include_anchor: bool = False, copy: bool = False, -) -> Union[AnnData, (pd.DataFrame, pd.DataFrame)]: +) -> Union[AnnData, Tuple[pd.DataFrame, pd.DataFrame]]: """ Cluster variables by previously calculated distance to an anchor point. From 876c4edf26ab2cdf27c2f8121b5d9d1f4c1568e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 26 May 2024 21:16:24 +0000 Subject: [PATCH 15/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index d2e3aa02..e5b4bff2 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union, Tuple +from typing import Any, Tuple, Union import numpy as np import pandas as pd From 8ee07bafc0e02b86a9b7fd0d5d95f499574deca5 Mon Sep 17 00:00:00 2001 From: LLehner Date: Sun, 26 May 2024 23:57:17 +0200 Subject: [PATCH 16/20] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index e5b4bff2..ea9a4201 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Tuple, Union +from typing import Any, Union import numpy as np import pandas as pd @@ -21,7 +21,7 @@ def var_embeddings( n_bins: int = 100, include_anchor: bool = False, copy: bool = False, -) -> Union[AnnData, Tuple[pd.DataFrame, pd.DataFrame]]: +) -> Union[AnnData, pd.DataFrame]: """ Cluster variables by previously calculated distance to an anchor point. From d3cefff90876ce64dcff83f7fa617363b21f942a Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 27 May 2024 19:06:35 +0200 Subject: [PATCH 17/20] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index ea9a4201..398a847c 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union +from typing import Any import numpy as np import pandas as pd @@ -21,7 +21,7 @@ def var_embeddings( n_bins: int = 100, include_anchor: bool = False, copy: bool = False, -) -> Union[AnnData, pd.DataFrame]: +) -> AnnData | pd.DataFrame: """ Cluster variables by previously calculated distance to an anchor point. From 57296760d44ce609f684ac966a08e4dc2a595631 Mon Sep 17 00:00:00 2001 From: Laurens Lehner Date: Thu, 8 Aug 2024 13:11:41 +0200 Subject: [PATCH 18/20] Fix indices; Update return type --- src/squidpy/tl/_var_embeddings.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 398a847c..63630254 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -20,7 +20,6 @@ def var_embeddings( design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, - copy: bool = False, ) -> AnnData | pd.DataFrame: """ Cluster variables by previously calculated distance to an anchor point. @@ -60,7 +59,6 @@ def var_embeddings( aggregated_df = X_df.groupby(["distance"]).sum() # transpose the count matrix result = aggregated_df.T - # optionally include or remove variable values for distance 0 (anchor point) start_bin = 0 if not include_anchor: @@ -69,15 +67,15 @@ def var_embeddings( # rename column names for plotting result.columns = range(start_bin, 101) - # create genes x genes identity matrix - obs = pd.DataFrame(np.eye(len(result)), columns=result.index, dtype="category") - # append identity matrix to obs column wise (required for highlighting genes in plot) + # create genes x genes identity matrix (required for highlighting genes in plot) + obs = pd.DataFrame(np.eye(len(result)), columns=result.index) + obs.replace(1.0, pd.Series(obs.columns, obs.columns), inplace=True) + obs.replace(0.0, "other", inplace=True) + obs = obs.astype("category") obs.index = result.index - adata.obsm["var_by_distance_X"] = result - adata.obsm["var_by_distance_obs"] = obs + adata_new = AnnData(X=result, obs=obs, var=pd.DataFrame(index=result.columns)) - if copy: - return (result, obs) + return adata_new def calculate_median(interval: pd.Interval) -> Any: From 7dfa9332302621ab6ccb3291a764f0bbf771bf1e Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 26 Aug 2024 19:03:02 +0200 Subject: [PATCH 19/20] Add spatialdata as input --- src/squidpy/tl/_var_embeddings.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 63630254..ac6ef210 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,12 +1,13 @@ from __future__ import annotations -from typing import Any +from typing import Any, Optional import numpy as np import pandas as pd import scanpy as sc from anndata import AnnData from scanpy import logging as logg +from spatialdata import SpatialData from squidpy._docs import d @@ -15,20 +16,23 @@ @d.dedent def var_embeddings( - adata: AnnData, + sdata: SpatialData, + table: str, group: str, design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, ) -> AnnData | pd.DataFrame: """ - Cluster variables by previously calculated distance to an anchor point. + Bin variables by previously calculated distance to an anchor point. Parameters ---------- %(adata)s + table + Name of the table in `SpatialData` object. group - Annotation column in `.obs` that is used as anchor. + Annotation column in design matrix, given by `design_matrix_key`, that is used as anchor. design_matrix_key Name of the design matrix saved to `.obsm`. n_bins @@ -40,6 +44,9 @@ def var_embeddings( If ``copy = True``, returns var by distance matrices. Otherwise, stores var by distance bin matrices in `.obsm`. """ + + adata = sdata.tables[table] + if design_matrix_key not in adata.obsm.keys(): raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") @@ -75,7 +82,7 @@ def var_embeddings( obs.index = result.index adata_new = AnnData(X=result, obs=obs, var=pd.DataFrame(index=result.columns)) - return adata_new + sdata.tables["var_by_dist_bins"] = adata_new def calculate_median(interval: pd.Interval) -> Any: From d6e5ecd59789010043f5c704a72b2c90ab44e586 Mon Sep 17 00:00:00 2001 From: LLehner Date: Tue, 27 Aug 2024 11:21:17 +0200 Subject: [PATCH 20/20] Update docstring --- src/squidpy/tl/_var_embeddings.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index ac6ef210..44ae3d2e 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -41,8 +41,7 @@ def var_embeddings( Whether to include the variable counts belonging to the anchor point in the aggregation. Returns ------- - If ``copy = True``, returns var by distance matrices. - Otherwise, stores var by distance bin matrices in `.obsm`. + Stores binned count matrices in `sdata.tables["var_by_dist_bins"]`. """ adata = sdata.tables[table]