Skip to content

Commit

Permalink
Merge pull request #126 from haniffalab/dev
Browse files Browse the repository at this point in the history
Create release 0.5.0
  • Loading branch information
davehorsfall authored Dec 21, 2023
2 parents 9c68ff9 + 2d1f98a commit 86f9251
Show file tree
Hide file tree
Showing 32 changed files with 478 additions and 196 deletions.
70 changes: 59 additions & 11 deletions bin/integrate_anndata.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env python3

from typing import Union
import typing as T
import os
import fire
import zarr
import h5py
import logging
import numpy as np
import pandas as pd
import anndata as ad
Expand All @@ -13,25 +15,27 @@
from pathlib import Path


def reindex_and_concat(path: str, offset: int, features: str = None, **kwargs):
def reindex_and_concat(
path: str, offset: int, features: str = None, args: dict[str, T.Any] = {}, **kwargs
):
adata = read_anndata(path)

adata = reindex_anndata(adata, offset, no_save=True)
adata = reindex_anndata(adata, offset, **args, **kwargs)
if features:
adata = concat_features(adata, features, no_save=True)
adata = concat_features(adata, features, **args, **kwargs)

out_filename = "reindexed-concat-{}".format(
os.path.splitext(os.path.basename(path))[0]
)
write_anndata(adata, out_filename, **kwargs)
write_anndata(adata, out_filename, **args, **kwargs)

return


def reindex_anndata(
data: Union[ad.AnnData, str],
offset: int,
no_save: bool = False,
no_save: bool = True,
out_filename: str = None,
**kwargs,
):
Expand All @@ -55,7 +59,7 @@ def reindex_anndata(
def concat_features(
data: Union[ad.AnnData, str],
features: str,
no_save: bool = False,
no_save: bool = True,
out_filename: str = None,
**kwargs,
):
Expand All @@ -68,11 +72,11 @@ def concat_features(
)

if features.endswith(".h5ad") and os.path.isfile(features):
adata = concat_matrix_from_cell2location(adata, features)
adata = concat_matrix_from_cell2location(adata, features, **kwargs)
elif features.startswith("obs/"):
adata = concat_matrix_from_obs(adata, features.split("/")[1])
adata = concat_matrix_from_obs(adata, features.split("/")[1], **kwargs)
elif features.startswith("obsm/"):
adata = concat_matrix_from_obsm(adata, features.split("/")[1])
adata = concat_matrix_from_obsm(adata, features.split("/")[1], **kwargs)

if no_save:
return adata
Expand Down Expand Up @@ -134,10 +138,14 @@ def concat_matrix_from_cell2location(
data: Union[ad.AnnData, str],
c2l_file: str,
q: str = "q05_cell_abundance_w_sf",
sample: str = None,
sample: tuple[str, str] = None,
feature_name: str = "gene",
obs_feature_name: str = None,
sort: bool = True,
sort_index: str = None,
**kwargs,
):
sort = sort or sort_index is not None
if isinstance(data, ad.AnnData):
adata = data
else:
Expand All @@ -153,6 +161,38 @@ def concat_matrix_from_cell2location(
if sample:
c2l_adata = c2l_adata[c2l_adata.obs[sample[0]] == sample[1]]

if sort:
if not sort_index and adata.uns.get("webatlas_reindexed"):
sort_index = "label_id"
if sort_index:
data_idx = adata.obs[sort_index]
else:
data_idx = adata.obs.index
idx = c2l_adata.obs.index.get_indexer(data_idx.tolist())
if -1 in idx: # Indices do not match
logging.error(
"Values do not match between AnnData object's"
f" `{sort_index or 'index'}`"
" and cell2location output index."
)

logging.info("Attempting to match indices as substrings")
try:
data_idx = match_substring_indices(c2l_adata.obs.index, data_idx)
if not data_idx.is_unique:
raise Exception(
"Found non-unique matches between indices as substrings."
)
idx = c2l_adata.obs.index.get_indexer(data_idx.tolist())
if -1 in idx:
raise Exception("Non-matching indices present.")
except Exception:
raise SystemError(
"Failed to find a match between indices as substrings."
)

c2l_adata = c2l_adata[idx,]

c2l_df = pd.DataFrame(
c2l_adata.obsm[q].to_numpy(),
index=c2l_adata.obs.index,
Expand All @@ -162,7 +202,9 @@ def concat_matrix_from_cell2location(
dtype="float32",
)

return concat_matrices(adata, c2l_df, "celltype", feature_name, obs_feature_name)
return concat_matrices(
adata, c2l_df, "celltype", feature_name, obs_feature_name, **kwargs
)


def concat_matrices(
Expand Down Expand Up @@ -269,5 +311,11 @@ def write_anndata(
return


def match_substring_indices(fullstring_idx, substring_idx):
return pd.Series(substring_idx).apply(
lambda x: fullstring_idx[fullstring_idx.str.contains(x)].values[0]
)


if __name__ == "__main__":
fire.Fire()
78 changes: 49 additions & 29 deletions bin/process_h5ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,57 @@ def h5ad_to_zarr(
return zarr_file


def reindex_anndata_obs(adata: ad.AnnData) -> ad.AnnData:
# check if index is numerical, if not reindex
if not adata.obs.index.is_integer() and not (
adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
):
IDX_NAME = "label_id"
if IDX_NAME in adata.obs:
adata.obs.rename(columns={IDX_NAME: f"_{IDX_NAME}"})
adata.obs = adata.obs.reset_index(names=IDX_NAME)
adata.obs.index = (
pd.Categorical(adata.obs[IDX_NAME]).codes + 1
) # avoid 0's for possible label images
adata.uns["webatlas_reindexed"] = True
adata.obs.index = adata.obs.index.astype(str)

return adata


def subset_anndata(
adata: ad.AnnData,
obs_subset: tuple[str, T.Any] = None,
var_subset: tuple[str, T.Any] = None,
) -> ad.AnnData:
# Subset adata by obs
if obs_subset:
obs_subset[1] = (
[obs_subset[1]]
if not isinstance(obs_subset[1], (list, tuple))
else obs_subset[1]
)
adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]

# Subset adata by var
if var_subset:
var_subset[1] = (
[var_subset[1]]
if not isinstance(var_subset[1], (list, tuple))
else var_subset[1]
)
adata = adata[:, adata.var[var_subset[0]].isin(var_subset[1])]

return adata


def preprocess_anndata(
adata: ad.AnnData,
compute_embeddings: bool = False,
var_index: str = None,
obs_subset: tuple[str, T.Any] = None,
var_subset: tuple[str, T.Any] = None,
**kwargs,
):
"""This function preprocesses an AnnData object, ensuring correct dtypes for zarr conversion
Expand All @@ -140,23 +185,7 @@ def preprocess_anndata(
to use to subset the AnnData object. Defaults to None.
"""

# Subset adata by obs
if obs_subset:
obs_subset[1] = (
[obs_subset[1]]
if not isinstance(obs_subset[1], (list, tuple))
else obs_subset[1]
)
adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]

# Subset adata by var
if var_subset:
var_subset[1] = (
[var_subset[1]]
if not isinstance(var_subset[1], (list, tuple))
else var_subset[1]
)
adata = adata[:, adata.var[var_subset[0]].isin(var_subset[1])]
adata = subset_anndata(adata, obs_subset=obs_subset, var_subset=var_subset)

# reindex var with a specified column
if var_index and var_index in adata.var:
Expand All @@ -165,14 +194,7 @@ def preprocess_anndata(
adata.var.index = adata.var.index.astype(str)
adata.var_names_make_unique()

# check if index is numerical, if not reindex
if not adata.obs.index.is_integer() and not (
adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
):
adata.obs["label_id"] = adata.obs.index
adata.obs.index = pd.Categorical(adata.obs.index)
adata.obs.index = adata.obs.index.codes
adata.obs.index = adata.obs.index.astype(str)
adata = reindex_anndata_obs(adata)

# turn obsm into a numpy array
for k in adata.obsm_keys():
Expand All @@ -186,16 +208,14 @@ def preprocess_anndata(
sc.pp.neighbors(adata)
sc.tl.umap(adata)

# ensure data types for obs
for col in adata.obs:
# anndata >= 0.8.0
# if data type is categorical vitessce will throw "path obs/X contains a group" and won"t find .zarray
# if adata.obs[col].dtype == "category":
# adata.obs[col] = adata.obs[col].cat.codes
if adata.obs[col].dtype in ["int8", "int64"]:
adata.obs[col] = adata.obs[col].astype("int32")
if adata.obs[col].dtype == "bool":
adata.obs[col] = adata.obs[col].astype(str).astype("category")

# ensure data types for obsm
for col in adata.obsm:
if type(adata.obsm[col]).__name__ in ["DataFrame", "Series"]:
adata.obsm[col] = adata.obsm[col].to_numpy()
Expand Down
21 changes: 3 additions & 18 deletions bin/process_spaceranger.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import tifffile as tf
from pathlib import Path
from skimage.draw import disk
from process_h5ad import h5ad_to_zarr
from process_h5ad import h5ad_to_zarr, reindex_anndata_obs, subset_anndata


def spaceranger_to_anndata(
Expand Down Expand Up @@ -164,23 +164,9 @@ def visium_label(
scalef = adata.uns["spatial"][sample_id]["scalefactors"]["tissue_hires_scalef"]
shape = [int(hires_shape[0] / scalef), int(hires_shape[1] / scalef)]

# Subset adata by obs
if obs_subset:
obs_subset[1] = (
[obs_subset[1]]
if not isinstance(obs_subset[1], (list, tuple))
else obs_subset[1]
)
adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
adata = subset_anndata(adata, obs_subset=obs_subset)

# check if index is numerical, if not reindex
if not adata.obs.index.is_integer() and not (
adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
):
adata.obs["label_id"] = adata.obs.index
adata.obs.index = pd.Categorical(adata.obs.index)
adata.obs.index = adata.obs.index.codes
adata.obs.index = adata.obs.index.astype(str)
adata = reindex_anndata_obs(adata)

# turn obsm into a numpy array
for k in adata.obsm_keys():
Expand All @@ -189,7 +175,6 @@ def visium_label(
spot_diameter_fullres = adata.uns["spatial"][sample_id]["scalefactors"][
"spot_diameter_fullres"
]
# hires_scalef = adata.uns["spatial"][sample_id]["scalefactors"]["tissue_hires_scalef"]
spot_coords = adata.obsm["spatial"]
assert adata.obs.shape[0] == spot_coords.shape[0]

Expand Down
4 changes: 2 additions & 2 deletions bin/process_xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def xenium_to_anndata(
# pd.Categorical.codes converts them to int this is done manually at this step
# instead of reindex_anndata so we control what matches the label image
adata.obs = adata.obs.reset_index()
adata.obs.index = pd.Categorical(adata.obs["cell_id"]).codes.astype(str)
adata.obs.index = (pd.Categorical(adata.obs["cell_id"]).codes + 1).astype(str)

return adata

Expand Down Expand Up @@ -163,7 +163,7 @@ def xenium_label(
# starting on v1.3 cell_id looks like "aaabinlp-1"
# pd.Categorical.codes converts them to int
# this is required so the label image matches the h5ad ids
ids = pd.Categorical(ids).codes
ids = pd.Categorical(ids).codes + 1

pols = z["polygon_vertices"][1]

Expand Down
Loading

0 comments on commit 86f9251

Please sign in to comment.