Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update tutorial for Xenium pooling #80

Merged
merged 12 commits into from
Dec 14, 2024
13 changes: 8 additions & 5 deletions docs/source/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
.. autosummary::
:toctree: generated

load_hest
iter_hest
```

## Run HEST-Benchmark
Expand Down Expand Up @@ -44,6 +44,8 @@

## Pooling of transcripts, binning

Methods used to pool Xenium transcripts and Visium-HD bins into square bins of custom size

```{eval-rst}
.. currentmodule:: hest.readers

Expand Down Expand Up @@ -72,7 +74,7 @@
correct_batch_effect
```

## Resolving gene name aliases
## Gene names manipulation

```{eval-rst}
.. currentmodule:: hest.HESTData
Expand All @@ -81,12 +83,13 @@
:toctree: generated

unify_gene_names
ensembl_id_to_gene
```


## Readers to augment HEST-1k
## Readers to expand HEST-1k

Readers to create new HEST-1k samples.
Readers to expand HEST-1k with additional samples.

```{eval-rst}
.. currentmodule:: hest.readers
Expand All @@ -103,7 +106,7 @@ Readers to create new HEST-1k samples.


## CellViT segmentation
Nuclei segmentation methods
Simplified API for nuclei segmentation


```{eval-rst}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Welcome to hest's documentation!

`hest` is a python library for H&E/ST pairs manipulation. It was used to assemble the <em>HEST-1k</em> dataset.

For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/)
For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/) (work in progress)

```{eval-rst}
.. card:: Installation
Expand Down
15 changes: 12 additions & 3 deletions src/hest/HESTData.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,8 @@ def read_hest_wsi(wsi: WSI, width, height):

return SpatialData(tables=new_table, images=images, shapes=shapes)

def ensembleID_to_gene(self):
ensembleID_to_gene(self)
def ensembl_id_to_gene(self):
ensembl_id_to_gene(self)


class VisiumHESTData(HESTData):
Expand Down Expand Up @@ -1058,6 +1058,15 @@ def __len__(self):
return len(self.id_list)

def iter_hest(hest_dir: str, id_list: List[str] = None, **read_kwargs) -> HESTIterator:
""" Iterate through the HEST samples contained in `hest_dir`

Args:
hest_dir (str): hest directory containing folders: st, wsis, metadata, tissue_seg (optional)
id_list (List[str], Optional): list of ids to read (ex: ['TENX96', 'TENX99']), pass None to read all available samples. Default to None

Returns:
HESTIterator: HESTData iterator
"""
return HESTIterator(hest_dir, id_list, **read_kwargs)

def _read_st(hest_dir, st_filename, load_transcripts=False):
Expand Down Expand Up @@ -1247,7 +1256,7 @@ def unify_gene_names(adata: sc.AnnData, species="human", drop=False) -> sc.AnnDa
# TODO return dict map of renamed, and remaining
return adata

def ensembleID_to_gene(st: HESTData, filter_na = False) -> HESTData:
def ensembl_id_to_gene(st: HESTData, filter_na = False) -> HESTData:
"""
Converts ensemble gene IDs of a HESTData object using Biomart annotations and filter out genes with no matching Ensembl ID

Expand Down
4 changes: 2 additions & 2 deletions src/hest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .utils import tiff_save, find_pixel_size_from_spot_coords, write_10X_h5, get_k_genes, SpotPacking
from .autoalign import autoalign_visium
from .readers import *
from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembleID_to_gene
from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembl_id_to_gene
from .segmentation.cell_segmenters import segment_cellvit

__all__ = [
Expand All @@ -21,5 +21,5 @@
'write_10X_h5',
'HESTData',
'segment_cellvit',
'ensembleID_to_gene'
'ensembl_id_to_gene'
]
29 changes: 16 additions & 13 deletions src/hest/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,8 @@ def read(
cell_bound_path: str = None,
dapi_path = None,
load_img=True,
use_dask=False
use_dask=False,
spot_size_um=100.
) -> XeniumHESTData:
""" Read a Xenium sample

Expand All @@ -919,6 +920,7 @@ def read(
dapi_path (_type_, optional): path to a `morphology_focus_0000.ome.tif`/`morphology_focus.ome.tif` file. Defaults to None.
load_img (bool, optional): whenever to load the WSI. Defaults to True.
use_dask (bool, optional): whenever to load the transcript dataframe with DASK (recommended if the transcript dataframe does not fit into the RAM). Defaults to False.
spot_size_um (float, optional): transcripts are pooled into squares of spot_size_um x spot_size_um mirometers and then stored in `HESTData.adata`

Returns:
XeniumHESTData: Xenium sample
Expand Down Expand Up @@ -963,11 +965,12 @@ def read(
transcript_df,
dict['pixel_size_um_estimated'],
key_x='he_x',
key_y='he_y'
key_y='he_y',
spot_size_um=spot_size_um
)

dict['spot_diameter'] = 55.
dict['inter_spot_dist'] = 100.
dict['spot_diameter'] = spot_size_um
dict['inter_spot_dist'] = spot_size_um
dict['spots_under_tissue'] = len(adata.obs)

else:
Expand Down Expand Up @@ -1044,20 +1047,20 @@ def pool_transcripts_xenium(
key_x='he_x',
key_y='he_y',
) -> sc.AnnData: # type: ignore
""" Pool a xenium transcript dataframe by square spots of size 100um
""" Pool a xenium transcript dataframe by square spots of `spot_size_um` micrometers.

Args:
df (pd.DataFrame): xenium transcipts dataframe containing columns:
- 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
- 'feature_name' indicating the transcript name
df (Union[pd.DataFrame, dd.DataFrame]): xenium transcipts dataframe containing columns:
- 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
- 'feature_name' indicating the transcript name
pixel_size_he (float): pixel_size in um on the he image
spot_size_um: pooling rectangle width in um
key_x: column name of pixel x coordinate of each transcript in `df`
key_y: column name of pixel y coordinate of each transcript in `df`


Returns:
sc.AnnData: AnnData object, each obs represents the sum of transcripts within that bin, coordinates of the center of each bin (in pixel on WSI) are in adata.obsm['spatial']
sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of transcripts within that bin. Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
"""
import scanpy as sc
import dask.dataframe as dd
Expand Down Expand Up @@ -1129,18 +1132,18 @@ def pool_transcripts_xenium(
return adata


def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um=16, chunk_len=50000) -> sc.AnnData: # type: ignore
""" Pool visium hd bins
def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um: Literal[2, 8, 16]=16, chunk_len=50000) -> sc.AnnData: # type: ignore
""" Pool a Visium HD (with a source resolution of `src_bin_size_um`) by square spots of `spot_size_um` micrometers.

Args:
adata (sc.AnnData): adata containing spot center coordiniates in `pxl_row_in_fullres` and `pxl_col_in_fullres`
pixel_size (float): pixel size of the WSI in um/px
dst_bin_size_um (int, optional): target bin size in um. Defaults to 128.
src_bin_size_um (int, optional): bin size of `adata` in um. Defaults to 16.
src_bin_size_um (Literal[2, 8, 16], optional): bin size of `adata` in um. Defaults to 16.
chunk_len (int, optional): chunk size when binning a larger than RAM `adata` (this is for RAM optimization only). Defaults to 50000.

Returns:
sc.AnnData: adata with pooled bins
sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of visium-hd sub-bins (of size `src_bin_size_um`) within that larger bin (of size `dst_bin_size_um`). Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
"""
import scanpy as sc

Expand Down
4 changes: 2 additions & 2 deletions tests/hest_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from hest.autoalign import autoalign_visium
from hest.readers import VisiumReader
from hest.HESTData import ensembleID_to_gene
from hest.HESTData import ensembl_id_to_gene
from hest.utils import load_image


Expand Down Expand Up @@ -136,7 +136,7 @@ def setUpClass(self):
#def test_conversion_ensembleID(self):
# for idx, st in enumerate(self.sts):
# with self.subTest(st_object=idx):
# ensembleID_to_gene(st)
# ensembl_id_to_gene(st)


def test_tissue_seg(self):
Expand Down
131 changes: 126 additions & 5 deletions tutorials/2-Interacting-with-HEST-1k.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,62 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
" warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n",
"d:\\HEST\\HEST\\lib\\site-packages\\scanpy\\preprocessing\\_qc.py:432: RuntimeWarning: invalid value encountered in divide\n",
" return values / sums[:, None]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"* Scanpy adata:\n",
"AnnData object with n_obs × n_vars = 11845 × 541\n",
" obs: 'in_tissue', 'pxl_col_in_fullres', 'pxl_row_in_fullres', 'array_col', 'array_row', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito'\n",
" var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'\n",
" uns: 'spatial'\n",
" obsm: 'spatial'\n",
"\n",
"* WSI:\n",
"<width=48376, height=53738, backend=OpenSlideWSI>\n",
"\n",
"* Shapes:\n",
"[name: cellvit, coord-system: he, <not loaded>, name: xenium_cell, coord-system: he, <not loaded>, name: xenium_nucleus, coord-system: he, <not loaded>]\n",
"\n",
"* Tissue contours:\n",
" tissue_id geometry\n",
"0 0 POLYGON ((14052 2848, 14025 2874, 13998 2874, ...\n",
"\n",
"* SpatialData conversion:\n",
"SpatialData object\n",
"├── Images\n",
"│ ├── 'ST_downscaled_hires_image': SpatialImage[cyx] (3, 3358, 3023)\n",
"│ └── 'ST_downscaled_lowres_image': SpatialImage[cyx] (3, 1000, 900)\n",
"├── Shapes\n",
"│ ├── 'cellvit': GeoDataFrame shape: (497508, 3) (2D shapes)\n",
"│ ├── 'locations': GeoDataFrame shape: (11845, 2) (2D shapes)\n",
"│ ├── 'tissue_contours': GeoDataFrame shape: (1, 2) (2D shapes)\n",
"│ ├── 'xenium_cell': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
"│ └── 'xenium_nucleus': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
"└── Tables\n",
" └── 'table': AnnData (11845, 541)\n",
"with coordinate systems:\n",
" ▸ 'ST_downscaled_hires', with elements:\n",
" ST_downscaled_hires_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n",
" ▸ 'ST_downscaled_lowres', with elements:\n",
" ST_downscaled_lowres_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n"
]
}
],
"source": [
"from hest import iter_hest\n",
"\n",
Expand Down Expand Up @@ -168,7 +221,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Patching"
"## Changing the Patching/pooling size\n",
"\n",
"### Patching\n",
"You can change the size of patches around the spots with `dump_patches`:"
]
},
{
Expand All @@ -188,6 +244,71 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Changing the Pooling size\n",
"\n",
"You can change the pooling size of Xenium/Visium HD samples stored in `HESTData.adata` with the following:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
" warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"from hest.readers import pool_transcripts_xenium\n",
"from hest import iter_hest\n",
"\n",
"new_spot_size = 200\n",
"\n",
"\n",
"# Iterate through a subset of hest\n",
"for st in iter_hest('../hest_data', id_list=['TENX95'], load_transcripts=True,):\n",
" print(st.transcript_df)\n",
"\n",
" # Feel free to convert st.transcript_df to a Dask DataFrame if you are working with limited RAM.\n",
"\n",
" st.adata = pool_transcripts_xenium(\n",
" st.transcript_df, \n",
" st.pixel_size, \n",
" key_x='he_x',\n",
" key_y='he_y',\n",
" spot_size_um=new_spot_size\n",
" )\n",
"\n",
"\n",
" # We change the spots, so we need to re-extract the patches\n",
" st.dump_patches(\n",
" patch_save_dir,\n",
" name='demo',\n",
" target_patch_size=224, # target patch size in 224\n",
" target_pixel_size=0.5 # pixel size of the patches in um/px after rescaling\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -292,7 +413,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "cuml",
"display_name": "HEST",
"language": "python",
"name": "python3"
},
Expand All @@ -306,7 +427,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.10.11"
}
},
"nbformat": 4,
Expand Down
Loading