Skip to content

Commit

Permalink
AnnData Ops: Adds cell metadata merging (#289)
Browse files Browse the repository at this point in the history
* Adds cell metadata merging

* Avoids pandas object dtype and adds passing tests
  • Loading branch information
pcm32 authored Mar 30, 2023
1 parent 6c9d530 commit e3c8659
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 2 deletions.
63 changes: 61 additions & 2 deletions tools/tertiary-analysis/scanpy/anndata_operations.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@">
<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy91" profile="@PROFILE@">
<description>modifies metadata and flags genes</description>
<macros>
<import>scanpy_macros2.xml</import>
Expand Down Expand Up @@ -41,6 +41,10 @@ ln -s '${copy_r.r_source}' r_source.h5 &&
ln -s '${us}' uns_source_${i}.h5 &&
#end for
#end if
#if $add_cell_metadata.default:
ln -s ${add_cell_metadata.file} cell_metadata.tsv &&
#end if
python $operations
]]></command>
<configfiles>
Expand All @@ -57,9 +61,45 @@ def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-')
appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '')
df[new_field] = df[field].astype(str) + appendents.astype(str)
return df

adata = sc.read('input.h5')

#if $add_cell_metadata.default:
import pandas as pd

def add_cell_metadata(ad, metadata_file="cell_metadata.tsv", drop_duplicates=True):
metadata_df = pd.read_csv(metadata_file, sep="\t", index_col=0)
# we avoid renames in the original object or outright drop the column in the metadata
for col in ad.obs.columns:
if col in metadata_df.columns:
print(f"Renaming {col} to {col}_x")
if drop_duplicates:
metadata_df = metadata_df.drop(col, axis=1)
else:
metadata_df.rename(columns={col: col + "_x"}, inplace=True)
# merge metadata into ad.obs column by column, changing columns to category dtype if they become object dtype on merge
merged_obs = ad.obs.merge(
metadata_df, left_index=True, right_index=True, how="left"
)
for o_col in metadata_df.columns:
col = o_col
# lets consider cases where columns where renamed during merge
if o_col + "_x" in merged_obs.columns:
col = o_col + "_x"
if o_col + "_y" in merged_obs.columns:
col = o_col + "_y"
if col in merged_obs.columns:
if merged_obs[col].dtype == object:
prev_dtype = metadata_df[o_col].dtype
if prev_dtype == str or prev_dtype == object:
prev_dtype = "category"
print(f"Changing {col} from {merged_obs[col].dtype} to {prev_dtype}")
merged_obs[col] = merged_obs[col].astype(prev_dtype)
return merged_obs

adata.obs = add_cell_metadata(adata)
#end if

#if $copy_adata_to_raw:
adata.raw = adata
#end if
Expand Down Expand Up @@ -253,6 +293,13 @@ adata.write('output.h5', compression='gzip')
<inputs>
<param name="input_obj_file" argument="input-object-file" type="data" format="h5,h5ad" label="Input object in hdf5 AnnData format"/>
<expand macro="output_object_params_no_loom"/>
<conditional name="add_cell_metadata">
<param name="default" type="boolean" checked="false" label="Merge additional cell metadata"/>
<when value="true">
<param name="file" type="data" label="Cell metadata with headers" help="A tabular file with headers, where the first column contains cell barcodes. Will be merged via a left join, so not all cells in the obs need to be in the metadata. Currently duplicated column headers will be ignored and the originals in the AnnData will be kept." format="tsv,tabular"/>
</when>
<when value="false"/>
</conditional>
<param name="copy_adata_to_raw" type="boolean" label="Copy AnnData to .raw" help="If activated, it will do 'adata.raw = adata'" checked="false"/>
<repeat name="modifications" title="Change field names in AnnData observations" min="0">
<param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change">
Expand Down Expand Up @@ -361,6 +408,18 @@ adata.write('output.h5', compression='gzip')
</assert_contents>
</output>
</test>
<test>
<param name="input_obj_file" value="anndata_ops.h5"/>
<conditional name="add_cell_metadata">
<param name="default" value="true"/>
<param name="file" value="test_incomplete_metadata.tsv"/>
</conditional>
<output name="output_h5ad" ftype="h5ad">
<assert_contents>
<has_h5_keys keys="obs/cell_type"/>
</assert_contents>
</output>
</test>
<test>
<param name="input_obj_file" value="anndata_ops.h5"/>
<repeat name="var_modifications" >
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
barcode cell_type
ERR2146881 A
ERR2146882 A
ERR2146883 B
ERR2146884 B
ERR2146885 C

0 comments on commit e3c8659

Please sign in to comment.