AnnData Ops: Adds cell metadata merging (#289)

* Adds cell metadata merging * Avoids pandas object dtype and adds passing tests
ebi-gene-expression-group · Mar 30, 2023 · e3c8659 · e3c8659
1 parent 6c9d530
commit e3c8659
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 2 deletions.
diff --git a/tools/tertiary-analysis/scanpy/anndata_operations.xml b/tools/tertiary-analysis/scanpy/anndata_operations.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@">
+<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy91" profile="@PROFILE@">
   <description>modifies metadata and flags genes</description>
   <macros>
     <import>scanpy_macros2.xml</import>
@@ -41,6 +41,10 @@ ln -s '${copy_r.r_source}' r_source.h5 &&
   ln -s '${us}' uns_source_${i}.h5 &&
 #end for
 #end if
+
+#if $add_cell_metadata.default:
+  ln -s ${add_cell_metadata.file} cell_metadata.tsv &&
+#end if
 python $operations
 ]]></command>
   <configfiles>
@@ -57,9 +61,45 @@ def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-')
   appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '')
   df[new_field] = df[field].astype(str) + appendents.astype(str)
   return df
-
+	    
 adata = sc.read('input.h5')
 
+#if $add_cell_metadata.default:
+import pandas as pd
+
+def add_cell_metadata(ad, metadata_file="cell_metadata.tsv", drop_duplicates=True):
+  metadata_df = pd.read_csv(metadata_file, sep="\t", index_col=0)
+  # we avoid renames in the original object or outright drop the column in the metadata
+  for col in ad.obs.columns:
+      if col in metadata_df.columns:
+          print(f"Renaming {col} to {col}_x")
+          if drop_duplicates:
+              metadata_df = metadata_df.drop(col, axis=1)
+          else:
+              metadata_df.rename(columns={col: col + "_x"}, inplace=True)
+  # merge metadata into ad.obs column by column, changing columns to category dtype if they become object dtype on merge
+  merged_obs = ad.obs.merge(
+      metadata_df, left_index=True, right_index=True, how="left"
+  )
+  for o_col in metadata_df.columns:
+      col = o_col
+      # lets consider cases where columns where renamed during merge
+      if o_col + "_x" in merged_obs.columns:
+          col = o_col + "_x"
+      if o_col + "_y" in merged_obs.columns:
+          col = o_col + "_y"
+      if col in merged_obs.columns:
+          if merged_obs[col].dtype == object:
+              prev_dtype = metadata_df[o_col].dtype
+              if prev_dtype == str or prev_dtype == object:
+                  prev_dtype = "category"
+              print(f"Changing {col} from {merged_obs[col].dtype} to {prev_dtype}")
+              merged_obs[col] = merged_obs[col].astype(prev_dtype)
+  return merged_obs
+
+adata.obs = add_cell_metadata(adata)
+#end if
+
 #if $copy_adata_to_raw:
 adata.raw = adata
 #end if
@@ -253,6 +293,13 @@ adata.write('output.h5', compression='gzip')
   <inputs>
     <param name="input_obj_file" argument="input-object-file" type="data" format="h5,h5ad" label="Input object in hdf5 AnnData format"/>
     <expand macro="output_object_params_no_loom"/>
+    <conditional name="add_cell_metadata">
+      <param name="default" type="boolean" checked="false" label="Merge additional cell metadata"/>
+      <when value="true">
+        <param name="file" type="data" label="Cell metadata with headers" help="A tabular file with headers, where the first column contains cell barcodes. Will be merged via a left join, so not all cells in the obs need to be in the metadata. Currently duplicated column headers will be ignored and the originals in the AnnData will be kept." format="tsv,tabular"/>
+      </when>
+      <when value="false"/>
+    </conditional>
     <param name="copy_adata_to_raw" type="boolean" label="Copy AnnData to .raw" help="If activated, it will do 'adata.raw = adata'" checked="false"/>
     <repeat name="modifications" title="Change field names in AnnData observations" min="0">
       <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change">
@@ -361,6 +408,18 @@ adata.write('output.h5', compression='gzip')
         </assert_contents>
       </output>
     </test>
+    <test>
+      <param name="input_obj_file" value="anndata_ops.h5"/>
+      <conditional name="add_cell_metadata">
+        <param name="default" value="true"/>
+        <param name="file" value="test_incomplete_metadata.tsv"/>
+      </conditional>
+      <output name="output_h5ad" ftype="h5ad">
+        <assert_contents>
+          <has_h5_keys keys="obs/cell_type"/>
+        </assert_contents>
+      </output>
+    </test>
     <test>
       <param name="input_obj_file" value="anndata_ops.h5"/>
       <repeat name="var_modifications" >

diff --git a/tools/tertiary-analysis/scanpy/test-data/test_incomplete_metadata.tsv b/tools/tertiary-analysis/scanpy/test-data/test_incomplete_metadata.tsv
@@ -0,0 +1,6 @@
+barcode	cell_type
+ERR2146881	A
+ERR2146882	A
+ERR2146883	B
+ERR2146884	B
+ERR2146885	C