From e3c8659c8580bef5d96214f87bb65dbd6944eaa2 Mon Sep 17 00:00:00 2001 From: Pablo Moreno Date: Thu, 30 Mar 2023 23:04:50 +0100 Subject: [PATCH] AnnData Ops: Adds cell metadata merging (#289) * Adds cell metadata merging * Avoids pandas object dtype and adds passing tests --- .../scanpy/anndata_operations.xml | 63 ++++++++++++++++++- .../test-data/test_incomplete_metadata.tsv | 6 ++ 2 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 tools/tertiary-analysis/scanpy/test-data/test_incomplete_metadata.tsv diff --git a/tools/tertiary-analysis/scanpy/anndata_operations.xml b/tools/tertiary-analysis/scanpy/anndata_operations.xml index 0414b35f..0bb1f16d 100644 --- a/tools/tertiary-analysis/scanpy/anndata_operations.xml +++ b/tools/tertiary-analysis/scanpy/anndata_operations.xml @@ -1,5 +1,5 @@ - + modifies metadata and flags genes scanpy_macros2.xml @@ -41,6 +41,10 @@ ln -s '${copy_r.r_source}' r_source.h5 && ln -s '${us}' uns_source_${i}.h5 && #end for #end if + +#if $add_cell_metadata.default: + ln -s ${add_cell_metadata.file} cell_metadata.tsv && +#end if python $operations ]]> @@ -57,9 +61,45 @@ def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-') appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '') df[new_field] = df[field].astype(str) + appendents.astype(str) return df - + adata = sc.read('input.h5') +#if $add_cell_metadata.default: +import pandas as pd + +def add_cell_metadata(ad, metadata_file="cell_metadata.tsv", drop_duplicates=True): + metadata_df = pd.read_csv(metadata_file, sep="\t", index_col=0) + # we avoid renames in the original object or outright drop the column in the metadata + for col in ad.obs.columns: + if col in metadata_df.columns: + print(f"Renaming {col} to {col}_x") + if drop_duplicates: + metadata_df = metadata_df.drop(col, axis=1) + else: + metadata_df.rename(columns={col: col + "_x"}, inplace=True) + # merge metadata into ad.obs column by column, changing columns to category dtype if they become object dtype on merge + merged_obs = ad.obs.merge( + metadata_df, left_index=True, right_index=True, how="left" + ) + for o_col in metadata_df.columns: + col = o_col + # lets consider cases where columns where renamed during merge + if o_col + "_x" in merged_obs.columns: + col = o_col + "_x" + if o_col + "_y" in merged_obs.columns: + col = o_col + "_y" + if col in merged_obs.columns: + if merged_obs[col].dtype == object: + prev_dtype = metadata_df[o_col].dtype + if prev_dtype == str or prev_dtype == object: + prev_dtype = "category" + print(f"Changing {col} from {merged_obs[col].dtype} to {prev_dtype}") + merged_obs[col] = merged_obs[col].astype(prev_dtype) + return merged_obs + +adata.obs = add_cell_metadata(adata) +#end if + #if $copy_adata_to_raw: adata.raw = adata #end if @@ -253,6 +293,13 @@ adata.write('output.h5', compression='gzip') + + + + + + + @@ -361,6 +408,18 @@ adata.write('output.h5', compression='gzip') + + + + + + + + + + + + diff --git a/tools/tertiary-analysis/scanpy/test-data/test_incomplete_metadata.tsv b/tools/tertiary-analysis/scanpy/test-data/test_incomplete_metadata.tsv new file mode 100644 index 00000000..6836810f --- /dev/null +++ b/tools/tertiary-analysis/scanpy/test-data/test_incomplete_metadata.tsv @@ -0,0 +1,6 @@ +barcode cell_type +ERR2146881 A +ERR2146882 A +ERR2146883 B +ERR2146884 B +ERR2146885 C