merge: Avoid overwriting the output id column with non-id columns

This edge case occurs when an input table has an id column used for joining (in tests, literally "id", but any valid id column name) and also another, non-id column that matches the output id column name (in tests, literally "strain").
nextstrain · Aug 21, 2024 · 40f4e2e · 40f4e2e
1 parent 242d67f
commit 40f4e2e
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 2 deletions.
diff --git a/augur/merge.py b/augur/merge.py
@@ -13,7 +13,9 @@
 column or overwriting values in an existing column.  For columns appearing in
 more than one table, non-empty values on the right hand side overwrite values
 on the left hand side.  The first table's id column name is used as the output
-id column name.
+id column name.  Non-id columns in other input tables that would conflict with
+this output id column name are prefixed with two underscores ("__") in the
+output table.
 
 One generated column per input table is appended to the end of the output
 table to identify the source of each row's data.  Column names are generated
@@ -186,7 +188,24 @@ def run(args):
             for column in m.columns:
                 # Match different id column names in different metadata files
                 # since they're logically equivalent.
-                output_column = output_id_column if column == m.id_column else column
+                if column == m.id_column:
+                    output_column = output_id_column
+
+                # Don't overwrite output id column (i.e. first table's id
+                # column) with a non-id column of the same name (i.e. from a
+                # subsequent table).
+                #
+                # XXX TODO: Alternatively, we could a) skip such columns rather
+                # than mangle their names, or b) allow for output columns with
+                # non-unique names (i.e. distinguishable only by position).
+                # Would either of those be preferrable?  Should it be
+                # configurable?
+                #   -trs, 21 Aug 2024
+                elif column == output_id_column:
+                    output_column = f"__{column}"
+                    print_info(f"WARNING: Renaming column in {m.name!r} from {column!r} to {output_column!r} because it conflicts with the output id column name ({output_id_column!r}).")
+                else:
+                    output_column = column
 
                 output_columns.setdefault(output_column, [])
                 output_columns[output_column] += [(m.table_name, column)]

diff --git a/tests/functional/merge/cram/merge.t b/tests/functional/merge/cram/merge.t
@@ -143,6 +143,27 @@ Metadata field values with metachars (field or record delimiters) are handled pr
   x"	1	1
   two	X2a	X2b	X2c				1	1
 
+Output column renamed when it conflicts with id column.
+
+  $ cat >id-and-strain.csv <<~~
+  > id,strain
+  > one,1
+  > two,2
+  > three,3
+  > ~~
+  $ ${AUGUR} merge \
+  >   --metadata strain-only=x.tsv id-and-strain=id-and-strain.csv \
+  >   --metadata-id-columns id strain \
+  >   --output-metadata - | csv2tsv --csv-delim $'\t' | tsv-pretty
+  Reading 'strain-only' metadata from 'x.tsv'…
+  Reading 'id-and-strain' metadata from 'id-and-strain.csv'…
+  WARNING: Renaming column in 'id-and-strain' from 'strain' to '__strain' because it conflicts with the output id column name ('strain').
+  Merging metadata and writing to '-'…
+  strain  a    b    c    __strain  __source_metadata_strain-only  __source_metadata_id-and-strain
+  one     X1a  X1b  X1c         1                              1                                1
+  two     X2a  X2b  X2c         2                              1                                1
+  three                         3                              0                                1
+
 
 ERROR HANDLING