[SYNPY-1521] Fixes asDataFrame kwarg Collision Issue (#1132)

* fixes kwarg collision * pre-commit * adds typehints to _csv_to_pandas_df * adds unit tests for _csv_to_pandas_df * pre-commit fix * fix line separator behavior * splits comment lines * pre-commit whitespaces * removes extra **kwargs * updates docstrings * pre-commit
Sage-Bionetworks · Sep 24, 2024 · 35db9a9 · 35db9a9
1 parent 857154e
commit 35db9a9
Show file tree

Hide file tree

Showing 2 changed files with 237 additions and 21 deletions.
diff --git a/synapseclient/table.py b/synapseclient/table.py
@@ -34,7 +34,7 @@
 import re
 import tempfile
 from builtins import zip
-from typing import Dict, List, Tuple, TypeVar, Union
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
 
 from synapseclient.core.constants import concrete_types
 from synapseclient.core.exceptions import SynapseError
@@ -397,16 +397,16 @@ def _convert_df_date_cols_to_datetime(
 
 
 def _csv_to_pandas_df(
-    filepath,
-    separator=DEFAULT_SEPARATOR,
-    quote_char=DEFAULT_QUOTE_CHARACTER,
-    escape_char=DEFAULT_ESCAPSE_CHAR,
-    contain_headers=True,
-    lines_to_skip=0,
-    date_columns=None,
-    list_columns=None,
-    rowIdAndVersionInIndex=True,
-    dtype=None,
+    filepath: str,
+    separator: str = DEFAULT_SEPARATOR,
+    quote_char: str = DEFAULT_QUOTE_CHARACTER,
+    escape_char: str = DEFAULT_ESCAPSE_CHAR,
+    contain_headers: bool = True,
+    lines_to_skip: int = 0,
+    date_columns: Optional[List[str]] = None,
+    list_columns: Optional[List[str]] = None,
+    rowIdAndVersionInIndex: bool = True,
+    dtype: Optional[Dict[str, Any]] = None,
     **kwargs,
 ):
     """
@@ -415,14 +415,20 @@ def _csv_to_pandas_df(
     Arguments:
         filepath: The path to the file.
         separator: The separator for the file, Defaults to `DEFAULT_SEPARATOR`.
+                    Passed as `sep` to pandas. If `sep` is supplied as a `kwarg`
+                    it will be used instead of this `separator` argument.
         quote_char: The quote character for the file,
-          Defaults to `DEFAULT_QUOTE_CHARACTER`.
+                    Defaults to `DEFAULT_QUOTE_CHARACTER`.
+                    Passed as `quotechar` to pandas. If `quotechar` is supplied as a `kwarg`
+                    it will be used instead of this `quote_char` argument.
         escape_char: The escape character for the file,
                     Defaults to `DEFAULT_ESCAPSE_CHAR`.
-                    contain_headers: Whether the file contains headers,
+        contain_headers: Whether the file contains headers,
                     Defaults to `True`.
         lines_to_skip: The number of lines to skip at the beginning of the file,
-                        Defaults to `0`.
+                        Defaults to `0`. Passed as `skiprows` to pandas.
+                        If `skiprows` is supplied as a `kwarg`
+                        it will be used instead of this `lines_to_skip` argument.
         date_columns: The names of the date columns in the file
         list_columns: The names of the list columns in the file
         rowIdAndVersionInIndex: Whether the file contains rowId and
@@ -440,21 +446,26 @@ def _csv_to_pandas_df(
 
     line_terminator = str(os.linesep)
 
+    pandas_args = {
+        "dtype": dtype,
+        "sep": separator,
+        "quotechar": quote_char,
+        "escapechar": escape_char,
+        "header": 0 if contain_headers else None,
+        "skiprows": lines_to_skip,
+    }
+    pandas_args.update(kwargs)
+
     # assign line terminator only if for single character
     # line terminators (e.g. not '\r\n') 'cause pandas doesn't
     # longer line terminators. See: <https://github.com/pydata/pandas/issues/3501>
     # "ValueError: Only length-1 line terminators supported"
     df = pd.read_csv(
         filepath,
-        dtype=dtype,
-        sep=separator,
         lineterminator=line_terminator if len(line_terminator) == 1 else None,
-        quotechar=quote_char,
-        escapechar=escape_char,
-        header=0 if contain_headers else None,
-        skiprows=lines_to_skip,
-        **kwargs,
+        **pandas_args,
     )
+
     # parse date columns if exists
     if date_columns:
         df = _convert_df_date_cols_to_datetime(df, date_columns)
@@ -2425,6 +2436,12 @@ def __init__(
         includeRowIdAndRowVersion=None,
         headers=None,
     ):
+        """Initialize a CsvFileTable object.
+
+        Note: Some arguments provided to this constructor are passed to pandas.read_csv()
+        including `quoteCharacter`, `escapeCharacter`, `lineEnd`, and `separator`.
+        These arguments can be overwritten by passing `pandas.read_csv` kwargs to `asDataFrame`.
+        """
         self.filepath = filepath
 
         self.includeRowIdAndRowVersion = includeRowIdAndRowVersion
@@ -2481,6 +2498,11 @@ def asDataFrame(
     ):
         """Convert query result to a Pandas DataFrame.
 
+        Note: Class attributes  `quoteCharacter`, `escapeCharacter`, `lineEnd`, and `separator`
+        are used as `quotechar`, `escapechar`, `lineterminator`, and `sep` in `pandas.read_csv`
+        within this method. If you want to override these values, you can do so by passing the
+        appropriate keyword arguments to this method.
+
         Arguments:
             rowIdAndVersionInIndex: Make the dataframe index consist of the
                                     row_id and row_version (and row_etag if it exists)

diff --git a/tests/unit/synapseclient/unit_test_tables.py b/tests/unit/synapseclient/unit_test_tables.py
@@ -1,6 +1,7 @@
 """Unit test for synapseclient.table"""
 import csv
 import io
+import json
 import math
 import os
 import shutil
@@ -40,6 +41,7 @@
     Table,
     TableQueryResult,
     _convert_df_date_cols_to_datetime,
+    _csv_to_pandas_df,
     _get_view_type_mask,
     _get_view_type_mask_for_deprecated_type,
     as_table_columns,
@@ -251,6 +253,198 @@ def test_convert_df_date_cols_to_datetime() -> None:
     assert_frame_equal(test_df2, expected_date_df)
 
 
+def test_csv_to_pandas_df_no_kwargs():
+    # GIVEN a pandas DataFrame (CSV file stand-in)
+    expected_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [True, False, True]}
+    )
+
+    with patch.object(
+        pd, "read_csv", return_value=expected_df
+    ) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
+        # WHEN I call _csv_to_pandas_df with default parameters
+        df = _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect pandas.read_csv was called with default arguments
+        mock_read_csv.assert_called_once_with(
+            "dummy_path.csv",
+            dtype=None,
+            sep=synapseclient.table.DEFAULT_SEPARATOR,
+            quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            header=0,
+            skiprows=0,
+            lineterminator=None,
+        )
+
+        # AND I expect the returned DataFrame to be
+        # the same as the original DataFrame (file)
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_csv_to_pandas_df_with_kwargs() -> None:
+    # GIVEN a pandas DataFrame (CSV file stand-in)
+    expected_df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
+
+    with patch.object(
+        pd, "read_csv", return_value=expected_df
+    ) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
+        # WHEN I call _csv_to_pandas_df with custom keyword arguments
+        kwargs = {"escapechar": "\\", "keep_default_na": False}
+        df = _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+            **kwargs,
+        )
+
+        # THEN I expect pandas.read_csv was called with the keyword arguments
+        mock_read_csv.assert_called_once_with(
+            "dummy_path.csv",
+            dtype=None,
+            sep=synapseclient.table.DEFAULT_SEPARATOR,
+            quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escapechar="\\",
+            header=0,
+            skiprows=0,
+            keep_default_na=False,
+            lineterminator=None,
+        )
+
+        # AND I expect the returned DataFrame to match the expected DataFrame
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_csv_to_pandas_df_calls_convert_date_cols():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with a date column
+    expected_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "date_col": ["2021-01-01", "2021-01-02", "2021-01-03"]}
+    )
+
+    with patch.object(pd, "read_csv", return_value=expected_df), patch.object(
+        synapseclient.table, "_convert_df_date_cols_to_datetime"
+    ) as mock_convert_dates:
+        # WHEN I call _csv_to_pandas_df with date_columns specified
+        _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=["date_col"],  # Specify date column
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect _convert_df_date_cols_to_datetime to be
+        # called with the expected DataFrame and date columns
+        mock_convert_dates.assert_called_once_with(expected_df, ["date_col"])
+
+
+def test_csv_to_pandas_df_handles_list_columns():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with a list column
+    initial_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "list_col": ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"]}
+    )
+
+    # AND a pandas DataFrame (expected result) with the list column converted to a list
+    expected_final_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "list_col": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
+    )
+
+    with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
+        synapseclient.table, "_convert_df_date_cols_to_datetime"
+    ), patch.object(
+        pd.Series, "apply", return_value=expected_final_df["list_col"]
+    ) as mock_apply:
+        # WHEN I call _csv_to_pandas_df with list_columns specified
+        result_df = synapseclient.table._csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=["list_col"],  # Specify list column
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect json.loads to be applied to the list column
+        mock_apply.assert_called_once_with(json.loads)
+
+        # AND I expect the returned DataFrame to match the expected DataFrame
+        pd.testing.assert_frame_equal(result_df, expected_final_df)
+
+
+def test_csv_to_pandas_df_handles_row_id_and_version():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
+    initial_df = pd.DataFrame(
+        {
+            "ROW_ID": [1, 2, 3],
+            "ROW_VERSION": [1, 1, 2],
+            "col1": ["a", "b", "c"],
+            "col2": [10, 20, 30],
+        }
+    )
+
+    # AND a pandas DataFrame (expected result)
+    # with the ROW_ID and ROW_VERSION columns removed
+    expected_final_df = pd.DataFrame(
+        {"col1": ["a", "b", "c"], "col2": [10, 20, 30]}, index=["1_1", "2_1", "3_2"]
+    )  # Index format: ROW_ID_ROW_VERSION
+
+    with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
+        synapseclient.table,
+        "row_labels_from_id_and_version",
+        return_value=["1_1", "2_1", "3_2"],
+    ) as mock_row_labels:
+        # WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
+        result_df = synapseclient.table._csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect row_labels_from_id_and_version to be called once
+        mock_row_labels.assert_called_once()
+
+        # AND I expect the returned DataFrame to match the expected
+        # DataFrame with the ROW_ID and ROW_VERSION columns removed
+        pd.testing.assert_frame_equal(result_df, expected_final_df)
+
+        # AND I expect the index of the result_df to be as expected
+        assert list(result_df.index) == ["1_1", "2_1", "3_2"]
+
+
 def test_schema() -> None:
     schema = Schema(name="My Table", parent="syn1000001")