Skip to content

Commit

Permalink
[SYNPY-1521] Fixes asDataFrame kwarg Collision Issue (#1132)
Browse files Browse the repository at this point in the history
* fixes kwarg collision

* pre-commit

* adds typehints to _csv_to_pandas_df

* adds unit tests for _csv_to_pandas_df

* pre-commit fix

* fix line separator behavior

* splits comment lines

* pre-commit whitespaces

* removes extra **kwargs

* updates docstrings

* pre-commit
  • Loading branch information
BWMac authored Sep 24, 2024
1 parent 857154e commit 35db9a9
Show file tree
Hide file tree
Showing 2 changed files with 237 additions and 21 deletions.
64 changes: 43 additions & 21 deletions synapseclient/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import re
import tempfile
from builtins import zip
from typing import Dict, List, Tuple, TypeVar, Union
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union

from synapseclient.core.constants import concrete_types
from synapseclient.core.exceptions import SynapseError
Expand Down Expand Up @@ -397,16 +397,16 @@ def _convert_df_date_cols_to_datetime(


def _csv_to_pandas_df(
filepath,
separator=DEFAULT_SEPARATOR,
quote_char=DEFAULT_QUOTE_CHARACTER,
escape_char=DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
filepath: str,
separator: str = DEFAULT_SEPARATOR,
quote_char: str = DEFAULT_QUOTE_CHARACTER,
escape_char: str = DEFAULT_ESCAPSE_CHAR,
contain_headers: bool = True,
lines_to_skip: int = 0,
date_columns: Optional[List[str]] = None,
list_columns: Optional[List[str]] = None,
rowIdAndVersionInIndex: bool = True,
dtype: Optional[Dict[str, Any]] = None,
**kwargs,
):
"""
Expand All @@ -415,14 +415,20 @@ def _csv_to_pandas_df(
Arguments:
filepath: The path to the file.
separator: The separator for the file, Defaults to `DEFAULT_SEPARATOR`.
Passed as `sep` to pandas. If `sep` is supplied as a `kwarg`
it will be used instead of this `separator` argument.
quote_char: The quote character for the file,
Defaults to `DEFAULT_QUOTE_CHARACTER`.
Defaults to `DEFAULT_QUOTE_CHARACTER`.
Passed as `quotechar` to pandas. If `quotechar` is supplied as a `kwarg`
it will be used instead of this `quote_char` argument.
escape_char: The escape character for the file,
Defaults to `DEFAULT_ESCAPSE_CHAR`.
contain_headers: Whether the file contains headers,
contain_headers: Whether the file contains headers,
Defaults to `True`.
lines_to_skip: The number of lines to skip at the beginning of the file,
Defaults to `0`.
Defaults to `0`. Passed as `skiprows` to pandas.
If `skiprows` is supplied as a `kwarg`
it will be used instead of this `lines_to_skip` argument.
date_columns: The names of the date columns in the file
list_columns: The names of the list columns in the file
rowIdAndVersionInIndex: Whether the file contains rowId and
Expand All @@ -440,21 +446,26 @@ def _csv_to_pandas_df(

line_terminator = str(os.linesep)

pandas_args = {
"dtype": dtype,
"sep": separator,
"quotechar": quote_char,
"escapechar": escape_char,
"header": 0 if contain_headers else None,
"skiprows": lines_to_skip,
}
pandas_args.update(kwargs)

# assign line terminator only if for single character
# line terminators (e.g. not '\r\n') 'cause pandas doesn't
# longer line terminators. See: <https://github.com/pydata/pandas/issues/3501>
# "ValueError: Only length-1 line terminators supported"
df = pd.read_csv(
filepath,
dtype=dtype,
sep=separator,
lineterminator=line_terminator if len(line_terminator) == 1 else None,
quotechar=quote_char,
escapechar=escape_char,
header=0 if contain_headers else None,
skiprows=lines_to_skip,
**kwargs,
**pandas_args,
)

# parse date columns if exists
if date_columns:
df = _convert_df_date_cols_to_datetime(df, date_columns)
Expand Down Expand Up @@ -2425,6 +2436,12 @@ def __init__(
includeRowIdAndRowVersion=None,
headers=None,
):
"""Initialize a CsvFileTable object.
Note: Some arguments provided to this constructor are passed to pandas.read_csv()
including `quoteCharacter`, `escapeCharacter`, `lineEnd`, and `separator`.
These arguments can be overwritten by passing `pandas.read_csv` kwargs to `asDataFrame`.
"""
self.filepath = filepath

self.includeRowIdAndRowVersion = includeRowIdAndRowVersion
Expand Down Expand Up @@ -2481,6 +2498,11 @@ def asDataFrame(
):
"""Convert query result to a Pandas DataFrame.
Note: Class attributes `quoteCharacter`, `escapeCharacter`, `lineEnd`, and `separator`
are used as `quotechar`, `escapechar`, `lineterminator`, and `sep` in `pandas.read_csv`
within this method. If you want to override these values, you can do so by passing the
appropriate keyword arguments to this method.
Arguments:
rowIdAndVersionInIndex: Make the dataframe index consist of the
row_id and row_version (and row_etag if it exists)
Expand Down
194 changes: 194 additions & 0 deletions tests/unit/synapseclient/unit_test_tables.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit test for synapseclient.table"""
import csv
import io
import json
import math
import os
import shutil
Expand Down Expand Up @@ -40,6 +41,7 @@
Table,
TableQueryResult,
_convert_df_date_cols_to_datetime,
_csv_to_pandas_df,
_get_view_type_mask,
_get_view_type_mask_for_deprecated_type,
as_table_columns,
Expand Down Expand Up @@ -251,6 +253,198 @@ def test_convert_df_date_cols_to_datetime() -> None:
assert_frame_equal(test_df2, expected_date_df)


def test_csv_to_pandas_df_no_kwargs():
# GIVEN a pandas DataFrame (CSV file stand-in)
expected_df = pd.DataFrame(
{"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [True, False, True]}
)

with patch.object(
pd, "read_csv", return_value=expected_df
) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
# WHEN I call _csv_to_pandas_df with default parameters
df = _csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect pandas.read_csv was called with default arguments
mock_read_csv.assert_called_once_with(
"dummy_path.csv",
dtype=None,
sep=synapseclient.table.DEFAULT_SEPARATOR,
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
header=0,
skiprows=0,
lineterminator=None,
)

# AND I expect the returned DataFrame to be
# the same as the original DataFrame (file)
pd.testing.assert_frame_equal(df, expected_df)


def test_csv_to_pandas_df_with_kwargs() -> None:
# GIVEN a pandas DataFrame (CSV file stand-in)
expected_df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})

with patch.object(
pd, "read_csv", return_value=expected_df
) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
# WHEN I call _csv_to_pandas_df with custom keyword arguments
kwargs = {"escapechar": "\\", "keep_default_na": False}
df = _csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
**kwargs,
)

# THEN I expect pandas.read_csv was called with the keyword arguments
mock_read_csv.assert_called_once_with(
"dummy_path.csv",
dtype=None,
sep=synapseclient.table.DEFAULT_SEPARATOR,
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escapechar="\\",
header=0,
skiprows=0,
keep_default_na=False,
lineterminator=None,
)

# AND I expect the returned DataFrame to match the expected DataFrame
pd.testing.assert_frame_equal(df, expected_df)


def test_csv_to_pandas_df_calls_convert_date_cols():
# GIVEN a pandas DataFrame (CSV file stand-in) with a date column
expected_df = pd.DataFrame(
{"col1": [1, 2, 3], "date_col": ["2021-01-01", "2021-01-02", "2021-01-03"]}
)

with patch.object(pd, "read_csv", return_value=expected_df), patch.object(
synapseclient.table, "_convert_df_date_cols_to_datetime"
) as mock_convert_dates:
# WHEN I call _csv_to_pandas_df with date_columns specified
_csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=["date_col"], # Specify date column
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect _convert_df_date_cols_to_datetime to be
# called with the expected DataFrame and date columns
mock_convert_dates.assert_called_once_with(expected_df, ["date_col"])


def test_csv_to_pandas_df_handles_list_columns():
# GIVEN a pandas DataFrame (CSV file stand-in) with a list column
initial_df = pd.DataFrame(
{"col1": [1, 2, 3], "list_col": ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"]}
)

# AND a pandas DataFrame (expected result) with the list column converted to a list
expected_final_df = pd.DataFrame(
{"col1": [1, 2, 3], "list_col": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
)

with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
synapseclient.table, "_convert_df_date_cols_to_datetime"
), patch.object(
pd.Series, "apply", return_value=expected_final_df["list_col"]
) as mock_apply:
# WHEN I call _csv_to_pandas_df with list_columns specified
result_df = synapseclient.table._csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=["list_col"], # Specify list column
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect json.loads to be applied to the list column
mock_apply.assert_called_once_with(json.loads)

# AND I expect the returned DataFrame to match the expected DataFrame
pd.testing.assert_frame_equal(result_df, expected_final_df)


def test_csv_to_pandas_df_handles_row_id_and_version():
# GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
initial_df = pd.DataFrame(
{
"ROW_ID": [1, 2, 3],
"ROW_VERSION": [1, 1, 2],
"col1": ["a", "b", "c"],
"col2": [10, 20, 30],
}
)

# AND a pandas DataFrame (expected result)
# with the ROW_ID and ROW_VERSION columns removed
expected_final_df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [10, 20, 30]}, index=["1_1", "2_1", "3_2"]
) # Index format: ROW_ID_ROW_VERSION

with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
synapseclient.table,
"row_labels_from_id_and_version",
return_value=["1_1", "2_1", "3_2"],
) as mock_row_labels:
# WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
result_df = synapseclient.table._csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect row_labels_from_id_and_version to be called once
mock_row_labels.assert_called_once()

# AND I expect the returned DataFrame to match the expected
# DataFrame with the ROW_ID and ROW_VERSION columns removed
pd.testing.assert_frame_equal(result_df, expected_final_df)

# AND I expect the index of the result_df to be as expected
assert list(result_df.index) == ["1_1", "2_1", "3_2"]


def test_schema() -> None:
schema = Schema(name="My Table", parent="syn1000001")

Expand Down

0 comments on commit 35db9a9

Please sign in to comment.