andim · andim · Mar 18, 2024 · Mar 13, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/pyrepseq/io.py b/pyrepseq/io.py
@@ -24,7 +24,7 @@ def standardize_dataframe(
     """
     This is a utility function to organise a table of TCR-pMHC data into the standard pyrepseq format and perform data cleaning/standardization to ensure that the TCR/MHC gene symbols are IMGT-compliant, the epitopes are all valid amino acid strings, and the CDR3s look valid.
     For further notes on data standardization, see below.
-    The standard format is a table with at least the following columns (not necessarily in order):
+    The standard format is a table with some or all of the following columns (not necessarily in order):
 
     +-----------------+------------------------------------------+-----------+
     | Column Name     | Column should contain                    | Data type |

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,11 +3,9 @@
 import pytest
 
 
-@pytest.fixture
-def mock_data_path():
-    return Path("tests") / "resources" / "mock_data.csv"
+RESOURCES_DIR = Path("tests")/"resources"
 
 
 @pytest.fixture
-def mock_data_df(mock_data_path):
-    return pd.read_csv(mock_data_path)
+def mock_data_df():
+    return pd.read_csv(RESOURCES_DIR/"mock_data.csv")
diff --git a/tests/test_coincidence.py b/tests/test_coincidence.py
diff --git a/tests/test_pc.py b/tests/test_pc.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import pyrepseq as prs
+from pytest import mark
+
+
+@mark.filterwarnings("ignore:Inputting paired-chain CDR3 data as a tuple")
+@mark.parametrize(
+    ("arg", "expected"),
+    (
+        (["A", "A"], 1.0),
+        (["A", "B"], 0.0),
+        (["A", "A", "B"], 1.0 / 3.0),
+        ((["A","A"],["B","B"]), 1.0),
+        ((["A","B"],["A","B"]), 0.0),
+        ((["A","A","B"],["C","C","C"]), 1.0 / 3.0)
+    )
+)
+def test_with_one_arg(arg, expected):
+    result = prs.pc(arg)
+    assert result == expected
+
+
+def test_with_one_df(mock_data_df):
+    # Do we want pc to detect clone count column and automatically compute based on that?
+    # Currently it just looks for duplicate rows and calculates based on that.
+    mock_data_df_with_coincidence = pd.concat([mock_data_df,mock_data_df.iloc[[-1]]])
+    result = prs.pc(mock_data_df_with_coincidence)
+    num_items = len(mock_data_df_with_coincidence)
+    num_pairs = num_items * (num_items-1) / 2
+    expected = 1.0 / num_pairs
+    assert result == expected
+
+
+@mark.filterwarnings("ignore:Inputting paired-chain CDR3 data as a tuple")
+@mark.parametrize(
+    ("arg1", "arg2", "expected"),
+    (
+        (["A", "A"], ["A", "A"], 1.0),
+        (["A", "A"], ["B", "B"], 0.0),
+        (["A", "B"], ["A", "B"], 0.5),
+        ((["A","A"],["B","B"]), (["A","A"],["B","B"]), 1.0),
+        ((["A","B"],["A","B"]), (["A","B"],["A","B"]), 0.5)
+    )
+)
+def test_with_two_args(arg1, arg2, expected):
+    result = prs.pc(arg1, arg2)
+    assert result == expected
+
+
+def test_with_two_dfs(mock_data_df):
+    result = prs.pc(mock_data_df, mock_data_df)
+    expected = 1 / len(mock_data_df)
+    assert result == expected
diff --git a/tests/test_pc_delta.py b/tests/test_pc_delta.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pyrepseq as prs
+from pytest import mark
+
+
+@mark.filterwarnings("ignore:Inputting paired-chain CDR3 data as a tuple")
+@mark.parametrize(
+    ("arg", "expected"),
+    (
+        (["A", "A"], np.array([1,0,0,0])),
+        (["A", "B"], np.array([0,1,0,0])),
+        (["A", "A", "B"], np.array([1.0/3.0,2.0/3.0,0,0])),
+        ((["A","A"],["B","B"]), np.array([1,0,0,0])),
+        ((["A", "B"], ["A", "B"]), np.array([0,0,1,0]))
+    )
+)
+def test_with_one_arg(arg, expected):
+    results = prs.pcDelta(arg, bins=range(5))
+    assert np.array_equal(results, expected)
+
+
+def test_with_one_df(mock_data_df):
+    results = prs.pcDelta(mock_data_df, bins=range(12))
+    expected = np.array([1.0/3.0,0,0,0,0,0,0,0,0,0,2.0/3.0])
+    assert np.array_equal(results, expected)
+
+
+@mark.filterwarnings("ignore:Inputting paired-chain CDR3 data as a tuple")
+@mark.parametrize(
+    ("arg1", "arg2", "expected"),
+    (
+        (["A","A"], ["A","A"], np.array([1,0,0,0])),
+        (["A","A"], ["B","B"], np.array([0,1,0,0])),
+        (["A","B"], ["A","B"], np.array([0.5,0.5,0,0])),
+        ((["A","A"],["B","B"]), (["A","A"],["B","B"]), np.array([1,0,0,0])),
+        ((["A","B"],["A","B"]), (["A","B"],["A","B"]), np.array([0.5,0,0.5,0]))
+    )
+)
+def test_with_two_args(arg1, arg2, expected):
+    results = prs.pcDelta(arg1, arg2, bins=range(5))
+    assert np.array_equal(results, expected)
+
+
+def test_with_two_dfs(mock_data_df):
+    results = prs.pcDelta(mock_data_df, mock_data_df, bins=range(12))
+    expected = np.array([5.0/9.0,0,0,0,0,0,0,0,0,0,4.0/9.0])
+    assert np.array_equal(results, expected)