Merge pull request #34 from sanogenetics/feature/upstream-v2.1.0

Feature/upstream v2.1.0
apriha · Nov 24, 2020 · 15246c2 · 15246c2
2 parents 48f3bee + 3a03565
commit 15246c2
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 6 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -20,7 +20,7 @@ script:
   # so this ensures all Python versions will be periodically integration tested with the
   # resource servers
   - set -e
-  - NUM_JOBS=3
+  - NUM_JOBS=4
   - SELECTED_JOB=$((10#$(date +%V) % $NUM_JOBS))  # identify a job based on week of the year
   - |
     if [[ $TRAVIS_PULL_REQUEST != "false" && $SELECTED_JOB == $JOB_ID ]]; then
@@ -58,6 +58,8 @@ jobs:
       env: JOB_ID=1
     - python: 3.8
       env: JOB_ID=2
+    - python: 3.9
+      env: JOB_ID=3
     - stage: deploy
       python: 3.6
       script: skip

diff --git a/README.rst b/README.rst
@@ -6,6 +6,9 @@ snps
 ====
 tools for reading, writing, merging, and remapping SNPs 🧬
 
+``snps`` *strives to be an easy-to-use and accessible open-source library for working with
+genotype data*
+
 Features
 --------
 Input / Output
@@ -162,6 +165,9 @@ discrepant SNPs are available for inspection after the merge via properties of t
 Additionally, any non-called / null genotypes will be updated during the merge, if the file
 being merged has a called genotype for the SNP.
 
+Moreover, ``merge`` takes a ``chrom`` parameter - this enables merging of only SNPs associated
+with the specified chromosome (e.g., "Y" or "MT").
+
 Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding
 to the results of each merge (e.g., SNPs in common).
 

diff --git a/setup.py b/setup.py
@@ -107,6 +107,7 @@
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
         "Topic :: Scientific/Engineering",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
         "Topic :: Scientific/Engineering :: Information Analysis",

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
@@ -606,12 +606,13 @@ def read_ancestry(self, file, compression):
         """
 
         def parser():
-            # read files with multiple separators
             df = pd.read_csv(
                 file,
                 comment="#",
                 header=0,
-                delim_whitespace=True,  # https://stackoverflow.com/a/41320761
+                engine="c",
+                sep="\s+",
+                # delim_whitespace=True,  # https://stackoverflow.com/a/15026839
                 na_values=0,
                 names=["rsid", "chrom", "pos", "allele1", "allele2"],
                 index_col=0,

diff --git a/src/snps/snps.py b/src/snps/snps.py
@@ -1250,6 +1250,7 @@ def merge(
         discrepant_positions_threshold=100,
         discrepant_genotypes_threshold=500,
         remap=True,
+        chrom="",
     ):
         """ Merge other ``SNPs`` objects into this ``SNPs`` object.
 
@@ -1266,6 +1267,8 @@ def merge(
         remap : bool
             if necessary, remap other ``SNPs`` objects to have the same build as this ``SNPs`` object
             before merging
+        chrom : str, optional
+            chromosome to merge (e.g., "1", "Y", "MT")
 
         Returns
         -------
@@ -1334,11 +1337,19 @@ def merge_dfs(s):
                 s.discrepant_vcf_position
             )
 
-        def merge_snps(s, positions_threshold, genotypes_threshold):
+        def merge_snps(s, positions_threshold, genotypes_threshold, merge_chrom):
             # merge SNPs, identifying those with discrepant positions and genotypes; update NAs
 
             # identify common SNPs (i.e., any rsids being added that already exist in self.snps)
-            df = self.snps.join(s.snps, how="inner", rsuffix="_added")
+            df = (
+                self.snps.join(s.snps, how="inner", rsuffix="_added")
+                if not merge_chrom
+                else self.snps.loc[self.snps.chrom == merge_chrom].join(
+                    s.snps.loc[s.snps.chrom == merge_chrom],
+                    how="inner",
+                    rsuffix="_added",
+                )
+            )
 
             common_rsids = df.index
 
@@ -1386,7 +1397,11 @@ def merge_snps(s, positions_threshold, genotypes_threshold):
                 return (False,)
 
             # add new SNPs
-            self._snps = self.snps.combine_first(s.snps)
+            self._snps = (
+                self.snps.combine_first(s.snps)
+                if not merge_chrom
+                else self.snps.combine_first(s.snps.loc[s.snps.chrom == merge_chrom])
+            )
             # combine_first converts position to float64, so convert it back to uint32
             self._snps["pos"] = self.snps["pos"].astype(np.uint32)
 
@@ -1455,6 +1470,7 @@ def merge_snps(s, positions_threshold, genotypes_threshold):
                     snps_object,
                     discrepant_positions_threshold,
                     discrepant_genotypes_threshold,
+                    chrom,
                 )
 
                 if merged:

diff --git a/tests/test_snps.py b/tests/test_snps.py
@@ -768,6 +768,57 @@ def test_appending_dfs(self):
             s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True
         )
 
+    def test_merge_chrom(self):
+        s1 = SNPs("tests/input/generic.csv")
+        df = s1.snps.append(
+            self.create_snp_df(
+                rsid=["rs100", "rs101", "rs102", "rs103"],
+                chrom=["Y", "Y", "Y", "Y"],
+                pos=[100, 101, 102, 103],
+                genotype=["A", np.nan, "A", "A"],
+            )
+        )
+        s1._snps = df.copy()
+        s2 = SNPs()
+        s2._build = 37
+        s2._snps = df.copy()
+
+        # set values for chrom that will be ignored (that would otherwise result in
+        # identification of discrepant SNPs or updating genotype)
+        s2._snps.loc["rs3", "pos"] = 1003  # discrepant position
+        s2._snps.loc["rs4", "genotype"] = "AA"  # discrepant genotype
+        s2._snps.loc["rs5", "genotype"] = "AA"
+
+        # set values for chrom to be merged
+        s2._snps.loc["rs100", "genotype"] = "T"  # discrepant genotype
+        s2._snps.loc["rs101", "genotype"] = "A"
+        s2._snps.loc["rs102", "pos"] = 1002  # discrepant position
+
+        # set expected values for merge result
+        df.loc["rs100", "genotype"] = np.nan  # discrepant genotype sets to np.nan
+        df.loc["rs101", "genotype"] = "A"  # updates np.nan
+
+        results = s1.merge([s2], chrom="Y")
+
+        pd.testing.assert_frame_equal(s1.snps, df, check_exact=True)
+
+        self.assert_results(
+            results,
+            [
+                {
+                    "merged": True,
+                    "common_rsids": pd.Index(
+                        ["rs100", "rs101", "rs102", "rs103"], name="rsid"
+                    ),
+                    "discrepant_position_rsids": pd.Index(["rs102"], name="rsid"),
+                    "discrepant_genotype_rsids": pd.Index(["rs100"], name="rsid"),
+                }
+            ],
+        )
+
+        self.assertEqual(len(s1.discrepant_merge_positions), 1)
+        self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
+
 
 class TestDeprecatedMethods(TestSnps):
     def run_deprecated_test(self, f, msg):