Skip to content

Commit

Permalink
Merge pull request #34 from sanogenetics/feature/upstream-v2.1.0
Browse files Browse the repository at this point in the history
Feature/upstream v2.1.0
  • Loading branch information
afaulconbridge committed Nov 24, 2020
2 parents 48f3bee + 3a03565 commit 15246c2
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 6 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ script:
# so this ensures all Python versions will be periodically integration tested with the
# resource servers
- set -e
- NUM_JOBS=3
- NUM_JOBS=4
- SELECTED_JOB=$((10#$(date +%V) % $NUM_JOBS)) # identify a job based on week of the year
- |
if [[ $TRAVIS_PULL_REQUEST != "false" && $SELECTED_JOB == $JOB_ID ]]; then
Expand Down Expand Up @@ -58,6 +58,8 @@ jobs:
env: JOB_ID=1
- python: 3.8
env: JOB_ID=2
- python: 3.9
env: JOB_ID=3
- stage: deploy
python: 3.6
script: skip
Expand Down
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ snps
====
tools for reading, writing, merging, and remapping SNPs 🧬

``snps`` *strives to be an easy-to-use and accessible open-source library for working with
genotype data*

Features
--------
Input / Output
Expand Down Expand Up @@ -162,6 +165,9 @@ discrepant SNPs are available for inspection after the merge via properties of t
Additionally, any non-called / null genotypes will be updated during the merge, if the file
being merged has a called genotype for the SNP.

Moreover, ``merge`` takes a ``chrom`` parameter - this enables merging of only SNPs associated
with the specified chromosome (e.g., "Y" or "MT").

Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding
to the results of each merge (e.g., SNPs in common).

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Topic :: Scientific/Engineering :: Information Analysis",
Expand Down
5 changes: 3 additions & 2 deletions src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,12 +606,13 @@ def read_ancestry(self, file, compression):
"""

def parser():
# read files with multiple separators
df = pd.read_csv(
file,
comment="#",
header=0,
delim_whitespace=True, # https://stackoverflow.com/a/41320761
engine="c",
sep="\s+",
# delim_whitespace=True, # https://stackoverflow.com/a/15026839
na_values=0,
names=["rsid", "chrom", "pos", "allele1", "allele2"],
index_col=0,
Expand Down
22 changes: 19 additions & 3 deletions src/snps/snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,6 +1250,7 @@ def merge(
discrepant_positions_threshold=100,
discrepant_genotypes_threshold=500,
remap=True,
chrom="",
):
""" Merge other ``SNPs`` objects into this ``SNPs`` object.
Expand All @@ -1266,6 +1267,8 @@ def merge(
remap : bool
if necessary, remap other ``SNPs`` objects to have the same build as this ``SNPs`` object
before merging
chrom : str, optional
chromosome to merge (e.g., "1", "Y", "MT")
Returns
-------
Expand Down Expand Up @@ -1334,11 +1337,19 @@ def merge_dfs(s):
s.discrepant_vcf_position
)

def merge_snps(s, positions_threshold, genotypes_threshold):
def merge_snps(s, positions_threshold, genotypes_threshold, merge_chrom):
# merge SNPs, identifying those with discrepant positions and genotypes; update NAs

# identify common SNPs (i.e., any rsids being added that already exist in self.snps)
df = self.snps.join(s.snps, how="inner", rsuffix="_added")
df = (
self.snps.join(s.snps, how="inner", rsuffix="_added")
if not merge_chrom
else self.snps.loc[self.snps.chrom == merge_chrom].join(
s.snps.loc[s.snps.chrom == merge_chrom],
how="inner",
rsuffix="_added",
)
)

common_rsids = df.index

Expand Down Expand Up @@ -1386,7 +1397,11 @@ def merge_snps(s, positions_threshold, genotypes_threshold):
return (False,)

# add new SNPs
self._snps = self.snps.combine_first(s.snps)
self._snps = (
self.snps.combine_first(s.snps)
if not merge_chrom
else self.snps.combine_first(s.snps.loc[s.snps.chrom == merge_chrom])
)
# combine_first converts position to float64, so convert it back to uint32
self._snps["pos"] = self.snps["pos"].astype(np.uint32)

Expand Down Expand Up @@ -1455,6 +1470,7 @@ def merge_snps(s, positions_threshold, genotypes_threshold):
snps_object,
discrepant_positions_threshold,
discrepant_genotypes_threshold,
chrom,
)

if merged:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,57 @@ def test_appending_dfs(self):
s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True
)

def test_merge_chrom(self):
s1 = SNPs("tests/input/generic.csv")
df = s1.snps.append(
self.create_snp_df(
rsid=["rs100", "rs101", "rs102", "rs103"],
chrom=["Y", "Y", "Y", "Y"],
pos=[100, 101, 102, 103],
genotype=["A", np.nan, "A", "A"],
)
)
s1._snps = df.copy()
s2 = SNPs()
s2._build = 37
s2._snps = df.copy()

# set values for chrom that will be ignored (that would otherwise result in
# identification of discrepant SNPs or updating genotype)
s2._snps.loc["rs3", "pos"] = 1003 # discrepant position
s2._snps.loc["rs4", "genotype"] = "AA" # discrepant genotype
s2._snps.loc["rs5", "genotype"] = "AA"

# set values for chrom to be merged
s2._snps.loc["rs100", "genotype"] = "T" # discrepant genotype
s2._snps.loc["rs101", "genotype"] = "A"
s2._snps.loc["rs102", "pos"] = 1002 # discrepant position

# set expected values for merge result
df.loc["rs100", "genotype"] = np.nan # discrepant genotype sets to np.nan
df.loc["rs101", "genotype"] = "A" # updates np.nan

results = s1.merge([s2], chrom="Y")

pd.testing.assert_frame_equal(s1.snps, df, check_exact=True)

self.assert_results(
results,
[
{
"merged": True,
"common_rsids": pd.Index(
["rs100", "rs101", "rs102", "rs103"], name="rsid"
),
"discrepant_position_rsids": pd.Index(["rs102"], name="rsid"),
"discrepant_genotype_rsids": pd.Index(["rs100"], name="rsid"),
}
],
)

self.assertEqual(len(s1.discrepant_merge_positions), 1)
self.assertEqual(len(s1.discrepant_merge_genotypes), 1)


class TestDeprecatedMethods(TestSnps):
def run_deprecated_test(self, f, msg):
Expand Down

0 comments on commit 15246c2

Please sign in to comment.