Skip to content

Commit

Permalink
updated sop function
Browse files Browse the repository at this point in the history
  • Loading branch information
JLSteenwyk committed Sep 12, 2024
1 parent 233240a commit 83d93e2
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 61 deletions.
1 change: 0 additions & 1 deletion phykit/services/alignment/column_score.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from collections import Counter
from typing import Dict, List, Tuple

from Bio import AlignIO
Expand Down
4 changes: 1 addition & 3 deletions phykit/services/alignment/create_concatenation_matrix.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import sys
from textwrap import dedent
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Tuple
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import defaultdict

from .base import Alignment
from ...helpers.files import read_single_column_file_to_list
Expand Down
10 changes: 7 additions & 3 deletions phykit/services/alignment/rename_fasta_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ def run(self):

def process_args(self, args) -> Dict[str, str]:
output_file_path = f"{args.output or args.fasta}.renamed.fa"
return dict(fasta=args.fasta, idmap=args.idmap, output_file_path=output_file_path)
return dict(
fasta=args.fasta,
idmap=args.idmap,
output_file_path=output_file_path,
)

def load_idmap(self, idmap_file: str) -> Dict[str, str]:
try:
Expand All @@ -44,6 +48,6 @@ def replace_ids_and_write(
with open(output_file_path, "w") as output_file:
for record in records:
if record.id in idmap:
record.id = idmap[record.id] # Replace ID
record.description = "" # Remove description
record.id = idmap[record.id]
record.description = ""
SeqIO.write(record, output_file, "fasta")
77 changes: 28 additions & 49 deletions phykit/services/alignment/sum_of_pairs_score.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import Counter
import itertools
from typing import Dict, List, Tuple

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

from .base import Alignment

Expand All @@ -11,66 +12,44 @@ def __init__(self, args) -> None:
super().__init__(**self.process_args(args))

def run(self):
# create biopython object of sequences for query and
# reference alignments
query_records = SeqIO.to_dict(SeqIO.parse(self.fasta, "fasta"))
reference_records = SeqIO.to_dict(SeqIO.parse(self.reference, "fasta"))

# get all record pairs
record_id_pairs = self.get_record_ids(reference_records)

# calculate how many matches there are and how many total pairs there are
(
number_of_matches,
number_of_total_pairs,
) = self.determine_number_of_matches_and_total_pairs(
record_id_pairs, reference_records, query_records
record_id_pairs = list(
itertools.combinations(reference_records.keys(), 2)
)

# print res
number_of_matches, number_of_total_pairs = \
self.determine_number_of_matches_and_total_pairs(
record_id_pairs, reference_records, query_records
)

print(round(number_of_matches / number_of_total_pairs, 4))

def process_args(self, args) -> dict:
def process_args(self, args) -> Dict[str, str]:
return dict(fasta=args.fasta, reference=args.reference)

def get_record_ids(self, reference_records: dict) -> list:
# loop through record names and save each to
record_ids = []
for entry_name in reference_records:
record_ids.append(entry_name)
# create all pairwise combinations
record_id_pairs = list(itertools.combinations(record_ids, 2))
return record_id_pairs

def determine_number_of_matches_and_total_pairs(
self, record_id_pairs: list, reference_records: dict, query_records: dict
):
self,
record_id_pairs: List[Tuple[str, str]],
reference_records: Dict[str, SeqRecord],
query_records: Dict[str, SeqRecord],
) -> Tuple[int, int]:
print(query_records)
number_of_matches = 0
number_of_total_pairs = 0
# loop through each pair
for record_pair in record_id_pairs:
first_in_pair = record_pair[0]
second_in_pair = record_pair[1]

pairs_in_reference = []
pairs_in_query = []
# for each pair, loop through the length of the alignment and get sequence at each site
for i in range(0, len(reference_records[first_in_pair].seq)):
pairs_in_reference.append(
reference_records[first_in_pair].seq[i]
+ reference_records[second_in_pair].seq[i]
)
for i in range(0, len(query_records[first_in_pair].seq)):
pairs_in_query.append(
query_records[first_in_pair].seq[i]
+ query_records[second_in_pair].seq[i]
)

# count the number of matches and total pairs
matches = list(
(Counter(pairs_in_reference) & Counter(pairs_in_query)).elements()
)
number_of_matches += len(matches)
number_of_total_pairs += len(pairs_in_reference)
for first_in_pair, second_in_pair in record_id_pairs:
ref_seq1 = reference_records[first_in_pair].seq
ref_seq2 = reference_records[second_in_pair].seq
query_seq1 = query_records[first_in_pair].seq
query_seq2 = query_records[second_in_pair].seq

for ref_res1, ref_res2, query_res1, query_res2 in zip(
ref_seq1, ref_seq2, query_seq1, query_seq2
):
number_of_total_pairs += 1
if ref_res1 == query_res1 and ref_res2 == query_res2:
number_of_matches += 1

return number_of_matches, number_of_total_pairs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys
from mock import patch, call
from pathlib import Path
from textwrap import dedent

from phykit.phykit import Phykit

Expand All @@ -27,7 +26,7 @@ def test_sum_of_pairs_score_full_ref(self, mocked_print):

@patch("builtins.print")
def test_sum_of_pairs_score_short_ref(self, mocked_print):
expected_result = 0.7714
expected_result = 0.4
testargs = [
"phykit",
"sum_of_pairs_score",
Expand All @@ -41,7 +40,7 @@ def test_sum_of_pairs_score_short_ref(self, mocked_print):

@patch("builtins.print")
def test_sum_of_pairs_score_alias0(self, mocked_print):
expected_result = 0.7714
expected_result = 0.4
testargs = [
"phykit",
"sops",
Expand All @@ -55,7 +54,7 @@ def test_sum_of_pairs_score_alias0(self, mocked_print):

@patch("builtins.print")
def test_sum_of_pairs_score_alias1(self, mocked_print):
expected_result = 0.7714
expected_result = 0.4
testargs = [
"phykit",
"sop",
Expand All @@ -65,4 +64,4 @@ def test_sum_of_pairs_score_alias1(self, mocked_print):
]
with patch.object(sys, "argv", testargs):
Phykit()
assert mocked_print.mock_calls == [call(expected_result)]
assert mocked_print.mock_calls == [call(expected_result)]

0 comments on commit 83d93e2

Please sign in to comment.