Skip to content

Commit

Permalink
improved species-specific inparalog pruning
Browse files Browse the repository at this point in the history
  • Loading branch information
JLSteenwyk committed Jun 23, 2023
1 parent d2062c1 commit f154433
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 26 deletions.
5 changes: 4 additions & 1 deletion docs/change_log/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ Change log
Major changes to OrthoSNAP are summarized here.

**0.1.0**
Added -r/\-\-rooted, -st/\-\-snap_trees, and -ip/\-\-inparalog_to_keep functions
Added -r/\-\-rooted, -st/\-\-snap_trees, and -ip/\-\-inparalog_to_keep functions

**1.0.0**
Improved species inparalog pruning
3 changes: 1 addition & 2 deletions orthosnap/args_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,12 @@ def process_args(args) -> dict:
if not output_path.endswith("/"):
output_path = output_path + "/"
else:
output_path = re.sub("/[^/]+$", '', fasta)
output_path = re.sub("/[^/]+$", "", fasta)
if output_path == fasta:
output_path = "./"
elif not output_path.endswith("/"):
output_path = output_path + "/"


if args.inparalog_to_keep:
inparalog_to_keep = InparalogToKeep(args.inparalog_to_keep)
else:
Expand Down
51 changes: 35 additions & 16 deletions orthosnap/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,29 @@ def collapse_low_support_bipartitions(newtree, support: float):
return newtree


def determine_if_dups_are_sister(subtree_tips: list):
def determine_if_dups_are_sister(subtree_tips: list, newtree):
"""
determine if dups are sister to one another
"""
# get first set of subtree tips
first_set_of_subtree_tips = subtree_tips[0]
# first_set_of_subtree_tips = subtree_tips[0]
# first_set_of_subtree_tips = subtree_tips
# set if duplicate sequences are sister as True
are_sisters = True
# check if duplicate sequences are sister
for set_of_subtree_tips in subtree_tips[1:]:
if first_set_of_subtree_tips != set_of_subtree_tips:
are_sisters = False
if not are_sisters:
break
# create a copy of the tree
dup_tree = copy.deepcopy(newtree)

dup_tree = dup_tree.common_ancestor(subtree_tips)
_, all_tips = get_all_tips_and_taxa_names(dup_tree)
if set(all_tips) != set(subtree_tips):
are_sisters = False

# # check if duplicate sequences are sister
# for set_of_subtree_tips in subtree_tips[1:]:
# if first_set_of_subtree_tips != set_of_subtree_tips:
# are_sisters = False
# if not are_sisters:
# break

return are_sisters

Expand Down Expand Up @@ -154,10 +163,11 @@ def handle_multi_copy_subtree(
# if the taxon is represented by more than one sequence
if counts_of_taxa_from_terms[name] > 1:
# get subtree tips
subtree_tips, dups = get_subtree_tips(terms, name, tree)
_, dups = get_subtree_tips(terms, name, tree)

# check if subtrees are sister to one another
are_sisters = determine_if_dups_are_sister(subtree_tips)
# are_sisters = determine_if_dups_are_sister(subtree_tips)
are_sisters = determine_if_dups_are_sister(dups, newtree)

# if duplicate sequences are sister, get the longest sequence
if are_sisters:
Expand Down Expand Up @@ -214,7 +224,14 @@ def handle_single_copy_subtree(
subgroup_counter,
assigned_tips,
) = write_output_fasta_and_account_for_assigned_tips_single_copy_case(
fasta, subgroup_counter, terms, fasta_dict, assigned_tips, snap_trees, newtree, output_path
fasta,
subgroup_counter,
terms,
fasta_dict,
assigned_tips,
snap_trees,
newtree,
output_path,
)

return subgroup_counter, assigned_tips
Expand All @@ -230,7 +247,6 @@ def inparalog_to_keep_determination(
"""
remove_short_sequences_among_duplicates_that_are_sister
"""

lengths = dict()
# keep inparalog based on sequence length
if inparalog_to_keep.value in [
Expand Down Expand Up @@ -313,17 +329,20 @@ def write_output_fasta_and_account_for_assigned_tips_single_copy_case(
newtree,
output_path: str,
):

# write output
fasta_path_stripped = re.sub("^.*/", '', fasta)
output_file_name = f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa"
fasta_path_stripped = re.sub("^.*/", "", fasta)
output_file_name = (
f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa"
)
with open(output_file_name, "w") as output_handle:
for term in terms:
SeqIO.write(fasta_dict[term], output_handle, "fasta")
assigned_tips.append(term)

if snap_tree:
output_file_name = f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.tre"
output_file_name = (
f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.tre"
)
Phylo.write(newtree, output_file_name, "newick")

subgroup_counter += 1
Expand Down
9 changes: 8 additions & 1 deletion orthosnap/orthosnap.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,14 @@ def execute(

# write user args to stdout
write_user_args(
tree, fasta, support, occupancy, rooted, snap_trees, inparalog_to_keep, output_path
tree,
fasta,
support,
occupancy,
rooted,
snap_trees,
inparalog_to_keep,
output_path,
)

# create start time logger
Expand Down
2 changes: 1 addition & 1 deletion orthosnap/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.1"
__version__ = "1.0.0"
7 changes: 5 additions & 2 deletions orthosnap/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ def write_user_args(
)
)


def write_output_stats(fasta, subgroup_counter, start_time, snap_trees, output_path):
"""
Function to print out output statistics
"""

fasta_path_stripped = re.sub("^.*/", '', fasta)
output_file_name = f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa"
fasta_path_stripped = re.sub("^.*/", "", fasta)
output_file_name = (
f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa"
)

if subgroup_counter > 0:
print(
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_args_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_inparalog_to_keep_longest_branch_len(self, args):
assert res == InparalogToKeep.longest_branch_len

def test_output_path(self, args):
args.output_path="./tests/samples/"
args.output_path = "./tests/samples/"
res = process_args(args)
assert res["output_path"] == "./tests/samples/"

Expand All @@ -118,14 +118,14 @@ def test_output_path_none(self, args):
args.fasta = "tests/expected/test_support_value_60_OG0000010/OG0000010.renamed.fa.mafft.clipkit.orthosnap.0.fa"
res = process_args(args)
assert res["output_path"] == "tests/expected/test_support_value_60_OG0000010/"

def test_output_path_no_slash(self, args):
args.output_path = "./tests/samples"
res = process_args(args)
assert res["output_path"] == "./tests/samples/"

def test_output_path_none(self, args):
args.fasta = "requirements.txt" # fake stand in file
args.fasta = "requirements.txt" # fake stand in file
args.output_path = None
res = process_args(args)
assert res["output_path"] == "./"

0 comments on commit f154433

Please sign in to comment.