diff --git a/docs/change_log/index.rst b/docs/change_log/index.rst index 6c892b93..3cb29487 100644 --- a/docs/change_log/index.rst +++ b/docs/change_log/index.rst @@ -9,4 +9,7 @@ Change log Major changes to OrthoSNAP are summarized here. **0.1.0** -Added -r/\-\-rooted, -st/\-\-snap_trees, and -ip/\-\-inparalog_to_keep functions \ No newline at end of file +Added -r/\-\-rooted, -st/\-\-snap_trees, and -ip/\-\-inparalog_to_keep functions + +**1.0.0** +Improved species inparalog pruning diff --git a/orthosnap/args_processing.py b/orthosnap/args_processing.py index d888b448..834f880f 100644 --- a/orthosnap/args_processing.py +++ b/orthosnap/args_processing.py @@ -52,13 +52,12 @@ def process_args(args) -> dict: if not output_path.endswith("/"): output_path = output_path + "/" else: - output_path = re.sub("/[^/]+$", '', fasta) + output_path = re.sub("/[^/]+$", "", fasta) if output_path == fasta: output_path = "./" elif not output_path.endswith("/"): output_path = output_path + "/" - if args.inparalog_to_keep: inparalog_to_keep = InparalogToKeep(args.inparalog_to_keep) else: diff --git a/orthosnap/helper.py b/orthosnap/helper.py index 2c85f7be..9007439e 100644 --- a/orthosnap/helper.py +++ b/orthosnap/helper.py @@ -28,20 +28,29 @@ def collapse_low_support_bipartitions(newtree, support: float): return newtree -def determine_if_dups_are_sister(subtree_tips: list): +def determine_if_dups_are_sister(subtree_tips: list, newtree): """ determine if dups are sister to one another """ # get first set of subtree tips - first_set_of_subtree_tips = subtree_tips[0] + # first_set_of_subtree_tips = subtree_tips[0] + # first_set_of_subtree_tips = subtree_tips # set if duplicate sequences are sister as True are_sisters = True - # check if duplicate sequences are sister - for set_of_subtree_tips in subtree_tips[1:]: - if first_set_of_subtree_tips != set_of_subtree_tips: - are_sisters = False - if not are_sisters: - break + # create a copy of the tree + dup_tree = copy.deepcopy(newtree) + + dup_tree = dup_tree.common_ancestor(subtree_tips) + _, all_tips = get_all_tips_and_taxa_names(dup_tree) + if set(all_tips) != set(subtree_tips): + are_sisters = False + + # # check if duplicate sequences are sister + # for set_of_subtree_tips in subtree_tips[1:]: + # if first_set_of_subtree_tips != set_of_subtree_tips: + # are_sisters = False + # if not are_sisters: + # break return are_sisters @@ -154,10 +163,11 @@ def handle_multi_copy_subtree( # if the taxon is represented by more than one sequence if counts_of_taxa_from_terms[name] > 1: # get subtree tips - subtree_tips, dups = get_subtree_tips(terms, name, tree) + _, dups = get_subtree_tips(terms, name, tree) # check if subtrees are sister to one another - are_sisters = determine_if_dups_are_sister(subtree_tips) + # are_sisters = determine_if_dups_are_sister(subtree_tips) + are_sisters = determine_if_dups_are_sister(dups, newtree) # if duplicate sequences are sister, get the longest sequence if are_sisters: @@ -214,7 +224,14 @@ def handle_single_copy_subtree( subgroup_counter, assigned_tips, ) = write_output_fasta_and_account_for_assigned_tips_single_copy_case( - fasta, subgroup_counter, terms, fasta_dict, assigned_tips, snap_trees, newtree, output_path + fasta, + subgroup_counter, + terms, + fasta_dict, + assigned_tips, + snap_trees, + newtree, + output_path, ) return subgroup_counter, assigned_tips @@ -230,7 +247,6 @@ def inparalog_to_keep_determination( """ remove_short_sequences_among_duplicates_that_are_sister """ - lengths = dict() # keep inparalog based on sequence length if inparalog_to_keep.value in [ @@ -313,17 +329,20 @@ def write_output_fasta_and_account_for_assigned_tips_single_copy_case( newtree, output_path: str, ): - # write output - fasta_path_stripped = re.sub("^.*/", '', fasta) - output_file_name = f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa" + fasta_path_stripped = re.sub("^.*/", "", fasta) + output_file_name = ( + f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa" + ) with open(output_file_name, "w") as output_handle: for term in terms: SeqIO.write(fasta_dict[term], output_handle, "fasta") assigned_tips.append(term) if snap_tree: - output_file_name = f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.tre" + output_file_name = ( + f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.tre" + ) Phylo.write(newtree, output_file_name, "newick") subgroup_counter += 1 diff --git a/orthosnap/orthosnap.py b/orthosnap/orthosnap.py index c8c43984..7288be5a 100644 --- a/orthosnap/orthosnap.py +++ b/orthosnap/orthosnap.py @@ -38,7 +38,14 @@ def execute( # write user args to stdout write_user_args( - tree, fasta, support, occupancy, rooted, snap_trees, inparalog_to_keep, output_path + tree, + fasta, + support, + occupancy, + rooted, + snap_trees, + inparalog_to_keep, + output_path, ) # create start time logger diff --git a/orthosnap/version.py b/orthosnap/version.py index 485f44ac..5becc17c 100644 --- a/orthosnap/version.py +++ b/orthosnap/version.py @@ -1 +1 @@ -__version__ = "0.1.1" +__version__ = "1.0.0" diff --git a/orthosnap/writer.py b/orthosnap/writer.py index 08cb413a..f0e44639 100644 --- a/orthosnap/writer.py +++ b/orthosnap/writer.py @@ -35,13 +35,16 @@ def write_user_args( ) ) + def write_output_stats(fasta, subgroup_counter, start_time, snap_trees, output_path): """ Function to print out output statistics """ - fasta_path_stripped = re.sub("^.*/", '', fasta) - output_file_name = f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa" + fasta_path_stripped = re.sub("^.*/", "", fasta) + output_file_name = ( + f"{output_path}/{fasta_path_stripped}.orthosnap.{subgroup_counter}.fa" + ) if subgroup_counter > 0: print( diff --git a/tests/unit/test_args_parsing.py b/tests/unit/test_args_parsing.py index 7e5002e7..a5b83e44 100644 --- a/tests/unit/test_args_parsing.py +++ b/tests/unit/test_args_parsing.py @@ -109,7 +109,7 @@ def test_inparalog_to_keep_longest_branch_len(self, args): assert res == InparalogToKeep.longest_branch_len def test_output_path(self, args): - args.output_path="./tests/samples/" + args.output_path = "./tests/samples/" res = process_args(args) assert res["output_path"] == "./tests/samples/" @@ -118,14 +118,14 @@ def test_output_path_none(self, args): args.fasta = "tests/expected/test_support_value_60_OG0000010/OG0000010.renamed.fa.mafft.clipkit.orthosnap.0.fa" res = process_args(args) assert res["output_path"] == "tests/expected/test_support_value_60_OG0000010/" - + def test_output_path_no_slash(self, args): args.output_path = "./tests/samples" res = process_args(args) assert res["output_path"] == "./tests/samples/" def test_output_path_none(self, args): - args.fasta = "requirements.txt" # fake stand in file + args.fasta = "requirements.txt" # fake stand in file args.output_path = None res = process_args(args) assert res["output_path"] == "./"