From 1b26e0406d4de497a69a7192272575e1ff996528 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 8 Aug 2023 17:58:09 +0300 Subject: [PATCH 01/44] keep cigartuples in read assignment --- src/alignment_processor.py | 1 + src/isoform_assignment.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/alignment_processor.py b/src/alignment_processor.py index 8e130ed2..b93dd571 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -358,6 +358,7 @@ def process_genic(self, alignment_storage, gene_info): read_assignment.polya_info = alignment_info.polya_info read_assignment.cage_found = len(alignment_info.cage_hits) > 0 read_assignment.exons = alignment_info.read_exons + read_assignment.cigartuples = alignment.cigartuples read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info, read_assignment) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 47d73552..ffd57645 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -477,6 +477,7 @@ def __init__(self, read_id, assignment_type, match=None): self.assignment_id = ReadAssignment.assignment_id_generator.increment() self.read_id = read_id self.exons = None + self.cigartuples = None self.corrected_exons = None self.corrected_introns = None self.gene_info = None @@ -507,6 +508,7 @@ def deserialize(cls, infile, gene_info): read_assignment.assignment_id = read_int(infile) read_assignment.read_id = read_string(infile) read_assignment.exons = read_list_of_pairs(infile, read_int) + read_assignment.cigartuples = read_list_of_pairs(infile, read_int) read_assignment.corrected_exons = read_list_of_pairs(infile, read_int) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) read_assignment.gene_info = gene_info @@ -532,6 +534,7 @@ def serialize(self, outfile): write_int(self.assignment_id, outfile) write_string(self.read_id, outfile) write_list_of_pairs(self.exons, outfile, write_int) + write_list_of_pairs(self.cigartuples, outfile, write_int) write_list_of_pairs(self.corrected_exons, outfile, write_int) write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile) write_int_neg(self.polya_info.external_polya_pos, outfile) From 27f52fe4a9b03417ca352ff9a3309cf29beb217d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 17 Aug 2023 15:56:05 +0300 Subject: [PATCH 02/44] template for transcript correction --- src/graph_based_model_construction.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 7e94114c..c70c98b4 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -130,6 +130,7 @@ def process(self, read_assignment_storage): self.construct_assignment_based_isoforms(read_assignment_storage) self.assign_reads_to_models(read_assignment_storage) self.filter_transcripts() + self.correct_transcripts() if self.params.genedb: self.create_extended_annotation() @@ -198,6 +199,23 @@ def compare_models_with_known(self): model.add_additional_attribute("alternatives", event_string) self.transcript2transcript.append(assignment) + def correct_transcripts(self): + for model in self.transcript_model_storage: + exons = model.exon_blocks + assigned_reads = self.transcript_read_ids[model.transcript_id] + corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads) + if corrected_exons: + model.exon_blocks = corrected_exons + + def correct_transcript_splice_sites(self, exons, assigned_reads): + # exons: list of coordinate pairs + # assigned_reads: list of ReadAssignment, contains read_id and cigartuples + # self.chr_record - FASTA recored, i.e. a single chromosome from a reference + # returns: a list of corrected exons if correction takes place, None - otherwise + # TODO Heidi: insert your code here + return None + + def filter_transcripts(self): filtered_storage = [] confirmed_transcipt_ids = set() From 65f324d57f1099b979190a2fba916d3a9121ac0e Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Mon, 21 Aug 2023 14:59:26 +0300 Subject: [PATCH 03/44] Add initial implementation for transcript_splice_site_corrector and unittests --- src/graph_based_model_construction.py | 70 ++++- src/transcript_splice_site_corrector.py | 241 +++++++++++++++++ .../test_transcript_splice_site_corrector.py | 251 ++++++++++++++++++ 3 files changed, 560 insertions(+), 2 deletions(-) create mode 100644 src/transcript_splice_site_corrector.py create mode 100644 tests/test_transcript_splice_site_corrector.py diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index c70c98b4..6a50da63 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -26,6 +26,9 @@ from .long_read_profiles import CombinedProfileConstructor from .polya_finder import PolyAInfo +from .transcript_splice_site_corrector import count_deletions_for_splice_site_locations +from .transcript_splice_site_corrector import compute_most_common_del_and_verify_nucleotides +from .transcript_splice_site_corrector import sublist_largest_values_exists logger = logging.getLogger('IsoQuant') @@ -207,13 +210,76 @@ def correct_transcripts(self): if corrected_exons: model.exon_blocks = corrected_exons - def correct_transcript_splice_sites(self, exons, assigned_reads): + def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): # exons: list of coordinate pairs # assigned_reads: list of ReadAssignment, contains read_id and cigartuples # self.chr_record - FASTA recored, i.e. a single chromosome from a reference # returns: a list of corrected exons if correction takes place, None - otherwise # TODO Heidi: insert your code here - return None + + + # Constants + ACCEPTED_DEL_CASES = [3, 4, 5, 6] + SUPPORTED_STRANDS = ['+', '-'] + THRESHOLD_CASES_AT_LOCATION = 0.7 + MIN_N_OF_ALIGNED_READS = 5 + + MORE_CONSERVATIVE_STRATEGY = False + + + strand = assigned_reads[0].strand + if strand not in SUPPORTED_STRANDS: + return None + + splice_site_cases = {} + # Iterate assigned_reads list and count deletions for splice site locations + for read_assignment in assigned_reads: + count_deletions_for_splice_site_locations(read_assignment, exons, splice_site_cases) + + # Second iteration + # 1. Count most common deletion at each splice site location + # 2. For interesting cases count nucleotides at deletion positions + # 3. If canonical nucleotides are found, correct splice site + + corrected_exons = [] + for splice_site_location, splice_site_data in splice_site_cases.items(): + + reads = sum(splice_site_data["deletions"].values()) + if reads < MIN_N_OF_ALIGNED_READS: + continue + + compute_most_common_del_and_verify_nucleotides( + splice_site_location, + splice_site_data, + self.chr_record, + ACCEPTED_DEL_CASES, + strand + ) + if MORE_CONSERVATIVE_STRATEGY: + if not sublist_largest_values_exists( + splice_site_data["del_pos_distr"], + abs(splice_site_data["most_common_del"])): + continue + pass + + if splice_site_data["del_location_has_canonical_nucleotides"]: + corrected_exons.append(splice_site_location) + + # If correction took place, return corrected exons + if not corrected_exons: + return None + + corrected_exons = [] + for exon in exons: + corrected_exon = exon + if exon[0] in splice_site_cases: + corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"] + corrected_exon = (corrected_location, exon[1]) + if exon[1] in splice_site_cases: + corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"] + corrected_exon = (exon[0], corrected_location) + corrected_exons.append(corrected_exon) + return corrected_exons def filter_transcripts(self): diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py new file mode 100644 index 00000000..7ee8e727 --- /dev/null +++ b/src/transcript_splice_site_corrector.py @@ -0,0 +1,241 @@ +def extract_location_from_cigar_string(cigartuples: list, + read_start: int, + read_end: int, + splice_site_location: int): + """ + Extract location from cigar string. + + Args: + cigar_tuples (list): list of cigar tuples (cigar code, aligned position). + See pysam documentation for more information + read_start (int): the start location for the read (base-1) + read_end (int): the end location for the read (base-1) + splice_site_location (int): location of interest (base-1) + + Returns: + _type_: _description_ + """ + relative_position = splice_site_location - read_start + alignment_position = 0 + ref_position = 0 + + for cigar_code in cigartuples: + + if cigar_code[0] in [0, 2, 3, 7, 8]: + ref_position += cigar_code[1] + if ref_position <= relative_position and not \ + read_start + ref_position == read_end: + alignment_position += cigar_code[1] + else: + return alignment_position + (cigar_code[1] - (ref_position - relative_position)) + + return -1 + + +def count_deletions_from_cigar_codes_in_given_window(cigartuples: list, + aligned_location: int, + location_is_end: bool, + splice_site_data: dict, + window_size: int): + """ + Get cigar codes in a given window. + + Args: + cigar_tuples (list): list of cigar tuples (cigar code, aligned position). See + pysam documentation for more information + aligned_location (int): aligned location + loc_type (str): type of location (start or end) + """ + + deletions = 0 + + + cigar_code_list = [] + location = 0 + + if location_is_end: + aligned_location = aligned_location - window_size + 1 + + for cigar_code in cigartuples: + if window_size == len(cigar_code_list): + break + if location + cigar_code[1] > aligned_location: + overlap = location + \ + cigar_code[1] - (aligned_location + len(cigar_code_list)) + cigar_code_list.extend( + [cigar_code[0] for _ in range(min(window_size - + len(cigar_code_list), overlap))]) + location += cigar_code[1] + + for i in range(window_size): + if i >= len(cigar_code_list): + break + if cigar_code_list[i] == 2: + deletions += 1 + splice_site_data["del_pos_distr"][i] += 1 + + if deletions not in splice_site_data: + splice_site_data["deletions"][deletions] = 0 + + splice_site_data["deletions"][deletions] += 1 + + +def extract_splice_site_locations_within_aligned_read(read_start: int, read_end: int, exons:list): + matching_locations = [] + for exon_start, exon_end in exons: + if read_start <= exon_start <= read_end: + location_is_end = False + matching_locations.append((exon_start, location_is_end)) + if read_start <= exon_end <= read_end: + location_is_end = True + matching_locations.append((exon_end, location_is_end)) + if read_end <= exon_end: + break + return matching_locations + + +def count_deletions_for_splice_site_locations(assigned_read, exons: list, splice_site_cases: dict): + """ + + Args: + assigned_read (ReadAssignment): read assignment + exons (list): tuple of exons (start, end) + splice_site_cases (dict): a dictionary for storing splice site cases + """ + + # Extract read start and end + read_start = assigned_read.corrected_exons[0][0] + read_end = assigned_read.corrected_exons[-1][1] + cigartuples = assigned_read.cigartuples + + # Constant window size for counting deletions + WINDOW_SIZE = 8 + + # Extract splice site locations within aligned read + matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons) + + # Count deletions for each splice site location + for splice_site_location, location_type in matching_locations: + if splice_site_location not in splice_site_cases: + splice_site_cases[splice_site_location] = { + 'location_is_end': location_type, + 'deletions': {}, + 'del_pos_distr': [0 for _ in range(WINDOW_SIZE)], + 'most_common_deletion': -1, + 'del_location_has_canonical_nucleotides': False + } + + # Processing cigartuples + # 1. Find the aligned location + aligned_location = extract_location_from_cigar_string(cigartuples, read_start, read_end, splice_site_location) + # 2. Count deletions in a predefined window + count_deletions_from_cigar_codes_in_given_window( + cigartuples, + aligned_location, + location_type, + splice_site_cases[splice_site_location], + WINDOW_SIZE) + + + +def compute_most_common_case_of_deletions(deletions: dict, location_is_end: bool): + del_most_common_case = [k for k, v in deletions.items( + ) if v == max(deletions.values())] + if len(del_most_common_case) == 1: + if location_is_end: + return -del_most_common_case[0] + return del_most_common_case[0] + return -1 + + +def extract_nucleotides_from_most_common_del_location( + location: int, + splice_site_data: dict, + chr_record, + strand: str): + most_common_del = splice_site_data["most_common_del"] + idx_correction = -1 + extraction_start = location + most_common_del + idx_correction + extraction_end = location + most_common_del + 2 + idx_correction + try: + extracted_canonicals = chr_record[extraction_start:extraction_end] + except KeyError: + extracted_canonicals = 'XX' + + + canonical_pairs = { + '+': { + 'start': ['AG', 'AC'], + 'end': ['GT', 'GC', 'AT'] + }, + '-': { + 'start': ['AC', 'GC', 'AC'], + 'end': ['CT', 'GT'] + } + } + if splice_site_data["location_is_end"]: + possible_canonicals = canonical_pairs[strand]['end'] + else: + possible_canonicals = canonical_pairs[strand]['start'] + + if extracted_canonicals in possible_canonicals: + splice_site_data["del_location_has_canonical_nucleotides"] = True + +def compute_most_common_del_and_verify_nucleotides( + splice_site_location: int, + splice_site_data: dict, + chr_record, + ACCEPTED_DEL_CASES: list, + strand: str,): + + + # Compute most common case of deletions + splice_site_data["most_common_deletion"] = compute_most_common_case_of_deletions( + splice_site_data["deletions"], + splice_site_data["location_is_end"]) + + # Extract nucleotides from most common deletion location if it is an accepted case + if splice_site_data["most_common_deletion"] in ACCEPTED_DEL_CASES: + extract_nucleotides_from_most_common_del_location( + splice_site_location, + splice_site_data, + chr_record, + strand) + + + +def threshold_exceeded( + del_pos_distr: list, + deletions: dict, + most_common_del: int, + THRESHOLD_CASES_AT_LOCATION): + total_cases = sum(deletions.values()) + nucleotides_exceeding_treshold = 0 + for value in del_pos_distr: + if value / total_cases > THRESHOLD_CASES_AT_LOCATION: + nucleotides_exceeding_treshold += 1 + return bool(nucleotides_exceeding_treshold >= abs(most_common_del)) + +def sublist_largest_values_exists(lst, n): + """ + Verifies that there is a sublist of size n that contains the largest values in the list. + Not currently in use, but may be included in the error prediction strategy for stricter prediction. + Args: + lst (int): list of deletion distribution + n (int): most common case of deletions + + Returns: + _type_: _description_ + """ + largest_values = set(sorted(lst, reverse=True)[:n]) + count = 0 + + for num in lst: + if num in largest_values: + count += 1 + if count >= n: + return True + else: + count = 0 + + return False \ No newline at end of file diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py new file mode 100644 index 00000000..77d3f5ae --- /dev/null +++ b/tests/test_transcript_splice_site_corrector.py @@ -0,0 +1,251 @@ +from unittest import TestCase +from unittest import main as unittest_main + + +from src.transcript_splice_site_corrector import threshold_exceeded +from src.transcript_splice_site_corrector import sublist_largest_values_exists +from src.transcript_splice_site_corrector import extract_location_from_cigar_string +from src.transcript_splice_site_corrector import count_deletions_from_cigar_codes_in_given_window +from src.transcript_splice_site_corrector import extract_splice_site_locations_within_aligned_read +class TestMoreConservativeStrategyConditions(TestCase): + + def test_threshold_exceeds_returns_true(self): + THRESHOLD = 0.7 + del_pos_distr = [0, 0, 10, 10, 10, 10, 0, 0] + deletions = {4: 10} + most_common_del = 4 + result = threshold_exceeded( + del_pos_distr, + deletions, + most_common_del, + THRESHOLD) + self.assertTrue(result) + + def test_threshold_not_exceeded_returns_false(self): + THRESHOLD = 0.7 + del_pos_distr = [0, 0, 10, 10, 10, 6, 0, 0] + deletions = {4: 6, 3: 4} + most_common_del = 4 + result = threshold_exceeded( + del_pos_distr, + deletions, + most_common_del, + THRESHOLD) + self.assertFalse(result) + + def test_sublist_largest_values_exists_returns_true(self): + lst = [0, 0, 10, 10, 10, 10, 0, 0] + n = 4 + result = sublist_largest_values_exists(lst, n) + self.assertTrue(result) + + def test_sublist_largest_values_exists_returns_false(self): + lst = [0, 0, 10, 10, 10, 6, 0, 0] + n = 4 + result = sublist_largest_values_exists(lst, n) + self.assertFalse(result) + + +class TestExtractingLocationFromCigarString(TestCase): + + def test_cigar_string_with_soft_clip_and_one_match_is_parsed_correctly(self): + cigar = [(4, 50), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 105 + expected_output = 55 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + + def test_cigar_string_with_soft_clip_insertion_and_one_match_is_parsed_correctly(self): + cigar = [(4, 50), (1, 10), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 105 + expected_output = 65 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + + def test_cigar_str_with_s_d_i_m_gives_correct_output(self): + cigar = [(4, 50), (2, 10), (1, 10), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 115 + expected_output = 75 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_cigar_str_with_s_d_n_m_gives_correct_output(self): + cigar = [(4, 50), (2, 10), (3, 100), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 215 + expected_output = 165 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_cigar_str_with_s_m_i_n_m_gives_correct_output(self): + cigar = [(4, 50), (0, 10), (1, 10), (3, 100), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 215 + expected_output = 175 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_location_outside_of_cigar_str_returns_minus_one(self): + cigar = [(4, 50), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 199 + expected_output = -1 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_more_complicated_test_returns_correct_position(self): + cigar_tuples = [(4, 156), (0, 12), (2, 3), (0, 2), (2, 2), (0, 10), (2, 2), (0, 4), (2, 3), (0, 7), (1, 1), (0, 16), (1, 4), (0, 23), (1, 1), (0, 7), + (1, 1), (0, 9), (2, 1), (0, 13), (2, 1), (0, 15), (2, 2), (0, 3), (1, 2), (0, 19), (2, 2), (0, 20), (2, 1), (0, 32), (3, 294), (0, 36), (4, 25)] + reference_start = 72822568 + reference_end = 73822568 + position = 72823071 + expected_output = 668 + result = extract_location_from_cigar_string( + cigar_tuples, reference_start, reference_end, position) + self.assertEqual(result, expected_output) + + def test_case_that_does_not_consume_any_reference_returns_the_correct_location(self): + cigar = [(4, 50), (0, 10)] + reference_start = 100 + reference_end = 160 + location = 100 + expected_output = 50 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_case_that_has_no_reference_consuming_codes_returns_minus_one_as_error(self): + cigar = [(4, 50), (1, 10)] + reference_start = 100 + reference_end = 160 + location = 100 + expected_output = -1 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_case_that_has_no_reference_consuming_codes_at_the_end_returns_minus_one_as_error(self): + cigar = [(4, 50), (0, 10), (1, 10)] + reference_start = 100 + reference_end = 160 + location = 110 + expected_output = -1 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + def test_case_that_has_it_s_location_at_final_match_returns_correct_value(self): + cigar = [(4, 50), (0, 10), (1, 10)] + reference_start = 100 + reference_end = 110 + location = 110 + expected_output = 60 + result = extract_location_from_cigar_string( + cigar, reference_start, reference_end, location) + self.assertEqual(result, expected_output) + + +class TestIndelCountingFromCigarCodes(TestCase): + + def setUp(self): + self.window_size = 8 + + def test_indel_counter_returns_false_and_an_empty_debug_list_for_given_empty_list(self): + cigar_tuples = [] + aligned_location = 100 + location_is_end = False + splice_site_data = { + 'deletions': {}, + "del_pos_distr": [0] * self.window_size, + } + expected_result = { + 'deletions': {0: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0] + } + count_deletions_from_cigar_codes_in_given_window( + cigar_tuples, + aligned_location, + location_is_end, + splice_site_data, + self.window_size) + + self.assertEqual(splice_site_data['deletions'], expected_result['deletions']) + self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr']) + + + + def test_indels_are_counted_correctly(self): + cigar_tuples = [(0, 20), (2, 3), (1, 2), (0, 10)] + aligned_location = 27 + location_is_end = True + splice_site_data = { + 'deletions': {}, + "del_pos_distr": [0] * self.window_size, + } + + + expected_result = { + 'deletions': {3: 1}, + "del_pos_distr": [1, 1, 1, 0, 0, 0, 0, 0] + } + + count_deletions_from_cigar_codes_in_given_window( + cigar_tuples, + aligned_location, + location_is_end, + splice_site_data, + self.window_size) + + self.assertEqual(splice_site_data['deletions'], expected_result['deletions']) + self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr']) + + def test_full_window_of_dels_returns_true_for_errors(self): + cigar_tuples = [(0, 20), (2, 8), (1, 2), (0, 10)] + aligned_location = 20 + location_is_end = False + splice_site_data = { + 'deletions': {}, + "del_pos_distr": [0] * self.window_size, + } + expected_result = { + 'deletions': {8: 1}, + "del_pos_distr": [1, 1, 1, 1, 1, 1, 1, 1] + } + + count_deletions_from_cigar_codes_in_given_window( + cigar_tuples, + aligned_location, + location_is_end, + splice_site_data, + self.window_size) + + self.assertEqual(splice_site_data['deletions'], expected_result['deletions']) + self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr']) + +class ExtractSpliceSiteLocationsFromAlignedRead(TestCase): + + def test_correct_splice_sites_are_extracted(self): + exons = [(1, 10), (20, 30), (40, 50)] + read_start = 20 + read_end = 40 + result = extract_splice_site_locations_within_aligned_read( + read_start, read_end, exons) + expected_output = [20, 30 , 40] + self.assertEqual(result, expected_output) \ No newline at end of file From bd52a52380a69da269cd98a0c3449062a039ef47 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 08:46:42 +0300 Subject: [PATCH 04/44] Fix issues with two unittests --- tests/test_transcript_splice_site_corrector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 77d3f5ae..5b96543e 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -40,7 +40,7 @@ def test_sublist_largest_values_exists_returns_true(self): self.assertTrue(result) def test_sublist_largest_values_exists_returns_false(self): - lst = [0, 0, 10, 10, 10, 6, 0, 0] + lst = [0, 0, 10, 10, 10, 0, 6, 0] n = 4 result = sublist_largest_values_exists(lst, n) self.assertFalse(result) @@ -247,5 +247,5 @@ def test_correct_splice_sites_are_extracted(self): read_end = 40 result = extract_splice_site_locations_within_aligned_read( read_start, read_end, exons) - expected_output = [20, 30 , 40] + expected_output = [(20, False), (30, True) , (40, False)] self.assertEqual(result, expected_output) \ No newline at end of file From 99c9d393fdf752675f8ba25c12bf4b5d4d30c509 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 09:01:46 +0300 Subject: [PATCH 05/44] Fix issue with two datastructures having the same var name --- src/graph_based_model_construction.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 6a50da63..e012ab94 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -269,17 +269,17 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): if not corrected_exons: return None - corrected_exons = [] + final_corrected_exons = [] for exon in exons: - corrected_exon = exon + new_corrected_exon = exon if exon[0] in splice_site_cases: corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"] corrected_exon = (corrected_location, exon[1]) if exon[1] in splice_site_cases: corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"] corrected_exon = (exon[0], corrected_location) - corrected_exons.append(corrected_exon) - return corrected_exons + final_corrected_exons.append(new_corrected_exon) + return final_corrected_exons def filter_transcripts(self): From 98619dbd680622e3cb854156d4d92d92db496ec3 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 09:44:14 +0300 Subject: [PATCH 06/44] Refactor code into separate functions --- src/graph_based_model_construction.py | 72 +++++++++++---------------- 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index e012ab94..99d5cee5 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -26,9 +26,11 @@ from .long_read_profiles import CombinedProfileConstructor from .polya_finder import PolyAInfo -from .transcript_splice_site_corrector import count_deletions_for_splice_site_locations -from .transcript_splice_site_corrector import compute_most_common_del_and_verify_nucleotides -from .transcript_splice_site_corrector import sublist_largest_values_exists +from .transcript_splice_site_corrector import ( + count_deletions_for_splice_site_locations, + correct_splice_site_errors, + generate_updated_exon_list + ) logger = logging.getLogger('IsoQuant') @@ -234,52 +236,36 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): splice_site_cases = {} # Iterate assigned_reads list and count deletions for splice site locations for read_assignment in assigned_reads: - count_deletions_for_splice_site_locations(read_assignment, exons, splice_site_cases) + read_start = read_assignment.corrected_exons[0][0] + read_end = read_assignment.corrected_exons[-1][1] + cigartuples = read_assignment.cigartuples + count_deletions_for_splice_site_locations( + read_start, + read_end, + cigartuples, + exons, + splice_site_cases) - # Second iteration - # 1. Count most common deletion at each splice site location - # 2. For interesting cases count nucleotides at deletion positions - # 3. If canonical nucleotides are found, correct splice site - corrected_exons = [] - for splice_site_location, splice_site_data in splice_site_cases.items(): - - reads = sum(splice_site_data["deletions"].values()) - if reads < MIN_N_OF_ALIGNED_READS: - continue - - compute_most_common_del_and_verify_nucleotides( - splice_site_location, - splice_site_data, - self.chr_record, - ACCEPTED_DEL_CASES, - strand - ) - if MORE_CONSERVATIVE_STRATEGY: - if not sublist_largest_values_exists( - splice_site_data["del_pos_distr"], - abs(splice_site_data["most_common_del"])): - continue - pass + corrected_exons = correct_splice_site_errors( + splice_site_cases, + MIN_N_OF_ALIGNED_READS, + ACCEPTED_DEL_CASES, + MORE_CONSERVATIVE_STRATEGY, + strand, + self.chr_record + ) - if splice_site_data["del_location_has_canonical_nucleotides"]: - corrected_exons.append(splice_site_location) - - # If correction took place, return corrected exons if not corrected_exons: return None - final_corrected_exons = [] - for exon in exons: - new_corrected_exon = exon - if exon[0] in splice_site_cases: - corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"] - corrected_exon = (corrected_location, exon[1]) - if exon[1] in splice_site_cases: - corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"] - corrected_exon = (exon[0], corrected_location) - final_corrected_exons.append(new_corrected_exon) - return final_corrected_exons + updated_exons = generate_updated_exon_list( + splice_site_cases, + corrected_exons, + exons + ) + + return updated_exons def filter_transcripts(self): From 321be9c00beaa1b2ad4d72a26b6c369750a81751 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 09:44:29 +0300 Subject: [PATCH 07/44] Refactor code into separate functions --- src/transcript_splice_site_corrector.py | 76 +++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index 7ee8e727..be17594e 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -94,7 +94,12 @@ def extract_splice_site_locations_within_aligned_read(read_start: int, read_end: return matching_locations -def count_deletions_for_splice_site_locations(assigned_read, exons: list, splice_site_cases: dict): +def count_deletions_for_splice_site_locations( + read_start: int, + read_end: int, + cigartuples: list, + exons: list, + splice_site_cases: dict): """ Args: @@ -103,10 +108,6 @@ def count_deletions_for_splice_site_locations(assigned_read, exons: list, splice splice_site_cases (dict): a dictionary for storing splice site cases """ - # Extract read start and end - read_start = assigned_read.corrected_exons[0][0] - read_end = assigned_read.corrected_exons[-1][1] - cigartuples = assigned_read.cigartuples # Constant window size for counting deletions WINDOW_SIZE = 8 @@ -238,4 +239,67 @@ def sublist_largest_values_exists(lst, n): else: count = 0 - return False \ No newline at end of file + return False + + +def correct_splice_site_errors( + splice_site_cases: dict, + MIN_N_OF_ALIGNED_READS: int, + ACCEPTED_DEL_CASES: list, + MORE_CONSERVATIVE_STRATEGY: bool, + strand: str, + chr_record): + """ 1. Count most common deletion at each splice site location + 2. For interesting cases count nucleotides at deletion positions + 3. If canonical nucleotides are found, correct splice site + + Args: + splice_site_cases (dict): collected splice site cases + MIN_N_OF_ALIGNED_READS (int): constant for minimum number of aligned reads + ACCEPTED_DEL_CASES (list): constant for accepted cases of deletions + MORE_CONSERVATIVE_STRATEGY (bool): constant for more conservative strategy + strand (str): transcript strand (extracted from first ReadAssignment-object in read_assignments list) + chr_record (Fasta): FASTA recored, i.e. a single chromosome from a reference + """ + + locations_with_errors = [] + for splice_site_location, splice_site_data in splice_site_cases.items(): + + reads = sum(splice_site_data["deletions"].values()) + if reads < MIN_N_OF_ALIGNED_READS: + continue + + compute_most_common_del_and_verify_nucleotides( + splice_site_location, + splice_site_data, + chr_record, + ACCEPTED_DEL_CASES, + strand + ) + if MORE_CONSERVATIVE_STRATEGY: + if not sublist_largest_values_exists( + splice_site_data["del_pos_distr"], + abs(splice_site_data["most_common_del"])): + continue + pass + + if splice_site_data["del_location_has_canonical_nucleotides"]: + locations_with_errors.append(splice_site_location) + + return locations_with_errors + +def generate_updated_exon_list( + splice_site_cases: dict, + locations_with_errors: list, + exons: list): + updated_exons = [] + for exon in exons: + updated_exon = exon + if exon[0] in locations_with_errors: + corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"] + updated_exon = (corrected_location, exon[1]) + if exon[1] in locations_with_errors: + corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"] + updated_exon = (exon[0], corrected_location) + updated_exons.append(updated_exon) + return updated_exons \ No newline at end of file From f30996ba400a19f4b2e4ff3d650b35ab6aeb0deb Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 09:44:46 +0300 Subject: [PATCH 08/44] Expand tests for untested functions --- .../test_transcript_splice_site_corrector.py | 78 +++++++++++++++++-- 1 file changed, 71 insertions(+), 7 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 5b96543e..071bd309 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -2,11 +2,20 @@ from unittest import main as unittest_main -from src.transcript_splice_site_corrector import threshold_exceeded -from src.transcript_splice_site_corrector import sublist_largest_values_exists -from src.transcript_splice_site_corrector import extract_location_from_cigar_string -from src.transcript_splice_site_corrector import count_deletions_from_cigar_codes_in_given_window -from src.transcript_splice_site_corrector import extract_splice_site_locations_within_aligned_read +from src.transcript_splice_site_corrector import ( + extract_location_from_cigar_string, + count_deletions_from_cigar_codes_in_given_window, + extract_splice_site_locations_within_aligned_read, + count_deletions_for_splice_site_locations, + compute_most_common_case_of_deletions, + extract_nucleotides_from_most_common_del_location, + compute_most_common_del_and_verify_nucleotides, + threshold_exceeded, + sublist_largest_values_exists, + correct_splice_site_errors, + generate_updated_exon_list, +) + class TestMoreConservativeStrategyConditions(TestCase): def test_threshold_exceeds_returns_true(self): @@ -239,7 +248,7 @@ def test_full_window_of_dels_returns_true_for_errors(self): self.assertEqual(splice_site_data['deletions'], expected_result['deletions']) self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr']) -class ExtractSpliceSiteLocationsFromAlignedRead(TestCase): +class TestExtractSpliceSiteLocationsFromAlignedRead(TestCase): def test_correct_splice_sites_are_extracted(self): exons = [(1, 10), (20, 30), (40, 50)] @@ -248,4 +257,59 @@ def test_correct_splice_sites_are_extracted(self): result = extract_splice_site_locations_within_aligned_read( read_start, read_end, exons) expected_output = [(20, False), (30, True) , (40, False)] - self.assertEqual(result, expected_output) \ No newline at end of file + self.assertEqual(result, expected_output) + + +class TestExonListUpdater(TestCase): + + def test_error_at_location_start_is_corrected(self): + exons = [(1, 10), (20, 30), (40, 50)] + locations_with_errors = [20] + splice_site_cases = { + 20: { + "most_common_del": 4, + } + } + result = generate_updated_exon_list( + splice_site_cases, locations_with_errors, exons) + expected_result = [(1, 10), (24, 30), (40, 50)] + self.assertEqual(result, expected_result) + + def test_error_at_location_end_is_corrected(self): + exons = [(1, 10), (20, 30), (40, 50)] + locations_with_errors = [30] + splice_site_cases = { + 30: { + "most_common_del": -4, + } + } + result = generate_updated_exon_list( + splice_site_cases, locations_with_errors, exons) + expected_result = [(1, 10), (20, 26), (40, 50)] + self.assertEqual(result, expected_result) + + + pass + +class TestHelperFunctions(TestCase): + + def test_distinct_most_common_case_is_returned_for_location_end(self): + cases = {0: 10, 1: 2, 3: 0, 4: 20, 5: 1} + location_is_end = False + result = compute_most_common_case_of_deletions(cases, location_is_end) + expected_result = 4 + self.assertEqual(result, expected_result) + + def test_distinct_most_common_case_is_returned_for_location_start(self): + cases = {0: 10, 1: 2, 3: 0, 4: 20, 5: 1} + location_is_end = True + result = compute_most_common_case_of_deletions(cases, location_is_end) + expected_result = -4 + self.assertEqual(result, expected_result) + + def test_if_no_distinct_most_commont_del_exists_return_neg_one(self): + cases = {0: 10, 1: 2, 3: 20, 4: 20, 5: 1} + location_is_end = False + result = compute_most_common_case_of_deletions(cases, location_is_end) + expected_result = -1 + self.assertEqual(result, expected_result) \ No newline at end of file From 3f8aa163b482735492ddbbd1b0f379e81db2d254 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 11:38:10 +0300 Subject: [PATCH 09/44] Expand unittests --- .../test_transcript_splice_site_corrector.py | 156 +++++++++++++++++- 1 file changed, 153 insertions(+), 3 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 071bd309..ef20e0ef 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -1,6 +1,5 @@ from unittest import TestCase -from unittest import main as unittest_main - +from unittest.mock import MagicMock, patch from src.transcript_splice_site_corrector import ( extract_location_from_cigar_string, @@ -312,4 +311,155 @@ def test_if_no_distinct_most_commont_del_exists_return_neg_one(self): location_is_end = False result = compute_most_common_case_of_deletions(cases, location_is_end) expected_result = -1 - self.assertEqual(result, expected_result) \ No newline at end of file + self.assertEqual(result, expected_result) + + +class TestCorrectSpliceSiteErrors(TestCase): + + @patch('src.transcript_splice_site_corrector.compute_most_common_case_of_deletions') + def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_deletions): + splice_site_cases = { + 20: { + "del_location_has_canonical_nucleotides": False, + "deletions": {4: 10}, + "location_is_end": False, + "most_common_del": 4, + }, + 30: { + "del_location_has_canonical_nucleotides": True, + "deletions": {4: 10}, + "location_is_end": False, + "most_common_del": 4, + }, + } + MIN_N_ALIGNED_READS = 5 + ACCEPTED_DEL_CASES = [4] + MORE_CONSERVATIVE_STRATEGY = False + strand = "+" + chr_record = None + result = correct_splice_site_errors( + splice_site_cases, + MIN_N_ALIGNED_READS, + ACCEPTED_DEL_CASES, + MORE_CONSERVATIVE_STRATEGY, + strand, + chr_record) + expected_result = [30] + self.assertEqual(result, expected_result) + +class TestCountDeletionsFromSpliceSiteLocations(TestCase): + def test_count_deletions_from_splice_site_locations_extracts_correct_locations(self): + exons = [(1, 10), (20, 30), (40, 50)] + # 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 + # [M ,M, M, M, M, M, D, D, D, D, M, M, M, M, M, M, M, M, M, M, M] + cigartuples = [(0, 6), (2, 4), (0, 10)] + read_start = 20 + read_end = 40 + splice_site_cases = {} + count_deletions_for_splice_site_locations( + read_start, + read_end, + cigartuples, + exons, + splice_site_cases) + expected_result = { + 20: { + 'location_is_end': False, + 'deletions': {2: 1}, + 'del_pos_distr': [0, 0, 0, 0, 0, 0, 1, 1], + 'most_common_deletion': -1, + 'del_location_has_canonical_nucleotides': False + }, + 30: { + 'location_is_end': True, + 'deletions': {4: 1}, + 'del_pos_distr': [0, 0, 0, 1, 1, 1, 1, 0], + 'most_common_deletion': -1, + 'del_location_has_canonical_nucleotides': False + }, + 40: { + 'location_is_end': False, + 'deletions': {0: 1}, + 'del_pos_distr': [0, 0, 0, 0, 0, 0, 0, 0], + 'most_common_deletion': -1, + 'del_location_has_canonical_nucleotides': False + }, + } + self.assertEqual(splice_site_cases, expected_result) + + +class TestNucleotideExtraction(TestCase): + + def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly(self): + location = 10 + splice_site_data = { + "most_common_del": 4, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + } + chr_record = "AAAAAAAAAAAAAAG" + + strand = "+" + extract_nucleotides_from_most_common_del_location( + location, + splice_site_data, + chr_record, + strand) + self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + + def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(self): + location = 10 + splice_site_data = { + "most_common_del": -4, + "location_is_end": True, + "del_location_has_canonical_nucleotides": False, + } + + # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + # offset of -4 ^ + # | | + # v start pos + # A A A A A G C A A A A A A A A + chr_record = "AAAAAGCAAAAAAAA" + + strand = "+" + extract_nucleotides_from_most_common_del_location( + location, + splice_site_data, + chr_record, + strand) + self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + + def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly(self): + location = 10 + splice_site_data = { + "most_common_del": 4, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + } + chr_record = "AAAAAAAAAAAAAAC" + + strand = "-" + extract_nucleotides_from_most_common_del_location( + location, + splice_site_data, + chr_record, + strand) + self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + + def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(self): + location = 10 + splice_site_data = { + "most_common_del": -4, + "location_is_end": True, + "del_location_has_canonical_nucleotides": False, + } + chr_record = "AAAAACTAAAAAAAA" + + strand = "-" + extract_nucleotides_from_most_common_del_location( + location, + splice_site_data, + chr_record, + strand) + self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) \ No newline at end of file From 3061f0f808eba0da082e2231306834daee4a8e54 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 12:37:57 +0300 Subject: [PATCH 10/44] expand unittests --- .../test_transcript_splice_site_corrector.py | 97 ++++++++++++++++++- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index ef20e0ef..13e45ec6 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -367,21 +367,21 @@ def test_count_deletions_from_splice_site_locations_extracts_correct_locations(s 'location_is_end': False, 'deletions': {2: 1}, 'del_pos_distr': [0, 0, 0, 0, 0, 0, 1, 1], - 'most_common_deletion': -1, + 'most_common_del': -1, 'del_location_has_canonical_nucleotides': False }, 30: { 'location_is_end': True, 'deletions': {4: 1}, 'del_pos_distr': [0, 0, 0, 1, 1, 1, 1, 0], - 'most_common_deletion': -1, + 'most_common_del': -1, 'del_location_has_canonical_nucleotides': False }, 40: { 'location_is_end': False, 'deletions': {0: 1}, 'del_pos_distr': [0, 0, 0, 0, 0, 0, 0, 0], - 'most_common_deletion': -1, + 'most_common_del': -1, 'del_location_has_canonical_nucleotides': False }, } @@ -462,4 +462,93 @@ def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(se splice_site_data, chr_record, strand) - self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) \ No newline at end of file + self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + + +class TestDeletionComputationAndBaseExtraction(TestCase): + + def test_for_accepted_del_case_nucleotides_are_vefiried(self): + splice_site_location = 10 + splice_site_data = { + "most_common_del": -1, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + "deletions": {4: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], + } + + chr_record = "AAAAAAAAAAAAAAG" + ACCEPTED_DEL_CASES = [4] + strand = "+" + compute_most_common_del_and_verify_nucleotides( + splice_site_location, + splice_site_data, + chr_record, + ACCEPTED_DEL_CASES, + strand) + expected_result = { + "most_common_del": 4, + "location_is_end": False, + "del_location_has_canonical_nucleotides": True, + "deletions": {4: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], + } + self.assertEqual(splice_site_data, expected_result) + + + def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self): + splice_site_location = 10 + splice_site_data = { + "most_common_del": -1, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + "deletions": {2: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], + } + + chr_record = "AAAAAAAAAAAAAAG" + ACCEPTED_DEL_CASES = [4] + strand = "+" + compute_most_common_del_and_verify_nucleotides( + splice_site_location, + splice_site_data, + chr_record, + ACCEPTED_DEL_CASES, + strand) + expected_result = { + "most_common_del": 2, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + "deletions": {2: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], + } + self.assertEqual(splice_site_data, expected_result) + + def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self): + splice_site_location = 10 + splice_site_data = { + "most_common_del": -1, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + "deletions": {4: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], + } + + chr_record = "AAAAAAAAAAAAAXX" + ACCEPTED_DEL_CASES = [4] + strand = "+" + compute_most_common_del_and_verify_nucleotides( + splice_site_location, + splice_site_data, + chr_record, + ACCEPTED_DEL_CASES, + strand) + expected_result = { + "most_common_del": 4, + "location_is_end": False, + "del_location_has_canonical_nucleotides": False, + "deletions": {4: 1}, + "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], + } + self.assertEqual(splice_site_data, expected_result) + \ No newline at end of file From 8eefa311bbf70e07c3be2e0fd3a640f9486e4a14 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 12:39:08 +0300 Subject: [PATCH 11/44] Fix key-issue in splice_site_dict --- src/transcript_splice_site_corrector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index be17594e..d2c51d75 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -122,7 +122,7 @@ def count_deletions_for_splice_site_locations( 'location_is_end': location_type, 'deletions': {}, 'del_pos_distr': [0 for _ in range(WINDOW_SIZE)], - 'most_common_deletion': -1, + 'most_common_del': -1, 'del_location_has_canonical_nucleotides': False } @@ -191,12 +191,12 @@ def compute_most_common_del_and_verify_nucleotides( # Compute most common case of deletions - splice_site_data["most_common_deletion"] = compute_most_common_case_of_deletions( + splice_site_data["most_common_del"] = compute_most_common_case_of_deletions( splice_site_data["deletions"], splice_site_data["location_is_end"]) # Extract nucleotides from most common deletion location if it is an accepted case - if splice_site_data["most_common_deletion"] in ACCEPTED_DEL_CASES: + if splice_site_data["most_common_del"] in ACCEPTED_DEL_CASES: extract_nucleotides_from_most_common_del_location( splice_site_location, splice_site_data, From cc53125e8ecf222bc9a404ad8d2e8b4475cc3f58 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 12:52:34 +0300 Subject: [PATCH 12/44] Add threshold verification to conservative strategy --- src/transcript_splice_site_corrector.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index d2c51d75..a876e7fc 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -205,7 +205,7 @@ def compute_most_common_del_and_verify_nucleotides( -def threshold_exceeded( +def threshold_for_del_cases_exceeded( del_pos_distr: list, deletions: dict, most_common_del: int, @@ -246,6 +246,7 @@ def correct_splice_site_errors( splice_site_cases: dict, MIN_N_OF_ALIGNED_READS: int, ACCEPTED_DEL_CASES: list, + THRESHOLD_CASES_AT_LOCATION: float, MORE_CONSERVATIVE_STRATEGY: bool, strand: str, chr_record): @@ -281,7 +282,12 @@ def correct_splice_site_errors( splice_site_data["del_pos_distr"], abs(splice_site_data["most_common_del"])): continue - pass + if not threshold_for_del_cases_exceeded( + splice_site_data["del_pos_distr"], + splice_site_data["deletions"], + splice_site_data["most_common_del"], + THRESHOLD_CASES_AT_LOCATION): + continue if splice_site_data["del_location_has_canonical_nucleotides"]: locations_with_errors.append(splice_site_location) From af816240f18eda2d35f2be08a24e32799e387a14 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 12:52:51 +0300 Subject: [PATCH 13/44] Add constant for threshold to args --- src/graph_based_model_construction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 99d5cee5..be91655e 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -251,6 +251,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): splice_site_cases, MIN_N_OF_ALIGNED_READS, ACCEPTED_DEL_CASES, + THRESHOLD_CASES_AT_LOCATION, MORE_CONSERVATIVE_STRATEGY, strand, self.chr_record From 2740474c996bf0327f62e4333046efa8b5e0a9ff Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 22 Aug 2023 12:53:02 +0300 Subject: [PATCH 14/44] Update function name --- tests/test_transcript_splice_site_corrector.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 13e45ec6..9b0d32aa 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -1,5 +1,5 @@ from unittest import TestCase -from unittest.mock import MagicMock, patch +from unittest.mock import patch from src.transcript_splice_site_corrector import ( extract_location_from_cigar_string, @@ -9,12 +9,18 @@ compute_most_common_case_of_deletions, extract_nucleotides_from_most_common_del_location, compute_most_common_del_and_verify_nucleotides, - threshold_exceeded, + threshold_for_del_cases_exceeded, sublist_largest_values_exists, correct_splice_site_errors, generate_updated_exon_list, ) +####################################################################### +## ## +## Run tests with: ## +## python -m unittest tests/test_transcript_splice_site_corrector.py ## +## ## +####################################################################### class TestMoreConservativeStrategyConditions(TestCase): def test_threshold_exceeds_returns_true(self): @@ -22,7 +28,7 @@ def test_threshold_exceeds_returns_true(self): del_pos_distr = [0, 0, 10, 10, 10, 10, 0, 0] deletions = {4: 10} most_common_del = 4 - result = threshold_exceeded( + result = threshold_for_del_cases_exceeded( del_pos_distr, deletions, most_common_del, @@ -34,7 +40,7 @@ def test_threshold_not_exceeded_returns_false(self): del_pos_distr = [0, 0, 10, 10, 10, 6, 0, 0] deletions = {4: 6, 3: 4} most_common_del = 4 - result = threshold_exceeded( + result = threshold_for_del_cases_exceeded( del_pos_distr, deletions, most_common_del, @@ -334,6 +340,7 @@ def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_de } MIN_N_ALIGNED_READS = 5 ACCEPTED_DEL_CASES = [4] + THRESHOLD_CASES_AT_LOCATION = 0.7 MORE_CONSERVATIVE_STRATEGY = False strand = "+" chr_record = None @@ -341,6 +348,7 @@ def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_de splice_site_cases, MIN_N_ALIGNED_READS, ACCEPTED_DEL_CASES, + THRESHOLD_CASES_AT_LOCATION, MORE_CONSERVATIVE_STRATEGY, strand, chr_record) @@ -350,6 +358,7 @@ def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_de class TestCountDeletionsFromSpliceSiteLocations(TestCase): def test_count_deletions_from_splice_site_locations_extracts_correct_locations(self): exons = [(1, 10), (20, 30), (40, 50)] + # Cigar codes for indeces 20-40: # 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 # [M ,M, M, M, M, M, D, D, D, D, M, M, M, M, M, M, M, M, M, M, M] cigartuples = [(0, 6), (2, 4), (0, 10)] @@ -415,6 +424,7 @@ def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(se "del_location_has_canonical_nucleotides": False, } + # Fasta 1-based index extraction location: # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # offset of -4 ^ # | | From e2e3ed5f3f0b0fa74882d6153d5712efcd0f07f2 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 22 Aug 2023 18:50:58 +0300 Subject: [PATCH 15/44] fix cigartuples, can be None sometimes --- src/graph_based_model_construction.py | 2 ++ src/isoform_assignment.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index be91655e..3e5eb63e 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -239,6 +239,8 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): read_start = read_assignment.corrected_exons[0][0] read_end = read_assignment.corrected_exons[-1][1] cigartuples = read_assignment.cigartuples + if not cigartuples: + continue count_deletions_for_splice_site_locations( read_start, read_end, diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index ffd57645..6e90a18c 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -509,6 +509,8 @@ def deserialize(cls, infile, gene_info): read_assignment.read_id = read_string(infile) read_assignment.exons = read_list_of_pairs(infile, read_int) read_assignment.cigartuples = read_list_of_pairs(infile, read_int) + if not read_assignment.cigartuples: + read_assignment.cigartuples = None read_assignment.corrected_exons = read_list_of_pairs(infile, read_int) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) read_assignment.gene_info = gene_info @@ -534,7 +536,10 @@ def serialize(self, outfile): write_int(self.assignment_id, outfile) write_string(self.read_id, outfile) write_list_of_pairs(self.exons, outfile, write_int) - write_list_of_pairs(self.cigartuples, outfile, write_int) + if self.cigartuples is None: + write_list_of_pairs([], outfile, write_int) + else: + write_list_of_pairs(self.cigartuples, outfile, write_int) write_list_of_pairs(self.corrected_exons, outfile, write_int) write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile) write_int_neg(self.polya_info.external_polya_pos, outfile) From 07ab569411d3e2e18778036478983ee91dfe1167 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 10:24:43 +0300 Subject: [PATCH 16/44] Add logger.debug to see corrected exons --- src/graph_based_model_construction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index be91655e..b90474fc 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -265,6 +265,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): corrected_exons, exons ) + logger.debug("Corrected exons: ", updated_exons) return updated_exons From 7953ff37885280c9b7c7549398f97e06b96ef419 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 10:44:14 +0300 Subject: [PATCH 17/44] Add logger.debug to see corrected exons --- src/graph_based_model_construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index cc5757c3..5bdac4e3 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -219,7 +219,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): # returns: a list of corrected exons if correction takes place, None - otherwise # TODO Heidi: insert your code here - + logger.debug("Correcting splice sites. n of exons: ", len(exons), " n of assigned reads: ", len(assigned_reads)) # Constants ACCEPTED_DEL_CASES = [3, 4, 5, 6] SUPPORTED_STRANDS = ['+', '-'] From bec61f878e44355d277b90ae4641f5e31f35d08b Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 11:24:41 +0300 Subject: [PATCH 18/44] Add logger.debug to see corrected exons --- src/graph_based_model_construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 5bdac4e3..f8ac8808 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -219,7 +219,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): # returns: a list of corrected exons if correction takes place, None - otherwise # TODO Heidi: insert your code here - logger.debug("Correcting splice sites. n of exons: ", len(exons), " n of assigned reads: ", len(assigned_reads)) + logger.debug(f"Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}") # Constants ACCEPTED_DEL_CASES = [3, 4, 5, 6] SUPPORTED_STRANDS = ['+', '-'] From 8e46fc165c4310d7df1af9a58472cce559f32dbe Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 11:28:50 +0300 Subject: [PATCH 19/44] Add logger.debug to see corrected exons --- src/graph_based_model_construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index f8ac8808..533ac757 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -267,7 +267,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): corrected_exons, exons ) - logger.debug("Corrected exons: ", updated_exons) + logger.debug(f"Corrected exons: {len(updated_exons)}, {updated_exons}") return updated_exons From 03591464876a897a1fee556b5929c93241f59f19 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 12:34:48 +0300 Subject: [PATCH 20/44] Add logger.debug to see indel calc --- src/graph_based_model_construction.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 533ac757..aca26fd4 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -219,7 +219,6 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): # returns: a list of corrected exons if correction takes place, None - otherwise # TODO Heidi: insert your code here - logger.debug(f"Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}") # Constants ACCEPTED_DEL_CASES = [3, 4, 5, 6] SUPPORTED_STRANDS = ['+', '-'] @@ -230,6 +229,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): strand = assigned_reads[0].strand + logger.debug(f"Heidi: Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}") if strand not in SUPPORTED_STRANDS: return None @@ -248,7 +248,8 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): exons, splice_site_cases) - + logger.debug(f"Heidi: Splice site cases: {splice_site_cases}") + corrected_exons = correct_splice_site_errors( splice_site_cases, MIN_N_OF_ALIGNED_READS, @@ -267,7 +268,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): corrected_exons, exons ) - logger.debug(f"Corrected exons: {len(updated_exons)}, {updated_exons}") + logger.debug(f"Heidi: Corrected exons: {len(updated_exons)}, {updated_exons}") return updated_exons From bc91708fc78bb89df650f053a152a6114a9cc1be Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 12:57:24 +0300 Subject: [PATCH 21/44] Add debugging to see matching cases list --- src/transcript_splice_site_corrector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index a876e7fc..6710d8a9 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -1,3 +1,6 @@ +import logging +logger = logging.getLogger('IsoQuant') + def extract_location_from_cigar_string(cigartuples: list, read_start: int, read_end: int, @@ -114,7 +117,7 @@ def count_deletions_for_splice_site_locations( # Extract splice site locations within aligned read matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons) - + logger.debug(f"Matching locations: {matching_locations}") # Count deletions for each splice site location for splice_site_location, location_type in matching_locations: if splice_site_location not in splice_site_cases: From 054aa1c96cc4f7140c27ab858ac203bab08da8d8 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 23 Aug 2023 14:07:25 +0300 Subject: [PATCH 22/44] Add debugger to cases with no cigartuples --- src/graph_based_model_construction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index aca26fd4..b286c2d0 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -240,6 +240,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): read_end = read_assignment.corrected_exons[-1][1] cigartuples = read_assignment.cigartuples if not cigartuples: + logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}") continue count_deletions_for_splice_site_locations( read_start, From 5ccbc819824376664f562d60b77035db9f1fd052 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 24 Aug 2023 08:38:01 +0300 Subject: [PATCH 23/44] add debug line to verify if cigartuples are found on some reads --- src/graph_based_model_construction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index b286c2d0..0fec2044 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -242,13 +242,14 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): if not cigartuples: logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}") continue + logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}") count_deletions_for_splice_site_locations( read_start, read_end, cigartuples, exons, splice_site_cases) - + logger.debug(f"Heidi: Splice site cases: {splice_site_cases}") corrected_exons = correct_splice_site_errors( From 6e217ec0e7c8e30e03eac6544bb37fc1355226b2 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 07:34:36 +0300 Subject: [PATCH 24/44] Move debug to correct transcripts --- src/graph_based_model_construction.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 0fec2044..7741ebc8 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -208,6 +208,15 @@ def correct_transcripts(self): for model in self.transcript_model_storage: exons = model.exon_blocks assigned_reads = self.transcript_read_ids[model.transcript_id] + cigartuples = False + for read in assigned_reads: + if read.cigartuples: + cigartuples = True + break + if not cigartuples: + logger.debug(f"Heidi: Method correct_transcripts. No cigar tuples for transcript {model.transcript_id}") + else: + logger.debug(f"Heidi: Method correct_transcripts. Yes cigar tuples for transcript {model.transcript_id}") corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads) if corrected_exons: model.exon_blocks = corrected_exons @@ -240,9 +249,9 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): read_end = read_assignment.corrected_exons[-1][1] cigartuples = read_assignment.cigartuples if not cigartuples: - logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}") + # logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}") continue - logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}") + # logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}") count_deletions_for_splice_site_locations( read_start, read_end, From 82fdc3ad47560f634e2de52684717dafec5e90c2 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 10:09:02 +0300 Subject: [PATCH 25/44] Move debug to correct transcripts --- src/graph_based_model_construction.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 7741ebc8..d18b5682 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -213,10 +213,7 @@ def correct_transcripts(self): if read.cigartuples: cigartuples = True break - if not cigartuples: - logger.debug(f"Heidi: Method correct_transcripts. No cigar tuples for transcript {model.transcript_id}") - else: - logger.debug(f"Heidi: Method correct_transcripts. Yes cigar tuples for transcript {model.transcript_id}") + logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {cigartuples}") corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads) if corrected_exons: model.exon_blocks = corrected_exons From 20440ae27b849caa9f374665ffeac9795a4bd6af Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 10:12:50 +0300 Subject: [PATCH 26/44] Move debug to correct transcripts --- src/graph_based_model_construction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index d18b5682..1ed4243e 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -208,12 +208,12 @@ def correct_transcripts(self): for model in self.transcript_model_storage: exons = model.exon_blocks assigned_reads = self.transcript_read_ids[model.transcript_id] - cigartuples = False + found_cigartuples = False for read in assigned_reads: if read.cigartuples: - cigartuples = True + found_cigartuples = True break - logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {cigartuples}") + logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {found_cigartuples}") corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads) if corrected_exons: model.exon_blocks = corrected_exons From 83f5ae63fe94801f676abf75c1a5ea0a7a18c527 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 10:23:03 +0300 Subject: [PATCH 27/44] Move debug to correct transcripts --- src/graph_based_model_construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 1ed4243e..451f538a 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -213,7 +213,7 @@ def correct_transcripts(self): if read.cigartuples: found_cigartuples = True break - logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {found_cigartuples}") + logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, found one or more cigartuples: {found_cigartuples}") corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads) if corrected_exons: model.exon_blocks = corrected_exons From f8f363fcaaced755d4d9210ed8ea87a2e11dcb86 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 13:08:08 +0300 Subject: [PATCH 28/44] Move debug to correct transcripts --- src/graph_based_model_construction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 451f538a..732171d8 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -209,6 +209,7 @@ def correct_transcripts(self): exons = model.exon_blocks assigned_reads = self.transcript_read_ids[model.transcript_id] found_cigartuples = False + # TODO: REMOVE NEXT FIVE LINES AFTER CIAGRTUPLES ARE FIXED for read in assigned_reads: if read.cigartuples: found_cigartuples = True From b37f59b8e9afc7f17197d8413c80b9568d1f3f68 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 14:10:49 +0300 Subject: [PATCH 29/44] Fix bug with dict key ref --- src/transcript_splice_site_corrector.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index 6710d8a9..0e0f820e 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -50,7 +50,7 @@ def count_deletions_from_cigar_codes_in_given_window(cigartuples: list, loc_type (str): type of location (start or end) """ - deletions = 0 + count_of_deletions = 0 cigar_code_list = [] @@ -74,13 +74,13 @@ def count_deletions_from_cigar_codes_in_given_window(cigartuples: list, if i >= len(cigar_code_list): break if cigar_code_list[i] == 2: - deletions += 1 + count_of_deletions += 1 splice_site_data["del_pos_distr"][i] += 1 - if deletions not in splice_site_data: - splice_site_data["deletions"][deletions] = 0 + if count_of_deletions not in splice_site_data["deletions"]: + splice_site_data["deletions"][count_of_deletions] = 0 - splice_site_data["deletions"][deletions] += 1 + splice_site_data["deletions"][count_of_deletions] += 1 def extract_splice_site_locations_within_aligned_read(read_start: int, read_end: int, exons:list): @@ -117,6 +117,7 @@ def count_deletions_for_splice_site_locations( # Extract splice site locations within aligned read matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons) + logger.debug(f"Matching locations: {matching_locations}") # Count deletions for each splice site location for splice_site_location, location_type in matching_locations: @@ -265,6 +266,8 @@ def correct_splice_site_errors( strand (str): transcript strand (extracted from first ReadAssignment-object in read_assignments list) chr_record (Fasta): FASTA recored, i.e. a single chromosome from a reference """ + + locations_with_errors = [] for splice_site_location, splice_site_data in splice_site_cases.items(): From 8c240f48e3dd183d5f9d524061bdeba685b1f8e3 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 14:11:26 +0300 Subject: [PATCH 30/44] Add test for GraphBasedModelConstructor --- .../test_transcript_splice_site_corrector.py | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 9b0d32aa..37c70772 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -1,6 +1,9 @@ from unittest import TestCase -from unittest.mock import patch +from unittest.mock import patch, MagicMock +from src.isoform_assignment import ReadAssignment + +from src.graph_based_model_construction import GraphBasedModelConstructor from src.transcript_splice_site_corrector import ( extract_location_from_cigar_string, count_deletions_from_cigar_codes_in_given_window, @@ -561,4 +564,29 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self): "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } self.assertEqual(splice_site_data, expected_result) - \ No newline at end of file + +class TestSpliceSiteCorrector(TestCase): + + + def test_error_is_corrected(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 6)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "+" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 5), (10, 20)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGHIJKLMAGPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = [(0, 5), (14, 20)] + self.assertTrue(result == expected_result) + + + + \ No newline at end of file From 2360b6215c2ee1c1e62498660ca7765d1d914f19 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 18:27:40 +0300 Subject: [PATCH 31/44] Check for abs value --- src/transcript_splice_site_corrector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index 0e0f820e..fd3aee23 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -182,7 +182,6 @@ def extract_nucleotides_from_most_common_del_location( possible_canonicals = canonical_pairs[strand]['end'] else: possible_canonicals = canonical_pairs[strand]['start'] - if extracted_canonicals in possible_canonicals: splice_site_data["del_location_has_canonical_nucleotides"] = True @@ -200,7 +199,7 @@ def compute_most_common_del_and_verify_nucleotides( splice_site_data["location_is_end"]) # Extract nucleotides from most common deletion location if it is an accepted case - if splice_site_data["most_common_del"] in ACCEPTED_DEL_CASES: + if abs(splice_site_data["most_common_del"]) in ACCEPTED_DEL_CASES: extract_nucleotides_from_most_common_del_location( splice_site_location, splice_site_data, From 898f75ce354eba3be3c1de7dbd62522146d46cf3 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 18:28:00 +0300 Subject: [PATCH 32/44] Expand unittests for GraphBaseModelConstructor method --- .../test_transcript_splice_site_corrector.py | 82 ++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 37c70772..42bb029f 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -568,7 +568,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self): class TestSpliceSiteCorrector(TestCase): - def test_error_is_corrected(self): + def test_error_in_start_on_pos_strand_is_corrected(self): assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 6)] assigned_read_1.corrected_exons = [(0, 20)] @@ -588,5 +588,83 @@ def test_error_is_corrected(self): self.assertTrue(result == expected_result) - + def test_error_in_end_on_pos_strand_is_corrected(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 16)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "+" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 14), (20, 30)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGIJGCMNOPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = [(0, 10), (20, 30)] + self.assertTrue(result == expected_result) + + + def test_error_in_start_on_neg_strand_is_corrected(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 6)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "-" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 5), (10, 20)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGHIJKLMGCPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = [(0, 5), (14, 20)] + self.assertTrue(result == expected_result) + + + def test_error_in_end_on_neg_strand_is_corrected(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 16)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "-" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 14), (20, 30)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGIJCTMNOPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = [(0, 10), (20, 30)] + self.assertTrue(result == expected_result) + + + def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 16)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "-" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 14), (20, 30)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGIJKLMNOPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = None + self.assertTrue(result == expected_result) + \ No newline at end of file From 05de4f301e7a25229db1d844fee064582adddc39 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Fri, 25 Aug 2023 18:37:33 +0300 Subject: [PATCH 33/44] Improve debugger stdouts --- src/graph_based_model_construction.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 732171d8..2887880c 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -236,7 +236,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): strand = assigned_reads[0].strand - logger.debug(f"Heidi: Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}") + logger.debug(f"correct_transcript_splice_sites. Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}") if strand not in SUPPORTED_STRANDS: return None @@ -257,7 +257,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): exons, splice_site_cases) - logger.debug(f"Heidi: Splice site cases: {splice_site_cases}") + logger.debug(f"correct_transcript_splice_sites. Splice site cases: {splice_site_cases}") corrected_exons = correct_splice_site_errors( splice_site_cases, @@ -272,12 +272,15 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): if not corrected_exons: return None + cases = [str(exon) + ": " + str(splice_site_cases[exon]) for exon in corrected_exons] + logger.debug(f"correct_transcript_splice_sites. Corrected exons: {len(corrected_exons)}, {corrected_exons} {cases}") + + updated_exons = generate_updated_exon_list( splice_site_cases, corrected_exons, exons ) - logger.debug(f"Heidi: Corrected exons: {len(updated_exons)}, {updated_exons}") return updated_exons From c9d09b69acbf4af06e269f60dacbc3a8a0a944aa Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Mon, 28 Aug 2023 09:40:19 +0300 Subject: [PATCH 34/44] Update unittest after changing constant positioning --- tests/test_transcript_splice_site_corrector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 42bb029f..32bc5b25 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -368,12 +368,14 @@ def test_count_deletions_from_splice_site_locations_extracts_correct_locations(s read_start = 20 read_end = 40 splice_site_cases = {} + WINDOW_SIZE = 8 count_deletions_for_splice_site_locations( read_start, read_end, cigartuples, exons, - splice_site_cases) + splice_site_cases, + WINDOW_SIZE) expected_result = { 20: { 'location_is_end': False, From 35a0942b3a4a294ae0096a05c79e73d3c8209818 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Mon, 28 Aug 2023 09:40:32 +0300 Subject: [PATCH 35/44] Move WINDOW_SIZE to main func --- src/graph_based_model_construction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 2887880c..9e55793b 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -231,6 +231,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): SUPPORTED_STRANDS = ['+', '-'] THRESHOLD_CASES_AT_LOCATION = 0.7 MIN_N_OF_ALIGNED_READS = 5 + WINDOW_SIZE = 8 MORE_CONSERVATIVE_STRATEGY = False @@ -255,7 +256,8 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): read_end, cigartuples, exons, - splice_site_cases) + splice_site_cases, + WINDOW_SIZE) logger.debug(f"correct_transcript_splice_sites. Splice site cases: {splice_site_cases}") From 9e2c29d7182b5512058708b3f2d38727df821b4b Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Mon, 28 Aug 2023 09:40:44 +0300 Subject: [PATCH 36/44] Move const WINDOW_SIZE upper --- src/transcript_splice_site_corrector.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index fd3aee23..dc9dd750 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -102,7 +102,8 @@ def count_deletions_for_splice_site_locations( read_end: int, cigartuples: list, exons: list, - splice_site_cases: dict): + splice_site_cases: dict, + WINDOW_SIZE: int): """ Args: @@ -112,9 +113,6 @@ def count_deletions_for_splice_site_locations( """ - # Constant window size for counting deletions - WINDOW_SIZE = 8 - # Extract splice site locations within aligned read matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons) From 5c22578999504185e3d091da61451ee0d0436fa5 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Mon, 28 Aug 2023 15:48:37 +0300 Subject: [PATCH 37/44] Change division to multiplication --- src/transcript_splice_site_corrector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index dc9dd750..a46ff330 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -214,7 +214,7 @@ def threshold_for_del_cases_exceeded( total_cases = sum(deletions.values()) nucleotides_exceeding_treshold = 0 for value in del_pos_distr: - if value / total_cases > THRESHOLD_CASES_AT_LOCATION: + if value > total_cases * THRESHOLD_CASES_AT_LOCATION: nucleotides_exceeding_treshold += 1 return bool(nucleotides_exceeding_treshold >= abs(most_common_del)) From 46d01bbef3e4f9c40e69a202e8430d02e7f316e0 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Mon, 28 Aug 2023 15:48:52 +0300 Subject: [PATCH 38/44] Expand tests --- .../test_transcript_splice_site_corrector.py | 64 +++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 32bc5b25..38bd103a 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -600,7 +600,7 @@ def test_error_in_end_on_pos_strand_is_corrected(self): constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGIJGCMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -640,7 +640,7 @@ def test_error_in_end_on_neg_strand_is_corrected(self): constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGIJCTMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHICTLMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -649,6 +649,45 @@ def test_error_in_end_on_neg_strand_is_corrected(self): expected_result = [(0, 10), (20, 30)] self.assertTrue(result == expected_result) + def test_error_in_end_on_neg_strand_and_min_accepted_del_cases_is_corrected(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 3), (0, 17)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "-" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 14), (20, 30)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGHIJCTMNOPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = [(0, 11), (20, 30)] + self.assertTrue(result == expected_result) + + def test_error_in_end_on_neg_strand_and_max_accepted_del_cases_is_corrected(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 8), (2, 6), (0, 16)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "-" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 14), (20, 30)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGCTJKLMNOPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + + expected_result = [(0, 8), (20, 30)] + self.assertTrue(result == expected_result) + def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self): assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") @@ -660,7 +699,7 @@ def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGIJKLMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJKLMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -669,4 +708,21 @@ def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self expected_result = None self.assertTrue(result == expected_result) - \ No newline at end of file + def test_case_with_not_enough_dels_but_canonicals_in_end_on_pos_strand_returns_none(self): + assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test") + assigned_read_1.cigartuples = [(0, 10), (2, 2), (0, 18)] + assigned_read_1.corrected_exons = [(0, 20)] + assigned_read_1.strand = "-" + assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1] + exons = [(0, 14), (20, 30)] + + constructor = GraphBasedModelConstructor( + gene_info=MagicMock(), + chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ", + params=MagicMock(), + transcript_counter=0 + ) + result = constructor.correct_transcript_splice_sites(exons, assigned_reads) + + expected_result = None + self.assertTrue(result == expected_result) From 5a908646ff57c6f19a1631ffe8c09a556f3b4270 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 29 Aug 2023 10:40:28 +0300 Subject: [PATCH 39/44] Shorten key name --- src/transcript_splice_site_corrector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index a46ff330..292f4a26 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -125,7 +125,7 @@ def count_deletions_for_splice_site_locations( 'deletions': {}, 'del_pos_distr': [0 for _ in range(WINDOW_SIZE)], 'most_common_del': -1, - 'del_location_has_canonical_nucleotides': False + 'canonical_bases_found': False } # Processing cigartuples @@ -181,7 +181,7 @@ def extract_nucleotides_from_most_common_del_location( else: possible_canonicals = canonical_pairs[strand]['start'] if extracted_canonicals in possible_canonicals: - splice_site_data["del_location_has_canonical_nucleotides"] = True + splice_site_data["canonical_bases_found"] = True def compute_most_common_del_and_verify_nucleotides( splice_site_location: int, @@ -292,7 +292,7 @@ def correct_splice_site_errors( THRESHOLD_CASES_AT_LOCATION): continue - if splice_site_data["del_location_has_canonical_nucleotides"]: + if splice_site_data["canonical_bases_found"]: locations_with_errors.append(splice_site_location) return locations_with_errors From 47ca8b33f7bf1769c5faded8e3eff7806873bee9 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Tue, 29 Aug 2023 10:40:39 +0300 Subject: [PATCH 40/44] Update tests after key name change --- .../test_transcript_splice_site_corrector.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index 38bd103a..d926570a 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -329,13 +329,13 @@ class TestCorrectSpliceSiteErrors(TestCase): def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_deletions): splice_site_cases = { 20: { - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, "deletions": {4: 10}, "location_is_end": False, "most_common_del": 4, }, 30: { - "del_location_has_canonical_nucleotides": True, + "canonical_bases_found": True, "deletions": {4: 10}, "location_is_end": False, "most_common_del": 4, @@ -382,21 +382,21 @@ def test_count_deletions_from_splice_site_locations_extracts_correct_locations(s 'deletions': {2: 1}, 'del_pos_distr': [0, 0, 0, 0, 0, 0, 1, 1], 'most_common_del': -1, - 'del_location_has_canonical_nucleotides': False + 'canonical_bases_found': False }, 30: { 'location_is_end': True, 'deletions': {4: 1}, 'del_pos_distr': [0, 0, 0, 1, 1, 1, 1, 0], 'most_common_del': -1, - 'del_location_has_canonical_nucleotides': False + 'canonical_bases_found': False }, 40: { 'location_is_end': False, 'deletions': {0: 1}, 'del_pos_distr': [0, 0, 0, 0, 0, 0, 0, 0], 'most_common_del': -1, - 'del_location_has_canonical_nucleotides': False + 'canonical_bases_found': False }, } self.assertEqual(splice_site_cases, expected_result) @@ -409,7 +409,7 @@ def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly( splice_site_data = { "most_common_del": 4, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, } chr_record = "AAAAAAAAAAAAAAG" @@ -419,14 +419,14 @@ def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly( splice_site_data, chr_record, strand) - self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + self.assertTrue(splice_site_data["canonical_bases_found"]) def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(self): location = 10 splice_site_data = { "most_common_del": -4, "location_is_end": True, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, } # Fasta 1-based index extraction location: @@ -443,14 +443,14 @@ def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(se splice_site_data, chr_record, strand) - self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + self.assertTrue(splice_site_data["canonical_bases_found"]) def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly(self): location = 10 splice_site_data = { "most_common_del": 4, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, } chr_record = "AAAAAAAAAAAAAAC" @@ -460,14 +460,14 @@ def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly( splice_site_data, chr_record, strand) - self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + self.assertTrue(splice_site_data["canonical_bases_found"]) def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(self): location = 10 splice_site_data = { "most_common_del": -4, "location_is_end": True, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, } chr_record = "AAAAACTAAAAAAAA" @@ -477,7 +477,7 @@ def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(se splice_site_data, chr_record, strand) - self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"]) + self.assertTrue(splice_site_data["canonical_bases_found"]) class TestDeletionComputationAndBaseExtraction(TestCase): @@ -487,7 +487,7 @@ def test_for_accepted_del_case_nucleotides_are_vefiried(self): splice_site_data = { "most_common_del": -1, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, "deletions": {4: 1}, "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } @@ -504,7 +504,7 @@ def test_for_accepted_del_case_nucleotides_are_vefiried(self): expected_result = { "most_common_del": 4, "location_is_end": False, - "del_location_has_canonical_nucleotides": True, + "canonical_bases_found": True, "deletions": {4: 1}, "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } @@ -516,7 +516,7 @@ def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self): splice_site_data = { "most_common_del": -1, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, "deletions": {2: 1}, "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } @@ -533,7 +533,7 @@ def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self): expected_result = { "most_common_del": 2, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, "deletions": {2: 1}, "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } @@ -544,7 +544,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self): splice_site_data = { "most_common_del": -1, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, "deletions": {4: 1}, "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } @@ -561,7 +561,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self): expected_result = { "most_common_del": 4, "location_is_end": False, - "del_location_has_canonical_nucleotides": False, + "canonical_bases_found": False, "deletions": {4: 1}, "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } From b825a5ce784147238fc30bb18a998921e834b4fe Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 29 Aug 2023 18:52:00 +0300 Subject: [PATCH 41/44] fix cigartuples exactly where they needed to be --- src/alignment_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/alignment_processor.py b/src/alignment_processor.py index b93dd571..e584649a 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -307,6 +307,7 @@ def process_intergenic(self, alignment_storage): read_assignment.polya_info = alignment_info.polya_info read_assignment.cage_found = len(alignment_info.cage_hits) > 0 read_assignment.exons = alignment_info.read_exons + read_assignment.cigartuples = alignment.cigartuples read_assignment.corrected_exons = alignment_info.read_exons read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) From 3c07e177a7aaf02860278580b813cc1ecdfd02af Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 30 Aug 2023 06:54:07 +0300 Subject: [PATCH 42/44] Change idx correction for FASTA extract --- src/transcript_splice_site_corrector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py index 292f4a26..fee5711c 100644 --- a/src/transcript_splice_site_corrector.py +++ b/src/transcript_splice_site_corrector.py @@ -157,7 +157,7 @@ def extract_nucleotides_from_most_common_del_location( chr_record, strand: str): most_common_del = splice_site_data["most_common_del"] - idx_correction = -1 + idx_correction = 0 extraction_start = location + most_common_del + idx_correction extraction_end = location + most_common_del + 2 + idx_correction try: From ce52d631a079c52dc4860de3f661e5548d2272e8 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 30 Aug 2023 07:04:02 +0300 Subject: [PATCH 43/44] Fix unittests after fixing issue with chr_record idx-correction --- .../test_transcript_splice_site_corrector.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py index d926570a..010dc7fb 100644 --- a/tests/test_transcript_splice_site_corrector.py +++ b/tests/test_transcript_splice_site_corrector.py @@ -411,7 +411,7 @@ def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly( "location_is_end": False, "canonical_bases_found": False, } - chr_record = "AAAAAAAAAAAAAAG" + chr_record = "AAAAAAAAAAAAAAAG" strand = "+" extract_nucleotides_from_most_common_del_location( @@ -435,7 +435,7 @@ def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(se # | | # v start pos # A A A A A G C A A A A A A A A - chr_record = "AAAAAGCAAAAAAAA" + chr_record = "AAAAAAGCAAAAAAAA" strand = "+" extract_nucleotides_from_most_common_del_location( @@ -452,7 +452,7 @@ def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly( "location_is_end": False, "canonical_bases_found": False, } - chr_record = "AAAAAAAAAAAAAAC" + chr_record = "AAAAAAAAAAAAAAAC" strand = "-" extract_nucleotides_from_most_common_del_location( @@ -469,7 +469,7 @@ def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(se "location_is_end": True, "canonical_bases_found": False, } - chr_record = "AAAAACTAAAAAAAA" + chr_record = "AAAAAACTAAAAAAAA" strand = "-" extract_nucleotides_from_most_common_del_location( @@ -492,7 +492,7 @@ def test_for_accepted_del_case_nucleotides_are_vefiried(self): "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } - chr_record = "AAAAAAAAAAAAAAG" + chr_record = "AAAAAAAAAAAAAAAG" ACCEPTED_DEL_CASES = [4] strand = "+" compute_most_common_del_and_verify_nucleotides( @@ -521,7 +521,7 @@ def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self): "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } - chr_record = "AAAAAAAAAAAAAAG" + chr_record = "AAAAAAAAAAAAAAAG" ACCEPTED_DEL_CASES = [4] strand = "+" compute_most_common_del_and_verify_nucleotides( @@ -549,7 +549,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self): "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0], } - chr_record = "AAAAAAAAAAAAAXX" + chr_record = "AAAAAAAAAAAAAAXX" ACCEPTED_DEL_CASES = [4] strand = "+" compute_most_common_del_and_verify_nucleotides( @@ -580,7 +580,7 @@ def test_error_in_start_on_pos_strand_is_corrected(self): constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGHIJKLMAGPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJKLMNAGQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -600,7 +600,7 @@ def test_error_in_end_on_pos_strand_is_corrected(self): constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJGCMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -620,7 +620,7 @@ def test_error_in_start_on_neg_strand_is_corrected(self): constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGHIJKLMGCPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJKLMNGCQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -640,7 +640,7 @@ def test_error_in_end_on_neg_strand_is_corrected(self): constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGHICTLMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJCTMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -659,7 +659,7 @@ def test_error_in_end_on_neg_strand_and_min_accepted_del_cases_is_corrected(self constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGHIJCTMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJKCTNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -678,7 +678,7 @@ def test_error_in_end_on_neg_strand_and_max_accepted_del_cases_is_corrected(self constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGCTJKLMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHCTKLMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) @@ -718,7 +718,7 @@ def test_case_with_not_enough_dels_but_canonicals_in_end_on_pos_strand_returns_n constructor = GraphBasedModelConstructor( gene_info=MagicMock(), - chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ", + chr_record= "ABCDEFGHIJGCMNOPQRSTUVWXYZ", params=MagicMock(), transcript_counter=0 ) From fb7db12671b63eb8340b3e506da211db9e612fc8 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 30 Aug 2023 09:04:55 +0300 Subject: [PATCH 44/44] Remove unneeded logger.debugs --- src/graph_based_model_construction.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 9e55793b..991fbd17 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -208,15 +208,9 @@ def correct_transcripts(self): for model in self.transcript_model_storage: exons = model.exon_blocks assigned_reads = self.transcript_read_ids[model.transcript_id] - found_cigartuples = False - # TODO: REMOVE NEXT FIVE LINES AFTER CIAGRTUPLES ARE FIXED - for read in assigned_reads: - if read.cigartuples: - found_cigartuples = True - break - logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, found one or more cigartuples: {found_cigartuples}") corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads) if corrected_exons: + logger.debug(f"correct_transcripts. Corrected exons: {corrected_exons}, original exons: {exons}") model.exon_blocks = corrected_exons def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): @@ -237,7 +231,6 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): strand = assigned_reads[0].strand - logger.debug(f"correct_transcript_splice_sites. Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}") if strand not in SUPPORTED_STRANDS: return None @@ -259,7 +252,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): splice_site_cases, WINDOW_SIZE) - logger.debug(f"correct_transcript_splice_sites. Splice site cases: {splice_site_cases}") + corrected_exons = correct_splice_site_errors( splice_site_cases, @@ -275,7 +268,6 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list): return None cases = [str(exon) + ": " + str(splice_site_cases[exon]) for exon in corrected_exons] - logger.debug(f"correct_transcript_splice_sites. Corrected exons: {len(corrected_exons)}, {corrected_exons} {cases}") updated_exons = generate_updated_exon_list(