From 1b26e0406d4de497a69a7192272575e1ff996528 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 8 Aug 2023 17:58:09 +0300
Subject: [PATCH 01/44] keep cigartuples in read assignment

---
 src/alignment_processor.py | 1 +
 src/isoform_assignment.py  | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/src/alignment_processor.py b/src/alignment_processor.py
index 8e130ed2..b93dd571 100644
--- a/src/alignment_processor.py
+++ b/src/alignment_processor.py
@@ -358,6 +358,7 @@ def process_genic(self, alignment_storage, gene_info):
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
             read_assignment.exons = alignment_info.read_exons
+            read_assignment.cigartuples = alignment.cigartuples
             read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info,
                                                                                    read_assignment)
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 47d73552..ffd57645 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -477,6 +477,7 @@ def __init__(self, read_id, assignment_type, match=None):
         self.assignment_id = ReadAssignment.assignment_id_generator.increment()
         self.read_id = read_id
         self.exons = None
+        self.cigartuples = None
         self.corrected_exons = None
         self.corrected_introns = None
         self.gene_info = None
@@ -507,6 +508,7 @@ def deserialize(cls, infile, gene_info):
         read_assignment.assignment_id = read_int(infile)
         read_assignment.read_id = read_string(infile)
         read_assignment.exons = read_list_of_pairs(infile, read_int)
+        read_assignment.cigartuples = read_list_of_pairs(infile, read_int)
         read_assignment.corrected_exons = read_list_of_pairs(infile, read_int)
         read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
         read_assignment.gene_info = gene_info
@@ -532,6 +534,7 @@ def serialize(self, outfile):
         write_int(self.assignment_id, outfile)
         write_string(self.read_id, outfile)
         write_list_of_pairs(self.exons, outfile, write_int)
+        write_list_of_pairs(self.cigartuples, outfile, write_int)
         write_list_of_pairs(self.corrected_exons, outfile, write_int)
         write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile)
         write_int_neg(self.polya_info.external_polya_pos, outfile)

From 27f52fe4a9b03417ca352ff9a3309cf29beb217d Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 17 Aug 2023 15:56:05 +0300
Subject: [PATCH 02/44] template for transcript correction

---
 src/graph_based_model_construction.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 7e94114c..c70c98b4 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -130,6 +130,7 @@ def process(self, read_assignment_storage):
         self.construct_assignment_based_isoforms(read_assignment_storage)
         self.assign_reads_to_models(read_assignment_storage)
         self.filter_transcripts()
+        self.correct_transcripts()
 
         if self.params.genedb:
             self.create_extended_annotation()
@@ -198,6 +199,23 @@ def compare_models_with_known(self):
             model.add_additional_attribute("alternatives", event_string)
             self.transcript2transcript.append(assignment)
 
+    def correct_transcripts(self):
+        for model in self.transcript_model_storage:
+            exons = model.exon_blocks
+            assigned_reads = self.transcript_read_ids[model.transcript_id]
+            corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
+            if corrected_exons:
+                model.exon_blocks = corrected_exons
+
+    def correct_transcript_splice_sites(self, exons, assigned_reads):
+        # exons: list of coordinate pairs
+        # assigned_reads: list of ReadAssignment, contains read_id and cigartuples
+        # self.chr_record - FASTA recored, i.e. a single chromosome from a reference
+        # returns: a list of corrected exons if correction takes place, None - otherwise
+        # TODO Heidi: insert your code here
+        return None
+
+
     def filter_transcripts(self):
         filtered_storage = []
         confirmed_transcipt_ids = set()

From 65f324d57f1099b979190a2fba916d3a9121ac0e Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Mon, 21 Aug 2023 14:59:26 +0300
Subject: [PATCH 03/44] Add initial implementation for
 transcript_splice_site_corrector and unittests

---
 src/graph_based_model_construction.py         |  70 ++++-
 src/transcript_splice_site_corrector.py       | 241 +++++++++++++++++
 .../test_transcript_splice_site_corrector.py  | 251 ++++++++++++++++++
 3 files changed, 560 insertions(+), 2 deletions(-)
 create mode 100644 src/transcript_splice_site_corrector.py
 create mode 100644 tests/test_transcript_splice_site_corrector.py

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index c70c98b4..6a50da63 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -26,6 +26,9 @@
 from .long_read_profiles import CombinedProfileConstructor
 from .polya_finder import PolyAInfo
 
+from .transcript_splice_site_corrector import count_deletions_for_splice_site_locations
+from .transcript_splice_site_corrector import compute_most_common_del_and_verify_nucleotides
+from .transcript_splice_site_corrector import sublist_largest_values_exists
 
 logger = logging.getLogger('IsoQuant')
 
@@ -207,13 +210,76 @@ def correct_transcripts(self):
             if corrected_exons:
                 model.exon_blocks = corrected_exons
 
-    def correct_transcript_splice_sites(self, exons, assigned_reads):
+    def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         # exons: list of coordinate pairs
         # assigned_reads: list of ReadAssignment, contains read_id and cigartuples
         # self.chr_record - FASTA recored, i.e. a single chromosome from a reference
         # returns: a list of corrected exons if correction takes place, None - otherwise
         # TODO Heidi: insert your code here
-        return None
+
+        
+        # Constants
+        ACCEPTED_DEL_CASES = [3, 4, 5, 6]
+        SUPPORTED_STRANDS = ['+', '-']
+        THRESHOLD_CASES_AT_LOCATION = 0.7
+        MIN_N_OF_ALIGNED_READS = 5
+
+        MORE_CONSERVATIVE_STRATEGY = False
+
+
+        strand = assigned_reads[0].strand
+        if strand not in SUPPORTED_STRANDS:
+            return None
+
+        splice_site_cases = {}
+        # Iterate assigned_reads list and count deletions for splice site locations
+        for read_assignment in assigned_reads:
+            count_deletions_for_splice_site_locations(read_assignment, exons, splice_site_cases)
+
+        # Second iteration
+        # 1. Count most common deletion at each splice site location
+        # 2. For interesting cases count nucleotides at deletion positions
+        # 3. If canonical nucleotides are found, correct splice site
+        
+        corrected_exons = []
+        for splice_site_location, splice_site_data in splice_site_cases.items():
+            
+            reads = sum(splice_site_data["deletions"].values())
+            if reads < MIN_N_OF_ALIGNED_READS:
+                continue
+            
+            compute_most_common_del_and_verify_nucleotides(
+                splice_site_location, 
+                splice_site_data, 
+                self.chr_record,
+                ACCEPTED_DEL_CASES,
+                strand
+                )
+            if MORE_CONSERVATIVE_STRATEGY:
+                if not sublist_largest_values_exists(
+                    splice_site_data["del_pos_distr"],
+                    abs(splice_site_data["most_common_del"])):
+                    continue
+                pass
+
+            if splice_site_data["del_location_has_canonical_nucleotides"]:
+                corrected_exons.append(splice_site_location)
+        
+        # If correction took place, return corrected exons
+        if not corrected_exons:
+            return None
+        
+        corrected_exons = []
+        for exon in exons:
+            corrected_exon = exon
+            if exon[0] in splice_site_cases:
+                corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"]
+                corrected_exon = (corrected_location, exon[1])
+            if exon[1] in splice_site_cases:
+                corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"]
+                corrected_exon = (exon[0], corrected_location)
+            corrected_exons.append(corrected_exon)
+        return corrected_exons
 
 
     def filter_transcripts(self):
diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
new file mode 100644
index 00000000..7ee8e727
--- /dev/null
+++ b/src/transcript_splice_site_corrector.py
@@ -0,0 +1,241 @@
+def extract_location_from_cigar_string(cigartuples: list,
+                                    read_start: int,
+                                    read_end: int,
+                                    splice_site_location: int):
+    """
+    Extract location from cigar string.
+
+    Args:
+        cigar_tuples (list): list of cigar tuples (cigar code, aligned position).
+        See pysam documentation for more information
+        read_start (int): the start location for the read (base-1)
+        read_end (int): the end location for the read (base-1)
+        splice_site_location (int): location of interest (base-1)
+
+    Returns:
+        _type_: _description_
+    """
+    relative_position = splice_site_location - read_start
+    alignment_position = 0
+    ref_position = 0
+
+    for cigar_code in cigartuples:
+
+        if cigar_code[0] in [0, 2, 3, 7, 8]:
+            ref_position += cigar_code[1]
+        if ref_position <= relative_position and not \
+                read_start + ref_position == read_end:
+            alignment_position += cigar_code[1]
+        else:
+            return alignment_position + (cigar_code[1] - (ref_position - relative_position))
+
+    return -1
+
+
+def count_deletions_from_cigar_codes_in_given_window(cigartuples: list,
+                                                aligned_location: int,
+                                                location_is_end: bool,
+                                                splice_site_data: dict,
+                                                window_size: int):
+    """
+    Get cigar codes in a given window.
+
+    Args:
+        cigar_tuples (list): list of cigar tuples (cigar code, aligned position). See 
+        pysam documentation for more information
+        aligned_location (int): aligned location
+        loc_type (str): type of location (start or end)
+    """
+
+    deletions = 0
+    
+
+    cigar_code_list = []
+    location = 0
+
+    if location_is_end:
+        aligned_location = aligned_location - window_size + 1
+
+    for cigar_code in cigartuples:
+        if window_size == len(cigar_code_list):
+            break
+        if location + cigar_code[1] > aligned_location:
+            overlap = location + \
+                cigar_code[1] - (aligned_location + len(cigar_code_list))
+            cigar_code_list.extend(
+                [cigar_code[0] for _ in range(min(window_size -
+                                                len(cigar_code_list), overlap))])
+        location += cigar_code[1]
+
+    for i in range(window_size):
+        if i >= len(cigar_code_list):
+            break
+        if cigar_code_list[i] == 2:
+            deletions += 1
+            splice_site_data["del_pos_distr"][i] += 1
+    
+    if deletions not in splice_site_data:
+        splice_site_data["deletions"][deletions] = 0
+    
+    splice_site_data["deletions"][deletions] += 1
+
+
+def extract_splice_site_locations_within_aligned_read(read_start: int, read_end: int, exons:list):
+    matching_locations = []
+    for exon_start, exon_end in exons:
+        if read_start <= exon_start <= read_end:
+            location_is_end = False
+            matching_locations.append((exon_start, location_is_end))
+        if read_start <= exon_end <= read_end:
+            location_is_end = True
+            matching_locations.append((exon_end, location_is_end))
+        if read_end <= exon_end:
+            break
+    return matching_locations
+
+
+def count_deletions_for_splice_site_locations(assigned_read, exons: list, splice_site_cases: dict):
+    """
+
+    Args:
+        assigned_read (ReadAssignment): read assignment
+        exons (list): tuple of exons (start, end)
+        splice_site_cases (dict): a dictionary for storing splice site cases
+    """
+
+    # Extract read start and end
+    read_start = assigned_read.corrected_exons[0][0]
+    read_end = assigned_read.corrected_exons[-1][1]
+    cigartuples = assigned_read.cigartuples
+    
+    # Constant window size for counting deletions
+    WINDOW_SIZE = 8
+    
+    # Extract splice site locations within aligned read
+    matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons)
+    
+    # Count deletions for each splice site location
+    for splice_site_location, location_type in matching_locations:
+        if splice_site_location not in splice_site_cases:
+            splice_site_cases[splice_site_location] = {
+                'location_is_end': location_type,  
+                'deletions': {},
+                'del_pos_distr': [0 for _ in range(WINDOW_SIZE)],
+                'most_common_deletion': -1,
+                'del_location_has_canonical_nucleotides': False
+            }
+        
+        # Processing cigartuples
+        # 1. Find the aligned location
+        aligned_location = extract_location_from_cigar_string(cigartuples, read_start, read_end, splice_site_location)
+        # 2. Count deletions in a predefined window
+        count_deletions_from_cigar_codes_in_given_window(
+            cigartuples, 
+            aligned_location, 
+            location_type, 
+            splice_site_cases[splice_site_location], 
+            WINDOW_SIZE)
+
+
+
+def compute_most_common_case_of_deletions(deletions: dict, location_is_end: bool):
+    del_most_common_case = [k for k, v in deletions.items(
+    ) if v == max(deletions.values())]
+    if len(del_most_common_case) == 1:
+        if location_is_end:
+            return -del_most_common_case[0]
+        return del_most_common_case[0]
+    return -1    
+
+
+def extract_nucleotides_from_most_common_del_location(
+        location: int, 
+        splice_site_data: dict, 
+        chr_record, 
+        strand: str):
+    most_common_del = splice_site_data["most_common_del"]
+    idx_correction = -1
+    extraction_start = location + most_common_del + idx_correction
+    extraction_end = location + most_common_del + 2 + idx_correction
+    try:
+        extracted_canonicals =  chr_record[extraction_start:extraction_end]
+    except KeyError:
+        extracted_canonicals = 'XX'
+    
+    
+    canonical_pairs = {
+        '+': {
+            'start': ['AG', 'AC'],
+            'end': ['GT', 'GC', 'AT']
+        },
+        '-': {
+            'start': ['AC', 'GC', 'AC'],
+            'end': ['CT', 'GT']
+        }
+    }
+    if splice_site_data["location_is_end"]:
+        possible_canonicals = canonical_pairs[strand]['end']
+    else:
+        possible_canonicals = canonical_pairs[strand]['start']
+    
+    if extracted_canonicals in possible_canonicals:
+        splice_site_data["del_location_has_canonical_nucleotides"] = True
+
+def compute_most_common_del_and_verify_nucleotides(
+        splice_site_location: int, 
+        splice_site_data: dict, 
+        chr_record,
+        ACCEPTED_DEL_CASES: list,
+        strand: str,):
+    
+    
+    # Compute most common case of deletions
+    splice_site_data["most_common_deletion"] = compute_most_common_case_of_deletions(
+        splice_site_data["deletions"],
+        splice_site_data["location_is_end"])
+    
+    # Extract nucleotides from most common deletion location if it is an accepted case
+    if splice_site_data["most_common_deletion"] in ACCEPTED_DEL_CASES:
+        extract_nucleotides_from_most_common_del_location(
+            splice_site_location, 
+            splice_site_data, 
+            chr_record, 
+            strand)
+
+
+
+def threshold_exceeded(
+        del_pos_distr: list,
+        deletions: dict,
+        most_common_del: int,
+        THRESHOLD_CASES_AT_LOCATION):
+    total_cases = sum(deletions.values())
+    nucleotides_exceeding_treshold = 0
+    for value in del_pos_distr:
+        if value / total_cases > THRESHOLD_CASES_AT_LOCATION:
+            nucleotides_exceeding_treshold += 1
+    return bool(nucleotides_exceeding_treshold >= abs(most_common_del))
+
+def sublist_largest_values_exists(lst, n):
+    """
+        Verifies that there is a sublist of size n that contains the largest values in the list.
+        Not currently in use, but may be included in the error prediction strategy for stricter prediction. 
+    Args:
+        lst (int): list of deletion distribution
+        n (int): most common case of deletions
+
+    Returns:
+        _type_: _description_
+    """
+    largest_values = set(sorted(lst, reverse=True)[:n])
+    count = 0
+
+    for num in lst:
+        if num in largest_values:
+            count += 1
+            if count >= n:
+                return True
+        else:
+            count = 0
+
+    return False
\ No newline at end of file
diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
new file mode 100644
index 00000000..77d3f5ae
--- /dev/null
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -0,0 +1,251 @@
+from unittest import TestCase
+from unittest import main as unittest_main
+
+
+from src.transcript_splice_site_corrector import threshold_exceeded
+from src.transcript_splice_site_corrector import sublist_largest_values_exists
+from src.transcript_splice_site_corrector import extract_location_from_cigar_string
+from src.transcript_splice_site_corrector import count_deletions_from_cigar_codes_in_given_window
+from src.transcript_splice_site_corrector import extract_splice_site_locations_within_aligned_read
+class TestMoreConservativeStrategyConditions(TestCase):
+    
+    def test_threshold_exceeds_returns_true(self):
+        THRESHOLD = 0.7
+        del_pos_distr = [0, 0, 10, 10, 10, 10, 0, 0]
+        deletions = {4: 10}
+        most_common_del = 4
+        result = threshold_exceeded(
+            del_pos_distr,
+            deletions,
+            most_common_del,
+            THRESHOLD)
+        self.assertTrue(result)
+
+    def test_threshold_not_exceeded_returns_false(self):
+        THRESHOLD = 0.7
+        del_pos_distr = [0, 0, 10, 10, 10, 6, 0, 0]
+        deletions = {4: 6, 3: 4}
+        most_common_del = 4
+        result = threshold_exceeded(
+            del_pos_distr,
+            deletions,
+            most_common_del,
+            THRESHOLD)
+        self.assertFalse(result)
+
+    def test_sublist_largest_values_exists_returns_true(self):
+        lst = [0, 0, 10, 10, 10, 10, 0, 0]
+        n = 4
+        result = sublist_largest_values_exists(lst, n)
+        self.assertTrue(result)
+
+    def test_sublist_largest_values_exists_returns_false(self):
+        lst = [0, 0, 10, 10, 10, 6, 0, 0]
+        n = 4
+        result = sublist_largest_values_exists(lst, n)
+        self.assertFalse(result)
+        
+
+class TestExtractingLocationFromCigarString(TestCase):
+
+    def test_cigar_string_with_soft_clip_and_one_match_is_parsed_correctly(self):
+        cigar = [(4, 50), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 105
+        expected_output = 55
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+        
+
+    def test_cigar_string_with_soft_clip_insertion_and_one_match_is_parsed_correctly(self):
+        cigar = [(4, 50), (1, 10), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 105
+        expected_output = 65
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+        
+
+    def test_cigar_str_with_s_d_i_m_gives_correct_output(self):
+        cigar = [(4, 50), (2, 10), (1, 10), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 115
+        expected_output = 75
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_cigar_str_with_s_d_n_m_gives_correct_output(self):
+        cigar = [(4, 50), (2, 10), (3, 100), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 215
+        expected_output = 165
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_cigar_str_with_s_m_i_n_m_gives_correct_output(self):
+        cigar = [(4, 50), (0, 10), (1, 10), (3, 100), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 215
+        expected_output = 175
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_location_outside_of_cigar_str_returns_minus_one(self):
+        cigar = [(4, 50), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 199
+        expected_output = -1
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_more_complicated_test_returns_correct_position(self):
+        cigar_tuples = [(4, 156), (0, 12), (2, 3), (0, 2), (2, 2), (0, 10), (2, 2), (0, 4), (2, 3), (0, 7), (1, 1), (0, 16), (1, 4), (0, 23), (1, 1), (0, 7),
+                        (1, 1), (0, 9), (2, 1), (0, 13), (2, 1), (0, 15), (2, 2), (0, 3), (1, 2), (0, 19), (2, 2), (0, 20), (2, 1), (0, 32), (3, 294), (0, 36), (4, 25)]
+        reference_start = 72822568
+        reference_end = 73822568
+        position = 72823071
+        expected_output = 668
+        result = extract_location_from_cigar_string(
+            cigar_tuples, reference_start, reference_end, position)
+        self.assertEqual(result, expected_output)
+
+    def test_case_that_does_not_consume_any_reference_returns_the_correct_location(self):
+        cigar = [(4, 50), (0, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 100
+        expected_output = 50
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_case_that_has_no_reference_consuming_codes_returns_minus_one_as_error(self):
+        cigar = [(4, 50), (1, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 100
+        expected_output = -1
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_case_that_has_no_reference_consuming_codes_at_the_end_returns_minus_one_as_error(self):
+        cigar = [(4, 50), (0, 10), (1, 10)]
+        reference_start = 100
+        reference_end = 160
+        location = 110
+        expected_output = -1
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+    def test_case_that_has_it_s_location_at_final_match_returns_correct_value(self):
+        cigar = [(4, 50), (0, 10), (1, 10)]
+        reference_start = 100
+        reference_end = 110
+        location = 110
+        expected_output = 60
+        result = extract_location_from_cigar_string(
+            cigar, reference_start, reference_end, location)
+        self.assertEqual(result, expected_output)
+
+
+class TestIndelCountingFromCigarCodes(TestCase):
+
+    def setUp(self):
+        self.window_size = 8
+
+    def test_indel_counter_returns_false_and_an_empty_debug_list_for_given_empty_list(self):
+        cigar_tuples = []
+        aligned_location = 100
+        location_is_end = False
+        splice_site_data = {
+            'deletions': {},
+            "del_pos_distr": [0] * self.window_size,
+        }
+        expected_result = {
+            'deletions': {0: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0]
+        }
+        count_deletions_from_cigar_codes_in_given_window(
+            cigar_tuples,
+            aligned_location,
+            location_is_end,
+            splice_site_data,
+            self.window_size)
+        
+        self.assertEqual(splice_site_data['deletions'], expected_result['deletions'])
+        self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr'])
+        
+
+        
+    def test_indels_are_counted_correctly(self):
+        cigar_tuples = [(0, 20), (2, 3), (1, 2), (0, 10)]
+        aligned_location = 27
+        location_is_end = True
+        splice_site_data = {
+            'deletions': {},
+            "del_pos_distr": [0] * self.window_size,
+        }
+
+
+        expected_result = {
+            'deletions': {3: 1},
+            "del_pos_distr": [1, 1, 1, 0, 0, 0, 0, 0]
+        }
+
+        count_deletions_from_cigar_codes_in_given_window(
+            cigar_tuples,
+            aligned_location,
+            location_is_end,
+            splice_site_data,
+            self.window_size)
+
+        self.assertEqual(splice_site_data['deletions'], expected_result['deletions'])
+        self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr'])
+
+    def test_full_window_of_dels_returns_true_for_errors(self):
+        cigar_tuples = [(0, 20), (2, 8), (1, 2), (0, 10)]
+        aligned_location = 20
+        location_is_end = False
+        splice_site_data = {
+            'deletions': {},
+            "del_pos_distr": [0] * self.window_size,
+        }
+        expected_result = {
+            'deletions': {8: 1},
+            "del_pos_distr": [1, 1, 1, 1, 1, 1, 1, 1]
+        }
+
+        count_deletions_from_cigar_codes_in_given_window(
+            cigar_tuples,
+            aligned_location,
+            location_is_end,
+            splice_site_data,
+            self.window_size)
+
+        self.assertEqual(splice_site_data['deletions'], expected_result['deletions'])
+        self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr'])
+
+class ExtractSpliceSiteLocationsFromAlignedRead(TestCase):
+
+    def test_correct_splice_sites_are_extracted(self):
+        exons = [(1, 10), (20, 30), (40, 50)]
+        read_start = 20
+        read_end = 40
+        result = extract_splice_site_locations_within_aligned_read(
+            read_start, read_end, exons)
+        expected_output = [20, 30 , 40]
+        self.assertEqual(result, expected_output)
\ No newline at end of file

From bd52a52380a69da269cd98a0c3449062a039ef47 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 08:46:42 +0300
Subject: [PATCH 04/44] Fix issues with two unittests

---
 tests/test_transcript_splice_site_corrector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 77d3f5ae..5b96543e 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -40,7 +40,7 @@ def test_sublist_largest_values_exists_returns_true(self):
         self.assertTrue(result)
 
     def test_sublist_largest_values_exists_returns_false(self):
-        lst = [0, 0, 10, 10, 10, 6, 0, 0]
+        lst = [0, 0, 10, 10, 10, 0, 6, 0]
         n = 4
         result = sublist_largest_values_exists(lst, n)
         self.assertFalse(result)
@@ -247,5 +247,5 @@ def test_correct_splice_sites_are_extracted(self):
         read_end = 40
         result = extract_splice_site_locations_within_aligned_read(
             read_start, read_end, exons)
-        expected_output = [20, 30 , 40]
+        expected_output = [(20, False), (30, True) , (40, False)]
         self.assertEqual(result, expected_output)
\ No newline at end of file

From 99c9d393fdf752675f8ba25c12bf4b5d4d30c509 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 09:01:46 +0300
Subject: [PATCH 05/44] Fix issue with two datastructures having the same var
 name

---
 src/graph_based_model_construction.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 6a50da63..e012ab94 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -269,17 +269,17 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         if not corrected_exons:
             return None
         
-        corrected_exons = []
+        final_corrected_exons = []
         for exon in exons:
-            corrected_exon = exon
+            new_corrected_exon = exon
             if exon[0] in splice_site_cases:
                 corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"]
                 corrected_exon = (corrected_location, exon[1])
             if exon[1] in splice_site_cases:
                 corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"]
                 corrected_exon = (exon[0], corrected_location)
-            corrected_exons.append(corrected_exon)
-        return corrected_exons
+            final_corrected_exons.append(new_corrected_exon)
+        return final_corrected_exons
 
 
     def filter_transcripts(self):

From 98619dbd680622e3cb854156d4d92d92db496ec3 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 09:44:14 +0300
Subject: [PATCH 06/44] Refactor code into separate functions

---
 src/graph_based_model_construction.py | 72 +++++++++++----------------
 1 file changed, 29 insertions(+), 43 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index e012ab94..99d5cee5 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -26,9 +26,11 @@
 from .long_read_profiles import CombinedProfileConstructor
 from .polya_finder import PolyAInfo
 
-from .transcript_splice_site_corrector import count_deletions_for_splice_site_locations
-from .transcript_splice_site_corrector import compute_most_common_del_and_verify_nucleotides
-from .transcript_splice_site_corrector import sublist_largest_values_exists
+from .transcript_splice_site_corrector import (
+    count_deletions_for_splice_site_locations, 
+    correct_splice_site_errors,
+    generate_updated_exon_list
+    )
 
 logger = logging.getLogger('IsoQuant')
 
@@ -234,52 +236,36 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         splice_site_cases = {}
         # Iterate assigned_reads list and count deletions for splice site locations
         for read_assignment in assigned_reads:
-            count_deletions_for_splice_site_locations(read_assignment, exons, splice_site_cases)
+            read_start = read_assignment.corrected_exons[0][0]
+            read_end = read_assignment.corrected_exons[-1][1]
+            cigartuples = read_assignment.cigartuples
+            count_deletions_for_splice_site_locations(
+                read_start, 
+                read_end, 
+                cigartuples, 
+                exons, 
+                splice_site_cases)
 
-        # Second iteration
-        # 1. Count most common deletion at each splice site location
-        # 2. For interesting cases count nucleotides at deletion positions
-        # 3. If canonical nucleotides are found, correct splice site
         
-        corrected_exons = []
-        for splice_site_location, splice_site_data in splice_site_cases.items():
-            
-            reads = sum(splice_site_data["deletions"].values())
-            if reads < MIN_N_OF_ALIGNED_READS:
-                continue
-            
-            compute_most_common_del_and_verify_nucleotides(
-                splice_site_location, 
-                splice_site_data, 
-                self.chr_record,
-                ACCEPTED_DEL_CASES,
-                strand
-                )
-            if MORE_CONSERVATIVE_STRATEGY:
-                if not sublist_largest_values_exists(
-                    splice_site_data["del_pos_distr"],
-                    abs(splice_site_data["most_common_del"])):
-                    continue
-                pass
+        corrected_exons = correct_splice_site_errors(
+            splice_site_cases,
+            MIN_N_OF_ALIGNED_READS,
+            ACCEPTED_DEL_CASES,
+            MORE_CONSERVATIVE_STRATEGY,
+            strand,
+            self.chr_record
+        )
 
-            if splice_site_data["del_location_has_canonical_nucleotides"]:
-                corrected_exons.append(splice_site_location)
-        
-        # If correction took place, return corrected exons
         if not corrected_exons:
             return None
         
-        final_corrected_exons = []
-        for exon in exons:
-            new_corrected_exon = exon
-            if exon[0] in splice_site_cases:
-                corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"]
-                corrected_exon = (corrected_location, exon[1])
-            if exon[1] in splice_site_cases:
-                corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"]
-                corrected_exon = (exon[0], corrected_location)
-            final_corrected_exons.append(new_corrected_exon)
-        return final_corrected_exons
+        updated_exons = generate_updated_exon_list(
+            splice_site_cases,
+            corrected_exons,
+            exons
+        )
+        
+        return updated_exons
 
 
     def filter_transcripts(self):

From 321be9c00beaa1b2ad4d72a26b6c369750a81751 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 09:44:29 +0300
Subject: [PATCH 07/44] Refactor code into separate functions

---
 src/transcript_splice_site_corrector.py | 76 +++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 6 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index 7ee8e727..be17594e 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -94,7 +94,12 @@ def extract_splice_site_locations_within_aligned_read(read_start: int, read_end:
     return matching_locations
 
 
-def count_deletions_for_splice_site_locations(assigned_read, exons: list, splice_site_cases: dict):
+def count_deletions_for_splice_site_locations(
+        read_start: int, 
+        read_end: int, 
+        cigartuples: list, 
+        exons: list, 
+        splice_site_cases: dict):
     """
 
     Args:
@@ -103,10 +108,6 @@ def count_deletions_for_splice_site_locations(assigned_read, exons: list, splice
         splice_site_cases (dict): a dictionary for storing splice site cases
     """
 
-    # Extract read start and end
-    read_start = assigned_read.corrected_exons[0][0]
-    read_end = assigned_read.corrected_exons[-1][1]
-    cigartuples = assigned_read.cigartuples
     
     # Constant window size for counting deletions
     WINDOW_SIZE = 8
@@ -238,4 +239,67 @@ def sublist_largest_values_exists(lst, n):
         else:
             count = 0
 
-    return False
\ No newline at end of file
+    return False
+
+
+def correct_splice_site_errors(
+        splice_site_cases: dict,
+        MIN_N_OF_ALIGNED_READS: int,
+        ACCEPTED_DEL_CASES: list,
+        MORE_CONSERVATIVE_STRATEGY: bool,
+        strand: str,
+        chr_record):
+    """ 1. Count most common deletion at each splice site location
+        2. For interesting cases count nucleotides at deletion positions
+        3. If canonical nucleotides are found, correct splice site
+
+    Args:
+        splice_site_cases (dict): collected splice site cases
+        MIN_N_OF_ALIGNED_READS (int): constant for minimum number of aligned reads
+        ACCEPTED_DEL_CASES (list): constant for accepted cases of deletions
+        MORE_CONSERVATIVE_STRATEGY (bool): constant for more conservative strategy
+        strand (str): transcript strand (extracted from first ReadAssignment-object in read_assignments list)
+        chr_record (Fasta): FASTA recored, i.e. a single chromosome from a reference
+    """
+    
+    locations_with_errors = []
+    for splice_site_location, splice_site_data in splice_site_cases.items():
+        
+        reads = sum(splice_site_data["deletions"].values())
+        if reads < MIN_N_OF_ALIGNED_READS:
+            continue
+        
+        compute_most_common_del_and_verify_nucleotides(
+            splice_site_location, 
+            splice_site_data, 
+            chr_record,
+            ACCEPTED_DEL_CASES,
+            strand
+            )
+        if MORE_CONSERVATIVE_STRATEGY:
+            if not sublist_largest_values_exists(
+                splice_site_data["del_pos_distr"],
+                abs(splice_site_data["most_common_del"])):
+                continue
+            pass
+
+        if splice_site_data["del_location_has_canonical_nucleotides"]:
+            locations_with_errors.append(splice_site_location)
+    
+    return locations_with_errors
+
+def generate_updated_exon_list(
+        splice_site_cases: dict,
+        locations_with_errors: list,
+        exons: list):
+    updated_exons = []
+    for exon in exons:
+            updated_exon = exon
+            if exon[0] in locations_with_errors:
+                corrected_location = exon[0] + splice_site_cases[exon[0]]["most_common_del"]
+                updated_exon = (corrected_location, exon[1])
+            if exon[1] in locations_with_errors:
+                corrected_location = exon[1] + splice_site_cases[exon[1]]["most_common_del"]
+                updated_exon = (exon[0], corrected_location)
+            updated_exons.append(updated_exon)
+    return updated_exons
\ No newline at end of file

From f30996ba400a19f4b2e4ff3d650b35ab6aeb0deb Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 09:44:46 +0300
Subject: [PATCH 08/44] Expand tests for untested functions

---
 .../test_transcript_splice_site_corrector.py  | 78 +++++++++++++++++--
 1 file changed, 71 insertions(+), 7 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 5b96543e..071bd309 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -2,11 +2,20 @@
 from unittest import main as unittest_main
 
 
-from src.transcript_splice_site_corrector import threshold_exceeded
-from src.transcript_splice_site_corrector import sublist_largest_values_exists
-from src.transcript_splice_site_corrector import extract_location_from_cigar_string
-from src.transcript_splice_site_corrector import count_deletions_from_cigar_codes_in_given_window
-from src.transcript_splice_site_corrector import extract_splice_site_locations_within_aligned_read
+from src.transcript_splice_site_corrector import (
+    extract_location_from_cigar_string,
+    count_deletions_from_cigar_codes_in_given_window,
+    extract_splice_site_locations_within_aligned_read,
+    count_deletions_for_splice_site_locations,
+    compute_most_common_case_of_deletions,
+    extract_nucleotides_from_most_common_del_location,
+    compute_most_common_del_and_verify_nucleotides,
+    threshold_exceeded,
+    sublist_largest_values_exists,
+    correct_splice_site_errors,
+    generate_updated_exon_list,
+)
+
 class TestMoreConservativeStrategyConditions(TestCase):
     
     def test_threshold_exceeds_returns_true(self):
@@ -239,7 +248,7 @@ def test_full_window_of_dels_returns_true_for_errors(self):
         self.assertEqual(splice_site_data['deletions'], expected_result['deletions'])
         self.assertEqual(splice_site_data['del_pos_distr'], expected_result['del_pos_distr'])
 
-class ExtractSpliceSiteLocationsFromAlignedRead(TestCase):
+class TestExtractSpliceSiteLocationsFromAlignedRead(TestCase):
 
     def test_correct_splice_sites_are_extracted(self):
         exons = [(1, 10), (20, 30), (40, 50)]
@@ -248,4 +257,59 @@ def test_correct_splice_sites_are_extracted(self):
         result = extract_splice_site_locations_within_aligned_read(
             read_start, read_end, exons)
         expected_output = [(20, False), (30, True) , (40, False)]
-        self.assertEqual(result, expected_output)
\ No newline at end of file
+        self.assertEqual(result, expected_output)
+
+
+class TestExonListUpdater(TestCase):
+    
+    def test_error_at_location_start_is_corrected(self):
+        exons = [(1, 10), (20, 30), (40, 50)]
+        locations_with_errors = [20]
+        splice_site_cases = {
+            20: {
+                "most_common_del": 4,
+            }
+        }
+        result = generate_updated_exon_list(
+            splice_site_cases, locations_with_errors, exons)
+        expected_result = [(1, 10), (24, 30), (40, 50)]
+        self.assertEqual(result, expected_result)
+    
+    def test_error_at_location_end_is_corrected(self):
+        exons = [(1, 10), (20, 30), (40, 50)]
+        locations_with_errors = [30]
+        splice_site_cases = {
+            30: {
+                "most_common_del": -4,
+            }
+        }
+        result = generate_updated_exon_list(
+            splice_site_cases, locations_with_errors, exons)
+        expected_result = [(1, 10), (20, 26), (40, 50)]
+        self.assertEqual(result, expected_result)
+        
+
+    pass
+
+class TestHelperFunctions(TestCase):
+
+    def test_distinct_most_common_case_is_returned_for_location_end(self):
+        cases = {0: 10, 1: 2, 3: 0, 4: 20, 5: 1}
+        location_is_end = False
+        result = compute_most_common_case_of_deletions(cases, location_is_end)
+        expected_result = 4
+        self.assertEqual(result, expected_result)
+
+    def test_distinct_most_common_case_is_returned_for_location_start(self):
+        cases = {0: 10, 1: 2, 3: 0, 4: 20, 5: 1}
+        location_is_end = True
+        result = compute_most_common_case_of_deletions(cases, location_is_end)
+        expected_result = -4
+        self.assertEqual(result, expected_result)
+    
+    def test_if_no_distinct_most_commont_del_exists_return_neg_one(self):
+        cases = {0: 10, 1: 2, 3: 20, 4: 20, 5: 1}
+        location_is_end = False
+        result = compute_most_common_case_of_deletions(cases, location_is_end)
+        expected_result = -1
+        self.assertEqual(result, expected_result)
\ No newline at end of file

From 3f8aa163b482735492ddbbd1b0f379e81db2d254 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 11:38:10 +0300
Subject: [PATCH 09/44] Expand unittests

---
 .../test_transcript_splice_site_corrector.py  | 156 +++++++++++++++++-
 1 file changed, 153 insertions(+), 3 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 071bd309..ef20e0ef 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -1,6 +1,5 @@
 from unittest import TestCase
-from unittest import main as unittest_main
-
+from unittest.mock import MagicMock, patch
 
 from src.transcript_splice_site_corrector import (
     extract_location_from_cigar_string,
@@ -312,4 +311,155 @@ def test_if_no_distinct_most_commont_del_exists_return_neg_one(self):
         location_is_end = False
         result = compute_most_common_case_of_deletions(cases, location_is_end)
         expected_result = -1
-        self.assertEqual(result, expected_result)
\ No newline at end of file
+        self.assertEqual(result, expected_result)
+
+
+class TestCorrectSpliceSiteErrors(TestCase):
+    
+    @patch('src.transcript_splice_site_corrector.compute_most_common_case_of_deletions')
+    def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_deletions):
+        splice_site_cases = {
+            20: {
+                "del_location_has_canonical_nucleotides": False,
+                "deletions": {4: 10},
+                "location_is_end": False,
+                "most_common_del": 4,
+            },
+            30: {
+                "del_location_has_canonical_nucleotides": True,
+                "deletions": {4: 10},
+                "location_is_end": False,
+                "most_common_del": 4,
+            },
+        }
+        MIN_N_ALIGNED_READS = 5
+        ACCEPTED_DEL_CASES = [4]
+        MORE_CONSERVATIVE_STRATEGY = False
+        strand = "+"
+        chr_record = None
+        result = correct_splice_site_errors(
+            splice_site_cases,
+            MIN_N_ALIGNED_READS,
+            ACCEPTED_DEL_CASES,
+            MORE_CONSERVATIVE_STRATEGY,
+            strand,
+            chr_record)
+        expected_result = [30]
+        self.assertEqual(result, expected_result)
+
+class TestCountDeletionsFromSpliceSiteLocations(TestCase):
+    def test_count_deletions_from_splice_site_locations_extracts_correct_locations(self):
+        exons = [(1, 10), (20, 30), (40, 50)]
+        #  20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
+        # [M ,M, M, M, M, M, D, D, D, D, M, M, M, M, M, M, M, M, M, M, M]
+        cigartuples = [(0, 6), (2, 4), (0, 10)]
+        read_start = 20
+        read_end = 40
+        splice_site_cases = {}
+        count_deletions_for_splice_site_locations(
+            read_start,
+            read_end,
+            cigartuples,
+            exons,
+            splice_site_cases)
+        expected_result = {
+            20: {
+                'location_is_end': False, 
+                'deletions': {2: 1}, 
+                'del_pos_distr': [0, 0, 0, 0, 0, 0, 1, 1], 
+                'most_common_deletion': -1, 
+                'del_location_has_canonical_nucleotides': False
+            },
+            30: {
+                'location_is_end': True, 
+                'deletions': {4: 1}, 
+                'del_pos_distr': [0, 0, 0, 1, 1, 1, 1, 0], 
+                'most_common_deletion': -1, 
+                'del_location_has_canonical_nucleotides': False
+            },
+            40: {
+                'location_is_end': False, 
+                'deletions': {0: 1}, 
+                'del_pos_distr': [0, 0, 0, 0, 0, 0, 0, 0], 
+                'most_common_deletion': -1, 
+                'del_location_has_canonical_nucleotides': False
+            },
+        }
+        self.assertEqual(splice_site_cases, expected_result)
+
+
+class TestNucleotideExtraction(TestCase):
+
+    def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly(self):
+        location = 10
+        splice_site_data = {
+            "most_common_del": 4,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+        }
+        chr_record = "AAAAAAAAAAAAAAG"
+        
+        strand = "+"
+        extract_nucleotides_from_most_common_del_location(
+            location,
+            splice_site_data,
+            chr_record,
+            strand)
+        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+
+    def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(self):
+        location = 10
+        splice_site_data = {
+            "most_common_del": -4,
+            "location_is_end": True,
+            "del_location_has_canonical_nucleotides": False,
+        }
+        
+        #  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
+        #              offset of -4   ^
+        #                 |           |
+        #                 v         start pos
+        #  A  A  A  A  A  G  C  A  A  A  A  A  A  A  A  
+        chr_record = "AAAAAGCAAAAAAAA"
+        
+        strand = "+"
+        extract_nucleotides_from_most_common_del_location(
+            location,
+            splice_site_data,
+            chr_record,
+            strand)
+        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+
+    def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly(self):
+        location = 10
+        splice_site_data = {
+            "most_common_del": 4,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+        }
+        chr_record = "AAAAAAAAAAAAAAC"
+        
+        strand = "-"
+        extract_nucleotides_from_most_common_del_location(
+            location,
+            splice_site_data,
+            chr_record,
+            strand)
+        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+
+    def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(self):
+        location = 10
+        splice_site_data = {
+            "most_common_del": -4,
+            "location_is_end": True,
+            "del_location_has_canonical_nucleotides": False,
+        }
+        chr_record = "AAAAACTAAAAAAAA"
+        
+        strand = "-"
+        extract_nucleotides_from_most_common_del_location(
+            location,
+            splice_site_data,
+            chr_record,
+            strand)
+        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
\ No newline at end of file

From 3061f0f808eba0da082e2231306834daee4a8e54 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 12:37:57 +0300
Subject: [PATCH 10/44] expand unittests

---
 .../test_transcript_splice_site_corrector.py  | 97 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 4 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index ef20e0ef..13e45ec6 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -367,21 +367,21 @@ def test_count_deletions_from_splice_site_locations_extracts_correct_locations(s
                 'location_is_end': False, 
                 'deletions': {2: 1}, 
                 'del_pos_distr': [0, 0, 0, 0, 0, 0, 1, 1], 
-                'most_common_deletion': -1, 
+                'most_common_del': -1, 
                 'del_location_has_canonical_nucleotides': False
             },
             30: {
                 'location_is_end': True, 
                 'deletions': {4: 1}, 
                 'del_pos_distr': [0, 0, 0, 1, 1, 1, 1, 0], 
-                'most_common_deletion': -1, 
+                'most_common_del': -1, 
                 'del_location_has_canonical_nucleotides': False
             },
             40: {
                 'location_is_end': False, 
                 'deletions': {0: 1}, 
                 'del_pos_distr': [0, 0, 0, 0, 0, 0, 0, 0], 
-                'most_common_deletion': -1, 
+                'most_common_del': -1, 
                 'del_location_has_canonical_nucleotides': False
             },
         }
@@ -462,4 +462,93 @@ def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(se
             splice_site_data,
             chr_record,
             strand)
-        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
\ No newline at end of file
+        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+
+
+class TestDeletionComputationAndBaseExtraction(TestCase):
+
+    def test_for_accepted_del_case_nucleotides_are_vefiried(self):
+        splice_site_location = 10
+        splice_site_data  = {
+            "most_common_del": -1,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+            "deletions": {4: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
+        }
+
+        chr_record = "AAAAAAAAAAAAAAG"
+        ACCEPTED_DEL_CASES = [4]
+        strand = "+"
+        compute_most_common_del_and_verify_nucleotides(
+            splice_site_location,
+            splice_site_data,
+            chr_record,
+            ACCEPTED_DEL_CASES,
+            strand)
+        expected_result = {
+            "most_common_del": 4,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": True,
+            "deletions": {4: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
+        }
+        self.assertEqual(splice_site_data, expected_result)
+
+
+    def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self):
+        splice_site_location = 10
+        splice_site_data  = {
+            "most_common_del": -1,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+            "deletions": {2: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
+        }
+
+        chr_record = "AAAAAAAAAAAAAAG"
+        ACCEPTED_DEL_CASES = [4]
+        strand = "+"
+        compute_most_common_del_and_verify_nucleotides(
+            splice_site_location,
+            splice_site_data,
+            chr_record,
+            ACCEPTED_DEL_CASES,
+            strand)
+        expected_result = {
+            "most_common_del": 2,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+            "deletions": {2: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
+        }
+        self.assertEqual(splice_site_data, expected_result)
+    
+    def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self):
+        splice_site_location = 10
+        splice_site_data  = {
+            "most_common_del": -1,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+            "deletions": {4: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
+        }
+
+        chr_record = "AAAAAAAAAAAAAXX"
+        ACCEPTED_DEL_CASES = [4]
+        strand = "+"
+        compute_most_common_del_and_verify_nucleotides(
+            splice_site_location,
+            splice_site_data,
+            chr_record,
+            ACCEPTED_DEL_CASES,
+            strand)
+        expected_result = {
+            "most_common_del": 4,
+            "location_is_end": False,
+            "del_location_has_canonical_nucleotides": False,
+            "deletions": {4: 1},
+            "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
+        }
+        self.assertEqual(splice_site_data, expected_result)
+        
\ No newline at end of file

From 8eefa311bbf70e07c3be2e0fd3a640f9486e4a14 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 12:39:08 +0300
Subject: [PATCH 11/44] Fix key-issue in splice_site_dict

---
 src/transcript_splice_site_corrector.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index be17594e..d2c51d75 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -122,7 +122,7 @@ def count_deletions_for_splice_site_locations(
                 'location_is_end': location_type,  
                 'deletions': {},
                 'del_pos_distr': [0 for _ in range(WINDOW_SIZE)],
-                'most_common_deletion': -1,
+                'most_common_del': -1,
                 'del_location_has_canonical_nucleotides': False
             }
         
@@ -191,12 +191,12 @@ def compute_most_common_del_and_verify_nucleotides(
     
     
     # Compute most common case of deletions
-    splice_site_data["most_common_deletion"] = compute_most_common_case_of_deletions(
+    splice_site_data["most_common_del"] = compute_most_common_case_of_deletions(
         splice_site_data["deletions"],
         splice_site_data["location_is_end"])
     
     # Extract nucleotides from most common deletion location if it is an accepted case
-    if splice_site_data["most_common_deletion"] in ACCEPTED_DEL_CASES:
+    if splice_site_data["most_common_del"] in ACCEPTED_DEL_CASES:
         extract_nucleotides_from_most_common_del_location(
             splice_site_location, 
             splice_site_data, 

From cc53125e8ecf222bc9a404ad8d2e8b4475cc3f58 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 12:52:34 +0300
Subject: [PATCH 12/44] Add threshold verification to conservative strategy

---
 src/transcript_splice_site_corrector.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index d2c51d75..a876e7fc 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -205,7 +205,7 @@ def compute_most_common_del_and_verify_nucleotides(
 
 
 
-def threshold_exceeded(
+def threshold_for_del_cases_exceeded(
         del_pos_distr: list,
         deletions: dict,
         most_common_del: int,
@@ -246,6 +246,7 @@ def correct_splice_site_errors(
         splice_site_cases: dict,
         MIN_N_OF_ALIGNED_READS: int,
         ACCEPTED_DEL_CASES: list,
+        THRESHOLD_CASES_AT_LOCATION: float,
         MORE_CONSERVATIVE_STRATEGY: bool,
         strand: str,
         chr_record):
@@ -281,7 +282,12 @@ def correct_splice_site_errors(
                 splice_site_data["del_pos_distr"],
                 abs(splice_site_data["most_common_del"])):
                 continue
-            pass
+            if not threshold_for_del_cases_exceeded(
+                splice_site_data["del_pos_distr"],
+                splice_site_data["deletions"],
+                splice_site_data["most_common_del"],
+                THRESHOLD_CASES_AT_LOCATION):
+                continue
 
         if splice_site_data["del_location_has_canonical_nucleotides"]:
             locations_with_errors.append(splice_site_location)

From af816240f18eda2d35f2be08a24e32799e387a14 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 12:52:51 +0300
Subject: [PATCH 13/44] Add constant for threshold to args

---
 src/graph_based_model_construction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 99d5cee5..be91655e 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -251,6 +251,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             splice_site_cases,
             MIN_N_OF_ALIGNED_READS,
             ACCEPTED_DEL_CASES,
+            THRESHOLD_CASES_AT_LOCATION,
             MORE_CONSERVATIVE_STRATEGY,
             strand,
             self.chr_record

From 2740474c996bf0327f62e4333046efa8b5e0a9ff Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 22 Aug 2023 12:53:02 +0300
Subject: [PATCH 14/44] Update function name

---
 tests/test_transcript_splice_site_corrector.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 13e45ec6..9b0d32aa 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -1,5 +1,5 @@
 from unittest import TestCase
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 from src.transcript_splice_site_corrector import (
     extract_location_from_cigar_string,
@@ -9,12 +9,18 @@
     compute_most_common_case_of_deletions,
     extract_nucleotides_from_most_common_del_location,
     compute_most_common_del_and_verify_nucleotides,
-    threshold_exceeded,
+    threshold_for_del_cases_exceeded,
     sublist_largest_values_exists,
     correct_splice_site_errors,
     generate_updated_exon_list,
 )
 
+#######################################################################
+##                                                                   ##
+## Run tests with:                                                   ##
+## python -m unittest tests/test_transcript_splice_site_corrector.py ##
+##                                                                   ##
+#######################################################################
 class TestMoreConservativeStrategyConditions(TestCase):
     
     def test_threshold_exceeds_returns_true(self):
@@ -22,7 +28,7 @@ def test_threshold_exceeds_returns_true(self):
         del_pos_distr = [0, 0, 10, 10, 10, 10, 0, 0]
         deletions = {4: 10}
         most_common_del = 4
-        result = threshold_exceeded(
+        result = threshold_for_del_cases_exceeded(
             del_pos_distr,
             deletions,
             most_common_del,
@@ -34,7 +40,7 @@ def test_threshold_not_exceeded_returns_false(self):
         del_pos_distr = [0, 0, 10, 10, 10, 6, 0, 0]
         deletions = {4: 6, 3: 4}
         most_common_del = 4
-        result = threshold_exceeded(
+        result = threshold_for_del_cases_exceeded(
             del_pos_distr,
             deletions,
             most_common_del,
@@ -334,6 +340,7 @@ def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_de
         }
         MIN_N_ALIGNED_READS = 5
         ACCEPTED_DEL_CASES = [4]
+        THRESHOLD_CASES_AT_LOCATION = 0.7
         MORE_CONSERVATIVE_STRATEGY = False
         strand = "+"
         chr_record = None
@@ -341,6 +348,7 @@ def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_de
             splice_site_cases,
             MIN_N_ALIGNED_READS,
             ACCEPTED_DEL_CASES,
+            THRESHOLD_CASES_AT_LOCATION,
             MORE_CONSERVATIVE_STRATEGY,
             strand,
             chr_record)
@@ -350,6 +358,7 @@ def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_de
 class TestCountDeletionsFromSpliceSiteLocations(TestCase):
     def test_count_deletions_from_splice_site_locations_extracts_correct_locations(self):
         exons = [(1, 10), (20, 30), (40, 50)]
+        # Cigar codes for indeces 20-40:
         #  20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
         # [M ,M, M, M, M, M, D, D, D, D, M, M, M, M, M, M, M, M, M, M, M]
         cigartuples = [(0, 6), (2, 4), (0, 10)]
@@ -415,6 +424,7 @@ def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(se
             "del_location_has_canonical_nucleotides": False,
         }
         
+        #  Fasta 1-based index extraction location:
         #  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
         #              offset of -4   ^
         #                 |           |

From e2e3ed5f3f0b0fa74882d6153d5712efcd0f07f2 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 22 Aug 2023 18:50:58 +0300
Subject: [PATCH 15/44] fix cigartuples, can be None sometimes

---
 src/graph_based_model_construction.py | 2 ++
 src/isoform_assignment.py             | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index be91655e..3e5eb63e 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -239,6 +239,8 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             read_start = read_assignment.corrected_exons[0][0]
             read_end = read_assignment.corrected_exons[-1][1]
             cigartuples = read_assignment.cigartuples
+            if not cigartuples:
+                continue
             count_deletions_for_splice_site_locations(
                 read_start, 
                 read_end, 
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index ffd57645..6e90a18c 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -509,6 +509,8 @@ def deserialize(cls, infile, gene_info):
         read_assignment.read_id = read_string(infile)
         read_assignment.exons = read_list_of_pairs(infile, read_int)
         read_assignment.cigartuples = read_list_of_pairs(infile, read_int)
+        if not read_assignment.cigartuples:
+            read_assignment.cigartuples = None
         read_assignment.corrected_exons = read_list_of_pairs(infile, read_int)
         read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
         read_assignment.gene_info = gene_info
@@ -534,7 +536,10 @@ def serialize(self, outfile):
         write_int(self.assignment_id, outfile)
         write_string(self.read_id, outfile)
         write_list_of_pairs(self.exons, outfile, write_int)
-        write_list_of_pairs(self.cigartuples, outfile, write_int)
+        if self.cigartuples is None:
+            write_list_of_pairs([], outfile, write_int)
+        else:
+            write_list_of_pairs(self.cigartuples, outfile, write_int)
         write_list_of_pairs(self.corrected_exons, outfile, write_int)
         write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile)
         write_int_neg(self.polya_info.external_polya_pos, outfile)

From 07ab569411d3e2e18778036478983ee91dfe1167 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 10:24:43 +0300
Subject: [PATCH 16/44] Add logger.debug to see corrected exons

---
 src/graph_based_model_construction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index be91655e..b90474fc 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -265,6 +265,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             corrected_exons,
             exons
         )
+        logger.debug("Corrected exons: ", updated_exons)
         
         return updated_exons
 

From 7953ff37885280c9b7c7549398f97e06b96ef419 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 10:44:14 +0300
Subject: [PATCH 17/44] Add logger.debug to see corrected exons

---
 src/graph_based_model_construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index cc5757c3..5bdac4e3 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -219,7 +219,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         # returns: a list of corrected exons if correction takes place, None - otherwise
         # TODO Heidi: insert your code here
 
-        
+        logger.debug("Correcting splice sites. n of exons: ", len(exons), " n of assigned reads: ", len(assigned_reads))
         # Constants
         ACCEPTED_DEL_CASES = [3, 4, 5, 6]
         SUPPORTED_STRANDS = ['+', '-']

From bec61f878e44355d277b90ae4641f5e31f35d08b Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 11:24:41 +0300
Subject: [PATCH 18/44] Add logger.debug to see corrected exons

---
 src/graph_based_model_construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 5bdac4e3..f8ac8808 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -219,7 +219,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         # returns: a list of corrected exons if correction takes place, None - otherwise
         # TODO Heidi: insert your code here
 
-        logger.debug("Correcting splice sites. n of exons: ", len(exons), " n of assigned reads: ", len(assigned_reads))
+        logger.debug(f"Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}")
         # Constants
         ACCEPTED_DEL_CASES = [3, 4, 5, 6]
         SUPPORTED_STRANDS = ['+', '-']

From 8e46fc165c4310d7df1af9a58472cce559f32dbe Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 11:28:50 +0300
Subject: [PATCH 19/44] Add logger.debug to see corrected exons

---
 src/graph_based_model_construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index f8ac8808..533ac757 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -267,7 +267,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             corrected_exons,
             exons
         )
-        logger.debug("Corrected exons: ", updated_exons)
+        logger.debug(f"Corrected exons: {len(updated_exons)}, {updated_exons}")
         
         return updated_exons
 

From 03591464876a897a1fee556b5929c93241f59f19 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 12:34:48 +0300
Subject: [PATCH 20/44] Add logger.debug to see indel calc

---
 src/graph_based_model_construction.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 533ac757..aca26fd4 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -219,7 +219,6 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         # returns: a list of corrected exons if correction takes place, None - otherwise
         # TODO Heidi: insert your code here
 
-        logger.debug(f"Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}")
         # Constants
         ACCEPTED_DEL_CASES = [3, 4, 5, 6]
         SUPPORTED_STRANDS = ['+', '-']
@@ -230,6 +229,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
 
 
         strand = assigned_reads[0].strand
+        logger.debug(f"Heidi: Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}")
         if strand not in SUPPORTED_STRANDS:
             return None
 
@@ -248,7 +248,8 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
                 exons, 
                 splice_site_cases)
 
-        
+        logger.debug(f"Heidi: Splice site cases: {splice_site_cases}")
+
         corrected_exons = correct_splice_site_errors(
             splice_site_cases,
             MIN_N_OF_ALIGNED_READS,
@@ -267,7 +268,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             corrected_exons,
             exons
         )
-        logger.debug(f"Corrected exons: {len(updated_exons)}, {updated_exons}")
+        logger.debug(f"Heidi: Corrected exons: {len(updated_exons)}, {updated_exons}")
         
         return updated_exons
 

From bc91708fc78bb89df650f053a152a6114a9cc1be Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 12:57:24 +0300
Subject: [PATCH 21/44] Add debugging to see matching cases list

---
 src/transcript_splice_site_corrector.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index a876e7fc..6710d8a9 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -1,3 +1,6 @@
+import logging
+logger = logging.getLogger('IsoQuant')
+
 def extract_location_from_cigar_string(cigartuples: list,
                                     read_start: int,
                                     read_end: int,
@@ -114,7 +117,7 @@ def count_deletions_for_splice_site_locations(
     
     # Extract splice site locations within aligned read
     matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons)
-    
+    logger.debug(f"Matching locations: {matching_locations}")
     # Count deletions for each splice site location
     for splice_site_location, location_type in matching_locations:
         if splice_site_location not in splice_site_cases:

From 054aa1c96cc4f7140c27ab858ac203bab08da8d8 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 23 Aug 2023 14:07:25 +0300
Subject: [PATCH 22/44] Add debugger to cases with no cigartuples

---
 src/graph_based_model_construction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index aca26fd4..b286c2d0 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -240,6 +240,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             read_end = read_assignment.corrected_exons[-1][1]
             cigartuples = read_assignment.cigartuples
             if not cigartuples:
+                logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}")
                 continue
             count_deletions_for_splice_site_locations(
                 read_start, 

From 5ccbc819824376664f562d60b77035db9f1fd052 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Thu, 24 Aug 2023 08:38:01 +0300
Subject: [PATCH 23/44] add debug line to verify if cigartuples are found on
 some reads

---
 src/graph_based_model_construction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index b286c2d0..0fec2044 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -242,13 +242,14 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             if not cigartuples:
                 logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}")
                 continue
+            logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}")
             count_deletions_for_splice_site_locations(
                 read_start, 
                 read_end, 
                 cigartuples, 
                 exons, 
                 splice_site_cases)
-
+            
         logger.debug(f"Heidi: Splice site cases: {splice_site_cases}")
 
         corrected_exons = correct_splice_site_errors(

From 6e217ec0e7c8e30e03eac6544bb37fc1355226b2 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 07:34:36 +0300
Subject: [PATCH 24/44] Move debug to correct transcripts

---
 src/graph_based_model_construction.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 0fec2044..7741ebc8 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -208,6 +208,15 @@ def correct_transcripts(self):
         for model in self.transcript_model_storage:
             exons = model.exon_blocks
             assigned_reads = self.transcript_read_ids[model.transcript_id]
+            cigartuples = False
+            for read in assigned_reads:
+                if read.cigartuples:
+                    cigartuples = True
+                    break
+            if not cigartuples:
+                logger.debug(f"Heidi: Method correct_transcripts. No cigar tuples for transcript {model.transcript_id}")    
+            else:
+                logger.debug(f"Heidi: Method correct_transcripts. Yes cigar tuples for transcript {model.transcript_id}")
             corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
             if corrected_exons:
                 model.exon_blocks = corrected_exons
@@ -240,9 +249,9 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             read_end = read_assignment.corrected_exons[-1][1]
             cigartuples = read_assignment.cigartuples
             if not cigartuples:
-                logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}")
+                # logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}")
                 continue
-            logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}")
+            # logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}")
             count_deletions_for_splice_site_locations(
                 read_start, 
                 read_end, 

From 82fdc3ad47560f634e2de52684717dafec5e90c2 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 10:09:02 +0300
Subject: [PATCH 25/44] Move debug to correct transcripts

---
 src/graph_based_model_construction.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 7741ebc8..d18b5682 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -213,10 +213,7 @@ def correct_transcripts(self):
                 if read.cigartuples:
                     cigartuples = True
                     break
-            if not cigartuples:
-                logger.debug(f"Heidi: Method correct_transcripts. No cigar tuples for transcript {model.transcript_id}")    
-            else:
-                logger.debug(f"Heidi: Method correct_transcripts. Yes cigar tuples for transcript {model.transcript_id}")
+            logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {cigartuples}")    
             corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
             if corrected_exons:
                 model.exon_blocks = corrected_exons

From 20440ae27b849caa9f374665ffeac9795a4bd6af Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 10:12:50 +0300
Subject: [PATCH 26/44] Move debug to correct transcripts

---
 src/graph_based_model_construction.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index d18b5682..1ed4243e 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -208,12 +208,12 @@ def correct_transcripts(self):
         for model in self.transcript_model_storage:
             exons = model.exon_blocks
             assigned_reads = self.transcript_read_ids[model.transcript_id]
-            cigartuples = False
+            found_cigartuples = False
             for read in assigned_reads:
                 if read.cigartuples:
-                    cigartuples = True
+                    found_cigartuples = True
                     break
-            logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {cigartuples}")    
+            logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {found_cigartuples}")    
             corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
             if corrected_exons:
                 model.exon_blocks = corrected_exons

From 83f5ae63fe94801f676abf75c1a5ea0a7a18c527 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 10:23:03 +0300
Subject: [PATCH 27/44] Move debug to correct transcripts

---
 src/graph_based_model_construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 1ed4243e..451f538a 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -213,7 +213,7 @@ def correct_transcripts(self):
                 if read.cigartuples:
                     found_cigartuples = True
                     break
-            logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, four one or more cigartuples: {found_cigartuples}")    
+            logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, found one or more cigartuples: {found_cigartuples}")    
             corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
             if corrected_exons:
                 model.exon_blocks = corrected_exons

From f8f363fcaaced755d4d9210ed8ea87a2e11dcb86 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 13:08:08 +0300
Subject: [PATCH 28/44] Move debug to correct transcripts

---
 src/graph_based_model_construction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 451f538a..732171d8 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -209,6 +209,7 @@ def correct_transcripts(self):
             exons = model.exon_blocks
             assigned_reads = self.transcript_read_ids[model.transcript_id]
             found_cigartuples = False
+            # TODO: REMOVE NEXT FIVE LINES AFTER CIAGRTUPLES ARE FIXED
             for read in assigned_reads:
                 if read.cigartuples:
                     found_cigartuples = True

From b37f59b8e9afc7f17197d8413c80b9568d1f3f68 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 14:10:49 +0300
Subject: [PATCH 29/44] Fix bug with dict key ref

---
 src/transcript_splice_site_corrector.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index 6710d8a9..0e0f820e 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -50,7 +50,7 @@ def count_deletions_from_cigar_codes_in_given_window(cigartuples: list,
         loc_type (str): type of location (start or end)
     """
 
-    deletions = 0
+    count_of_deletions = 0
     
 
     cigar_code_list = []
@@ -74,13 +74,13 @@ def count_deletions_from_cigar_codes_in_given_window(cigartuples: list,
         if i >= len(cigar_code_list):
             break
         if cigar_code_list[i] == 2:
-            deletions += 1
+            count_of_deletions += 1
             splice_site_data["del_pos_distr"][i] += 1
     
-    if deletions not in splice_site_data:
-        splice_site_data["deletions"][deletions] = 0
+    if count_of_deletions not in splice_site_data["deletions"]:
+        splice_site_data["deletions"][count_of_deletions] = 0
     
-    splice_site_data["deletions"][deletions] += 1
+    splice_site_data["deletions"][count_of_deletions] += 1
 
 
 def extract_splice_site_locations_within_aligned_read(read_start: int, read_end: int, exons:list):
@@ -117,6 +117,7 @@ def count_deletions_for_splice_site_locations(
     
     # Extract splice site locations within aligned read
     matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons)
+    
     logger.debug(f"Matching locations: {matching_locations}")
     # Count deletions for each splice site location
     for splice_site_location, location_type in matching_locations:
@@ -265,6 +266,8 @@ def correct_splice_site_errors(
         strand (str): transcript strand (extracted from first ReadAssignment-object in read_assignments list)
         chr_record (Fasta): FASTA recored, i.e. a single chromosome from a reference
     """
+
+    
     
     locations_with_errors = []
     for splice_site_location, splice_site_data in splice_site_cases.items():

From 8c240f48e3dd183d5f9d524061bdeba685b1f8e3 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 14:11:26 +0300
Subject: [PATCH 30/44] Add test for GraphBasedModelConstructor

---
 .../test_transcript_splice_site_corrector.py  | 32 +++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 9b0d32aa..37c70772 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -1,6 +1,9 @@
 from unittest import TestCase
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 
+from src.isoform_assignment import ReadAssignment
+
+from src.graph_based_model_construction import GraphBasedModelConstructor
 from src.transcript_splice_site_corrector import (
     extract_location_from_cigar_string,
     count_deletions_from_cigar_codes_in_given_window,
@@ -561,4 +564,29 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self):
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
         self.assertEqual(splice_site_data, expected_result)
-        
\ No newline at end of file
+
+class TestSpliceSiteCorrector(TestCase):
+    
+    
+    def test_error_is_corrected(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 6)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "+"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 5), (10, 20)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGHIJKLMAGPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = [(0, 5), (14, 20)]
+        self.assertTrue(result == expected_result)
+
+
+        
+    
\ No newline at end of file

From 2360b6215c2ee1c1e62498660ca7765d1d914f19 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 18:27:40 +0300
Subject: [PATCH 31/44] Check for abs value

---
 src/transcript_splice_site_corrector.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index 0e0f820e..fd3aee23 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -182,7 +182,6 @@ def extract_nucleotides_from_most_common_del_location(
         possible_canonicals = canonical_pairs[strand]['end']
     else:
         possible_canonicals = canonical_pairs[strand]['start']
-    
     if extracted_canonicals in possible_canonicals:
         splice_site_data["del_location_has_canonical_nucleotides"] = True
 
@@ -200,7 +199,7 @@ def compute_most_common_del_and_verify_nucleotides(
         splice_site_data["location_is_end"])
     
     # Extract nucleotides from most common deletion location if it is an accepted case
-    if splice_site_data["most_common_del"] in ACCEPTED_DEL_CASES:
+    if abs(splice_site_data["most_common_del"]) in ACCEPTED_DEL_CASES:
         extract_nucleotides_from_most_common_del_location(
             splice_site_location, 
             splice_site_data, 

From 898f75ce354eba3be3c1de7dbd62522146d46cf3 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 18:28:00 +0300
Subject: [PATCH 32/44] Expand unittests for GraphBaseModelConstructor method

---
 .../test_transcript_splice_site_corrector.py  | 82 ++++++++++++++++++-
 1 file changed, 80 insertions(+), 2 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 37c70772..42bb029f 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -568,7 +568,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self):
 class TestSpliceSiteCorrector(TestCase):
     
     
-    def test_error_is_corrected(self):
+    def test_error_in_start_on_pos_strand_is_corrected(self):
         assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
         assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 6)]
         assigned_read_1.corrected_exons = [(0, 20)]
@@ -588,5 +588,83 @@ def test_error_is_corrected(self):
         self.assertTrue(result == expected_result)
 
 
-        
+    def test_error_in_end_on_pos_strand_is_corrected(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 16)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "+"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 14), (20, 30)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGIJGCMNOPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = [(0, 10), (20, 30)]
+        self.assertTrue(result == expected_result)
+
+
+    def test_error_in_start_on_neg_strand_is_corrected(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 6)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "-"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 5), (10, 20)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGHIJKLMGCPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = [(0, 5), (14, 20)]
+        self.assertTrue(result == expected_result)
+
+
+    def test_error_in_end_on_neg_strand_is_corrected(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 16)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "-"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 14), (20, 30)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGIJCTMNOPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = [(0, 10), (20, 30)]
+        self.assertTrue(result == expected_result)
+
+
+    def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 4), (0, 16)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "-"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 14), (20, 30)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGIJKLMNOPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = None
+        self.assertTrue(result == expected_result)
+
     
\ No newline at end of file

From 05de4f301e7a25229db1d844fee064582adddc39 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Fri, 25 Aug 2023 18:37:33 +0300
Subject: [PATCH 33/44] Improve debugger stdouts

---
 src/graph_based_model_construction.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 732171d8..2887880c 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -236,7 +236,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
 
 
         strand = assigned_reads[0].strand
-        logger.debug(f"Heidi: Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}")
+        logger.debug(f"correct_transcript_splice_sites. Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}")
         if strand not in SUPPORTED_STRANDS:
             return None
 
@@ -257,7 +257,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
                 exons, 
                 splice_site_cases)
             
-        logger.debug(f"Heidi: Splice site cases: {splice_site_cases}")
+        logger.debug(f"correct_transcript_splice_sites. Splice site cases: {splice_site_cases}")
 
         corrected_exons = correct_splice_site_errors(
             splice_site_cases,
@@ -272,12 +272,15 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         if not corrected_exons:
             return None
         
+        cases = [str(exon) + ": " + str(splice_site_cases[exon]) for exon in corrected_exons]
+        logger.debug(f"correct_transcript_splice_sites. Corrected exons: {len(corrected_exons)}, {corrected_exons} {cases}")
+        
+        
         updated_exons = generate_updated_exon_list(
             splice_site_cases,
             corrected_exons,
             exons
         )
-        logger.debug(f"Heidi: Corrected exons: {len(updated_exons)}, {updated_exons}")
         
         return updated_exons
 

From c9d09b69acbf4af06e269f60dacbc3a8a0a944aa Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Mon, 28 Aug 2023 09:40:19 +0300
Subject: [PATCH 34/44] Update unittest after changing constant positioning

---
 tests/test_transcript_splice_site_corrector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 42bb029f..32bc5b25 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -368,12 +368,14 @@ def test_count_deletions_from_splice_site_locations_extracts_correct_locations(s
         read_start = 20
         read_end = 40
         splice_site_cases = {}
+        WINDOW_SIZE = 8
         count_deletions_for_splice_site_locations(
             read_start,
             read_end,
             cigartuples,
             exons,
-            splice_site_cases)
+            splice_site_cases,
+            WINDOW_SIZE)
         expected_result = {
             20: {
                 'location_is_end': False, 

From 35a0942b3a4a294ae0096a05c79e73d3c8209818 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Mon, 28 Aug 2023 09:40:32 +0300
Subject: [PATCH 35/44] Move WINDOW_SIZE to main func

---
 src/graph_based_model_construction.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 2887880c..9e55793b 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -231,6 +231,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
         SUPPORTED_STRANDS = ['+', '-']
         THRESHOLD_CASES_AT_LOCATION = 0.7
         MIN_N_OF_ALIGNED_READS = 5
+        WINDOW_SIZE = 8
 
         MORE_CONSERVATIVE_STRATEGY = False
 
@@ -255,7 +256,8 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
                 read_end, 
                 cigartuples, 
                 exons, 
-                splice_site_cases)
+                splice_site_cases,
+                WINDOW_SIZE)
             
         logger.debug(f"correct_transcript_splice_sites. Splice site cases: {splice_site_cases}")
 

From 9e2c29d7182b5512058708b3f2d38727df821b4b Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Mon, 28 Aug 2023 09:40:44 +0300
Subject: [PATCH 36/44] Move const WINDOW_SIZE upper

---
 src/transcript_splice_site_corrector.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index fd3aee23..dc9dd750 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -102,7 +102,8 @@ def count_deletions_for_splice_site_locations(
         read_end: int, 
         cigartuples: list, 
         exons: list, 
-        splice_site_cases: dict):
+        splice_site_cases: dict,
+        WINDOW_SIZE: int):
     """
 
     Args:
@@ -112,9 +113,6 @@ def count_deletions_for_splice_site_locations(
     """
 
     
-    # Constant window size for counting deletions
-    WINDOW_SIZE = 8
-    
     # Extract splice site locations within aligned read
     matching_locations = extract_splice_site_locations_within_aligned_read(read_start, read_end, exons)
     

From 5c22578999504185e3d091da61451ee0d0436fa5 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Mon, 28 Aug 2023 15:48:37 +0300
Subject: [PATCH 37/44] Change division to multiplication

---
 src/transcript_splice_site_corrector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index dc9dd750..a46ff330 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -214,7 +214,7 @@ def threshold_for_del_cases_exceeded(
     total_cases = sum(deletions.values())
     nucleotides_exceeding_treshold = 0
     for value in del_pos_distr:
-        if value / total_cases > THRESHOLD_CASES_AT_LOCATION:
+        if value  > total_cases * THRESHOLD_CASES_AT_LOCATION:
             nucleotides_exceeding_treshold += 1
     return bool(nucleotides_exceeding_treshold >= abs(most_common_del))
 

From 46d01bbef3e4f9c40e69a202e8430d02e7f316e0 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Mon, 28 Aug 2023 15:48:52 +0300
Subject: [PATCH 38/44] Expand tests

---
 .../test_transcript_splice_site_corrector.py  | 64 +++++++++++++++++--
 1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 32bc5b25..38bd103a 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -600,7 +600,7 @@ def test_error_in_end_on_pos_strand_is_corrected(self):
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGIJGCMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -640,7 +640,7 @@ def test_error_in_end_on_neg_strand_is_corrected(self):
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGIJCTMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHICTLMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -649,6 +649,45 @@ def test_error_in_end_on_neg_strand_is_corrected(self):
         expected_result = [(0, 10), (20, 30)]
         self.assertTrue(result == expected_result)
 
+    def test_error_in_end_on_neg_strand_and_min_accepted_del_cases_is_corrected(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 3), (0, 17)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "-"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 14), (20, 30)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGHIJCTMNOPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = [(0, 11), (20, 30)]
+        self.assertTrue(result == expected_result)
+
+    def test_error_in_end_on_neg_strand_and_max_accepted_del_cases_is_corrected(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 8), (2, 6), (0, 16)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "-"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 14), (20, 30)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGCTJKLMNOPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+        
+
+        expected_result = [(0, 8), (20, 30)]
+        self.assertTrue(result == expected_result)
+
 
     def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self):
         assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
@@ -660,7 +699,7 @@ def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGIJKLMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -669,4 +708,21 @@ def test_case_with_dels_but_no_canonicals_in_end_on_neg_strand_returns_none(self
         expected_result = None
         self.assertTrue(result == expected_result)
 
-    
\ No newline at end of file
+    def test_case_with_not_enough_dels_but_canonicals_in_end_on_pos_strand_returns_none(self):
+        assigned_read_1 = ReadAssignment(read_id="1", assignment_type="test")
+        assigned_read_1.cigartuples = [(0, 10), (2, 2), (0, 18)]
+        assigned_read_1.corrected_exons = [(0, 20)]
+        assigned_read_1.strand = "-"
+        assigned_reads = [assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1, assigned_read_1]
+        exons = [(0, 14), (20, 30)]
+
+        constructor = GraphBasedModelConstructor(
+            gene_info=MagicMock(),
+            chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ",
+            params=MagicMock(),
+            transcript_counter=0
+        )
+        result = constructor.correct_transcript_splice_sites(exons, assigned_reads)
+
+        expected_result = None
+        self.assertTrue(result == expected_result)

From 5a908646ff57c6f19a1631ffe8c09a556f3b4270 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 29 Aug 2023 10:40:28 +0300
Subject: [PATCH 39/44] Shorten key name

---
 src/transcript_splice_site_corrector.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index a46ff330..292f4a26 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -125,7 +125,7 @@ def count_deletions_for_splice_site_locations(
                 'deletions': {},
                 'del_pos_distr': [0 for _ in range(WINDOW_SIZE)],
                 'most_common_del': -1,
-                'del_location_has_canonical_nucleotides': False
+                'canonical_bases_found': False
             }
         
         # Processing cigartuples
@@ -181,7 +181,7 @@ def extract_nucleotides_from_most_common_del_location(
     else:
         possible_canonicals = canonical_pairs[strand]['start']
     if extracted_canonicals in possible_canonicals:
-        splice_site_data["del_location_has_canonical_nucleotides"] = True
+        splice_site_data["canonical_bases_found"] = True
 
 def compute_most_common_del_and_verify_nucleotides(
         splice_site_location: int, 
@@ -292,7 +292,7 @@ def correct_splice_site_errors(
                 THRESHOLD_CASES_AT_LOCATION):
                 continue
 
-        if splice_site_data["del_location_has_canonical_nucleotides"]:
+        if splice_site_data["canonical_bases_found"]:
             locations_with_errors.append(splice_site_location)
     
     return locations_with_errors

From 47ca8b33f7bf1769c5faded8e3eff7806873bee9 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Tue, 29 Aug 2023 10:40:39 +0300
Subject: [PATCH 40/44] Update tests after key name change

---
 .../test_transcript_splice_site_corrector.py  | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index 38bd103a..d926570a 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -329,13 +329,13 @@ class TestCorrectSpliceSiteErrors(TestCase):
     def test_errors_are_correctly_returned(self, mock_compute_most_common_case_of_deletions):
         splice_site_cases = {
             20: {
-                "del_location_has_canonical_nucleotides": False,
+                "canonical_bases_found": False,
                 "deletions": {4: 10},
                 "location_is_end": False,
                 "most_common_del": 4,
             },
             30: {
-                "del_location_has_canonical_nucleotides": True,
+                "canonical_bases_found": True,
                 "deletions": {4: 10},
                 "location_is_end": False,
                 "most_common_del": 4,
@@ -382,21 +382,21 @@ def test_count_deletions_from_splice_site_locations_extracts_correct_locations(s
                 'deletions': {2: 1}, 
                 'del_pos_distr': [0, 0, 0, 0, 0, 0, 1, 1], 
                 'most_common_del': -1, 
-                'del_location_has_canonical_nucleotides': False
+                'canonical_bases_found': False
             },
             30: {
                 'location_is_end': True, 
                 'deletions': {4: 1}, 
                 'del_pos_distr': [0, 0, 0, 1, 1, 1, 1, 0], 
                 'most_common_del': -1, 
-                'del_location_has_canonical_nucleotides': False
+                'canonical_bases_found': False
             },
             40: {
                 'location_is_end': False, 
                 'deletions': {0: 1}, 
                 'del_pos_distr': [0, 0, 0, 0, 0, 0, 0, 0], 
                 'most_common_del': -1, 
-                'del_location_has_canonical_nucleotides': False
+                'canonical_bases_found': False
             },
         }
         self.assertEqual(splice_site_cases, expected_result)
@@ -409,7 +409,7 @@ def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly(
         splice_site_data = {
             "most_common_del": 4,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
         }
         chr_record = "AAAAAAAAAAAAAAG"
         
@@ -419,14 +419,14 @@ def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly(
             splice_site_data,
             chr_record,
             strand)
-        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+        self.assertTrue(splice_site_data["canonical_bases_found"])
 
     def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(self):
         location = 10
         splice_site_data = {
             "most_common_del": -4,
             "location_is_end": True,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
         }
         
         #  Fasta 1-based index extraction location:
@@ -443,14 +443,14 @@ def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(se
             splice_site_data,
             chr_record,
             strand)
-        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+        self.assertTrue(splice_site_data["canonical_bases_found"])
 
     def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly(self):
         location = 10
         splice_site_data = {
             "most_common_del": 4,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
         }
         chr_record = "AAAAAAAAAAAAAAC"
         
@@ -460,14 +460,14 @@ def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly(
             splice_site_data,
             chr_record,
             strand)
-        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+        self.assertTrue(splice_site_data["canonical_bases_found"])
 
     def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(self):
         location = 10
         splice_site_data = {
             "most_common_del": -4,
             "location_is_end": True,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
         }
         chr_record = "AAAAACTAAAAAAAA"
         
@@ -477,7 +477,7 @@ def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(se
             splice_site_data,
             chr_record,
             strand)
-        self.assertTrue(splice_site_data["del_location_has_canonical_nucleotides"])
+        self.assertTrue(splice_site_data["canonical_bases_found"])
 
 
 class TestDeletionComputationAndBaseExtraction(TestCase):
@@ -487,7 +487,7 @@ def test_for_accepted_del_case_nucleotides_are_vefiried(self):
         splice_site_data  = {
             "most_common_del": -1,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
             "deletions": {4: 1},
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
@@ -504,7 +504,7 @@ def test_for_accepted_del_case_nucleotides_are_vefiried(self):
         expected_result = {
             "most_common_del": 4,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": True,
+            "canonical_bases_found": True,
             "deletions": {4: 1},
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
@@ -516,7 +516,7 @@ def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self):
         splice_site_data  = {
             "most_common_del": -1,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
             "deletions": {2: 1},
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
@@ -533,7 +533,7 @@ def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self):
         expected_result = {
             "most_common_del": 2,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
             "deletions": {2: 1},
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
@@ -544,7 +544,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self):
         splice_site_data  = {
             "most_common_del": -1,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
             "deletions": {4: 1},
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
@@ -561,7 +561,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self):
         expected_result = {
             "most_common_del": 4,
             "location_is_end": False,
-            "del_location_has_canonical_nucleotides": False,
+            "canonical_bases_found": False,
             "deletions": {4: 1},
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }

From b825a5ce784147238fc30bb18a998921e834b4fe Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 29 Aug 2023 18:52:00 +0300
Subject: [PATCH 41/44] fix cigartuples exactly where they needed to be

---
 src/alignment_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/alignment_processor.py b/src/alignment_processor.py
index b93dd571..e584649a 100644
--- a/src/alignment_processor.py
+++ b/src/alignment_processor.py
@@ -307,6 +307,7 @@ def process_intergenic(self, alignment_storage):
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
             read_assignment.exons = alignment_info.read_exons
+            read_assignment.cigartuples = alignment.cigartuples
             read_assignment.corrected_exons = alignment_info.read_exons
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
 

From 3c07e177a7aaf02860278580b813cc1ecdfd02af Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 30 Aug 2023 06:54:07 +0300
Subject: [PATCH 42/44] Change idx correction for FASTA extract

---
 src/transcript_splice_site_corrector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transcript_splice_site_corrector.py b/src/transcript_splice_site_corrector.py
index 292f4a26..fee5711c 100644
--- a/src/transcript_splice_site_corrector.py
+++ b/src/transcript_splice_site_corrector.py
@@ -157,7 +157,7 @@ def extract_nucleotides_from_most_common_del_location(
         chr_record, 
         strand: str):
     most_common_del = splice_site_data["most_common_del"]
-    idx_correction = -1
+    idx_correction = 0
     extraction_start = location + most_common_del + idx_correction
     extraction_end = location + most_common_del + 2 + idx_correction
     try:

From ce52d631a079c52dc4860de3f661e5548d2272e8 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 30 Aug 2023 07:04:02 +0300
Subject: [PATCH 43/44] Fix unittests after fixing issue with chr_record
 idx-correction

---
 .../test_transcript_splice_site_corrector.py  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/test_transcript_splice_site_corrector.py b/tests/test_transcript_splice_site_corrector.py
index d926570a..010dc7fb 100644
--- a/tests/test_transcript_splice_site_corrector.py
+++ b/tests/test_transcript_splice_site_corrector.py
@@ -411,7 +411,7 @@ def test_canonical_nucleotides_for_loc_start_pos_strand_are_extracted_correctly(
             "location_is_end": False,
             "canonical_bases_found": False,
         }
-        chr_record = "AAAAAAAAAAAAAAG"
+        chr_record = "AAAAAAAAAAAAAAAG"
         
         strand = "+"
         extract_nucleotides_from_most_common_del_location(
@@ -435,7 +435,7 @@ def test_canonical_nucleotides_for_loc_end_pos_strand_are_extracted_correctly(se
         #                 |           |
         #                 v         start pos
         #  A  A  A  A  A  G  C  A  A  A  A  A  A  A  A  
-        chr_record = "AAAAAGCAAAAAAAA"
+        chr_record = "AAAAAAGCAAAAAAAA"
         
         strand = "+"
         extract_nucleotides_from_most_common_del_location(
@@ -452,7 +452,7 @@ def test_canonical_nucleotides_for_loc_start_neg_strand_are_extracted_correctly(
             "location_is_end": False,
             "canonical_bases_found": False,
         }
-        chr_record = "AAAAAAAAAAAAAAC"
+        chr_record = "AAAAAAAAAAAAAAAC"
         
         strand = "-"
         extract_nucleotides_from_most_common_del_location(
@@ -469,7 +469,7 @@ def test_canonical_nucleotides_for_loc_end_neg_strand_are_extracted_correctly(se
             "location_is_end": True,
             "canonical_bases_found": False,
         }
-        chr_record = "AAAAACTAAAAAAAA"
+        chr_record = "AAAAAACTAAAAAAAA"
         
         strand = "-"
         extract_nucleotides_from_most_common_del_location(
@@ -492,7 +492,7 @@ def test_for_accepted_del_case_nucleotides_are_vefiried(self):
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
 
-        chr_record = "AAAAAAAAAAAAAAG"
+        chr_record = "AAAAAAAAAAAAAAAG"
         ACCEPTED_DEL_CASES = [4]
         strand = "+"
         compute_most_common_del_and_verify_nucleotides(
@@ -521,7 +521,7 @@ def test_for_not_accepted_del_case_nucleotides_are_not_vefiried(self):
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
 
-        chr_record = "AAAAAAAAAAAAAAG"
+        chr_record = "AAAAAAAAAAAAAAAG"
         ACCEPTED_DEL_CASES = [4]
         strand = "+"
         compute_most_common_del_and_verify_nucleotides(
@@ -549,7 +549,7 @@ def test_for_accepted_del_case_non_canonical_nucleotides_return_false(self):
             "del_pos_distr": [0, 0, 0, 0, 0, 0, 0, 0],
         }
 
-        chr_record = "AAAAAAAAAAAAAXX"
+        chr_record = "AAAAAAAAAAAAAAXX"
         ACCEPTED_DEL_CASES = [4]
         strand = "+"
         compute_most_common_del_and_verify_nucleotides(
@@ -580,7 +580,7 @@ def test_error_in_start_on_pos_strand_is_corrected(self):
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGHIJKLMAGPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJKLMNAGQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -600,7 +600,7 @@ def test_error_in_end_on_pos_strand_is_corrected(self):
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJGCMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -620,7 +620,7 @@ def test_error_in_start_on_neg_strand_is_corrected(self):
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGHIJKLMGCPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJKLMNGCQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -640,7 +640,7 @@ def test_error_in_end_on_neg_strand_is_corrected(self):
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGHICTLMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJCTMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -659,7 +659,7 @@ def test_error_in_end_on_neg_strand_and_min_accepted_del_cases_is_corrected(self
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGHIJCTMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJKCTNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -678,7 +678,7 @@ def test_error_in_end_on_neg_strand_and_max_accepted_del_cases_is_corrected(self
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGCTJKLMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHCTKLMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )
@@ -718,7 +718,7 @@ def test_case_with_not_enough_dels_but_canonicals_in_end_on_pos_strand_returns_n
 
         constructor = GraphBasedModelConstructor(
             gene_info=MagicMock(),
-            chr_record= "ABCDEFGHIGCLMNOPQRSTUVWXYZ",
+            chr_record= "ABCDEFGHIJGCMNOPQRSTUVWXYZ",
             params=MagicMock(),
             transcript_counter=0
         )

From fb7db12671b63eb8340b3e506da211db9e612fc8 Mon Sep 17 00:00:00 2001
From: Heidi Holappa <heidi.holappa@helsinki.fi>
Date: Wed, 30 Aug 2023 09:04:55 +0300
Subject: [PATCH 44/44] Remove unneeded logger.debugs

---
 src/graph_based_model_construction.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 9e55793b..991fbd17 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -208,15 +208,9 @@ def correct_transcripts(self):
         for model in self.transcript_model_storage:
             exons = model.exon_blocks
             assigned_reads = self.transcript_read_ids[model.transcript_id]
-            found_cigartuples = False
-            # TODO: REMOVE NEXT FIVE LINES AFTER CIAGRTUPLES ARE FIXED
-            for read in assigned_reads:
-                if read.cigartuples:
-                    found_cigartuples = True
-                    break
-            logger.debug(f"Heidi: Method correct_transcripts. Transcript: {model.transcript_id}, found one or more cigartuples: {found_cigartuples}")    
             corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
             if corrected_exons:
+                logger.debug(f"correct_transcripts. Corrected exons: {corrected_exons}, original exons: {exons}")
                 model.exon_blocks = corrected_exons
 
     def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
@@ -237,7 +231,6 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
 
 
         strand = assigned_reads[0].strand
-        logger.debug(f"correct_transcript_splice_sites. Correcting splice sites. n of exons: {len(exons)}, n of assigned reads: {len(assigned_reads)}, strand: {strand}")
         if strand not in SUPPORTED_STRANDS:
             return None
 
@@ -259,7 +252,7 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
                 splice_site_cases,
                 WINDOW_SIZE)
             
-        logger.debug(f"correct_transcript_splice_sites. Splice site cases: {splice_site_cases}")
+        
 
         corrected_exons = correct_splice_site_errors(
             splice_site_cases,
@@ -275,7 +268,6 @@ def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
             return None
         
         cases = [str(exon) + ": " + str(splice_site_cases[exon]) for exon in corrected_exons]
-        logger.debug(f"correct_transcript_splice_sites. Corrected exons: {len(corrected_exons)}, {corrected_exons} {cases}")
         
         
         updated_exons = generate_updated_exon_list(