FIX coordinates of orfs with start/stop codon on exon boundary

This addresses Issue #64.
dieterich-lab · Mar 31, 2017 · 63ca81c · 63ca81c
1 parent dfec093
commit 63ca81c
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 7 deletions.
diff --git a/rpbp/__init__.py b/rpbp/__init__.py
@@ -1,2 +1,2 @@
-__version_info__ = ('1', '1', '5')
+__version_info__ = ('1', '1', '6')
 __version__ = '.'.join(__version_info__)
diff --git a/rpbp/reference_preprocessing/extract_orf_coordinates.py b/rpbp/reference_preprocessing/extract_orf_coordinates.py
@@ -160,7 +160,11 @@ def get_orfs(transcript_and_sequence, start_codons_re, stop_codons_re):
     transcript_length = len(transcript_sequence)
 
     # get the ORFs for this entry
-    orf_rel_positions = get_orf_positions(transcript_sequence, start_codons_re, stop_codons_re)
+    orf_rel_positions = get_orf_positions(
+        transcript_sequence,
+        start_codons_re,
+        stop_codons_re
+    )
 
     #if logger.isEnabledFor(logging.DEBUG):
     #   s = ["({},{})".format(o.start, o.end) for o in orf_rel_positions]
@@ -186,15 +190,39 @@ def get_orfs(transcript_and_sequence, start_codons_re, stop_codons_re):
 
     # we need the block information to convert between relative and genomic coordinates
     start = transcript['start']
-    block_lengths = np.fromstring(transcript['exon_lengths'], sep=',', dtype=int)
+
+    block_lengths = np.fromstring(
+        transcript['exon_lengths'],
+        sep=',',
+        dtype=int
+    )
+
     block_starts = np.zeros(len(block_lengths), dtype=int)
     block_starts[1:] = np.cumsum(block_lengths)[:-1]
-    block_relative_starts = np.fromstring(transcript['exon_genomic_relative_starts'], sep=',', dtype=int)
 
+    block_relative_starts = np.fromstring(
+        transcript['exon_genomic_relative_starts'],
+        sep=',',
+        dtype=int
+    )
+
+    # for a discussion about why 
+    # please see Issue #64: https://github.com/dieterich-lab/rp-bp/issues/64
     orf_gen_positions = [
         orf_position(
-            start=bed_utils.get_gen_pos(o.start, start, block_lengths, block_starts, block_relative_starts),
-            end=bed_utils.get_gen_pos(o.end, start, block_lengths, block_starts, block_relative_starts)
+            start=bed_utils.get_gen_pos(
+                o.start-1,
+                start,
+                block_lengths,
+                block_starts,
+                block_relative_starts)+1,
+
+            end=bed_utils.get_gen_pos(
+                o.end-1,
+                start,
+                block_lengths,
+                block_starts,
+                block_relative_starts)+1
         ) for o in orf_rel_positions
     ]
 

diff --git a/setup.py b/setup.py
@@ -208,7 +208,7 @@ def readme():
         return f.read()
 
 setup(name='rpbp',
-        version='1.1.5',
+        version='1.1.6',
         description="This package contains the Rp-Bp pipeline for predicting translation of open reading frames from ribosome profiling data.",
         long_description=readme(),
         keywords="rpbp ribosome profiling bayesian inference markov chain monte carlo translation",