Skip to content

Commit

Permalink
fix detection of start/stop codons for 'complete' input
Browse files Browse the repository at this point in the history
* fix/complete-start:
  run prodigal without meta option
  allow empty sequences
  include start and stop codons in output
  add last nights benchmarks to the evaluation README
  add quality measuring scripts
  compare to negative strand for dna_start_t_withstop
  invert s_save to avoid negative numbers
  negate strcmp
  dropme: start with start states
  infinity is sometimes smaller in FGS
  unfix start1/stop1 filenames
  include reverse stop codon in meta output
  fix ACGT typo to correct complete predictions
  revert "extend genes in complete genomes to start/stop codons"
  use corrected dna_start_t
  extend genes in complete genomes to start/stop codons
  • Loading branch information
Felix Van der Jeugt committed Oct 6, 2021
2 parents 7bddb73 + 437ef43 commit 2757c4e
Show file tree
Hide file tree
Showing 10 changed files with 228 additions and 29 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
/target
FragGeneScan
FGS+
prodigal
*.ffn
*.faa
*.gff
*.csv
2 changes: 2 additions & 0 deletions meta/evaluation/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.fasta
*.txt
67 changes: 67 additions & 0 deletions meta/evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes

Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes

The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly.

The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes.

## Create the annotations and lengths files

Execute `annotations.py`.

## Create the FGS/FGS+ files (from .aa)

(swap directories to execute these)

```sh
cd path/to/FGS
./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1
./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1
cd -
rm FGS.out FGS.ffn
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv
```

## Create the FGSrs/Prodigal files (from .gff)

```sh
FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1
prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff
grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv
grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv
```

## Print comparison table

Execute `rates.py`.

## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine)

Run in the FGS or FGS+ directory (for the training files).

```sh
hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \
'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \
'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \
'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff'
```

```
Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1
Time (mean ± σ): 3.797 s ± 0.006 s [User: 3.413 s, System: 0.348 s]
Range (min … max): 3.792 s … 3.807 s 5 runs
Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1
Time (mean ± σ): 369.979 s ± 25.774 s [User: 367.679 s, System: 0.517 s]
Range (min … max): 353.713 s … 415.649 s 5 runs
Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1
Time (mean ± σ): 1.703 s ± 0.014 s [User: 1.395 s, System: 0.275 s]
Range (min … max): 1.684 s … 1.719 s 5 runs
Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff
Time (mean ± σ): 8.533 s ± 0.038 s [User: 8.453 s, System: 0.047 s]
Range (min … max): 8.493 s … 8.573 s 5 runs
```
18 changes: 18 additions & 0 deletions meta/evaluation/annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l:
for line in f:
if line.startswith('AC'):
accession = line[2:].strip()[:-1]
elif line.startswith('FT gene'):
span = line[9:].strip()
if span.startswith('complement('):
span = span[11:-1]
strand = '-'
else:
strand = '+'
start, end = span.split('..')
print(accession, start, end, strand, sep=',', file=a)
elif line.startswith('SQ'):
parts = line[2:].strip().split(' ')
assert parts[0] == "Sequence"
print(accession, parts[1], sep=',', file=l)
assert parts[2].startswith("BP")
75 changes: 75 additions & 0 deletions meta/evaluation/rates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
from collections import namedtuple
from operator import itemgetter

Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand'])
Event = namedtuple('Event', ['location', 'annotation', 'prediction'])

def parse_readlengths(filename):
with open(filename) as f:
for line in f:
name, length = line.strip().split(',')
yield name, int(length)

def parse_csv(filename):
with open(filename) as f:
for line in f:
name, start, end, strand = line.strip().split(',')
yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1)

def events(read_lengths, annotations, predictions):
names = dict()
total = 0
for name, length in parse_readlengths(read_lengths):
names[name] = total
total += length

for name, start, end, strand in parse_csv(annotations):
yield Event(names[name] + start, strand, None)
yield Event(names[name] + end, 0, None)

for name, start, end, strand in parse_csv(predictions):
yield Event(names[name] + start, None, strand)
yield Event(names[name] + end, None, 0)

def rates(read_lengths, annotations, predictions):
rates = dict(tp=0, fp=0, tn=0, fn=0)
cl = 0 # current location
ca = 0 # current annotation
cp = 0 # current prediction
for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)):
if ca == 0 and cp == 0:
rates['tn'] += l - cl
elif ca == cp:
rates['tp'] += l - cl
elif ca == 0 and cp != 0:
rates['fp'] += l - cl
elif ca != 0 and cp == 0:
rates['fn'] += l - cl
else: # different strands
rates['fp'] += l - cl

if a is not None: ca = a
if p is not None: cp = p
cl = l
return rates

body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}'
head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}'
print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC'))
for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']:
r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv')
tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn']
t = tp + fp + tn + fn
print(body.format(
tool,
tp / t,
fp / t,
tn / t,
fn / t,
tp / (tp + fp),
tp / (tp + fn),
tn / (tn + fp),
tn / (tn + fn),
(tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5
))
11 changes: 11 additions & 0 deletions meta/out-to-gff.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
BEGIN { print "##gff-version 3"; }
{
s = substr($1, 1, 1)
if (s == ">") {
seqid = substr($1, 2)
} else {
s = split($0, t, "\t")
id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein"
print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id
}
}
19 changes: 12 additions & 7 deletions src/bin/FragGeneScanRs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use rayon::ThreadPoolBuilder;

extern crate frag_gene_scan_rs;
use frag_gene_scan_rs::dna::{count_cg_content, Nuc};
use frag_gene_scan_rs::gene;
use frag_gene_scan_rs::hmm;
use frag_gene_scan_rs::viterbi::viterbi;

Expand Down Expand Up @@ -236,13 +237,17 @@ fn run<R: Read + Send, W: WritingBuffer + Send>(
let fasta::OwnedRecord { mut head, seq } = record?;
head = head.into_iter().take_while(u8::is_ascii_graphic).collect();
let nseq: Vec<Nuc> = seq.into_iter().map(Nuc::from).collect();
let read_prediction = viterbi(
&global,
&locals[count_cg_content(&nseq)],
head,
nseq,
whole_genome,
);
let read_prediction = if nseq.is_empty() {
gene::ReadPrediction::new(head)
} else {
viterbi(
&global,
&locals[count_cg_content(&nseq)],
head,
nseq,
whole_genome,
)
};
if meta_buffer.is_some() {
read_prediction.meta(&mut metabuf)?;
}
Expand Down
7 changes: 3 additions & 4 deletions src/gene.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ impl ReadPrediction {

pub struct Gene {
pub start: usize,
pub metastart: usize,
pub end: usize,
pub frame: usize,
pub score: f64,
Expand All @@ -89,7 +88,7 @@ impl Gene {
buf.append(
&mut format!(
"{}\t{}\t{}\t{}\t{:.6}\tI:{}\tD:{}\n",
self.metastart,
self.start,
self.end,
if self.forward_strand { '+' } else { '-' },
self.frame,
Expand All @@ -112,12 +111,12 @@ impl Gene {
&mut format!(
"{}\tFGS\tCDS\t{}\t{}\t.\t{}\t{}\tID={}_{}_{}_{};product=predicted protein\n",
head,
self.metastart,
self.start,
self.end,
if self.forward_strand { '+' } else { '-' },
self.frame - 1,
head,
self.metastart,
self.start,
self.end,
if self.forward_strand { '+' } else { '-' }
)
Expand Down
4 changes: 2 additions & 2 deletions src/hmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ pub fn get_train_from_file(
read_noncoding(&mut locals, train_dir.join("noncoding"))?;
read_start(&mut locals, train_dir.join("start"))?;
read_stop(&mut locals, train_dir.join("stop"))?;
read_start1(&mut locals, train_dir.join("start1"))?;
read_stop1(&mut locals, train_dir.join("stop1"))?;
read_start1(&mut locals, train_dir.join("stop1"))?; // keep FGS naming scheme
read_stop1(&mut locals, train_dir.join("start1"))?; // keep FGS naming scheme
read_pwm(&mut locals, train_dir.join("pwm"))?;

Ok((global, locals))
Expand Down
Loading

0 comments on commit 2757c4e

Please sign in to comment.