Added new isON_pipeline.sh script and already updated isONform version

aljpetri · Jan 24, 2024 · ac45072 · ac45072
1 parent a336dc2
commit ac45072
Show file tree

Hide file tree

Showing 5 changed files with 207 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -3,9 +3,10 @@
 1. [Installation](#installation)
 2. [Introduction](#introduction)
 3. [Output](#output) 
-4. [Running isONform](#Running)
+4. [Input data](#Input_data)
+5. [Running isONform](#Running)
 	1. [Running a test](#runtest)
-5. [Credits](#credits)
+6. [Credits](#credits)
 
 ## Installation <a name="installation"></a>
 
@@ -46,11 +47,13 @@ This command installs isONforms dependencies:
 IsONform generates isoforms out of clustered and corrected long reads.
 For this a graph is built up using the networkx api and different simplification strategies are applied to it, such as bubble popping and node merging.
 The algorithm uses spoa to generate the final isoforms.<br />
-
+## Input data <a name="Input_data"></a>
+The isONpipeline takes .fastq files generated with long-read sequencing techniques (ONT or Pacbio) as an input that additionally have been cleaned of barcodes.
+Please make sure that you run the isONpipeline on data that have been processed with * [LIMA](https://lima.how/) (Pacbio data) or *[Pychopper](https://github.com/epi2me-labs/pychopper) (ONT data) so that all the barcodes are removed from the reads
 
 ## Running isONform <a name="Running"></a>
 
-To run the algorithm:<br />
+To only run the isONform algorithm:<br />
 
 
 ```
@@ -59,12 +62,12 @@ python isONform_parallel.py --fastq_folder path/to/input/files --t <nr_cores> --
 
 Note: Please always give absolute paths to the files or folders
 
-the isON-pipeline (isONclust, isONcorrect, isONform) can be run via:
+The full isON-pipeline (isONclust, isONcorrect, isONform) can be found [here](https://github.com/aljpetri/isONform/blob/master/isON_pipeline.sh) and is run via:
 
 ```
 ./full_pipeline.sh <raw_reads.fq>  <outfolder>  <num_cores> <isONform_folder> <iso_abundance> <mode>
 ```
-(Note that this requires pychopper, isONclust and isONcorrect to be installed)
+(Please note that this requires isONclust [LINK] (https://github.com/ksahlin/isONclust) and isONcorrect [LINK](https://github.com/ksahlin/isONcorrect) to be installed in addition to isONform)
 
 ## Outputs <a name="Outputs"></a>
 IsONform outputs three main files: transcriptome.fasta, mapping.txt, and support.txt.
@@ -75,6 +78,7 @@ As we cluster reads as in isONcorrect in batches of 1000 reads the 'y' denotes f
 The 'z' denotes a unique identifier which enables us to have unique ids for each isoform that we reconstructed.
 In mapping.txt it is indicated from which original reads an isoform has been reconstructed.
 support_txt gives the support (i.e. how many original reads make up the isoform).
+
 ## Contact <a name="Contact"></a>
 If you encounter any problems, please raise an issue on the issues page, you can also contact the developer of this repository via:
 alexander.petri[at]math.su.se
@@ -86,3 +90,7 @@ Please cite [1] when using isONform.
 
 1. Petri, A. J., & Sahlin, K. (2023). isONform: reference-free transcriptome reconstruction from Oxford Nanopore data. Bioinformatics, 39(Supplement_1), i222-i231. https://academic.oup.com/bioinformatics/article/39/Supplement_1/i222/7210488 .
 
+Please additionally cite [2] and [3] when running the full pipeline.
+
+2. Kristoffer Sahlin, Paul Medvedev. De Novo Clustering of Long-Read Transcriptome Data Using a Greedy, Quality-Value Based Algorithm, Journal of Computational Biology 2020, 27:4, 472-484. [Link](https://www.liebertpub.com/doi/abs/10.1089/cmb.2019.0299).
+3. Sahlin, K., Medvedev, P. Error correction enables use of Oxford Nanopore technology for reference-free transcriptome analysis. Nat Commun 12, 2 (2021). https://doi.org/10.1038/s41467-020-20340-8  [Link](https://www.nature.com/articles/s41467-020-20340-8).
diff --git a/isON_pipeline.sh b/isON_pipeline.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+set -e
+#the pipeline can be run in different modes:
+
+####ONT data
+# ont_with_pychopper: the full pipeline is run in addition to pychopper (pychopper, isONclust,isONcorrect,isONform)
+# ont_no_pychopper: only the isONpipeline is run without pychopper (isONclust,isONcorrect, isONform)
+
+####PACBIO data
+# pacbio: for PacBio data runs isONclust and isONform
+
+
+###### test modes (only for internal use)
+# analysis: analysis of ont data:  isONclust,isONcorrect and isONform are run (e.g. analyses on the paper)
+# only_isonform: only isONform is run
+#!/bin/bash
+
+programname=$0
+function usage {
+    echo ""
+    echo "Runs the full isON pipeline. Please make sure that the input file has been preprocessed with pychopper for ONT data "
+    echo ""
+    echo "usage: $programname --raw_reads string --outfolder string --num_cores integer --isONform_folder string  --iso_abundance integer --mode string"
+    echo ""
+    echo "  --raw_reads   	        absolute path to the input file (in fastq format)"
+    echo "                          (example: /home/user/Rawdata/raw_reads.fq)"
+    echo "  --outfolder             absolute path to the output folder (the folder in which all outputs are stored)"
+    echo "                          (example: /home/user/analysis_output)"
+    echo "  --num_cores             the number of processors the pipeline may use"
+    echo "                          (example: 8)"
+    echo "  --isONform_folder       the absolute path to the isONform installation on your machine (leave empty if you have installed isONform via pip)"
+    echo "                          (example: /home/user/isONform )"
+    echo "  --iso_abundance         threshold which denotes the minimum read support neccessary for an isoform to be called (also minimum number of reads per cluster in isONclust)"
+    echo "                          (example: 5)"
+    echo "  --mode                  Run mode of the pipeline, possible modes are 'ont_no_pyc' and 'ont_with_pc' for ont data and 'pacbio' for pacbio data"
+    echo "                          (example: ont_no_pychopper/ont_with_pychopper/pacbio)"
+    echo " For ONT data: use 'ont_no_pychopper' if you want to run the isON pipeline and pychopper, use 'ont_with_pychopper' if you only want to run the isON pipeline. Please run pychopper yourself before running the pipeline."
+    echo ""
+}
+
+while [ $# -gt 0 ]; do
+    # Check if the current argument is "--help"
+    if [[ $1 == "--help" ]]; then
+        # Call the usage function and exit with status code 0
+        usage
+        exit 0
+    # Check if the current argument is an option starting with "--"
+    elif [[ $1 == "--"* ]]; then
+        # Extract the option name by removing the leading dashes
+        v="${1/--/}"
+        # Check if the argument for this option was left empty (then we would have the next argument name as next entry)
+        if [[ $2 == "--"* ]]; then
+           # The argument was left empty therefore we leave the argument for the option empty
+           declare "$v"=''
+        else
+           #The argument was not left empty, therefore we properly set the argument as the value for option
+           declare "$v"="$2"
+           #We have to shift only in this case
+           shift
+        fi
+    fi
+    #This is the shift we have to perform each time
+    shift
+done
+
+if [[ -z $raw_reads ]]; then
+    usage
+    die "Missing parameter --raw_reads"
+elif [[ -z $outfolder ]]; then
+    usage
+    die "Missing parameter --outfolder"
+elif [[ -z $mode ]]; then
+    usage
+    die "Missing parameter --mode"
+#elif [[ -z $isONform_folder ]]; then
+#    isONform_folder=''
+    #TODO set isONform folder to '' if not given
+fi
+
+echo "Running `basename $0` raw reads: '$raw_reads' outfolder: '$outfolder' num_cores: '$num_cores' isONform_folder:'$isONform_folder' iso_abundance: '$iso_abundance' mode: '$mode'"
+
+mkdir -p $outfolder
+
+if [ $mode == "ont_with_pychopper" ]
+then
+echo
+echo "Will run pychopper (cdna_classifier.py), isONclust, isONcorrect and isONform. Make sure you have these tools installed."
+echo "For installation see: https://github.com/ksahlin/isONcorrect#installation and  https://github.com/aljpetri/isONform"
+echo
+
+echo
+echo "Running pychopper"
+echo
+
+pychopper  $raw_reads $outfolder/full_length.fq -t $num_cores
+
+echo
+echo "Finished pychopper"
+echo
+
+fi
+
+
+if [ $mode != "only_isonform" ] # this if statement prevents isONclust and isONcorrect from being run
+  then
+    echo
+    echo "Running isONclust"
+    echo
+  if [ $mode == "ont_with_pychopper" ]
+    then
+        /usr/bin/time -v isONclust  --t $num_cores  --ont --fastq $outfolder/full_length.fq \
+             --outfolder $outfolder/clustering
+        /usr/bin/time -v isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
+                      --fastq $outfolder/full_length.fq --outfolder  $outfolder/clustering/fastq_files
+  elif [ $mode == "ont_no_pychopper" ]
+   then
+       /usr/bin/time -v  isONclust  --t $num_cores  --ont --fastq $raw_reads \
+             --outfolder $outfolder/clustering
+       /usr/bin/time -v isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
+                      --fastq $raw_reads --outfolder  $outfolder/clustering/fastq_files
+  elif [ $mode == "pacbio" ]
+    then
+       /usr/bin/time -v  isONclust  --t $num_cores  --isoseq  --fastq $raw_reads \
+             --outfolder $outfolder/clustering
+       /usr/bin/time -v isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
+                      --fastq $raw_reads --outfolder  $outfolder/clustering/fastq_files
+
+  else #[ $mode != "pacbio" ] && [ $mode != "'ont'" ]
+       /usr/bin/time -v  isONclust  --t $num_cores   --fastq $raw_reads \
+             --outfolder $outfolder/clustering
+       /usr/bin/time -v isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
+                      --fastq $raw_reads --outfolder  $outfolder/clustering/fastq_files
+#This is the pacbio mode
+
+  fi
+
+  echo
+  echo "Finished isONclust"
+  echo
+#conda activate isON311
+
+  if [ $mode != "pacbio" ]
+  then
+    echo
+    echo "Running isONcorrect"
+    echo
+
+    /usr/bin/time -v python3.11 run_isoncorrect --t $num_cores  --fastq_folder $outfolder/clustering/fastq_files  --outfolder $outfolder/correction/
+
+    echo
+    echo "Finished isONcorrect"
+    echo
+  fi
+fi
+echo
+echo "Merging reads back to single file. Corrected reads per cluster are still stored in: " $outfolder/correction/
+echo
+
+echo
+echo "Running isONform"
+echo
+if [ -n "$isONform_folder"  ] #the user has given a path to isONform (cloned from github)
+  then
+  if [ $mode != "pacbio" ] #i.e. we run in ONT mode
+    then
+        /usr/bin/time -v  $isONform_folder/isONform_parallel --t $num_cores  --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 10 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches  --delta_iso_len_3 30 --delta_iso_len_5 50
+  else #we run isONform in pacbio mode (adding the keyword clustered to the command)
+        /usr/bin/time -v   $isONform_folder/isONform_parallel --t $num_cores --fastq_folder $outfolder/clustering/fastq_files --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 10 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches --delta_iso_len_3 30 --delta_iso_len_5 50 --clustered
+  fi
+
+
+else #the user has not given a path to isONform (pip installation
+  if [ $mode != "pacbio" ] #i.e. we run in ONT mode
+    then
+        /usr/bin/time -v  isONform_parallel --t $num_cores  --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 10 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches  --delta_iso_len_3 30 --delta_iso_len_5 50
+  else #we run isONform in pacbio mode (adding the keyword clustered to the command)
+        /usr/bin/time -v   isONform_parallel --t $num_cores --fastq_folder $outfolder/clustering/fastq_files --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 10 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches --delta_iso_len_3 30 --delta_iso_len_5 50 --clustered
+  fi
+fi
+echo
+echo "Finished isONform"
+echo
+
+echo
+echo "Finished with pipeline and wrote corrected reads into: " $outfolder
+echo
diff --git a/isONform_parallel.py → isONform_parallel b/isONform_parallel.py → isONform_parallel
@@ -5,7 +5,7 @@
 by Kristoffer Sahlin and changed by Alexander Petri to be usable with the isONform code base.
 
 """
-# ! /usr/bin/env python
+
 
 from __future__ import print_function
 import argparse
@@ -42,7 +42,7 @@ def isONform(data):
     help_functions.mkdir_p(outfolder)
     #print("OUT",outfolder)
     #print("Algoparams",isONform_algorithm_params)
-    isONform_exec = os.path.join(isONform_location, "main.py")
+    isONform_exec = os.path.join(isONform_location, "main")
     isONform_error_file = os.path.join(outfolder, "stderr.txt")
     with open(isONform_error_file, "w") as error_file:
         print('Running isONform batch_id:{0}.{1}...'.format(cl_id,batch_id), end=' ')
@@ -311,7 +311,7 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="De novo reconstruction of long-read transcriptome reads",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.3.3')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.3.4')
     parser.add_argument('--fastq_folder', type=str, default=False,
                         help='Path to input fastq folder with reads in clusters')
     parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')

diff --git a/main.py → main b/main.py → main
@@ -586,7 +586,7 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="De novo error correction of long-read transcriptome reads",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.3.3')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.3.4')
     parser.add_argument('--fastq', type=str, default=False, help='Path to input fastq file with reads')
 
     parser.add_argument('--k', type=int, default=20, help='Kmer size')

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 setup(
 
     name='isONform',  # Required
-    version='0.3.3',  # Required
+    version='0.3.4',  # Required
     description='De novo construction of isoforms from long-read data ',  # Required
     long_description=long_description,  # Optional
     long_description_content_type='text/markdown',
@@ -55,7 +55,7 @@
         'Programming Language :: Python :: 3.11',
     ],
 
-    keywords='Oxford Nanopore isoform prediction',  # Optional
+    keywords='Oxford Nanopore isoform prediction, Pacific Biosciences isoform prediction',  # Optional
 
     # You can just specify package directories manually here if your project is
     # simple. Or you can use find_packages().
@@ -102,5 +102,5 @@
     #         'IsoCon=IsoCon.__main__()',
     #     ],
     # },
-    scripts=['isONform_parallel.py','main.py'],
+    scripts=['isONform_parallel','main'],
 )