diff --git a/virsorter/.gitignore b/virsorter/.gitignore new file mode 100644 index 0000000..ba077a4 --- /dev/null +++ b/virsorter/.gitignore @@ -0,0 +1 @@ +bin diff --git a/virsorter/Dockerfile b/virsorter/Dockerfile new file mode 100644 index 0000000..20d2a1f --- /dev/null +++ b/virsorter/Dockerfile @@ -0,0 +1,21 @@ +FROM perl:latest + +MAINTAINER Ken Youens-Clark + +RUN apt-get update && apt-get install libdb-dev -y + +RUN cpanm --force Capture::Tiny + +RUN cpanm --force BioPerl + +RUN cpanm File::Which + +COPY wrapper_phage_contigs_sorter_iPlant.pl /usr/local/bin/ + +COPY Scripts /usr/local/bin/Scripts/ + +COPY bin /usr/local/bin/ + +ENTRYPOINT ["wrapper_phage_contigs_sorter_iPlant.pl"] + +CMD ["-h"] diff --git a/virsorter/LICENSE b/virsorter/LICENSE new file mode 100644 index 0000000..d6a9326 --- /dev/null +++ b/virsorter/LICENSE @@ -0,0 +1,340 @@ +GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {description} + Copyright (C) {year} {fullname} + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + {signature of Ty Coon}, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. + diff --git a/virsorter/README.md b/virsorter/README.md new file mode 100644 index 0000000..779b123 --- /dev/null +++ b/virsorter/README.md @@ -0,0 +1,64 @@ +# VirSorter + +Source code of the VirSorter App, available on iPlant (https://de.iplantcollaborative.org/de/) + +# Dependencies + +Install the following into a "bin" directory: + +* HMMER (http://hmmer.janelia.org/) +* MCL (http://micans.org/mcl/) +* Metagene Annotator (http://metagene.nig.ac.jp/metagene/download_mga.html) +* MUSCLE (http://www.drive5.com/muscle/) +* BLAST (ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/LATEST/, not BLAST+) + +# Docker + +## Data Container + +The 12G of dependent data exists as a separate data container +called "virsorter-data." + +This is the Dockerfile for that: + + FROM perl:latest + + MAINTAINER Ken Youens-Clark + + COPY Generic_ref_file.refs /data/ + + COPY PFAM_27 /data/PFAM_27 + + COPY Phage_gene_catalog /data/Phage_gene_catalog + + COPY Phage_gene_catalog_plus_viromes /data/Phage_gene_catalog_plus_viromes + + COPY SUP05_SAGs_with_viruses.fna /data/ + + COPY VirSorter_Readme.txt /data + + COPY VirSorter_Readme_viromes.txt /data + + VOLUME ["/data"] + +Then do: + + $ docker build -t kyclark/virsorter-data . + $ docker create --name virsorter-data kyclark/virsorter-data /bin/true + +## Build + + $ docker build -t kyclark/virsorter . + +## Run + +A sample "run" command to use the current working directory for input/output: + + $ docker run --rm --volumes-from virsorter-data -v $(pwd):/de-app-work \ + -w /de-app-work kyclark/virsorter --fna Mic_1.fna --db 1 + +# Authors + +Simon Roux is the author of Virsorter + +Ken Youens-Clark packaged this for Docker/iPlant. diff --git a/virsorter/Scripts/Sliding_windows_3 b/virsorter/Scripts/Sliding_windows_3 new file mode 100755 index 0000000..5a432e4 Binary files /dev/null and b/virsorter/Scripts/Sliding_windows_3 differ diff --git a/virsorter/Scripts/Sliding_windows_3.c b/virsorter/Scripts/Sliding_windows_3.c new file mode 100755 index 0000000..cc18a06 --- /dev/null +++ b/virsorter/Scripts/Sliding_windows_3.c @@ -0,0 +1,259 @@ +#include +#include +#include +#include + +long double factorial(unsigned n){ + long double f=1; + while(n>0){f*=n--;} + return f; +} + +long double combination(unsigned k,unsigned n){ + long double f=(factorial(n) / (factorial(k) * factorial(n-k))); + return f; +} + + +long double combination_eff(unsigned k,unsigned n){ + long double num=1; + if (k<(n/2)){k=n-k;} + int n_2=n; + while (n_2>k){num*=n_2--;} + long double f= num / factorial(n-k); + return f; +} + +long double proba_n(unsigned n,unsigned k, long double proba){ + long double result=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); // New way more efficient to compute combination + return result; +} + + +long double proba_more_than(int n,int k, long double proba){ + long double result=0.0; + while(k<=n) { + result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); + k++; + } + return result; +} + + +long double proba_less_than(int n,int k, long double proba){ + long double result=0.0; + while(k>=0) { + result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); + k--; + } + return result; +} + +int get_th(int size_window,long double threshold, long double proba){ + int th_nb_gene=size_window+1; + long double p_t=0.0; +// printf("starting at %d / with proba %LE\n",th_nb_gene,proba); + while(p_t<=threshold && th_nb_gene>0){ + th_nb_gene--; + p_t = p_t + proba_n(size_window,th_nb_gene,proba); +// printf("\tp(x>=%d) = %LE\n",th_nb_gene,p_t); + } + return th_nb_gene; +} + + +int get_th_less(int size_window,long double threshold, long double proba){ + int th_nb_gene=-1; + long double p_t=0.0; + while(p_t<=threshold && th_nb_gene=0 && j>=0 && istore[start][size][type]){ + result=0; +// i=start+hood+1;j=size+hood+1; + } + } + } + } + return result; +} + + +long double log10perso(long double x){ + return log(x)/log(10); +} + + +int main(int argc, char *argv[]) +{ +// printf( "I am alive! Beware.\n" ); + FILE *ifp, *reffile; + char* refFilename=argv[1];char* inputFilename=argv[2];char* outputFilename=argv[3]; + reffile=fopen(refFilename,"r"); + int nb_genes=0,phage=0,pfam=0,unch=0,size=0,strand=0,hallmark=0,i=0,noncaudo=0; + float f_size=0.0; + long double p_phage=0.0,p_pfam=0.0,p_unch=0.0,p_strand=0.0,p_noncaudo=0.0; + if (reffile == NULL) { + fprintf(stderr, "Can't open input file %s\n",refFilename); + exit(1); + } + while (fscanf(reffile,"%Lf %Lf %Lf %Lf %f %Lf", &p_phage, &p_pfam, &p_unch, &p_strand, &f_size, &p_noncaudo) == 6) {} + printf("refs => %LE %LE %LE %LE %f %LE\n", p_phage, p_pfam, p_unch, p_strand, f_size, p_noncaudo); + fclose(reffile); + ifp = fopen(inputFilename, "r"); + if (ifp == NULL) { + fprintf(stderr, "Can't open input file %s!\n",inputFilename); + exit(1); + } + if (fscanf(ifp, "%d", &nb_genes) == 1){ +// printf("%d genes\n",nb_genes); + } + // Alloc memory for gene tables + int t_phage[nb_genes],t_pfam[nb_genes],t_unch[nb_genes], t_size[nb_genes],t_strand[nb_genes],t_hallmark[nb_genes],t_noncaudo[nb_genes]; + while (fscanf(ifp,"%d %d %d %d %d %d %d", &phage, &noncaudo, &pfam, &unch, &size, &strand, &hallmark) == 7) { +// printf("gene %d => %d %d %d %d %d %d %d\n", i, phage, noncaudo, pfam, unch, size, strand, hallmark); + t_phage[i]=phage; + t_noncaudo[i]=noncaudo; + t_pfam[i]=pfam; + t_unch[i]=unch; + t_size[i]=size; + t_strand[i]=strand; + t_hallmark[i]=hallmark; + i++; + } + fclose(ifp); + if (nb_genes!=i){ + printf("Houston we got a problem !!!!!! : we had %d genes and we count %d lines\n",nb_genes,i); + exit(1); + } +// // set up sliding windows + int min=10,max=100; + if (min>nb_genes){min=nb_genes;} + if (max>nb_genes){max=nb_genes;} +// // how many sliding windows will we have ? + int k=0,j=0,max_g=0,c_phage=0,c_pfam=0,pred_nb_s_w=0,t=0,th_nb_gene=0; + for (k=min;k<=max;k++){ + pred_nb_s_w+=nb_genes-k+1; + } +// printf("Predicting %d sliding windows\n",pred_nb_s_w); + // computing the threshold for each size of sliding window +// printf("Trying to allocate the memory 1\n"); + long double th=0.01/pred_nb_s_w,p_t=0.0; + // alloc memory for score matrix for the 6 metrics + double ***store=malloc(nb_genes*sizeof(double **)); + if (store==NULL){printf("out of memory\n");exit(1);} + for(i=0; i < nb_genes; i++){ + store[i] = malloc(max * sizeof(double *)); + if(store[i] == NULL){printf("out of memory\n");exit(1);} + for (j=0;j<=max;j++){ + store[i][j] = malloc(6 * sizeof(double )); + if(store[i][j] == NULL){printf("out of memory\n");exit(1);} + for (k=0;k<6;k++){store[i][j][k]=0;} + } + } +// printf("Memory Allocated and Initialized for %d %d 5\n",nb_genes,max); + int store_h[nb_genes][max]; + int n_phage=0,n_pfam=0,n_short=0,n_switch=0,n_unch=0,n_hallmark=0,n_noncaudo=0; + printf("For this contig we'll have %d sliding windows (= nb of comparison)\n",pred_nb_s_w); + for (k=max;k>=min;k--){ + int th_phage=k,th_pfam=k,th_size=k,th_unch=k,th_strand=k,th_noncaudo=k; + // we get all thresholds + th_phage=get_th(k,th,p_phage); +// printf("For window size %d, you will need at least %d phage genes to be significant\n",k,th_phage); + th_pfam=get_th_less(k,th,p_pfam); + th_unch=get_th(k,th,p_unch); +// printf("For window size %d, you will need at least %d uncharacterized genes to be significant\n",k,th_unch); + th_size=get_th(k,th,0.1); + th_strand=get_th_less(k,th,p_strand); + th_noncaudo=get_th(k,th,p_noncaudo); +// printf("For window size %d, you will need at least %d noncaudo genes to be significant\n",k,th_noncaudo); +// printf("////// Sliding window of %d genes -> th %d\n",k,th_phage); + // For all the sliding windows of this size, we count and compute and store the significativity value if > sig + for (i=0;i<(nb_genes-k+1);i++){ + n_phage=0;n_pfam=0;n_unch=0;n_short=0;n_switch=0;n_hallmark=0;n_noncaudo=0; +// // Counting + for (j=i;j<(i+k);j++){ + n_phage+=t_phage[j]; +// printf("Adding %d to the number of phage genes (%d)\n",t_phage[j],j); + n_pfam+=t_pfam[j]; + n_unch+=t_unch[j]; + n_short+=t_size[j]; + n_switch+=t_strand[j]; + n_hallmark+=t_hallmark[j]; + n_noncaudo+=t_noncaudo[j]; + } + unsigned tag=0; +// // If above thresholds + if (n_phage>th_phage){ +// // Calculate and store significativity + store[i][k][0]=-1*log10(proba_more_than(k,n_phage,p_phage)*pred_nb_s_w);tag=1; +// printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k); + } + if (n_pfam %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 1\n",n_pfam,th_pfam,store[i][k][1],i,k); + } + if (n_unch>th_unch){ +// // Calculate and store significativity + store[i][k][2]=-1*log10(proba_more_than(k,n_unch,p_unch)*pred_nb_s_w);tag=1; +// printf("Unch => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 2\n",n_unch,th_unch,store[i][k][2],i,k); + } + if (n_short>th_size){ +// // Calculate and store significativity + store[i][k][3]=-1*log10(proba_more_than(k,n_short,0.1)*pred_nb_s_w);tag=1; +// printf("Short => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 3\n",n_short,th_size,store[i][k][3],i,k); + } + if (n_switch %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 4\n",n_switch,th_strand,store[i][k][4],i,k); + } + if (n_noncaudo>th_noncaudo){ +// // Calculate and store significativity + store[i][k][5]=-1*log10(proba_more_than(k,n_noncaudo,p_noncaudo)*pred_nb_s_w);tag=1; +// printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k); + } + if (tag==1){store_h[i][k]=n_hallmark;} + } + } + // We look for local maxima and export the results + FILE *ofp; + ofp = fopen(outputFilename, "w"); + if (ofp == NULL) { + fprintf(stderr, "Can't open output file %s!\n",outputFilename); + exit(1); + } + for (k=max;k>=min;k--){ + for (i=0;i<(nb_genes-k+1);i++){ + for (j=0;j<6;j++){ + if (store[i][k][j] != 0.0){ // the stored value is not null +// printf("potential local maximum %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]); + if (is_local_maximum(i,k,j,nb_genes-1,max,store)==1){ // and is a local maxima + // so we print it, with the nb_hallmark (start / window size / type / sig / nb_hallmark) +// printf("local maximum ! %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]); + // i - start gene / k - sliding window size / j - proof typ (0 - phage / 1 - pfam / 2 - unch / 3 - size / 4 - strand) + fprintf(ofp, "%d\t%d\t%d\t%.14lf\t%d\n",i,k,j,store[i][k][j],store_h[i][k]); + } + } + } + } + } + fclose(ofp); + printf("done"); + // We export the results + return 0; +} diff --git a/virsorter/Scripts/Step_0_make_new_clusters.pl b/virsorter/Scripts/Step_0_make_new_clusters.pl new file mode 100755 index 0000000..1982cb7 --- /dev/null +++ b/virsorter/Scripts/Step_0_make_new_clusters.pl @@ -0,0 +1,257 @@ +#!/usr/bin/env perl + +use strict; +use autodie; +use File::Spec::Functions; +use File::Path 'mkpath'; +use File::Which 'which'; + +# Script to generate a new db with putative new clusters +# Argument 0 : revision directory +# Argument 1 : Fasta file of the predicted proteins +# Argument 2 : Fasta file of the unclustered from previous Runs +# Argument 3 : Liste of prots to try to cluster +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3]))) +{ + print "# Script to generate a new db with putative new clusters +# Argument 0 : revision directory +# Argument 1 : Fasta file of the predicted proteins +# Argument 2 : Fasta file of the unclustered from previous Runs +# Argument 3 : Liste of prots to try to cluster\n"; + die "\n"; +} + +my $path_to_blastall = which("blastall"); +my $MCX_LOAD = which("mcxload"); +my $MCL = which("mcl"); + +my $r_dir=$ARGV[0]; +$r_dir=~/(r_\d*)\/?$/; +my $r_number=$1; +print "Revision $r_number\n"; +my $fasta_prot_contigs=$ARGV[1]; +my $fasta_prot_unclustered=$ARGV[2]; +my $liste=$ARGV[3]; +my $blast_unclustered=$fasta_prot_unclustered; +$blast_unclustered=~s/Pool_unclustered.faa/Blast_unclustered.tab/; + +my $path_to_formatdb = which("formatdb"); +my $path_to_blastal = which("blastall"); + +my $min_seq_in_a_cluster=3; + +my $path_to_muscle= which("muscle"); +my $path_to_hmmbuild= which("hmmbuild"); +my $path_to_hmmpress= which("hmmpress"); + + +my %check; +open(LI,"<$liste") || die ("pblm opening liste $liste\n"); +while (
  • ){ + chomp($_); + my @tab=split(",",$_); + foreach(@tab){$check{$_}=1;} +} +close LI; + +my $pool_new= catfile($r_dir, "pool_new_proteins.fasta"); +open(S1,">$pool_new") || die ("pblm opening file $pool_new"); +open(FA,"<$fasta_prot_contigs") || die ("pblm opening file $fasta_prot_contigs"); +my $tag=0; +while (){ + chomp($_); + if ($_=~/^>(.*)/){ + my $seq=$1; + $tag=0; + if ($check{$seq}==1){ + print S1 "$_\n"; + $tag=1; + } + } + elsif($tag==1){ + print S1 "$_\n"; + } +} +close FA; +close S1; + +my $db= catfile($r_dir, "pool_new_proteins"); +my $cmd_format="$path_to_formatdb -i $pool_new -n $db"; +print "$cmd_format\n"; +my $out=`$cmd_format`; +print "Formatdb : $out\n"; +my $cmd_cat="cat $fasta_prot_unclustered >> $pool_new"; +print "$cmd_cat\n"; +$out=`$cmd_cat`; +print "Cat : $cmd_cat\n"; +# BLAST des unclustered et des new contre les new +my $out_blast=catfile($r_dir, "pool_unclustered-and-new-proteins-vs-new-proteins.tab"); +my $cmd_blast="$path_to_blastall -p blastp -i $pool_new -d $db -o $out_blast -m 8 -a 10 -e 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts +print "$cmd_blast\n"; +$out=`$cmd_blast`; +print "Blast : $out\n"; +$cmd_cat="cat $blast_unclustered >> $out_blast"; +print "$cmd_cat\n"; +$out=`$cmd_cat`; +print "Cat : $out\n"; +print "Generating abc file\n"; +my $out_abc= catfile($r_dir, "new_clusters.abc"); +my $th_score=50; +my $th_evalue=0.00001; +my $max=200; # Max on sig +open(S1,">$out_abc") || die ("pblm opening file $out_abc\n"); +open(BL,"<$out_blast") || die ("pblm opening file $out_blast\n"); +while(){ + chomp($_); + my @tab=split("\t",$_); + if ($tab[11]>$th_score && $tab[10]<$th_evalue && $tab[0] ne $tab[1]){ + my $evalue=$tab[10]; +# $evalue=-log10($evalue); +# if ($evalue>$max){$evalue=$max;} + print S1 "$tab[0]\t$tab[1]\t$evalue\n"; + } +} +close BL; +close S1; +my $out_mci=catfile($r_dir, "new_clusters.mci"); +my $out_tab=catfile($r_dir, "new_clusters.tab"); +my $cmd_mcxload="$MCX_LOAD -abc $out_abc --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o $out_mci -write-tab $out_tab"; +print "$cmd_mcxload\n"; +$out=`$cmd_mcxload`; +print "Mxc Load : $out\n"; +my $dump_file=catfile($r_dir, "new_clusters.csv"); +my $cmd_mcl="$MCL $out_mci -I 2 -use-tab $out_tab -o $dump_file"; +print "$cmd_mcl\n"; +$out=`$cmd_mcl`; +print "Mcl : $out\n"; + +my %unclustered; +my %clusters; +my %check_cluster; +my $last_cluster_id=0; +# toutes les séquences clusterisées dans des groupes de plus de 2 (3 et plus) -> on prend / Toutes les autres on les garde en tant qu'unclustered +open(DUMP,"<$dump_file") || die "pblm ouverture fichier $dump_file\n"; +while(){ + chomp($_); + my @tab=split("\t",$_); + my $n_s_c=$#tab+1; + if ($n_s_c>=$min_seq_in_a_cluster){ + # on a trouvé un cluster de plus de deux + my $cluster_id=$last_cluster_id+1; + $cluster_id="Phage_cluster_".$cluster_id."-".$r_number; + print "We found a cluster with $n_s_c sequences => Cluster $cluster_id\n"; + $last_cluster_id++; + foreach(@tab){ + $clusters{$cluster_id}{$_}=1; + $check_cluster{$_}=1; + } + } + else{ + foreach(@tab){ + $unclustered{$_}=1; + $check_cluster{$_}=1; + } + } +} +close DUMP; +my %seq_temp; +my $id_c=""; +open(FA,"<$pool_new") || die "pblm ouverture fichier $pool_new\n"; +while(){ + chomp($_); + if ($_=~/^>(\S*)/){ + $id_c=$1; + if (!defined($check_cluster{$id_c})){$unclustered{$id_c}=1;$check_cluster{$id_c}=1;} + } + else{$seq_temp{$id_c}.=$_;} +} +close FA; + +`mkdir $r_dir/clusts`; +foreach(keys %clusters){ + my $cluster_id=$_; + my $out_file=catfile($r_dir, "clusts", $cluster_id . ".faa"); + open(S1,">$out_file") || die "pblm ouverture fichier $out_file\n"; + foreach(keys %{$clusters{$cluster_id}}){ + print S1 ">$_\n$seq_temp{$_}\n"; + } + close S1; +} + +mkpath(catdir($r_dir, 'db')); +my $pool_unclustered=catfile($r_dir, "db", "Pool_unclustered.faa"); +my $blast_unclustered=catfile($r_dir, "db", "Blast_unclustered.tab"); +my $pool_new_unclustered=catfile($r_dir, "db", "Pool_new_unclustered.faa"); +my $blastable_new_unclustered=$pool_new_unclustered; +$blastable_new_unclustered=~s/\.faa//; + +open(S2,">$pool_new_unclustered") || die "pblm ouverture fichier $pool_new_unclustered\n"; +open(S1,">$pool_unclustered") || die "pblm ouverture fichier $pool_unclustered\n"; +foreach(keys %unclustered){ + print S1 ">$_\n$seq_temp{$_}\n"; + if ($check{$_}==1){ + print S2 ">$_\n$seq_temp{$_}\n"; + } +} +close S1; +close S2; +print "making a blastable db from the new unclustered\n"; +$out=`$path_to_formatdb -i $pool_new_unclustered -n $blastable_new_unclustered`; +# on réduit aussi le fichier blast qu'on ajoute au blast des unclustered +open(BL,"<$out_blast") || die "pblm ouverture fichier $out_blast\n"; +open(S1,">$blast_unclustered") || die "pblm ouverture fichier $blast_unclustered\n"; +while(){ + chomp($_); + my @tab=split("\t",$_); + if ($unclustered{$tab[0]}==1 && $unclustered{$tab[1]}==1){ + print S1 "$_\n"; + } +} +close BL; +close S1; + +my $tag=0; +foreach(sort keys %clusters){ + $tag=1; + my $ali_id=$_; + my $path_to_file= catfile($r_dir, "clusts", $ali_id); + my $path_to_fasta=catfile($r_dir, "clusts", $ali_id . ".faa"); + my $path_to_ali=catfile($r_dir, "clusts", $ali_id . ".ali_faa"); + my $path_to_hmm=catfile($r_dir, "clusts", $ali_id . "_ali.hmm"); + if (-e $path_to_ali){ + `rm $path_to_ali $path_to_hmm`; + } + my $muscle_out= catfile($r_dir, "log_out_muscle"); + my $muscle_err= catfile($r_dir, "log_err_muscle"); + `$path_to_muscle -in $path_to_fasta -out $path_to_ali > $muscle_out 2> $muscle_err`; + my $out_stokcholm=$path_to_ali.".stockholm"; + open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n"; + print S1 "# STOCKHOLM 1.0\n"; + open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n"; + while(){ + chomp($_); + if ($_=~/^>(.*)/){ + my $id=$1; + $id=~s/\s/_/g; + print S1 "\n$id "; + + } + else{print S1 "$_";} + } + close FA; + print S1 "\n//\n"; + `$path_to_hmmbuild --amino $path_to_hmm $out_stokcholm`; +} + +my @tab_hmm=<$r_dir/clusts/*.hmm>; +if ($#tab_hmm>=0){ + # we gather all hmm and fasta (if any) + $out=`cat $r_dir/clusts/*.hmm >> $r_dir/db/Pool_clusters.hmm`; + print "cat new hmm : $out\n"; + # and create a new database for hmmsearch + $out=`$path_to_hmmpress $r_dir/db/Pool_clusters.hmm`; + print "hmm press :$out\n"; +} +else{ + $out=`touch $r_dir/db/Pool_clusters.hmm`; +} diff --git a/virsorter/Scripts/Step_1_contigs_cleaning_and_gene_prediction.pl b/virsorter/Scripts/Step_1_contigs_cleaning_and_gene_prediction.pl new file mode 100755 index 0000000..98aac5d --- /dev/null +++ b/virsorter/Scripts/Step_1_contigs_cleaning_and_gene_prediction.pl @@ -0,0 +1,356 @@ +#!/usr/bin/env perl + +use strict; +use autodie; +use Bio::Seq; +use File::Spec::Functions; +use File::Which 'which'; + +# Script to detect circular contigs, nett sequences, and predict genes with mga +# Argument 0 : Fasta file of contigs +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3]))) +{ + print "# Script to detect circular contigs, nett sequences, and predict genes with mga +# Argument 0 : Id du dataset +# Argument 1 : Working dir +# Argument 2 : Fasta file of contigs +# Argument 3 : Threshold on the number of genes \n"; + die "\n"; +} + +my $id = $ARGV[0]; +my $tmp_dir = $ARGV[1]; +my $fasta_contigs = $ARGV[2]; +my $th_nb_genes = $ARGV[3]; +my $path_to_mga = which('mga_linux_ia64'); +my $in_file = catfile($tmp_dir, $id . "_nett.fasta"); +my $circu_file = catfile($tmp_dir, $id . "_circu.list"); +my $out_special_circu = catfile($tmp_dir, $id . "_contigs_circu_temp.fasta"); + +# Reading fasta file of the contigs +open my $fa, '<', $fasta_contigs; +my %seq_base; +my $id_seq=""; +while(<$fa>){ + $_=~s/\r\n/\n/g; #Cas d'un fichier windows ##AJOUT + chomp($_); + if ($_=~/^>(\S*)/){$id_seq=$1;} + else{$seq_base{$id_seq}.=$_;} +} +close $fa; + +## DETECTION OF CIRCULAR CONTIG AND CLEANING OF THESE CIRCULAR (REMOVE THE MATCHING ENDS) +my $minimum_size=1500; +my %order_contig; +my %length; +my $n1=0; + +open my $s1, '>', $in_file; +open my $s2, '>', $circu_file; +for my $id_contig ( + sort {length($seq_base{$b}) <=> length($seq_base{$a})} keys %seq_base){ + $order_contig{$id_contig}=$n1; + $n1++; + my $s=$seq_base{$id_contig}; + $length{$id_contig}=length($seq_base{$id_contig}); + # Test its circularity + my $prefix=substr($seq_base{$id_contig},0,10); + if ($s=~/(.+)($prefix.*?)$/){ +# print "on a retrouvé prefix ($prefix) plus loin dans la séquence de $_\n"; + my $sequence=$1; + my $suffixe=$2; + my $test=substr($seq_base{$id_contig},0,length($suffixe)); +# print "$suffixe\n$test\n"; + if ($suffixe eq $test){ +# print " et il match bien $suffixe, donc c'est un contig circulaire\n"; + my $l=$length{$id_contig}; + $id_contig=$id_contig."-circular"; + $length{$id_contig}=$l; + print $s2 "$id_contig\t$length{$id_contig}\n"; + $seq_base{$id_contig}=$sequence; + } + } + # Update the length of the contig + $length{$id_contig}=length($seq_base{$id_contig}); + print $s1 ">$id_contig\n$seq_base{$id_contig}\n"; +} +close $s1; +close $s2; + +# Gene prediction for all contigs +my $out_file= $tmp_dir."/".$id."_mga.predict"; +print "$path_to_mga $in_file -m > $out_file\n"; +my $mga=`$path_to_mga $in_file -m > $out_file`; + +# Special prediction for circular contigs if we have some +my $out_file_circu=""; +my %circu; +if (-e $circu_file){ + open my $tsv, '<', $circu_file; + while(<$tsv>){ + chomp($_); + my @tab=split("\t",$_); + my $id_c=$tab[0]; + $circu{$id_c}=1; + } + close $tsv; + open my $s3, '>', $out_special_circu; + my $long=1000; # we cp the 1000 first bases to the end of the contig + my $seuil_long=1000; + my $n_circu=0; + foreach(sort {$order_contig{$a} <=> $order_contig{$b} } keys %circu){ + my $id_c=$_; + my $s=$seq_base{$id_c}.substr($seq_base{$id_c},0,$long); + print $s3 ">$id_c\n$s\n"; + $n_circu++; + } + close $s3; + $out_file_circu= $tmp_dir."/".$id."_special_circus_mga.predict"; + if ($n_circu>0){ + my $mga=`$path_to_mga $out_special_circu -m > $out_file_circu`; + } + else{ + `touch $out_file_circu`; + } +} + +# Mix 'n match of the two results of gene prediction +my %order_gene; +my $n2=0; +open my $fts, '<', $out_file; +my %predict; +my %type; +my $id_c=""; +while(<$fts>){ + chomp($_); + if($_=~/^# gc/){} + elsif($_=~/^# self: (.*)/){$type{$id_c}=$1;} + elsif ($_=~/^# (.*)/){ + $id_c=$1; + $n2=0; + } + else{ + my @tab=split("\t",$_); + $predict{$id_c}{$tab[0]}=$_; + if (!defined($order_gene{$id_c}{$tab[0]})){$order_gene{$id_c}{$tab[0]}=$n2;$n2++;} + } +} +close $fts; +if (-e $circu_file){ + open my $fts_c, '<', $out_file_circu; + my $tag=0; + while(<$fts_c>){ + chomp($_); + if($_=~/^# gc/){} + elsif($_=~/^# self: (.*)/){$type{$id_c}=$1;} + elsif ($_=~/^# (.*)/){ + if($tag==1){ + my %to_start; + # Some ORFs were modified, we clean up + foreach(sort {$order_gene{$a} <=> $order_gene{$b} } keys %{$predict{$id_c}}){ + my @tab=split("\t",$predict{$id_c}{$_}); + if ($tab[5]!=11){ + # $tab[0] miss start and/or stop codon + if(($tab[1]<3) || ($tab[2]>($length{$id_c}-3))){ + # And it spans the origin, so we can remove it + if ($tab[1]<3){ + $to_start{$tab[0]}{"start"}=$tab[1]; + $to_start{$tab[0]}{"stop"}=$tab[2]; + } + delete($predict{$id_c}{$tab[0]}); + } + elsif(($tab[2]>997) && ($tab[2]<1001)){ # if we are around the zone of ~ 1000 + foreach(keys %to_start){ + my $total=($length{$id_c}-$tab[1]+1)+($to_start{$_}{"stop"}); + if ($total % 3 == 0){ + $tab[2]=$to_start{$_}{"stop"}; + $tab[5]=11; + my $new_line=join("\t",@tab); + $predict{$id_c}{$tab[0]}=$new_line; + } + } + } + } + } + } + $id_c=$1; + $tag=0; + } + else{ + my @tab=split("\t",$_); + if (defined($predict{$id_c}{$tab[0]})){ + my @tab2=split("\t",$predict{$id_c}{$tab[0]}); + if (($tab2[1]==$tab[1]) && ($tab2[2]==$tab[2])){}# same prediction, we don't change anything + else{ + if (($tab[1]<$length{$id_c}) && ($tab[2]>$length{$id_c})){ + # we span the origin, we replace the prediction + $tag=1; + my $stop=$tab[2]-$length{$id_c}; + $tab[2]=$stop; + my $new_line=join("\t",@tab); + $predict{$id_c}{$tab[0]}=$new_line; + } + } + } + else{ + # we predict a new gene, we keep only if at the start / end of the contig + if (($tab[1]<$length{$id_c}) && ($tab[2]>$length{$id_c})){ + $tag=1; + my $stop=$tab[2]-$length{$id_c}; + $tab[2]=$stop; + my $new_line=join("\t",@tab); + $predict{$id_c}{$tab[0]}=$new_line; + $tag=1; + } + } + } + } + if($tag==1){ + my %to_start; + # we changed some things, we clean up + foreach(sort {$order_gene{$a} <=> $order_gene{$b} } keys %{$predict{$id_c}}){ + my @tab=split("\t",$predict{$id_c}{$_}); + if ($tab[5]!=11){ + if(($tab[1]<3) || ($tab[2]>($length{$id_c}-3))){ + if ($tab[1]<3){ + $to_start{$tab[0]}{"start"}=$tab[1]; + $to_start{$tab[0]}{"stop"}=$tab[2]; + } + delete($predict{$id_c}{$tab[0]}); + } + elsif(($tab[2]>997) && ($tab[2]<1001)){ + foreach(keys %to_start){ + my $total=($length{$id_c}-$tab[1]+1)+($to_start{$_}{"stop"}); + if ($total % 3 == 0){ + $tab[2]=$to_start{$_}{"stop"}; + $tab[5]=11; + my $new_line=join("\t",@tab); + $predict{$id_c}{$tab[0]}=$new_line; + } + } + } + } + } + } + close $fts_c; +} + +## Generation of the final files +## One with all sequences nett and filtered (based on number of genes) - Fasta +## One of the associated gene prediction - MGA-like +## One of the predicted protein sequences - Fasta +my $final_file=$tmp_dir."/".$id."_nett_filtered.fasta"; +my $out_final=$tmp_dir."/".$id."_mga_final.predict"; +my $prot_file=$tmp_dir."/".$id."_prots.fasta"; + +open my $fa_s, '>', $final_file; +open my $out_s, '>', $out_final; +open my $prot_s,'>', $prot_file; +my $n=0; +foreach(sort {$order_contig{$a} <=> $order_contig{$b} } keys %predict){ + $n++; + if ($n % 10000 == 0){print "$n-ieme contig\n";} + my $id=$_; + my @tab_genes=sort {$order_gene{$id}{$a} <=> $order_gene{$id}{$b} } keys %{$predict{$id}}; + my $n_complete_genes=0; + for (my $i=0;$i<=$#tab_genes;$i++){ + my @tab=split("\t",$predict{$id}{$tab_genes[$i]}); + if ($tab[5]!=11){} + else{$n_complete_genes++;} + } + if ($n_complete_genes<$th_nb_genes){ +# print "$id is excluded because too short ($n_complete_genes) \n"; + } + else{ + ## We check the first gene and modify it if needed + my @tab_first=split("\t",$predict{$id}{$tab_genes[0]}); + my @tab_second=split("\t",$predict{$id}{$tab_genes[1]}); + $tab_first[0]=~/gene_(\d*)/; + my $n_1=$1; + $tab_second[0]=~/gene_(\d*)/; + my $n_2=$1; + if ($n_1>$n_2){ + print "We probably have a circular contig ($id) as the first gene $tab_first[0] is beyond the second gene $tab_second[0] ($n_1>$n_2), so we switch $tab_first[0] "; + $tab_first[0]="gene_0"; + print "to $tab_first[0]\n"; + $predict{$id}{$tab_genes[0]}=join("\t",@tab_first); + } +# if ($n_complete_genes<$th_nb_genes){print "$id is saved because of its circularity\n";} +# else{print "We keep $id = $#tab_genes +1 genes\n";} + print $out_s ">$id\t$length{$id}\n"; + print $fa_s ">$id\n"; + my $seq_c=$seq_base{$id}; + print $fa_s "$seq_c\n"; + foreach(@tab_genes){ + my @tab=split("\t",$predict{$id}{$_}); + if ($tab[5]!=11){ + # soit on est au début de séquence, soit en toute fin (théoriquement) + if ($tab[4]!=0){ + if ($tab[3] eq "-"){ + $tab[2]-=$tab[4]; + } + elsif($tab[3] eq "+"){ + $tab[1]+=$tab[4]; + } + else{ + print "%%%%%% pblm on a pas de sens pour $id : @tab\n"; + } + } + my $new_line=join("\t",@tab); + $predict{$id}{$_}=$new_line; + } + print $out_s "$predict{$id}{$_}\n"; + @tab=split("\t",$predict{$id}{$_}); + my $name = $tab[0]; + my $start = $tab[1]; + my $stop = $tab[2]; + my $sens = $tab[3]; + my $frame = $tab[4]; + my $frag = ""; + # Regular case (not spanning the origin) + if ($start<$stop){ + my $long=$stop-$start+1; + $frag=substr($seq_c,$start-1,$long); + } + # Exceptional case, we span the origin + else{ + my $l1=length($seq_c)-$start+1; + $frag=substr($seq_c,$start-1,$l1); + $frag.=substr($seq_c,0,$stop); + } + ## WE GET THE PREDICTED PROTEIN SEQUENCE + if ($frag eq ""){ + print "!!!! FRAG IS $frag\n"; + } + my $seq_bio = Bio::Seq->new(-id => "dummy_id" , -seq =>$frag, -alphabet => 'dna', -verbose => -1); + # Test to catch the Bio SeqUtils warning + my @seqs; + eval{ + @seqs = Bio::SeqUtils->translate_6frames($seq_bio, -verbose => -1); + }; + if ( $@ ){ + print "We got the error $@\n"; + } + #my @seqs = Bio::SeqUtils->translate_6frames($seq_bio, -verbose => -1); + # End of test + my $cadre=0; + if ($sens eq "-"){$cadre=3;} + my $prot=$seqs[$cadre]; + my $prot_sequence=$prot->seq; + if ($prot_sequence=~/\*$/){ + # we remove the stop codon + chop($prot_sequence); + } + my $id_out=$id."-".$name; + if (($prot_sequence=~/X{50,}/) || ($prot_sequence=~/F{50,}/) || ($prot_sequence=~/A{50,}/) || ($prot_sequence=~/K{50,}/) || ($prot_sequence=~/P{50,}/)){ + print "we exclude $id_out because there is a pblm with the sequence -> too many succesive X, F, A, K or P\n"; + } + else{ + print $prot_s ">$id_out\n$prot_sequence\n"; + } + } + } +} +close $fa_s; +close $out_s; +close $prot_s; diff --git a/virsorter/Scripts/Step_2_merge_contigs_annotation.pl b/virsorter/Scripts/Step_2_merge_contigs_annotation.pl new file mode 100755 index 0000000..f334619 --- /dev/null +++ b/virsorter/Scripts/Step_2_merge_contigs_annotation.pl @@ -0,0 +1,239 @@ +#!/usr/bin/env perl + +use strict; +use autodie; + +# Script to generate the merged contig annotation (annotate each gene) +# Argument 0 : MGA predict file +# Argument 1 : HMMsearch vs Phage Clusters +# Argument 2 : BLast vs unclustered +# Argument 3 : HMMsearch vs PFAMa +# Argument 4 : HMMsearch vs PFAMb +# Argument 5 : Ref Phage Clusters +# Argument 6 : Out file +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[6]))) +{ + print "# Script to generate the merged contig annotation (annotate each gene) +# Argument 0 : MGA predict file +# Argument 1 : HMMsearch vs Phage Clusters +# Argument 2 : BLast vs unclustered +# Argument 3 : HMMsearch vs PFAMa +# Argument 4 : HMMsearch vs PFAMb +# Argument 5 : Ref Phage Clusters +# Argument 6 : Out file\n"; + die "\n"; +} + +my $mga_file = $ARGV[0]; +my $hmm_phage_clusters = $ARGV[1]; +my $blast_vs_unclustered = $ARGV[2]; +my $hmm_pfama = $ARGV[3]; +my $hmm_pfamb = $ARGV[4]; +my $ref_phage_clusters = $ARGV[5]; +my $out_file = $ARGV[6]; + +my $circu_file=$mga_file; +$circu_file=~s/_mga_final.predict/_circu.list/; +# Take list of circular files +my %circu; +open my $li, '<', $circu_file; +while(<$li>){ + chomp($_); + my @tab=split("\t",$_); + my $id_c=$tab[0]; + $circu{$id_c}=1; +} +close $li; + +my $n2=0; +my %size; +my %order_gene; +my %predict; +my %type; +my $id_c=""; +my @liste_contigs; +# Read all gene predictions +open my $fts, '<', $mga_file; +while(<$fts>){ + chomp($_); + if ($_=~/^>(.*)/){ + my @tab=split("\t",$1); + $id_c=$tab[0]; + $size{$id_c}=$tab[1]; + $n2=0; + push(@liste_contigs,$id_c); + } + else{ + my @tab=split("\t",$_); + $predict{$id_c}{$tab[0]}=$_; + $order_gene{$id_c}{$tab[0]}=$n2; + $n2++; + } +} +close $fts; + +# first the BLAST vs unclustered , which annotation will eventually be erased by the HMM vs Phage cluster if any (that we trust more) +my %affi_phage_cluster; +my $score_blast_th=50; +my $evalue_blast_th=0.001; +open my $tsv, '<', $blast_vs_unclustered; +while (<$tsv>){ + chomp($_); + my @tab=split("\t",$_); + my $seq=$tab[0]; + my $match=$tab[1]; + $match=~s/\|/_/g; + my $evalue=$tab[10]; + my $score=$tab[11]; + if ($score>=$score_blast_th && $evalue<=$evalue_blast_th && (!defined($affi_phage_cluster{$seq}) || ($score>$affi_phage_cluster{$seq}{"score"})) && ($seq ne $match)){ ## We add the $seq ne $match so that we do not count a match to a phage sequence when it's only itself in the unclustered pool from a previous revision. + $affi_phage_cluster{$seq}{"score"}=$score; + $affi_phage_cluster{$seq}{"evalue"}=$evalue; + $affi_phage_cluster{$seq}{"match"}=$match; +# print "$seq match $match\n"; + } + +} +close $tsv; + + +my $score_th=40; +my $evalue_th=0.00001; + +# Then reading the annotation from the HMM vs Phage Cluster +open my $tsv, '<', $hmm_phage_clusters; +while(<$tsv>){ + chomp($_); + if ($_=~m/^#/){ + next; + } + else{ + my @splign=split(m/\s+/,$_); + my $seq=$splign[0]; + my $match=$splign[2]; + $match=~s/\.ali_faa//g; + my $evalue=$splign[4]; + my $score=$splign[5]; + if ($score>=$score_th && $evalue<=$evalue_th && (!defined($affi_phage_cluster{$seq}) || ($score>$affi_phage_cluster{$seq}{"score"}))){ + $affi_phage_cluster{$seq}{"score"}=$score; + $affi_phage_cluster{$seq}{"evalue"}=$evalue; + $affi_phage_cluster{$seq}{"match"}=$match; +# print "$seq match $match\n"; + } + } +} +close $tsv; + +# Then reading annotation from PFAM +my %affi_pfam; +open my $tsv, '<', $hmm_pfama; +while(<$tsv>){ + chomp($_); + if ($_=~m/^#/){ + next; + } + else{ + my @splign=split(m/\s+/,$_); + my $seq=$splign[0]; + my $match=$splign[2]; + my $evalue=$splign[4]; + my $score=$splign[5]; + if ($score>=$score_th && $evalue<=$evalue_th && (!defined($affi_pfam{$seq}) || ($score>$affi_pfam{$seq}{"score"}))){ + $affi_pfam{$seq}{"score"}=$score; + $affi_pfam{$seq}{"evalue"}=$evalue; + $affi_pfam{$seq}{"match"}=$match; + } + } +} +close $tsv; + +open my $tsv, '<', $hmm_pfamb; +while(<$tsv>){ + chomp($_); + if ($_=~m/^#/){ + next; + } + else{ + my @splign=split(m/\s+/,$_); + my $seq=$splign[0]; + my $match=$splign[2]; + my $evalue=$splign[4]; + my $score=$splign[5]; + if ($score>=$score_th && $evalue<=$evalue_th && (!defined($affi_pfam{$seq}) || ($score>$affi_pfam{$seq}{"score"}))){ + $affi_pfam{$seq}{"score"}=$score; + $affi_pfam{$seq}{"evalue"}=$evalue; + $affi_pfam{$seq}{"match"}=$match; + } + } +} +close $tsv; + + +# We also read the annotation for each phage cluster, i.e. its category +my %phage_cluster; +open my $psv, '<', $ref_phage_clusters; +while (<$psv>){ + chomp($_); + my @tab=split(/\|/,$_); + $phage_cluster{$tab[0]}{"category"}=$tab[1]; +} +close $psv; + + +# Final output +# >Contig,nb_genes,circularity +# gene_id,start,stop,length,strand,affi_phage,score,evalue,category,affi_pfam,score,evalue, +open my $s1, '>', $out_file; +my $n=0; +foreach(@liste_contigs){ + $n++; + if ($n % 10000 == 0){print "$n-ieme contig\n";} + my $contig_c=$_; + my $circ="l"; + if ($circu{$contig_c}==1){$circ="c";} + my @tab_genes=sort {$order_gene{$contig_c}{$a} <=> $order_gene{$contig_c}{$b} } keys %{$predict{$contig_c}}; + my $n_g=$#tab_genes+1; + print $s1 ">$contig_c|$n_g|$circ\n"; + foreach(@tab_genes){ + my $g_c=$_; + my @tab=split("\t",$predict{$contig_c}{$g_c}); + $g_c=$contig_c."-".$g_c; + my $name=$tab[0]; + my $start=$tab[1]; + my $stop=$tab[2]; + my $strand=$tab[3]; + my $frame=$tab[4]; + my $affi_pc="-"; + my $affi_pc_score="-"; + my $affi_pc_evalue="-"; + my $affi_category="-"; + if (defined($affi_phage_cluster{$g_c})){ + my $phage_c=$affi_phage_cluster{$g_c}{"match"}; + if (defined($phage_cluster{$phage_c}{"category"})){$affi_category=$phage_cluster{$phage_c}{"category"};} +# else{print "No category for $phage_c ????????\n";} # Blast unclustered do not have any category + $affi_pc=$phage_c; + $affi_pc_score=$affi_phage_cluster{$g_c}{"score"}; + $affi_pc_evalue=$affi_phage_cluster{$g_c}{"evalue"}; + } + my $affi_pfam="-"; + my $affi_pfam_score="-"; + my $affi_pfam_evalue="-"; + if (defined($affi_pfam{$g_c})){ + $affi_pfam=$affi_pfam{$g_c}{"match"}; + $affi_pfam_score=$affi_pfam{$g_c}{"score"}; + $affi_pfam_evalue=$affi_pfam{$g_c}{"evalue"}; + } + my $length=$stop-$start; + if ($length<0){ # It can happen if one gene overlap the contig origin + $length=($size{$contig_c}-$start)+$stop; + } + print $s1 "$g_c|$start|$stop|$length|$strand|$affi_pc|$affi_pc_score|$affi_pc_evalue|$affi_category|$affi_pfam|$affi_pfam_score|$affi_pfam_evalue\n"; + } +} +close $s1; + + + + + + + diff --git a/virsorter/Scripts/Step_3_highlight_phage_signal.pl b/virsorter/Scripts/Step_3_highlight_phage_signal.pl new file mode 100755 index 0000000..8580025 --- /dev/null +++ b/virsorter/Scripts/Step_3_highlight_phage_signal.pl @@ -0,0 +1,650 @@ +#!/usr/bin/env perl + +use strict; +use autodie; +use File::Spec::Functions; +use FindBin '$Bin'; + +# Script to measure metrics on the sliding window +# Argument 0 : csv file of the contigs +# Argument 1 : summary file of the phage fragments +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[1]))) +{ + print "# Script to measure metrics on the sliding window +# Argument 0 : csv file of the contigs +# Argument 1 : summary file of the phage fragments +# Argument 2 (optional) : a file with the refs values that we could use instead of estimating them \n"; + die "\n"; +} +$| = 1; +my $csv_file = $ARGV[0]; +my $out_file = $ARGV[1]; +if ( -e $out_file ) { `rm $out_file`; } +my $ref_file = $ARGV[0]; +$ref_file =~ s/\.csv/.refs/g; +my $do_ref_estimation = 0; +if (defined($ARGV[2])){ +# $ref_file=$ARGV[2]; + `cp $ARGV[2] $ref_file`; # That way, the ref file is in the result directory if a use wants to check it + $do_ref_estimation=1; +} + +## ABSOLUTE THRESHOLDS ## +my $th_viral_hallmark=1; +my $th_sig=2; +my $th_sig_2=4; +my $th_nb_genes_covered=0.80; +my $th_nb_genes_noncaudo=1; +## END OF ABSOLUTE THRESHOLDS ## +my $script_dir= catfile($Bin); +my $path_to_c_script= catfile($script_dir, "Sliding_windows_3"); + +print "## Taking information from the contig info file ($csv_file)\n"; +open F1, '<', $csv_file; +my $n=0; +my $id_c=$_; +my %infos; +my @liste_contigs; +my %nb_genes; +while(){ + chomp($_); + if ($_=~/>(.*)/){ + my @tab=split(/\|/,$1); + $id_c=$tab[0]; + push(@liste_contigs,$id_c); + $nb_genes{$id_c}=$tab[1]; + $n++; + } + else{ + # 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 + # gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue| + my @tab=split(/\|/,$_); + my $gene=$tab[0]; + $gene=~/.*-(gene_\d*)/; + $gene=$1; + $infos{$id_c}{$gene}{"order"}=$n; + $infos{$id_c}{$gene}{"length"}=$tab[3]; + $infos{$id_c}{$gene}{"strand"}=$tab[4]; + $infos{$id_c}{$gene}{"category"}=-1; + if ($tab[5] eq "-"){ ## no Phage Cluster affiliation + if ($tab[9] eq "-"){ ## no PFAM either, ok.. + $infos{$id_c}{$gene}{"best_domain_hit"}="-"; + } + else{ + $infos{$id_c}{$gene}{"best_domain_hit"}="PFAM-".$tab[9]; + } + } + else{ + if ($tab[9] eq "-" || $tab[6]>$tab[10]){ ## no PFAM or Phage Cluster better than PFAM (score comparison) + $infos{$id_c}{$gene}{"best_domain_hit"}="PC-".$tab[5]; + if ($tab[9] ne ""){$infos{$id_c}{$gene}{"hit_PFAM"}="PFAM-".$tab[5];} + if ($tab[8] eq "-"){$infos{$id_c}{$gene}{"category"}=-1;} + else{$infos{$id_c}{$gene}{"category"}=$tab[8];} + } + else{ ## So we have a PFAM, which is clearly better than Phage Cluster, so we keep it + $infos{$id_c}{$gene}{"best_domain_hit"}="PFAM-".$tab[9]; + $infos{$id_c}{$gene}{"hit_PC"}="PC-".$tab[5]; + } + } + $n++; + } +} +close F1; + +my $th_gene_size=0; +# WE HAVE A REF FILE, WE DONT ESTIMATE +if ($do_ref_estimation==1){ + print "## We have a ref file : $ref_file , so will use it\n"; + open F1, '<', $ref_file; + while (){ + chomp($_); + my @tab=split("\t",$_); + $th_gene_size=$tab[4]; + } + close F1; +} +### ELSE, IF WE ESTIMATE THE PARAMETERS FROM THE DATASET +else{ + my %total; + my @store_avg_g_size; + # look at all contigs at once for the global metrics + print "## First look at everything to get the totals\n"; + foreach(@liste_contigs){ + my $contig=$_; + my @tab_genes=sort {$infos{$contig}{$a}{"order"} <=> $infos{$contig}{$b}{"order"}} keys %{$infos{$contig}}; + my $total_nb_genes=$#tab_genes+1; + my $n_f=0; + # First, taking all the metrics for the totals + my $last_strand=$infos{$contig}{$tab_genes[0]}{"strand"}; + for (my $i=0;$i<$total_nb_genes;$i++){ + if (defined($tab_genes[$i])){ + $total{"n_obs"}++; + if ($infos{$contig}{$tab_genes[$i]}{"best_domain_hit"}=~/^PC-/){ # look at best domain hit on a phage + $total{"phage"}++; + if (defined($infos{$contig}{$tab_genes[$i]}{"hit_PFAM"})){$total{"pfam"}++;} + if ($infos{$contig}{$tab_genes[$i]}{"category"}>=3){ + $total{"noncaudo"}++; + } + } + elsif($infos{$contig}{$tab_genes[$i]}{"best_domain_hit"}=~/^PFAM-/){ + $total{"pfam"}++; + if (defined($infos{$contig}{$tab_genes[$i]}{"hit_PC"})){$total{"phage"}++;} + } + elsif($infos{$contig}{$tab_genes[$i]}{"best_domain_hit"} eq "-"){ + $total{"unch"}++; + } + if ($infos{$contig}{$tab_genes[$i]}{"strand"} ne $last_strand){ + $total{"switch"}++; + $last_strand=$infos{$contig}{$tab_genes[$i]}{"strand"}; + } + push(@store_avg_g_size,$infos{$contig}{$tab_genes[$i]}{"length"}); + } + } + } + + print "## Transform it into probability and gene size decile\n"; + # Transform it into probability / ratios and sort the gene size table + $total{"phage"}/=$total{"n_obs"}; + $total{"noncaudo"}/=$total{"n_obs"}; + $total{"pfam"}/=$total{"n_obs"}; + $total{"unch"}/=$total{"n_obs"}; + $total{"switch"}/=$total{"n_obs"}; + # Determine d1 (first decile) of the gene size distribution, so we divide the distribution in 10 parts + $th_gene_size=get_th_gene_size(\@store_avg_g_size,10); + open S2, '>', $ref_file; + print S2 $total{"phage"}."\t".$total{"pfam"}."\t".$total{"unch"}."\t".$total{"switch"}."\t".$th_gene_size."\t".$total{"noncaudo"}; + close S2; +} + +my $nb_gene_th=2; +# Now the sliding windows +print "## Then look at each contig and each sliding window\n"; +open S1, '>', $out_file; +close S1; +my $i=0; +foreach(@liste_contigs){ + my $contig_c=$_; + my @tab_genes=sort {$infos{$contig_c}{$a}{"order"} <=> $infos{$contig_c}{$b}{"order"}} keys %{$infos{$contig_c}}; + my $total_nb_genes=$#tab_genes+1; + ### Preparing data for C program + my $out_file_c=$ref_file; + $out_file_c=~s/\.refs/.tmp_$i/g; + my $out_file_c2=$ref_file; + $out_file_c2=~s/\.refs/.out_$i/g; + my $out_file_c3=$ref_file; + $out_file_c3=~s/\.refs/.out_$i-sorted/g; +# print "we have $out_file_c $out_file_c2 $out_file_c3\n"; + open MAP_C, '>', $out_file_c; + print MAP_C "$nb_genes{$contig_c}\n"; + my $last_strand="0"; + my $total_hallmark=0; + my $total_noncaudo=0; + foreach(@tab_genes){ + my $gene=$_; + my $tag=""; + # Line : PC / PFAM / UNCH / SIZE / STRAND / HALLMARK + if($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PC/){ + if ($infos{$contig_c}{$gene}{"category"}>=3){$tag="1\t1\t0\t0\t";$total_noncaudo++;} + else{$tag="1\t0\t0\t0\t";} + } + elsif($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PFAM/){$tag="0\t0\t1\t0\t";} + else{$tag="0\t0\t0\t1\t";} + if ($infos{$contig_c}{$gene}{"length"}<$th_gene_size){$tag.="1\t";} + else{$tag.="0\t";} + if (($last_strand eq "0") || ($infos{$contig_c}{$gene}{"strand"} eq $last_strand)){$tag.="0\t";} + else{$tag.="1\t";} + $last_strand=$infos{$contig_c}{$gene}{"strand"}; + if (($infos{$contig_c}{$gene}{"category"}==0) || ($infos{$contig_c}{$gene}{"category"}==3)){ + $tag.="1\t";$total_hallmark++; + print "Gene $contig_c / $gene -> category $infos{$contig_c}{$gene}{category} -> putative hallmark\n"; + } # look at putative hallmarklmark + else{$tag.="0\t";} + print MAP_C "$tag\n"; + } + close MAP_C; + ### Now go execute the C program + my $c_cmd="$path_to_c_script $ref_file $out_file_c $out_file_c2"; +# print "Step 1 - $c_cmd\n"; + my $out=`$c_cmd`; +# print "$out\n"; + $c_cmd="sort -r -n -k 4 $out_file_c2 > $out_file_c3"; +# print "Step 2 - $c_cmd\n"; + $out=`$c_cmd`; +# print "$out\n"; + ### reading the c program output to fill the match hash table / and removing overlap + my %match; + my %check; + my @check_gene; + open OUT_C, '<', $out_file_c3; + while(){ + chomp($_); + my @tab=split("\t",$_); + my $start=$tab[0]; + my $last=$tab[0]+$tab[1]-1; + my $fragment_id=$contig_c."-".$tab_genes[$start]."-".$tab_genes[$last]; + my $tag=0; + # Code : 0 phage / 1 pfam / 2 unch / 3 size / 4 strand switch + if ($tab[2]==0){ + if (overlap($fragment_id,$check{"phage"})==0){ + $match{$fragment_id}{"proof"}{"phage"}=$tab[3]; + $check{"phage"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } + } + if ($tab[2]==1){ + if (overlap($fragment_id,$check{"pfam"})==0){ + $match{$fragment_id}{"proof"}{"pfam"}=$tab[3]; + $check{"pfam"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } + } + if ($tab[2]==2){ + if (overlap($fragment_id,$check{"unch"})==0){ + $match{$fragment_id}{"proof"}{"unch"}=$tab[3]; + $check{"unch"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } + } + if ($tab[2]==3){ + if (overlap($fragment_id,$check{"avg_g_size"})==0){ + $match{$fragment_id}{"proof"}{"avg_g_size"}=$tab[3]; + $check{"avg_g_size"}{$fragment_id}=1; + $tag=1; + } + } + if ($tab[2]==4){ + if (overlap($fragment_id,$check{"switch"})==0){ + $match{$fragment_id}{"proof"}{"switch"}=$tab[3]; + $check{"switch"}{$fragment_id}=1; + $tag=1; + } + } + if ($tab[2]==5){ + if (overlap($fragment_id,$check{"noncaudo"})==0){ + $match{$fragment_id}{"proof"}{"noncaudo"}=$tab[3]; + $check{"noncaudo"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } + } + if ($tag==1){ + # If a match, we also take the nb of hallmark genes, and the size + if ($tab[4]>0){$match{$fragment_id}{"hallmark"}=$tab[4];} + $match{$fragment_id}{"size"}=$tab[1]; + } + } + close OUT_C; + ### Ok, we read the C output, no we try (neatly) to merge all predictions for this sequence + my $n=0; + my %merged_match; + my $th_contig_size=$th_nb_genes_covered*$total_nb_genes; + my @tab_matches=sort { $match{$b}{"size"} <=> $match{$a}{"size"} } keys %match; + if (!defined($match{$tab_matches[0]}{"size"})){} # Not even an interesting region, skip to the next sequence + else{ + my $tag_complete=0; + my $i=0; + while ($match{$tab_matches[$i]}{"size"}>$th_contig_size && $tag_complete==0){ + if ($match{$tab_matches[$i]}{"size"}>$th_contig_size && (defined($match{$tab_matches[$i]}{"proof"}{"pfam"}) || defined($match{$tab_matches[$i]}{"proof"}{"phage"}) || defined($match{$tab_matches[$i]}{"proof"}{"unch"}) || defined($match{$tab_matches[$i]}{"proof"}{"noncaudo"}))){ # SEEMS LIKE WE HAVE A COMPLETE PHAGE SEQUENCE + $tag_complete=1; + my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + if (defined($match{$fragment_id})){ + $merged_match{$fragment_id}=$match{$fragment_id}; # If we indeed have complete metrics, we take themn + } + else{ + $merged_match{$fragment_id}{"size"}=$total_nb_genes;# Otherwise we store just the size + $merged_match{$fragment_id}{"hallmark"}=$total_hallmark;# And the total number of hallmark genes on this fragment + } + $merged_match{$fragment_id}{"type"}="complete_phage";# And we store the type of fragment + foreach(@tab_matches){ + my $fragment_id=$_; + if ($match{$fragment_id}{"size"}<$total_nb_genes){ + my $r=get_overlap($fragment_id,\%merged_match); + if ($r eq "no"){ # if no overlap + $merged_match{$fragment_id}=$match{$fragment_id}; # NO OVERLAP WITH THE COMPLETE + print "!!!!!!!!!!!!!!!!!!! THIS SHOULD NOT BE POSSIBLE\n"; + } + else{ + # Overlap, we propagate the proof and note it "partial" + foreach(keys %{$match{$fragment_id}{"proof"}}){ + if (defined($merged_match{$r}{"proof"}{$_})){ + if ($merged_match{$r}{"proof"}{$_}=~/:/){ + $fragment_id=~/.*-(gene_\d*-gene_\d*)/; + $merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + } + else{} # already a score for the entire match, no pblm + } + else { + $fragment_id=~/.*-(gene_\d*-gene_\d*)/; + $merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + } + } + } + } + } + } + $i++; + } + if($tag_complete==0){ # No complete phage, putatively one or several prophages + # First get all the phage region + # We look for interesting regions my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + my $tag=-1; + my $tag_h=0; + for (my $i=0;$i<$total_nb_genes;$i++){ + if ($tag>=0 && (!defined($check_gene[$i]) || $check_gene[$i]<1)){ # end of an interesting region + my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$i-1]; + if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage + $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + $merged_match{$fragment_id}{"type"}="complete_phage"; + $merged_match{$fragment_id}{"size"}=$total_nb_genes; + $merged_match{$fragment_id}{"hallmark"}=$tag_h; + } + else{ # Prophage + $merged_match{$fragment_id}{"size"}=$i-$tag; + $merged_match{$fragment_id}{"type"}="prophage"; + $merged_match{$fragment_id}{"hallmark"}=$tag_h; + } + $tag=-1; + $tag_h=0; + } + elsif ($tag==-1 && $check_gene[$i]>=1){ + $tag=$i; + $tag_h=0; + } + if ($infos{$contig_c}{$tab_genes[$i]}{"category"}==0 || $infos{$contig_c}{$tab_genes[$i]}{"category"}==3){$tag_h++;} # look at putative hallmark + } + if ($tag>=0){ + my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$#tab_genes]; + print "Region is $fragment_id .."; + if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage + print "which is a complete phage\n"; + $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + $merged_match{$fragment_id}{"type"}="complete_phage"; + $merged_match{$fragment_id}{"size"}=$total_nb_genes; + $merged_match{$fragment_id}{"hallmark"}=$tag_h; + } + else{ # Prophage + print "which is a prophage\n"; + $merged_match{$fragment_id}{"size"}=$total_nb_genes-$tag; + $merged_match{$fragment_id}{"type"}="prophage"; + $merged_match{$fragment_id}{"hallmark"}=$tag_h; + } + } + # Now we merge the annotation in these regions + foreach(@tab_matches){ + my $fragment_id=$_; + # Check if overlap + my $r=get_overlap($fragment_id,\%merged_match); + if ($r eq "no"){ } # if no overlap # not in an interesting region + else{ + # Overlap, we propagate the proof and note it "partial" + foreach(keys %{$match{$fragment_id}{"proof"}}){ + if (defined($merged_match{$r}{"proof"}{$_})){ + if ($merged_match{$r}{"proof"}{$_}=~/:/){ + $fragment_id=~/.*-(gene_\d*-gene_\d*)/; + $merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + } + else{} # already a score for the entire match, no pblm + } + else { + $fragment_id=~/.*-(gene_\d*-gene_\d*)/; + $merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + } + } + delete($match{$fragment_id}); + } + } + ## New addition that should help to get the prophage coordinates correctly ! + # And now check if one of the prophage map to the whole sequence + foreach(keys %merged_match){ + print "This is a prophage\n"; + my $fragment_id=$_; + if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ + $tag_complete=1; + my $new_fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + print "We have a complete prophage -- we add it $new_fragment_id !\n"; + ; + foreach(keys %{$merged_match{$fragment_id}}){ + $merged_match{$new_fragment_id}{$_}=$merged_match{$fragment_id}{$_}; + } + $merged_match{$new_fragment_id}{"type"}="complete_phage";# And we store the type of fragment + } + } + if ($tag_complete==1){ + # We can remove all the prophages + my @tab_temp=keys %merged_match; + foreach(@tab_temp){ + if ($merged_match{$_}{"type"} eq "complete_phage"){} + else{ + delete($merged_match{$_}); + } + } + } + ## END OF THE NEW ADDITION + } + open S1, '>>', $out_file; + foreach(sort { $merged_match{$b}{"size"} <=> $merged_match{$a}{"size"} } keys %merged_match){ ## IMPORTANT, HAVE TO BE SIZE ORDERED + my $fragment_id=$_; + $fragment_id=~/.*-(gene_\d+-gene_\d+)/; + my $zone=$1; + my $type_detection=$merged_match{$fragment_id}{"type"}; + print "$fragment_id\t$merged_match{$fragment_id}{size}\t$merged_match{$fragment_id}{hallmark}\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\n"; + my $category=3; + if ($merged_match{$fragment_id}{"hallmark"}==0){delete($merged_match{$fragment_id}{"hallmark"});} + # Determine the category. To check this, we want several good indicators - And also remove prediction based on one single indicator, unless it's a strong one (sig >2) + # New categories : + # Cat 1 - hallmark + gene phage enrichment + # Cat 2 - gene phage or hallmark without gene phage + # Cat 3 - no hallmark or gene phage, but other signal + my @tab_proof=keys %{$merged_match{$fragment_id}{"proof"}}; + if ($merged_match{$fragment_id}{"hallmark"}>0){ + if (defined($merged_match{$fragment_id}{"proof"}{"noncaudo"}) || defined($merged_match{$fragment_id}{"proof"}{"phage"})){ + if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/(gene_\d+-gene_\d+):(\d+)/){ + my $match_region=$1; + my $score=$2; + if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region + } + elsif ($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; ## THRESHOLD TO REMOVE THE NONCAUDO ON THE SMALL SMALL CONTIGS + if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/(gene_\d+-gene_\d+):(\d+)/){ + my $match_region=$1; + my $score=$2; + if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region + } + elsif ($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; + if ($category==3){ # no match complete, so category 2 + $category=2; + } + } + else{ + foreach(@tab_proof){ + if ($merged_match{$fragment_id}{"proof"}{$_}=~/(gene_\d+-gene_\d+):(\d+)/){ + my $match_region=$1; + my $score=$2; + print "Hallmark but no phage or noncaudo, but other proof $_ -> $match_region / $score ($merged_match{$fragment_id}{proof}{$_})\n"; + if ($match_region eq $zone && $score>=$th_sig){$category=2;} # other metric on the whole region + elsif($score>=$th_sig_2){$category=2;} # metric partial only but strong enough so we keep it + } + elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig){ + if ($_ eq "pfam" || $_ eq "unch"){ + $category=2; # if we have hallmark or gene_size + a metric pfam or unch on the whole fragment -> should be quite sure + } + } + } + } + } + elsif (defined($merged_match{$fragment_id}{"proof"}{"phage"}) || defined($merged_match{$fragment_id}{"proof"}{"noncaudo"})){# If we have some phage signal, + if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/:(\d*)/){ + if ($1>=$th_sig){ + $category=2; # Good, phage signal significant -> should be pretty sure + } + } + elsif($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){ + $category=2; # Good, phage signal significant -> should be pretty sure + } + if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/:(\d*)/){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS + if ($1>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ + $category=2; # Good, phage signal significant -> should be pretty sure + } + } + elsif($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS + $category=2; # Good, phage signal significant -> should be pretty sure + } + } + if ($category==3){ # If the category is still 3, meaning that the phage signal (if there was any) was not that strong .. + if ($#tab_proof==0){ + $category=0; # No phage signal nor hallmark gene, and only one metric, we remove + } + else{ + my $tag1=0; + foreach(@tab_proof){ + if ($merged_match{$fragment_id}{"proof"}{$_}=~/:(\d*)/){ + if ($1>=$th_sig_2){ + $tag1=1; # Good, one signal very significant + } + } + elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig_2){ + $tag1=1; # Good, one signal very significant + } + } + if ($tag1==0){ # If none of the metrics is really strong ... + $category=0; # .. we remove the detection + } + } + } + # Columns index : 0 / 1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9 / 10 / 11 / 12 + # Columns : Contig / Total Nb Genes / Fragment / Size / Type detection / Category / Enrich Phage / Enrich Non Caudo / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark + if ($category>0){ + print S1 "$contig_c\t$total_nb_genes\t$fragment_id\t$merged_match{$fragment_id}{size}\t$type_detection\t$category\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{noncaudo}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\t$merged_match{$fragment_id}{hallmark}\n"; + } + } + close S1; + } + $i++; + `rm $out_file_c $out_file_c2 $out_file_c3`; +} + +sub factorial { # factorial $n + my $n = shift; + my $f = 1; + $f *= $n-- while $n > 0; # Multiply, then decrement + return $f; +} + +sub combine { # combination of $k elements in $n ensemble + my $k=$_[0]; + my $n=$_[1]; + my $f=factorial($n)/(factorial($k) * factorial($n-$k)); + return $f; +} + +sub proba { # probability of x=$i knowing nb_obs $n and p $p + my $i=$_[0]; + my $n=$_[1]; + my $p=$_[2]; + my $f=combine($i,$n)*($p**$i)*((1-$p)**($n-$i)); + return $f; +} + +sub proba_more_than { # probability of x>=$s knowing nb_obs $n and p $p + my $s=$_[0]; + my $n=$_[1]; + my $p=$_[2]; + my $f=0; + for (my $i=$s;$i<=$n;$i++){ + $f+=proba($i,$n,$p); + } + return $f; +} + +sub proba_less_than { # probability of x<=$s knowing nb_obs $n and p $p + my $s=$_[0]; + my $n=$_[1]; + my $p=$_[2]; + my $f=0; + for (my $i=0;$i<=$s;$i++){ + $f+=proba($i,$n,$p); + } + return $f; +} + +sub log10 { + my $n = shift; + return log($n)/log(10); +} + +sub overlap { # To check if a prediction is not within another of the same type, in which case we don't really care + my $pred=$_[0]; + $pred=~/.*-gene_(\d*)-gene_(\d*)/; + my $start_pred=$1; my $end_pred=$2; + my $p_hash=$_[1]; + my $o=0; + foreach(keys %{$p_hash}){ + $_=~/.*-gene_(\d*)-gene_(\d*)/; + if (($start_pred<=$1 && $1<$end_pred) || ($start_pred<$2 && $2<=$end_pred) || ($1<=$start_pred && $2>=$end_pred)){ + $o=1; + } + } + return $o; +} + + +sub get_overlap { # To get the overlapping if any + my $pred=$_[0]; + $pred=~/.*-gene_(\d*)-gene_(\d*)/; + my $start_pred=$1; my $end_pred=$2; + my $p_hash=$_[1]; + my $o="no"; + foreach(keys %{$p_hash}){ + $_=~/.*-gene_(\d*)-gene_(\d*)/; + if ($start_pred>=$1 && $end_pred<=$2){ + $o=$_; + } + } + return $o; +} + +sub is_local_max { + my $p_metrics=$_[0]; + my $s=$_[1]; + my $w=$_[2]; + my $c=$_[3]; + my $v=$$p_metrics{$s}{$w}{$c}; + my $f=1; + my $how_much_to_look=5; + for (my $i=-$how_much_to_look;$i<=$how_much_to_look;$i++){ + for (my $j=-$how_much_to_look;$j<=$how_much_to_look;$j++){ + if ($i==0 && $j==0){} + elsif(defined($$p_metrics{$s+$i}{$w+$j})){ + if(defined($$p_metrics{$s+$i}{$w+$j}{$c})){ + if ($$p_metrics{$s+$i}{$w+$j}{$c}>$v){ + $f=0; # we found a neightbor with a greater value, not a local maxima + } + } + } + } + } + return $f; +} + +sub get_position{ + my $value=$_[0]; + my $p_tab=$_[1]; + my @tab=sort {$a <=> $b} @$p_tab; + print "looking for $value in the gene size table\n"; + my $index = 0; + while($tab[$index]<$value && $index<$#tab){$index++;} + print "found at index $index - $tab[$index] - total : $#tab\n"; + my $ratio=$index/$#tab; + print "which gives a ratio of $ratio\n"; + return $ratio; +} + +sub get_th_gene_size{ + my @tab=sort {$a <=> $b} (@{$_[0]}); + my $div=$_[1]; + my $m=0; + if ($#tab % $div == 0){return ($tab[$#tab/$div]);} + else{return (($tab[($#tab-1)/$div]+$tab[($#tab+1)/$div])/2);} +} diff --git a/virsorter/Scripts/Step_4_summarize_phage_signal.pl b/virsorter/Scripts/Step_4_summarize_phage_signal.pl new file mode 100755 index 0000000..5683703 --- /dev/null +++ b/virsorter/Scripts/Step_4_summarize_phage_signal.pl @@ -0,0 +1,289 @@ +#!/usr/bin/env perl + +use strict; +use autodie; + +# Script to make a summary of the predictions to add to previous predictions +# Argument 0 : summary file of the phage fragments +# Argument 1 : global summary to be completed +# Argument 2 : Out file for new prot list +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3]))) +{ + print "# Script to make a summary of the predictions to add to previous predictions +# Argument 0 : affiliation file of the contigs +# Argument 1 : summary file of the phage fragments +# Argument 2 : global summary to be completed +# Argument 3 : Out file for new prot list\n"; + die "\n"; +} + +my $affi_contigs = $ARGV[0]; +my $new_summary = $ARGV[1]; +my $global_summary = $ARGV[2]; +my $new_prot_list = $ARGV[3]; + +my %infos; +my $tag=0; +my %check_prot_old; +my %check_contig_old; +if (-e $global_summary){ + # Get info from global_summary + open SUM, '<', $global_summary; + while (){ + chomp($_); + if ($_=~/^## (\d+)/){ + $tag=$1; + } + elsif($_=~/^##/ || $_ eq ""){} + elsif($tag<=3){ +# print "we had $_ -> tag $tag\n"; + my @tab=split(",",$_); + $infos{$tag}{$tab[2]}{"nb_gene"}=$tab[1]; + $infos{$tag}{$tab[2]}{"category"}=$tab[4]; + $infos{$tag}{$tab[2]}{"hallmark"}=$tab[5]; + $infos{$tag}{$tab[2]}{"phage"}=$tab[6]; + $infos{$tag}{$tab[2]}{"noncaudo"}=$tab[7]; + $infos{$tag}{$tab[2]}{"pfam"}=$tab[8]; + $infos{$tag}{$tab[2]}{"unch"}=$tab[9]; + $infos{$tag}{$tab[2]}{"switch"}=$tab[10]; + $infos{$tag}{$tab[2]}{"size"}=$tab[11]; + $check_contig_old{$tab[2]}=1; + print "checking old $tab[2]\n"; + } + else{ + my @tab=split(",",$_); + print "we had $tab[2] -> tag $tag\n"; + $infos{$tag}{$tab[0]}{$tab[2]}{"nb_gene_contig"}=$tab[1]; + $infos{$tag}{$tab[0]}{$tab[2]}{"nb_gene"}=$tab[3]; + $infos{$tag}{$tab[0]}{$tab[2]}{"category"}=$tab[4]; + $infos{$tag}{$tab[0]}{$tab[2]}{"hallmark"}=$tab[5]; + $infos{$tag}{$tab[0]}{$tab[2]}{"phage"}=$tab[6]; + $infos{$tag}{$tab[0]}{$tab[2]}{"noncaudo"}=$tab[7]; + $infos{$tag}{$tab[0]}{$tab[2]}{"pfam"}=$tab[8]; + $infos{$tag}{$tab[0]}{$tab[2]}{"unch"}=$tab[9]; + $infos{$tag}{$tab[0]}{$tab[2]}{"switch"}=$tab[10]; + $infos{$tag}{$tab[0]}{$tab[2]}{"size"}=$tab[11]; + if($infos{$tag}{$tab[0]}{$tab[2]}{"category"}==1){ # If the category is 4, we check all the prot from this fragment + $tab[2]=~/.*-gene_(\d*)-gene_(\d*)/; + for (my $i=$1;$i<=$2;$i++){ + my $prot_id=$tab[0]."-gene_".$i; + $check_prot_old{$prot_id}=1; + } + } + } + } + close SUM; +} +else{ + print "This is the first global summary that we'll do\n"; +} + +my %check_prot_new; +my %check_contig_new; +open SUM, '<', $new_summary; +while (){ + chomp($_); + $_=~s/,/;/g; + my @tab=split("\t",$_); + if ($tab[4] eq "complete_phage"){ +# 0 / 1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9 / 10 / 11 / +# Contig / Total Nb Genes / Fragment / Size / Type detection / Category / Enrich Phage / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark + # Determine order in which this contig will be displayed + my $class=3; + if ($tab[5]==1){# If the category is 1, we check all the prot from this contig + $class=1; + $check_contig_new{$tab[0]}=1; + } + elsif ($tab[5]==2){$class=2;} + for(my $i=5;$i<=$#tab;$i++){ + if ($tab[$i]=~/(.*);$/){ + $tab[$i]=$1; + } + } +# print "$_ => tag $class\n"; + $infos{$class}{$tab[0]}{"nb_gene"}=$tab[1]; + $infos{$class}{$tab[0]}{"category"}=$tab[5]; + $infos{$class}{$tab[0]}{"phage"}=$tab[6]; + $infos{$class}{$tab[0]}{"noncaudo"}=$tab[7]; + $infos{$class}{$tab[0]}{"pfam"}=$tab[8]; + $infos{$class}{$tab[0]}{"unch"}=$tab[9]; + $infos{$class}{$tab[0]}{"switch"}=$tab[10]; + $infos{$class}{$tab[0]}{"size"}=$tab[11]; + $infos{$class}{$tab[0]}{"hallmark"}=$tab[12]; + } + else{ + my $class=6; + if ($tab[5]==1){ + $class=4; + # If the category is 1, we check all the prot from this fragment + $tab[2]=~/.*-gene_(\d*)-gene_(\d*)/; + for (my $i=$1;$i<=$2;$i++){ + my $prot_id=$tab[0]."-gene_".$i; + $check_prot_new{$prot_id}=1; + print "we check new $prot_id\n"; + } + } + elsif($tab[5]==2){$class=5;} + # Remove all former prophages (if any) is there is an overlap + for (my $i=4;$i<=6;$i++){ + if (defined $infos{$i}{$tab[0]}){ + foreach (keys %{$infos{$i}{$tab[0]}}){ + if (overlap($tab[2],$_)==1){ + print "Overlap between $tab[1] and $_, we remove $_ ($tab[0] - 3)\n"; + delete($infos{$i}{$tab[0]}{$_}); + } + } + } + } + for(my $i=5;$i<=$#tab;$i++){ + if ($tab[$i]=~/(.*);$/){ + $tab[$i]=$1; + } + } +# print "Prophage $class / $tab[0] - $tab[2]\n"; + $infos{$class}{$tab[0]}{$tab[2]}{"nb_gene_contig"}=$tab[1]; + $infos{$class}{$tab[0]}{$tab[2]}{"nb_gene"}=$tab[3]; + $infos{$class}{$tab[0]}{$tab[2]}{"category"}=$tab[5]; + $infos{$class}{$tab[0]}{$tab[2]}{"phage"}=$tab[6]; + $infos{$class}{$tab[0]}{$tab[2]}{"noncaudo"}=$tab[7]; + $infos{$class}{$tab[0]}{$tab[2]}{"pfam"}=$tab[8]; + $infos{$class}{$tab[0]}{$tab[2]}{"unch"}=$tab[9]; + $infos{$class}{$tab[0]}{$tab[2]}{"switch"}=$tab[10]; + $infos{$class}{$tab[0]}{$tab[2]}{"size"}=$tab[11]; + $infos{$class}{$tab[0]}{$tab[2]}{"hallmark"}=$tab[12]; + } +} + +# Remove redundancy +foreach(sort {$a <=> $b } keys %infos){ + my $class=$_; + my @liste_contigs=keys %{$infos{$class}}; + if ($class<=3){ ## For complete phages, remove all predictions with higher categories + for (my $i=$class+1;$i<=6;$i++){ + foreach(@liste_contigs){ + if (defined($infos{$i}{$_})){ + print "$_ defined in $class, so we remove its info for $i\n"; + delete($infos{$i}{$_}); + } + } + } + } + else{ + foreach(@liste_contigs){ ## For prophages, remove the prediction of the same prophages with higher categories + my $contig=$_; + foreach(keys %{$infos{$class}{$contig}}){ + for (my $i=$class+1;$i<=6;$i++){ + if (defined($infos{$i}{$contig}{$_})){ + print "$_ defined in $class, so we remove its info for $i\n"; + delete($infos{$i}{$contig}{$_}); + } + } + } + + + } + } +} + + +open S1, '>', $global_summary; +for (my $class=1;$class<=6;$class++){ + if ($class==1){ + print S1 "## 1 - Complete phage contigs - category 1 (pretty sure)\n"; + print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n"; + } + if ($class==2){ + print S1 "## 2 - Complete phage contigs - category 2 (quite sure)\n"; + print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n"; + } + if ($class==3){ + print S1 "## 3 - Complete phage contigs - category 3 (not so sure)\n"; + print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n"; + } + if ($class==4){ + print S1 "## 4 - Prophages - category 1 (pretty sure)\n"; + print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n"; + } + if ($class==5){ + print S1 "## 5 - Prophages - category 2 (quite sure)\n"; + print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n"; + } + if ($class==6){ + print S1 "## 6 - Prophages - category 3 (not so sure)\n"; + print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n"; + } + foreach(sort keys %{$infos{$class}}){ + my $contig=$_; + if ($class<=3){ + print S1 "$_,$infos{$class}{$contig}{nb_gene},$_,$infos{$class}{$contig}{nb_gene},$infos{$class}{$contig}{category},$infos{$class}{$contig}{hallmark},$infos{$class}{$contig}{phage},$infos{$class}{$contig}{noncaudo},$infos{$class}{$contig}{pfam},$infos{$class}{$contig}{unch},$infos{$class}{$contig}{switch},$infos{$class}{$contig}{size}\n"; + } + else{ + foreach (sort keys %{$infos{$class}{$contig}}) { + print S1 "$contig,$infos{$class}{$contig}{$_}{nb_gene_contig},$_,$infos{$class}{$contig}{$_}{nb_gene},$infos{$class}{$contig}{$_}{category},$infos{$class}{$contig}{$_}{hallmark},$infos{$class}{$contig}{$_}{phage},$infos{$class}{$contig}{$_}{noncaudo},$infos{$class}{$contig}{$_}{pfam},$infos{$class}{$contig}{$_}{unch},$infos{$class}{$contig}{$_}{switch},$infos{$class}{$contig}{$_}{size}\n"; + } + } + } +} +close S1; + +# Check if they could be new clusters among the new proteins +my @liste_to_add=(); +my $th_evalue=0.0000000001; # Big threshold, to prevent too much false positive +open AFI, '<', $affi_contigs; +my $contig_c=""; +while (){ + chomp($_); + if ($_=~/>(.*)/){ + my @tab=split(/\|/,$1); + $contig_c=$tab[0]; + } + else{ + # 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 + # gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue| + my @tab=split(/\|/,$_); + my $gene=$tab[0]; + if (($check_prot_new{$gene}==1 && !defined($check_prot_old{$gene})) || (($check_contig_new{$contig_c}==1) && !defined($check_contig_old{$contig_c}))){ +# print "Ah, a new prot, putatively a new cluster\n"; + if ($tab[5] eq "-"){ +# print "\t oh yep, no phage cluster, so we take this\n"; + push(@liste_to_add,$gene); + } + elsif($tab[7]<=$th_evalue && $tab[5]=~/^Phage_cluster_\d+/){ +# print "\t nope, evalue of $tab[7] on a cluster, so will likely cluster with an existing PC\n"; + } + else{ +# print "\t oh yep, we take this, because it's either a bigger evalue than the th or a non-clustered phage protein -> $tab[5]\n"; + push(@liste_to_add,$gene); + } + } + } +} + +if ($#liste_to_add>=0){ + print "Listing the new prots to add\n"; + my $l=join(",",@liste_to_add); + open S2, '>', $new_prot_list; + print S2 "$l\n"; + close S2; +} +else{ + print "No new prots, no list\n"; +} + + +sub overlap { # To check if a prediction is not within another of the same type, in which case we don't really care + my $pred_wide=$_[0]; + my $pred_short=$_[1]; + $pred_wide=~/.*-gene_(\d*)-gene_(\d*)/; + my $start_pred_wide=$1; + my $end_pred_wide=$2; + $pred_short=~/.*-gene_(\d*)-gene_(\d*)/; + my $start_pred_short=$1; + my $end_pred_short=$2; + my $o=0; + if ($start_pred_short>=$start_pred_wide && $end_pred_short<=$end_pred_wide && (!($start_pred_short==$start_pred_wide && $end_pred_short==$end_pred_wide))){ + $o=1; + } +# print "$pred_short / $pred_wide $start_pred_short>=$start_pred_wide && $end_pred_short<=$end_pred_wide => $o\n"; + return $o; +} diff --git a/virsorter/Scripts/Step_5_get_phage_fasta-gb.pl b/virsorter/Scripts/Step_5_get_phage_fasta-gb.pl new file mode 100755 index 0000000..175b5ad --- /dev/null +++ b/virsorter/Scripts/Step_5_get_phage_fasta-gb.pl @@ -0,0 +1,444 @@ +#!/usr/bin/env perl + +use strict; +use autodie; +use Bio::SeqIO; +use Bio::Seq; +use Bio::SeqFeature::Generic; +use Bio::Location::Simple; +use Bio::Location::Split; +use Cwd 'cwd'; +use File::Spec::Functions; +use File::Path 'mkpath'; + +# Script to get fasta file from VirSorter results +# Argument 0 : code of the run +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[0]))) +{ + print "# Script to get fasta file from VirSorter results +# Argument 0 : code of the run\n"; + die "\n"; +} + + +my $code = $ARGV[0] or die 'No code'; +my $wdir = $ARGV[1] || cwd(); +my $dir_out = catdir($wdir, "Predicted_viral_sequences"); + +unless (-d $dir_out) { + mkpath($dir_out); +} + +# We decal each zone by 50 nt before and beyond +my $decal=50; +print "Code $code\n"; +my $out_file_1 = catfile( $dir_out, $code . '_cat-1.fasta' ); +my $out_file_2 = catfile( $dir_out, $code . '_cat-2.fasta' ); +my $out_file_3 = catfile( $dir_out, $code . '_cat-3.fasta' ); +my $out_file_p1 = catfile( $dir_out, $code . '_prophages_cat-4.fasta' ); +my $out_file_p2 = catfile( $dir_out, $code . '_prophages_cat-5.fasta' ); +my $out_file_p3 = catfile( $dir_out, $code . '_prophages_cat-6.fasta' ); +my $gb_file_1 = catfile( $dir_out, $code . '_cat-1.gb' ); +my $gb_file_2 = catfile( $dir_out, $code . '_cat-2.gb' ); +my $gb_file_3 = catfile( $dir_out, $code . '_cat-3.gb' ); +my $gb_file_p1 = catfile( $dir_out, $code . '_prophages_cat-4.gb' ); +my $gb_file_p2 = catfile( $dir_out, $code . '_prophages_cat-5.gb' ); +my $gb_file_p3 = catfile( $dir_out, $code . '_prophages_cat-6.gb' ); +print join("\n", "The sequences will be put in:", + ( map { " - $_" } + $out_file_1, + $out_file_2, + $out_file_3, + $out_file_p1, + $out_file_p2, + $out_file_p3, + ), + '' +); + +my $summary = catfile($wdir, $code . '_global-phage-signal.csv'); +my $last_affi = catfile($wdir, $code . '_phage-signal.csv'); +my $affi_contigs = catfile($wdir, $code . '_affi-contigs.csv'); +my $fasta_contigs = catfile($wdir, 'fasta', $code . '_nett_filtered.fasta'); +my $fasta_prot = catfile($wdir, 'fasta', $code . '_prots.fasta'); + +print "Checking '$last_affi'\n"; + +if (-e $last_affi){ + my %compte=(1=>0,2=>0,3=>0,4=>0,5=>0,6=>0); + my %check; + my $current_c=""; + open(SUM, '<', $summary); + + while (){ + chomp($_); + if ($_=~/## (\d)/){$current_c=$1;} + elsif ($_=~/##/){} + else{ + my @tab=split(",",$_); + $tab[0]=~s/\(/_/g; + $tab[0]=~s/\)/_/g; + $tab[2]=~s/\(/_/g; + $tab[2]=~s/\)/_/g; + $tab[0]=~s/\[/_/g; + $tab[0]=~s/\]/_/g; + $tab[2]=~s/\[/_/g; + $tab[2]=~s/\]/_/g; + if($tab[0]=~/(.*)-circular/){ +# $tab[2]=~s/-circular//g; + $check{$tab[0]}{$tab[2]}{"circular"}=1; + } + if($tab[0] eq ""){ + print "!!!!! void\n"; + } + else{ + $check{$tab[0]}{$tab[2]}{"prophage"}=$tab[2]; + $check{$tab[0]}{$tab[2]}{"line"}=$_; + $compte{$current_c}++; + $check{$tab[0]}{$tab[2]}{"category"}=$current_c; + } + } + } + close SUM; + print "$code\t$compte{1}\t$compte{2}\t$compte{3}\t$compte{4}\t$compte{5}\t$compte{6}\n"; + # Get the sequence annotation + my $id_c=""; + my %infos; + my $i=0; + open(ANOT,"<$affi_contigs") || die ("pblm opening file $affi_contigs\n"); + while(){ + chomp($_); + if ($_=~/>(.*)/){ + my @tab=split(/\|/,$1); + $tab[0]=~s/\(/_/g; + $tab[0]=~s/\)/_/g; + $tab[0]=~s/\[/_/g; + $tab[0]=~s/\]/_/g; + $id_c=$tab[0]; + } + else{ + # 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 + # gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue| + my @tab=split(/\|/,$_); + my $gene=$tab[0]; + $gene=~/.*-(gene_\d*)/; + $gene=$1; + $infos{$id_c}{$gene}{"start"}=$tab[1]; + $infos{$id_c}{$gene}{"stop"}=$tab[2]; + $infos{$id_c}{$gene}{"length"}=$tab[3]; + $infos{$id_c}{$gene}{"strand"}=$tab[4]; + $infos{$id_c}{$gene}{"category"}=-1; + $infos{$id_c}{$gene}{"order"}=$i; + $i++; + if ($tab[5] eq "-"){ ## no Phage Cluster affiliation + if ($tab[9] eq "-"){ ## no PFAM either, ok.. + $infos{$id_c}{$gene}{"affi"}="hypothetical protein"; + } + else{ + $infos{$id_c}{$gene}{"affi"}="PFAM-".$tab[9]; + } + } + else{ + if ($tab[9] eq "-"){ ## no PFAM or Phage Cluster better than PFAM (score comparison) + $infos{$id_c}{$gene}{"affi"}=$tab[5]; + } + else{ ## So we have a PFAM, which is clearly better than Phage Cluster, so we keep it + $infos{$id_c}{$gene}{"affi"}=$tab[5]."_"."PFAM-".$tab[9]; + } + } + } + } + close ANOT; + open(FA,"<$fasta_prot") || die ("pblm opening file $fasta_prot"); + my $gene_c=""; + my $tag=0; + while (){ + chomp($_); + if ($_=~/^>(\S*)/){ + $tag=0; + my $gene_temp=$1; + $gene_temp=~/(.*)-(gene_\d*)/; + $id_c=$1; + $gene_c=$2; + if(defined($infos{$id_c}{$gene_c})){$tag=1;} + } + elsif($tag==1){ + $infos{$id_c}{$gene_c}{"seq"}.=$_; + } + } + close FA; + # Now get all the fasta cut of the contigs + open(SP1,">$out_file_p1") || die ("pblm opening file $out_file_p1\n"); + open(SP2,">$out_file_p2") || die ("pblm opening file $out_file_p2\n"); + open(SP3,">$out_file_p3") || die ("pblm opening file $out_file_p3\n"); + open(S1,">$out_file_1") || die ("pblm opening file $out_file_1\n"); + open(S2,">$out_file_2") || die ("pblm opening file $out_file_2\n"); + open(S3,">$out_file_3") || die ("pblm opening file $out_file_3\n"); + my $output_1 = Bio::SeqIO->new(-file => ">$gb_file_1",-format => 'GenBank'); + my $output_2 = Bio::SeqIO->new(-file => ">$gb_file_2",-format => 'GenBank'); + my $output_3 = Bio::SeqIO->new(-file => ">$gb_file_3",-format => 'GenBank'); + my $output_p1 = Bio::SeqIO->new(-file => ">$gb_file_p1",-format => 'GenBank'); + my $output_p2 = Bio::SeqIO->new(-file => ">$gb_file_p2",-format => 'GenBank'); + my $output_p3 = Bio::SeqIO->new(-file => ">$gb_file_p3",-format => 'GenBank'); + my $sequence=0; + open(FASTA,"<$fasta_contigs") || die ("pblm opening file $fasta_contigs\n"); + $id_c=""; + my $seq_c=""; + while (){ + chomp($_); + if ($_=~/^>(.*)/){ + my $id_c_temp=$1; + $id_c_temp=~s/\(/_/g; + $id_c_temp=~s/\)/_/g; + $id_c_temp=~s/\[/_/g; + $id_c_temp=~s/\]/_/g; + if (defined($check{$id_c})){ + my $id_red=$id_c; + print "We had checked $id_c -> $id_red\n"; + foreach(keys %{$check{$id_c}}){ + $id_red=$id_c; + my @tab=split(",",$check{$id_c}{$_}{"line"}); + $tab[0]=~s/\(/_/g; + $tab[0]=~s/\)/_/g; + $tab[2]=~s/\(/_/g; + $tab[2]=~s/\)/_/g; + $tab[0]=~s/\[/_/g; + $tab[0]=~s/\]/_/g; + $tab[2]=~s/\[/_/g; + $tab[2]=~s/\]/_/g; + # $tab[2]=~s/-circular//g; + my $desc="Putative phage sequence (category $check{$id_c}{$_}{category}), predicted by PhageSorter"; + my $iscirc=0; + if ($check{$id_c}{$tab[2]}{"circular"}==1){ + # $id_red.="-circ"; + $iscirc=1; + } + if ($check{$id_c}{$_}{"category"}<=3){ + print ".. predicted to be a complete phage..\n"; + $sequence = Bio::Seq::RichSeq->new(-display_id => "$id_red", -accession_number => "$id_red", -desc => $desc ,-seq =>"$seq_c",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna"); + $sequence->add_date(`date +%D`); + my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($seq_c),-primary => "source",-tag => {'organism' => "$desc"}); + $sequence->add_SeqFeature($featsource); + foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){ + my $gene=$_; + my $splitlocation = Bio::Location::Split->new(); + my $strand=0; + if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;} + # si on est sur un join, etc.. + if ($infos{$id_c}{$gene}{"stop"} < $infos{$id_c}{$gene}{"start"}){ + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>length($seq_c),-strand=>$strand)); + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>1,-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand)); + } + else{ + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand)); + } + my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene}); + $sequence->add_SeqFeature($featgene); + my $product=$infos{$id_c}{$gene}{"affi"}; + my $note="Predicted by MGA"; + my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"}); + $sequence->add_SeqFeature($featcds); + $featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"}); + } + if ($check{$id_c}{$_}{"category"}==1){ + print S1 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n"; + $output_1->write_seq($sequence); + } + elsif ($check{$id_c}{$_}{"category"}==2){ + print S2 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n"; + $output_2->write_seq($sequence); + } + elsif ($check{$id_c}{$_}{"category"}==3){ + print S3 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n"; + $output_3->write_seq($sequence); + } + } + else{ + print ".. predicted to be a prophage..\n"; + if ($tab[2]=~/^$id_c-(gene_\d+)-(gene_\d+)/){ + my $gene_start=$1; + my $start=$infos{$id_c}{$gene_start}{"start"}-$decal; + if ($start<0){$start=0;} + my $gene_stop=$2; + my $stop=$infos{$id_c}{$gene_stop}{"stop"}+$decal; + my $length=$stop-$start; + print " from $1 to $2 .. from $start to $stop ($length)\n"; + my $substr=substr($seq_c,$start,$length); + $iscirc=0; # An integrated prophage cannot be circular, so set this to linear + my $display_id=$id_red."_".$gene_start."_".$gene_stop."-".$start."-".$stop."-cat_".$check{$id_c}{$_}{"category"}; + $sequence = Bio::Seq::RichSeq->new(-display_id => "$display_id", -accession_number => "$display_id", -desc => $desc ,-seq =>"$substr",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna"); + $sequence->add_date(`date +%D`); + my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($substr),-primary => "source",-tag => {'organism' => "$desc"}); + $sequence->add_SeqFeature($featsource); + foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){ + my $gene=$_; + # Check if the gene is in the fragment entirely + if (($infos{$id_c}{$gene}{"start"}>=$start) && ($infos{$id_c}{$gene}{"start"}<=$stop) && ($infos{$id_c}{$gene}{"stop"}>=$start) && ($infos{$id_c}{$gene}{"stop"}<=$stop)){ + my $splitlocation = Bio::Location::Split->new(); + my $strand=0; + if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;} + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"}-$start,-end=>$infos{$id_c}{$gene}{"stop"}-$start,-strand=>$strand)); + my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene}); + $sequence->add_SeqFeature($featgene); + my $product=$infos{$id_c}{$gene}{"affi"}; + my $note="Predicted by MGA"; + my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"}); + $sequence->add_SeqFeature($featcds); + $featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"}); + } + } + if ($check{$id_c}{$_}{"category"}==4){ + print SP1 ">".$display_id."\n".$substr."\n"; + $output_p1->write_seq($sequence); + } + elsif ($check{$id_c}{$_}{"category"}==5){ + print SP2 ">".$display_id."\n".$substr."\n"; + $output_p2->write_seq($sequence); + } + elsif($check{$id_c}{$_}{"category"}==6){ + print SP3 ">".$display_id."\n".$substr."\n"; + $output_p3->write_seq($sequence); + } + } + else{ + print "Pblm with $tab[2] - tab 2\n"; + } + } + } + } + $id_c=$id_c_temp; + $seq_c=""; + } + else{$seq_c.=$_;} + } + close FASTA; + # We do not forget the last one + if (defined($check{$id_c})){ + my $id_red=$id_c; + print "We had checked $id_c -> $id_red\n"; + foreach(keys %{$check{$id_c}}){ + $id_red=$id_c; + my @tab=split(",",$check{$id_c}{$_}{"line"}); + $tab[0]=~s/\(/_/g; + $tab[0]=~s/\)/_/g; + $tab[2]=~s/\(/_/g; + $tab[2]=~s/\)/_/g; + $tab[0]=~s/\[/_/g; + $tab[0]=~s/\]/_/g; + $tab[2]=~s/\[/_/g; + $tab[2]=~s/\]/_/g; + # $tab[2]=~s/-circular//g; + my $desc="Putative phage sequence (category $check{$id_c}{$_}{category}), predicted by PhageSorter"; + my $iscirc=0; + if ($check{$id_c}{$tab[2]}{"circular"}==1){ + # $id_red.="-circ"; + $iscirc=1; + } + if ($check{$id_c}{$_}{"category"}<=3){ + print ".. predicted to be a complete phage..\n"; + $sequence = Bio::Seq::RichSeq->new(-display_id => "$id_red", -accession_number => "$id_red", -desc => $desc ,-seq =>"$seq_c",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna"); + $sequence->add_date(`date +%D`); + my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($seq_c),-primary => "source",-tag => {'organism' => "$desc"}); + $sequence->add_SeqFeature($featsource); + foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){ + my $gene=$_; + my $splitlocation = Bio::Location::Split->new(); + my $strand=0; + if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;} + # si on est sur un join, etc.. + if ($infos{$id_c}{$gene}{"stop"} < $infos{$id_c}{$gene}{"start"}){ + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>length($seq_c),-strand=>$strand)); + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>1,-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand)); + } + else{ + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand)); + } + my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene}); + $sequence->add_SeqFeature($featgene); + my $product=$infos{$id_c}{$gene}{"affi"}; + my $note="Predicted by MGA"; + my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"}); + $sequence->add_SeqFeature($featcds); + $featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"}); + } + if ($check{$id_c}{$_}{"category"}==1){ + print S1 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n"; + $output_1->write_seq($sequence); + } + if ($check{$id_c}{$_}{"category"}==2){ + print S2 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n"; + $output_2->write_seq($sequence); + } + elsif ($check{$id_c}{$_}{"category"}==3){ + print S3 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n"; + $output_3->write_seq($sequence); + } + } + else{ + print ".. predicted to be a prophage..\n"; + if ($tab[2]=~/^$id_c-(gene_\d+)-(gene_\d+)/){ + my $gene_start=$1; + my $start=$infos{$id_c}{$gene_start}{"start"}-$decal; + if ($start<0){$start=0;} + my $gene_stop=$2; + my $stop=$infos{$id_c}{$gene_stop}{"stop"}+$decal; + my $length=$stop-$start; + print " from $1 to $2 .. from $start to $stop ($length)\n"; + $iscirc=0; # An integrated prophage cannot be circular, so set this to linear + my $display_id=$id_red."_".$gene_start."_".$gene_stop."-".$start."-".$stop."-cat_".$check{$id_c}{$_}{"category"}; + my $substr=substr($seq_c,$start,$length); + $sequence = Bio::Seq::RichSeq->new(-display_id => "$display_id", -accession_number => "$display_id", -desc => $desc ,-seq =>"$substr",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna"); + $sequence->add_date(`date +%D`); + my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($substr),-primary => "source",-tag => {'organism' => "$desc"}); + $sequence->add_SeqFeature($featsource); + foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){ + my $gene=$_; + if ((($infos{$id_c}{$gene}{"start"}-$start)>0) && (($infos{$id_c}{$gene}{"start"}-$start)<=$stop) && (($infos{$id_c}{$gene}{"stop"}-$start)>0) && (($infos{$id_c}{$gene}{"stop"}-$start)<=$stop)){ + my $splitlocation = Bio::Location::Split->new(); + my $strand=0; + if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;} + $splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"}-$start,-end=>$infos{$id_c}{$gene}{"stop"}-$start,-strand=>$strand)); + my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene}); + $sequence->add_SeqFeature($featgene); + my $product=$infos{$id_c}{$gene}{"affi"}; + my $note="Predicted by MGA"; + my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"}); + $sequence->add_SeqFeature($featcds); + $featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"}); + } + } + if ($check{$id_c}{$_}{"category"}==4){ + print SP1 ">".$display_id."\n".$substr."\n"; + $output_p1->write_seq($sequence); + } + elsif ($check{$id_c}{$_}{"category"}==5){ + print SP2 ">".$display_id."\n".$substr."\n"; + $output_p2->write_seq($sequence); + } + elsif($check{$id_c}{$_}{"category"}==6){ + print SP3 ">".$display_id."\n".$substr."\n"; + $output_p3->write_seq($sequence); + } + } + else{ + print "Pblm with $tab[2] - tab 2\n"; + } + } + } + } + close S1; + close S2; + close S3; + close SP1; + close SP2; + close SP3; + $output_1->close(); + $output_2->close(); + $output_3->close(); + $output_p1->close(); + $output_p2->close(); + $output_p3->close(); +} +else{ + print "$code\tin progress\n"; +} diff --git a/virsorter/Scripts/Step_first_add_custom_phage_sequence.pl b/virsorter/Scripts/Step_first_add_custom_phage_sequence.pl new file mode 100755 index 0000000..8db79f4 --- /dev/null +++ b/virsorter/Scripts/Step_first_add_custom_phage_sequence.pl @@ -0,0 +1,389 @@ +#!/usr/bin/env perl + +use strict; +use autodie; +use FindBin '$Bin'; +use Bio::Seq; +use File::Spec::Functions; +use File::Path 'mkpath'; +use File::Which 'which'; +# Script to generate a new db with putative new clusters +# Argument 0 : Fasta file of the new phages + +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[2]))) +{ + print "# Script to generate a new db with putative new clusters +# Argument 0 : fasta of custom phages +# Argument 1 : db-in directory +# Argument 2 : db-out directory +\n"; + die "\n"; +} + + +my $virsorter_dir = "/usr/local/bin/Virsorter/"; +my $path_to_formatdb = which("formatdb"); +my $path_to_blastall = which("blastall"); +my $path_to_muscle = which("muscle"); +my $path_to_hmmbuild = which("hmmbuild"); +my $path_to_hmmpress = which("hmmpress"); +my $path_hmmsearch = which("hmmsearch"); +my $path_to_mga = which("mga_linux_ia64"); +my $MCX_LOAD = which("mcxload"); +my $MCL = which("mcl"); + +my $min_seq_in_a_cluster=3; +my $n_cpus=8; + +my $fasta_contigs=$ARGV[0]; +my $db_in=$ARGV[1]; +my $db_out=$ARGV[2]; + +my $tmp_dir=$db_out."/initial_db"; +`mkdir $tmp_dir`; +`cp $db_in/* $tmp_dir/`; +$tmp_dir.="/"; +my $log_out=$tmp_dir."log_out_step_custom_phage"; +my $log_err=$tmp_dir."log_err_step_custom_phage"; + +my $db_phage = $tmp_dir . "Pool_clusters.hmm"; +my $blastable_unclustered = $tmp_dir . "Pool_new_unclustered"; +my $fasta_unclustered = $tmp_dir . "Pool_new_unclustered.faa"; +my $ref_phage_clusters = $tmp_dir . "Phage_Clusters_current.tab"; +my $blast_unclustered = $tmp_dir . "Blast_unclustered.tab"; + +open(F1,"<$fasta_contigs") || die "pblm ouverture fichier $fasta_contigs\n"; +my %seq_base; +my $id_seq=""; +my $i=0; +my %order_contig; +while(){ + $_=~s/\r\n/\n/g; #Cas d'un fichier windows ##AJOUT + chomp($_); + if ($_=~/^>(\S*)/){$id_seq=$1;$order_contig{$id_seq}=$i;$i++} + else{$seq_base{$id_seq}.=$_;} +} +close F1; + + +# Predict genes on the new phages +my $out_file= $db_out."/Custom_phages_mga.predict"; +print "$path_to_mga $fasta_contigs -m > $out_file\n"; +my $mga=`$path_to_mga $fasta_contigs -m > $out_file`; +my %order_gene; +my $n2=0; +open(RESU,"<$out_file") || die "pblm ouverture fichier $out_file\n"; +my %predict; +my %type; +my $id_c=""; +while(){ + chomp($_); + if($_=~/^# gc/){} + elsif($_=~/^# self: (.*)/){$type{$id_c}=$1;} + elsif ($_=~/^# (.*)/){ + $id_c=$1; + $n2=0; + } + else{ + my @tab=split("\t",$_); + $predict{$id_c}{$tab[0]}=$_; + if (!defined($order_gene{$id_c}{$tab[0]})){$order_gene{$id_c}{$tab[0]}=$n2;$n2++;} + } +} +close RESU; +my %check_prot; +my $prot_file=$tmp_dir."/Custom_phages_mga_prots.fasta"; +open(PROT,">$prot_file") || die "pblm ouverture fichier $prot_file\n"; +my $n=0; +foreach(sort {$order_contig{$a} <=> $order_contig{$b} } keys %predict){ + $n++; + my $id=$_; + my @tab_genes=sort {$order_gene{$id}{$a} <=> $order_gene{$id}{$b} } keys %{$predict{$id}}; + ## We check the first gene and modify it if needed + my $seq_c=$seq_base{$id}; + foreach(@tab_genes){ + my @tab=split("\t",$predict{$id}{$_}); + if ($tab[5]!=11){ + # soit on est au début de séquence, soit en toute fin (théoriquement) + if ($tab[4]!=0){ + if ($tab[3] eq "-"){ + $tab[2]-=$tab[4]; + } + elsif($tab[3] eq "+"){ + $tab[1]+=$tab[4]; + } + else{ + print "%%%%%% pblm on a pas de sens pour $id : @tab\n"; + } + } + my $new_line=join("\t",@tab); + $predict{$id}{$_}=$new_line; + } + + @tab=split("\t",$predict{$id}{$_}); + my $name=$tab[0]; + my $start=$tab[1]; + my $stop=$tab[2]; + my $sens=$tab[3]; + my $frame=$tab[4]; + my $frag=""; + # Cas "normal" (on chevauche pas l'origine) + if ($start<$stop){ + my $long=$stop-$start+1; + $frag=substr($seq_c,$start-1,$long); + } + # Cas exceptionnel, on chevauche l'origine du contig + else{ + my $l1=length($seq_c)-$start+1; + $frag=substr($seq_c,$start-1,$l1); + $frag.=substr($seq_c,0,$stop); + } + ## POUR RECUPERER LA SEQ PROT + my $seq_bio = Bio::Seq->new(-seq =>$frag,-alphabet => 'dna' ); + my @seqs = Bio::SeqUtils->translate_6frames($seq_bio); + my $cadre=0; + if ($sens eq "-"){$cadre=3;} + my $prot=$seqs[$cadre]; + my $prot_sequence=$prot->seq; + if ($prot_sequence=~/\*$/){ +# print "on enlève le codon stop final pour muscle\n"; + chop($prot_sequence); + } + my $id_out=$id."-".$name; + print PROT ">$id_out\n$prot_sequence\n"; + $check_prot{$id_out}=1; + } +} +close PROT; +# Clustering the proteins +# - 1 - Hmmsearch vs the original db +my $out_hmmsearch=$tmp_dir."New_prots_vs_Phagedb.tab"; +my $out_hmmsearch_bis=$tmp_dir."New_prots_vs_Phagedb.out"; +my $cmd_hmm_phage="$path_hmmsearch --tblout $out_hmmsearch --cpu $n_cpus -o $out_hmmsearch_bis --noali $db_phage $prot_file >> $log_out 2>> $log_err"; +print "Step 0.9 : $cmd_hmm_phage\n"; +`echo $cmd_hmm_phage >> $log_out 2>> $log_err`; +my $out=`$cmd_hmm_phage`; +print "$out\n"; +open(HMM,"<$out_hmmsearch") || die ("pblm opening file $out_hmmsearch\n"); +my $score_th=200; +my $evalue_th=0.0000000001; +while(){ + chomp($_); + if ($_=~m/^#/){ + next; + } + else{ + my @splign=split(m/\s+/,$_); + my $seq=$splign[0]; + my $match=$splign[2]; + my $evalue=$splign[4]; + my $score=$splign[5]; + if ($score>=$score_th && $evalue<=$evalue_th){ + $check_prot{$seq}=0; + } + } +} +close HMM; +# - 2 - All which does not match a known -> get it +my $prot_file_to_cluster=$tmp_dir."/Custom_phages_mga_prots-to-cluster.fasta"; +my $tag=0; +my %seq_temp; +open(PROT,"<$prot_file") || die ("pblm opening file $prot_file\n"); +open(NEWPROT,">$prot_file_to_cluster") || die ("pblm opening file $prot_file_to_cluster\n"); +my $id_c=""; +while (){ + chomp($_); + if ($_=~/^>(.*)/){ + my $id=$1; + $id_c=$id; + $tag=0; + if ($check_prot{$id}==1){ + print NEWPROT "$_\n"; + $tag=1; + } + } + elsif($tag==1){ + print NEWPROT "$_\n"; + $seq_temp{$id_c}.=$_; + } +} +close PROT; +close NEWPROT; +# - 3 - and make new clusters +my $db=$tmp_dir."Custom_phages_mga_prots-to-cluster"; +my $cmd_format="$path_to_formatdb -i $prot_file_to_cluster -n $db"; +print "$cmd_format\n"; +my $out=`$cmd_format`; +print "Formatdb : $out\n"; +my $cmd_cat="cat $fasta_unclustered >> $prot_file_to_cluster"; +print "$cmd_cat\n"; +$out=`$cmd_cat`; +print "Cat : $cmd_cat\n"; +# - blast vs themselves and the unclustered +my $out_blast=$tmp_dir."pool_unclustered-and-custom-phages-vs-custom-phages.tab"; +my $cmd_blast="$path_to_blastall -p blastp -i $prot_file_to_cluster -d $db -o $out_blast -m 8 -a 10 -e 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts +print "$cmd_blast\n"; +$out=`$cmd_blast`; +print "Blast : $out\n"; +$cmd_cat="cat $blast_unclustered >> $out_blast"; +print "$cmd_cat\n"; +$out=`$cmd_cat`; +print "Cat : $out\n"; +print "Generating abc file\n"; +# - mcl +my $out_abc=$tmp_dir."new_clusters.abc"; +my $th_score=50; +my $th_evalue=0.00001; +my $max=200; # Max on sig +open(S1,">$out_abc") || die ("pblm opening file $out_abc\n"); +open(BL,"<$out_blast") || die ("pblm opening file $out_blast\n"); +while(){ + chomp($_); + my @tab=split("\t",$_); + if ($tab[11]>$th_score && $tab[10]<$th_evalue && $tab[0] ne $tab[1]){ + my $evalue=$tab[10]; +# $evalue=-log10($evalue); +# if ($evalue>$max){$evalue=$max;} + print S1 "$tab[0]\t$tab[1]\t$evalue\n"; + } +} +close BL; +close S1; +my $out_mci=$tmp_dir."new_clusters.mci"; +my $out_tab=$tmp_dir."new_clusters.tab"; +my $cmd_mcxload="$MCX_LOAD -abc $out_abc --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o $out_mci -write-tab $out_tab"; +print "$cmd_mcxload\n"; +$out=`$cmd_mcxload`; +print "Mxc Load : $out\n"; +my $dump_file=$tmp_dir."new_clusters.csv"; +my $cmd_mcl="$MCL $out_mci -I 2 -use-tab $out_tab -o $dump_file"; +print "$cmd_mcl\n"; +$out=`$cmd_mcl`; +print "Mcl : $out\n"; +# - make new cluster +my %unclustered; +my %clusters; +my %check_cluster; +my $last_cluster_id=0; +# toutes les séquences clusterisées dans des groupes de plus de 2 (3 et plus) -> on prend / Toutes les autres on les garde en tant qu'unclustered +open(DUMP,"<$dump_file") || die "pblm ouverture fichier $dump_file\n"; +while(){ + chomp($_); + my @tab=split("\t",$_); + my $n_s_c=$#tab+1; + if ($n_s_c>=$min_seq_in_a_cluster){ + # on a trouvé un cluster de plus de deux + my $cluster_id=$last_cluster_id+1; + $cluster_id="Phage_cluster_".$cluster_id."-c"; + print "We found a cluster with $n_s_c sequences => Cluster $cluster_id\n"; + $last_cluster_id++; + foreach(@tab){ + $clusters{$cluster_id}{$_}=1; + $check_cluster{$_}=1; + } + } + else{ + foreach(@tab){ + $unclustered{$_}=1; + $check_cluster{$_}=1; + } + } +} +close DUMP; +my %seq_temp; +my $id_c=""; +open(FA,"<$prot_file_to_cluster") || die "pblm ouverture fichier $prot_file_to_cluster\n"; +while(){ + chomp($_); + if ($_=~/^>(\S*)/){ + $id_c=$1; + if (!defined($check_cluster{$id_c})){$unclustered{$id_c}=1;$check_cluster{$id_c}=1;} + } + else{$seq_temp{$id_c}.=$_;} +} +close FA; +`mkdir $tmp_dir/clusts`; +foreach(keys %clusters){ + my $cluster_id=$_; + my $out_file=$tmp_dir."clusts/".$cluster_id.".faa"; + open(S1,">$out_file") || die "pblm ouverture fichier $out_file\n"; + foreach(keys %{$clusters{$cluster_id}}){ + print S1 ">$_\n$seq_temp{$_}\n"; + } + close S1; +} +# - 4 - Plus add the unclustered to the unclustered database +my $final_pool_unclustered=$db_out."/Pool_new_unclustered.faa"; +my $final_blastable_unclustered=$final_pool_unclustered; +$final_blastable_unclustered=~s/\.faa//; +my $final_blast_unclustered=$db_out."/Blast_unclustered.tab"; +open(S1,">$final_pool_unclustered") || die "pblm ouverture fichier $final_pool_unclustered\n"; +foreach(keys %unclustered){ + print S1 ">$_\n$seq_temp{$_}\n"; +} +close S1; +print "making a blastable db from the new unclustered\n"; +$out=`$path_to_formatdb -i $final_pool_unclustered -n $final_blastable_unclustered`; +# on réduit aussi le fichier blast qu'on ajoute au blast des unclustered +open(BL,"<$out_blast") || die "pblm ouverture fichier $out_blast\n"; +open(S1,">$final_blast_unclustered") || die "pblm ouverture fichier $final_blast_unclustered\n"; +while(){ + chomp($_); + my @tab=split("\t",$_); + if ($unclustered{$tab[0]}==1 && $unclustered{$tab[1]}==1){ + print S1 "$_\n"; + } +} +close BL; +close S1; +# Generating the new database +my $tag=0; +foreach(sort keys %clusters){ + $tag=1; + my $ali_id=$_; + my $path_to_file=$tmp_dir."clusts/".$ali_id; + my $path_to_fasta=$tmp_dir."clusts/".$ali_id.".faa"; + my $path_to_ali=$tmp_dir."clusts/".$ali_id.".ali_faa"; + my $path_to_hmm=$tmp_dir."clusts/".$ali_id."_ali.hmm"; + if (-e $path_to_ali){ + `rm $path_to_ali $path_to_hmm`; + } + my $muscle_out=$tmp_dir."log_out_muscle"; + my $muscle_err=$tmp_dir."log_err_muscle"; + `$path_to_muscle -in $path_to_fasta -out $path_to_ali > $muscle_out 2> $muscle_err`; + my $out_stokcholm=$path_to_ali.".stockholm"; + open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n"; + print S1 "# STOCKHOLM 1.0\n"; + open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n"; + while(){ + chomp($_); + if ($_=~/^>(.*)/){ + my $id=$1; + $id=~s/\s/_/g; + print S1 "\n$id "; + + } + else{print S1 "$_";} + } + close FA; + print S1 "\n//\n"; + `$path_to_hmmbuild --amino $path_to_hmm $out_stokcholm`; +} +# on poole tous les hmm et les fasta, y compris les precedentes ! +$out=`cat $db_phage > $db_out/Pool_clusters.hmm`; +print "cat previous hmm : $out\n"; +$out=`cat $tmp_dir/clusts/*.hmm >> $db_out/Pool_clusters.hmm`; +print "cat new hmm : $out\n"; +# on en fait une base de données screenable par hmmscan +$out=`$path_to_hmmpress $db_out/Pool_clusters.hmm`; +print "hmm press :$out\n"; +# update the phage cluster catalog +my $final_catalog=$db_out."/Phage_Clusters_current.tab"; +$out=`cat $ref_phage_clusters > $final_catalog`; +print "Cat old catalog : $out\n"; +open(CA,">>$final_catalog") || die ("pblm opening file $final_catalog\n"); +foreach(keys %clusters){ + my $liste=join(" ",keys %{$clusters{$_}}); + print CA "$_|2||$liste\n"; +} +close CA; diff --git a/virsorter/wrapper_phage_contigs_sorter_iPlant.pl b/virsorter/wrapper_phage_contigs_sorter_iPlant.pl new file mode 100755 index 0000000..9a141d4 --- /dev/null +++ b/virsorter/wrapper_phage_contigs_sorter_iPlant.pl @@ -0,0 +1,441 @@ +#!/usr/bin/env perl + +=head1 USAGE + + ./wrapper_phage_contigs_sorter_iPlant.pl -d Code_dataset --fna Fasta file of contigs --db 1 --wdir /path/to/working_directory + Database codes : 1 for RefseqABVir only, 2 for RefseqABVir + Viromes + An additional set of reference sequences can be added to the database as a fasta file with the argument cp (--cp /path/to/fasta_file) + +=cut + +use strict; +use warnings; +use autodie; +use FindBin '$Bin'; +use File::Spec::Functions; +use File::Path 'mkpath'; +use File::Which 'which'; +use Getopt::Long 'GetOptions'; +use Pod::Usage; +use Cwd 'cwd'; + +my $help = ''; +my $code_dataset = 'VIRSorter'; +my $original_fna_file = ''; +my $choice_database = ''; +my $tag_virome = 0; +my $custom_phage = ''; +my $data_dir = '/data'; +my $wdir = cwd(); + +GetOptions( + 'fna=s' => \$original_fna_file, + 'd|dataset:s' => \$code_dataset, + 'db:i' => \$choice_database, + 'virome:i' => \$tag_virome, + 'wdir:s' => \$wdir, + 'cp:s' => \$custom_phage, + 'data-dir:s' => \$data_dir, + 'h|help' => \$help, +); + +if ($help) { + pod2usage(); +} + +unless ($original_fna_file) { + pod2usage('Missing FASTA file'); +} + +unless ($choice_database == 1 || $choice_database == 2) { + pod2usage('choice_database must be 1 or 2'); +} + +print join("\n", + "Dataset : $code_dataset", + "Fna file : $original_fna_file", + "Db : $choice_database", + "Wdir : $wdir", + "Custom phages: $custom_phage", + '' +); + +# +# This code does nothing useful. +# +# We check if the custom phage is an actual fasta file, or if it's the working dir (which means -> no custom phage). +#if ($custom_phage=~/.*\.f.*/){} +#else{ +# if($wdir=~/.*\/$custom_phage$/){ +# print "The custom phage is actually the wdir id, so we remove it\n"; +# $custom_phage=""; +# } +# else{ +# die("we do not understand this custom phage : $custom_phage"); +# } +#} + +if ($tag_virome) { + print "!!! THIS WILL BE A VIROME DECONTAMINATION RUN\n"; +} + +# Need 2 databases +# PCs from Refseq (phages) or PCs from Refseq+Viromes +# PFAM (26.0) + +my $n_cpus = 8; + +# my $code_dataset = $ARGV[0]; +# my $original_fna_file = $ARGV[1]; +# my $choice_database = $ARGV[2]; +# my $wdir = $ARGV[3]; +# my $custom_phage = ""; +# if ( defined( $ARGV[4] ) ) { $custom_phage = $ARGV[4]; } +print "#%#%#%#%#%# Processing $code_dataset....\n"; +my $microbial_base_needed = 0; +## replace this directory with the iPlant dir +#my $wdir=$wdir."/".$code_dataset."/"; + +my $path_to_mga = which('mga_linux_ia64'); +my $path_hmmsearch = which('hmmsearch'); +my $path_blastall = which('blastall'); +my $path_to_formatdb = which('formatdb'); +my $log_out = catfile($wdir, 'log_out'); +my $log_err = catfile($wdir, 'log_err'); +my $script_dir = catdir($Bin, 'Scripts'); +my $dir_Phage_genes = catdir($data_dir,'Phage_gene_catalog'); +my $ref_phage_clusters = catfile($data_dir, + 'Phage_gene_catalog/Phage_Clusters_current.tab'); +# my $readme_file = catfile($script_dir,"VirSorter_Readme.txt"); +my $readme_file = catfile($data_dir,"VirSorter_Readme.txt"); + +if ( $tag_virome == 1 ){ +# $readme_file = catfile($script_dir,"VirSorter_Readme_viromes.txt"); + $readme_file = catfile($data_dir,"VirSorter_Readme_viromes.txt"); +} + +# my $generic_ref_file = catfile($script_dir,"Generic_ref_file.refs"); +my $generic_ref_file = catfile($data_dir,"Generic_ref_file.refs"); + +if ( $choice_database == 2 ) { + $dir_Phage_genes = catdir($data_dir, "Phage_gene_catalog_plus_viromes/"); + $ref_phage_clusters = catfile($data_dir, + "Phage_gene_catalog_plus_viromes/Phage_Clusters_current.tab"); +} + +my $db_PFAM_a = catfile($data_dir, "PFAM_27/Pfam-A.hmm"); +my $db_PFAM_b = catfile($data_dir, "PFAM_27/Pfam-B.hmm"); + +my $out = ""; + +## SETTING UP THE WORKING DIRECTORY +my $log_dir = catdir($wdir, 'logs'); +if (-d $log_dir) { +## Commented on iPlant, but can be useful when running VirSorter on a directory already processed +## (to avoid recomputing the gene prediction and comparison to PFAM especially) +# $out = `rm -r $log_dir/* *.csv`; +# print "rm -r log* *.csv => $out\n"; +} +else { + mkpath($log_dir); +} + +# cp fasta file in the wdir +my $fastadir = catdir($wdir, "fasta"); +if ( !-d $fastadir ) { + mkpath($fastadir); + my $fna_file = catfile( $fastadir, "input_sequences.fna" ); + open my $fa, '<', $original_fna_file; + open my $s1, '>', $fna_file; + while (<$fa>) { + chomp($_); + if ( $_ =~ /^>(.*)/ ) { + my $id = $1; + $id =~ s/[\/\.,\|\s?!\*%]/_/g; + my $new_id = $code_dataset . "_" . $id; + print $s1 ">$new_id\n"; + } + else { + print $s1 "$_\n"; + } + + } + close $fa; + close $s1; + # detect circular, predict genes on contigs and extract proteins, as well + # as filtering on size (nb genes) and/or circular + my $nb_gene_th = 2; # At least two complete genes on the contig + my $path_script_step_1 = catfile($script_dir,"Step_1_contigs_cleaning_and_gene_prediction.pl"); + my $cmd_step_1 = "$path_script_step_1 $code_dataset $fastadir $fna_file $nb_gene_th >> $log_out 2>> $log_err"; + print "Step 0.5 : $cmd_step_1\n"; + `echo $cmd_step_1 >> $log_out 2>> $log_err`; + { $out = `$cmd_step_1`; } +} + +print "\t$out\n"; +my $fasta_contigs_nett = catfile( $fastadir, $code_dataset . "_nett_filtered.fasta" ); +my $fasta_file_prots = catfile( $fastadir, $code_dataset . "_prots.fasta" ); + +# Match against PFAM, once for all +# compare to PFAM a then b (hmmsearch) +my $out_hmmsearch_pfama = "Contigs_prots_vs_PFAMa.tab"; +my $out_hmmsearch_pfama_bis = "Contigs_prots_vs_PFAMa.out"; +my $cmd_hmm_pfama = +"$path_hmmsearch --tblout $out_hmmsearch_pfama --cpu $n_cpus -o $out_hmmsearch_pfama_bis --noali $db_PFAM_a $fasta_file_prots >> $log_out 2>> $log_err"; +print "Step 0.8 : $cmd_hmm_pfama\n"; + +`echo $cmd_hmm_pfama >> $log_out 2>> $log_err`; + +if ( !( -e $out_hmmsearch_pfama ) ) { + $out = `$cmd_hmm_pfama`; + print "\t$out\n"; +} + +my $out_hmmsearch_pfamb = "Contigs_prots_vs_PFAMb.tab"; +my $out_hmmsearch_pfamb_bis = "Contigs_prots_vs_PFAMb.out"; +my $cmd_hmm_pfamb = +"$path_hmmsearch --tblout $out_hmmsearch_pfamb --cpu $n_cpus -o $out_hmmsearch_pfamb_bis --noali $db_PFAM_b $fasta_file_prots >> $log_out 2>> $log_err"; +print "Step 0.9 : $cmd_hmm_pfamb\n"; +`echo $cmd_hmm_pfamb >> $log_out 2>> $log_err`; + +if ( !( -e $out_hmmsearch_pfamb ) ) { + $out = `$cmd_hmm_pfamb`; + print "\t$out\n"; +} +else { + $out = "Already a results for PFAM B .. skipping (the great guru)\n"; +} + +# Now work on the phage gene catalog + +# Files that will stay along the computations +my $predict_file = catfile( $fastadir, $code_dataset . "_mga_final.predict" ); +my $out_hmmsearch = "Contigs_prots_vs_Phage_Gene_Catalog.tab"; +my $out_hmmsearch_bis = "Contigs_prots_vs_Phage_Gene_Catalog.out"; +my $out_blast_unclustered = "Contigs_prots_vs_Phage_Gene_unclustered.tab"; +my $out_file_affi = $code_dataset . "_affi-contigs.csv"; +my $out_file_phage_fragments = $code_dataset . "_phage-signal.csv"; +my $global_out_file = $code_dataset . "_global-phage-signal.csv"; +my $new_prots_to_cluster = $code_dataset . "_new_prot_list.csv"; + +# Constant scripts +my $script_merge_annot = catfile($script_dir,"Step_2_merge_contigs_annotation.pl"); +my $cmd_merge = +"$script_merge_annot $predict_file $out_hmmsearch $out_blast_unclustered $out_hmmsearch_pfama $out_hmmsearch_pfamb $ref_phage_clusters $out_file_affi >> $log_out 2>> $log_err"; +# my $cmd_merge = "$script_merge_annot -m $predict_file -hmm_pc $out_hmmsearch -blast_pc $out_blast_unclustered -hmm_pfa $out_hmmsearch_pfama -hmm_pfb $out_hmmsearch_pfamb -ref_pc $ref_phage_clusters -out_f $out_file_affi >> $log_out 2>> $log_err"; + +my $script_detect = catfile($script_dir,"Step_3_highlight_phage_signal.pl"); +my $cmd_detect = +"$script_detect $out_file_affi $out_file_phage_fragments >> $log_out 2>> $log_err"; +if ($tag_virome==1){$cmd_detect = +"$script_detect $out_file_affi $out_file_phage_fragments $generic_ref_file >> $log_out 2>> $log_err";} + +my $script_summary =catfile($script_dir,"Step_4_summarize_phage_signal.pl"); +my $cmd_summary = +"$script_summary $out_file_affi $out_file_phage_fragments $global_out_file $new_prots_to_cluster >> $log_out 2>> $log_err"; + +# # Get the final result file ready +`touch $global_out_file`; +my $r_n = -1; +# Si on a des nouvelles prots a clusteriser ou si on est dans la premiere +# revision +while ( (-e $new_prots_to_cluster || $r_n == -1) && ($r_n<=10) ) { + $r_n++; # New revision of the prediction + my $dir_revision = "r_" . $r_n; + print "### Revision $r_n\n"; + if ( !-d $dir_revision ) { + ## mkdir de la db de cette revision + #print "mkdir $dir_revision >> $log_out 2>> $log_err\n"; + #$out=`mkdir $dir_revision >> $log_out 2>> $log_err`; + mkpath($dir_revision); + print "Out : $out\n"; + ## Clustering of the new prots with the unclustered + my $script_new_cluster = catfile($script_dir,"Step_0_make_new_clusters.pl"); + # First revision, we just import the Refseq database + if ( $r_n == 0 ) { + #`mkdir $dir_revision/db`; + mkpath( catdir( $dir_revision, 'db' ) ); + + ## Adding custom sequences to the database if required by the user + if ( $custom_phage ne "" ) { + my $script_custom_phage = catfile($script_dir,"Step_first_add_custom_phage_sequence.pl"); + $out =`$script_custom_phage $custom_phage $dir_Phage_genes/ $dir_revision/db >> $log_out 2>> $log_err`; + print "Adding custom phage to the database : $out\n"; + } + # should replace Pool_cluster / Pool_unclustered and + # Pool_new_unclustered else , we just import the Refseq database + else { `cp $dir_Phage_genes/* $dir_revision/db/`; } + } + else { + my $previous_r = $r_n - 1; + my $previous_fasta_unclustered = + catfile( "r_" . $previous_r, "db", "Pool_unclustered.faa" ); + my $cmd_new_clusters = join(' ', + "$script_new_cluster $dir_revision $fasta_file_prots", + "$previous_fasta_unclustered", + "$new_prots_to_cluster >> $log_out 2>> $log_err" + ); + + print "$cmd_new_clusters\n"; + $out = `$cmd_new_clusters`; + print "Step 1.1 new clusters and new database : $out\n"; + # Rm the list of prots to be clustered now that they should be + # clustered + $out = `rm $new_prots_to_cluster`; + print "rm $new_prots_to_cluster -> $out\n"; + } + + # Check if there are some data in these new clusters, or if all the new + # proteins are unclustered + my $new_db_profil = catfile( $dir_revision, "db", "Pool_clusters.hmm" ); + my $check = 0; + open my $DB, '<', $new_db_profil; + while (<$DB>) { + chomp($_); + if ( $_ =~ /^NAME/ ) { + $check++; + # print "there is a cluster $_ in the database, so we're good\n"; + } + } + close $DB; + if ( $check == 0 ) { + print "There is no clusters in the database, so we skip the hmmsearch\n"; + } + else { + my $out_hmmsearch_new = + catfile( $dir_revision, "Contigs_prots_vs_New_clusters.tab" ); + my $out_hmmsearch_bis_new = + catfile( $dir_revision, "Contigs_prots_vs_New_clusters.out" ); + my $cmd_hmm_cluster = join(' ', + "$path_hmmsearch --tblout $out_hmmsearch_new --cpu $n_cpus", + "-o $out_hmmsearch_bis_new --noali $new_db_profil", + "$fasta_file_prots >> $log_out 2>> $log_err" + ); + + print "Step 1.2 : $cmd_hmm_cluster\n"; + + `echo $cmd_hmm_cluster >> $log_out 2>> $log_err`; + + $out = `$cmd_hmm_cluster`; + print "\t$out\n"; + + $out = `cat $out_hmmsearch_new >> $out_hmmsearch`; + print "\t$out\n"; + } + + my $out_blast_new_unclustered = + catfile( $dir_revision, "Contigs_prots_vs_New_unclustered.tab" ); + my $blastable_unclustered = + catfile( $dir_revision, 'db', 'Pool_new_unclustered' ); + my $cmd_blast_unclustered = join(' ', + "$path_blastall -p blastp -i $fasta_file_prots -d", + "$blastable_unclustered -o $out_blast_new_unclustered -a $n_cpus", + "-m 8 -e 0.001 >> $log_out 2>> $log_err" + ); + + print "\nStep 1.3 : $cmd_blast_unclustered\n"; + `echo $cmd_blast_unclustered >> $log_out 2>> $log_err`; + $out = `$cmd_blast_unclustered`; + print "\t$out\n"; + $out = `cat $out_blast_new_unclustered >> $out_blast_unclustered`; + print "\t$out\n"; + ## Make backup of the previous files to have trace of the different steps + my $backup_affi = catfile( $dir_revision, "affi_backup.csv" ); + my $backup_phage_signal = + catfile( $dir_revision, "phage_signal_backup.csv" ); + my $backup_global_signal = + catfile( $dir_revision, "global_signal_backup.csv" ); + if ( -e $out_file_affi ) { `cp $out_file_affi $backup_affi`; } + if ( -e $out_file_phage_fragments ) { + `cp $out_file_phage_fragments $backup_phage_signal`; + } + if ( -e $global_out_file ) { + `cp $global_out_file $backup_global_signal`; + } + } + + ## Complete the affi + print "Step 2 : $cmd_merge\n"; + `echo $cmd_merge >> $log_out 2>> $log_err`; + $out = `$cmd_merge`; + ## This generate a csv table including the map of each contig, with PFAM + #and Viral PCs annotations, as well as strand and length of genes + + print "\t$out\n"; + ## Complete the summary + print "Step 3 : $cmd_detect\n"; + `echo $cmd_detect >> $log_out 2>> $log_err`; + $out = `$cmd_detect`; + print "\t$out\n"; + + # Decide which contigs are entirely viral and which are prophages, and + # which of both of these categories are phage enough to be added to the + # databases + print "Setting up the final result file\n"; + print "Step 4 : $cmd_summary\n"; + `echo $cmd_summary >> $log_out 2>> $log_err`; + $out = `$cmd_summary`; + print "\t$out\n"; +} + +# Last step -> extract all sequences as fasta files and gb +my $script_generate_output = catfile($script_dir,"Step_5_get_phage_fasta-gb.pl"); +my $cmd_step_5 = "$script_generate_output $code_dataset >> $log_out 2>> $log_err"; +print "\nStep 5 : $cmd_step_5\n"; + +`echo $cmd_step_5 >> $log_out 2>> $log_err`; + +$out = `$cmd_step_5`; +print "\t$out\n"; + + +# Plus clean the output directory +print "Cleaning the output directory\n"; +# We rm the first db to not overload user disk space +my $db_revision_0="r_0/db"; +$out=`rm -r $db_revision_0`; +print "rm -r $db_revision_0 : $out\n"; +`mv fasta/ Fasta_files/`; +# We put all results from Hmmsearch and BLAST files in a separate directory +my $store_database_comparison="Tab_files"; +mkpath($store_database_comparison); +`mv $out_hmmsearch $store_database_comparison/`; +# `mv $out_hmmsearch_bis $store_database_comparison/`; +`mv $out_blast_unclustered $store_database_comparison/`; +`mv $out_hmmsearch_pfama $store_database_comparison/`; +`mv $out_hmmsearch_pfama_bis $store_database_comparison/`; +`mv $out_hmmsearch_pfamb $store_database_comparison/`; +`mv $out_hmmsearch_pfamb_bis $store_database_comparison/`; +`mv error.log $log_dir`; +`mv formatdb.log $log_dir`; +my $final_error_log=catfile($log_dir,'Virsorter_stderr_log'); +`mv log_err $final_error_log`; +# Then we clean error log to remove the ugly (and unnecessary) warning from BioPerl - Not needed anymore, we (i.e. Ken) figured out what was causing the warning (seq object had no id) +# my $cmd_sed="sed -i '/Use of uninitialized value in concatenation (.) or string at \\/usr\\/local\\/lib\\/perl5\\/site_perl\\/5.22.0\\/Bio\\/SeqUtils.pm line 375.\$\/d' $final_error_log"; +# print "$cmd_sed\n"; +# `$cmd_sed`; +my $final_out_log=catfile($log_dir,'Virsorter_stdout_log'); +`mv log_out $final_out_log`; +# We put all the files linked to the metric computation in a new directory +my $store_metric_files="Metric_files"; +mkpath($store_metric_files); +`mv $out_file_affi $store_metric_files/VIRSorter_affi-contigs.tab`; +my $out_file_affi_ref = $code_dataset . "_affi-contigs.refs"; +`mv $out_file_affi_ref $store_metric_files/`; +`mv $out_file_phage_fragments $store_metric_files/VIRSorter_phage_signal.tab`; +if (-e $new_prots_to_cluster){`mv $new_prots_to_cluster $store_metric_files/`;} +# And we customize and add the readme file in the output directory +my $datestring=localtime(); +my $local_readme_file="Readme.txt"; +open my $s1,'>',$local_readme_file; +print $s1 "VirSorter parameters used :\n\n"; +print $s1 "--> Fasta file mined for viral sequences : $original_fna_file\n"; +print $s1 "--> Viral database used : "; +if ($choice_database==2){print $s1 "Viromes : all bacterial and archaeal virus genomes in Refseq, as of January 2014, plus non-redundant predicted genes from viral metagenomes (including seawater, freshwater, and human-related samples)\n"} +else{print $s1 "RefseqABVir (all bacterial and archaeal virus genomes in Refseq, as of January 2014)\n";} +if ($custom_phage eq ""){print $s1 "--> No custom reference sequence was added to the database\n";} +else{print $s1 "--> Custom reference sequences from fasta file $custom_phage were added to the database\n";} +if ($tag_virome==1){print $s1 "VirSorter was run with the in the 'Virome Decontamination' mode: overall metrics for microbial sequences were not evaluated from the complete dataset, but instead pre-computed values based on bacterial and archaeal genomes from Refseq were used."} +print $s1 "\nThis VirSorter computation finished on $datestring\n"; +close $s1; +`cat $readme_file >> $local_readme_file`;