diff --git a/virsorter/.gitignore b/virsorter/.gitignore
new file mode 100644
index 0000000..ba077a4
--- /dev/null
+++ b/virsorter/.gitignore
@@ -0,0 +1 @@
+bin
diff --git a/virsorter/Dockerfile b/virsorter/Dockerfile
new file mode 100644
index 0000000..20d2a1f
--- /dev/null
+++ b/virsorter/Dockerfile
@@ -0,0 +1,21 @@
+FROM perl:latest
+
+MAINTAINER Ken Youens-Clark <kyclark@email.arizona.edu>
+
+RUN apt-get update && apt-get install libdb-dev -y
+
+RUN cpanm --force Capture::Tiny
+
+RUN cpanm --force BioPerl
+
+RUN cpanm File::Which
+
+COPY wrapper_phage_contigs_sorter_iPlant.pl /usr/local/bin/
+
+COPY Scripts /usr/local/bin/Scripts/
+
+COPY bin /usr/local/bin/
+
+ENTRYPOINT ["wrapper_phage_contigs_sorter_iPlant.pl"]
+
+CMD ["-h"]
diff --git a/virsorter/LICENSE b/virsorter/LICENSE
new file mode 100644
index 0000000..d6a9326
--- /dev/null
+++ b/virsorter/LICENSE
@@ -0,0 +1,340 @@
+GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    {description}
+    Copyright (C) {year}  {fullname}
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  {signature of Ty Coon}, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
+
diff --git a/virsorter/README.md b/virsorter/README.md
new file mode 100644
index 0000000..779b123
--- /dev/null
+++ b/virsorter/README.md
@@ -0,0 +1,64 @@
+# VirSorter
+
+Source code of the VirSorter App, available on iPlant (https://de.iplantcollaborative.org/de/)
+
+# Dependencies
+
+Install the following into a "bin" directory:
+
+* HMMER (http://hmmer.janelia.org/)
+* MCL (http://micans.org/mcl/)
+* Metagene Annotator (http://metagene.nig.ac.jp/metagene/download_mga.html)
+* MUSCLE (http://www.drive5.com/muscle/)
+* BLAST (ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/LATEST/, not BLAST+)
+
+# Docker
+
+## Data Container
+
+The 12G of dependent data exists as a separate data container 
+called "virsorter-data."
+
+This is the Dockerfile for that:
+
+    FROM perl:latest
+
+    MAINTAINER Ken Youens-Clark <kyclark@email.arizona.edu>
+
+    COPY Generic_ref_file.refs /data/
+
+    COPY PFAM_27 /data/PFAM_27
+
+    COPY Phage_gene_catalog /data/Phage_gene_catalog
+
+    COPY Phage_gene_catalog_plus_viromes /data/Phage_gene_catalog_plus_viromes
+
+    COPY SUP05_SAGs_with_viruses.fna /data/
+
+    COPY VirSorter_Readme.txt /data
+
+    COPY VirSorter_Readme_viromes.txt /data
+
+    VOLUME ["/data"]
+  
+Then do:
+
+    $ docker build -t kyclark/virsorter-data .
+    $ docker create --name virsorter-data kyclark/virsorter-data /bin/true
+
+## Build
+
+    $ docker build -t kyclark/virsorter .
+
+## Run
+
+A sample "run" command to use the current working directory for input/output:
+
+    $ docker run --rm --volumes-from virsorter-data -v $(pwd):/de-app-work \
+    -w /de-app-work kyclark/virsorter --fna Mic_1.fna --db 1
+
+# Authors
+
+Simon Roux <roux.8@osu.edu> is the author of Virsorter
+
+Ken Youens-Clark <kyclark@email.arizona.edu> packaged this for Docker/iPlant.
diff --git a/virsorter/Scripts/Sliding_windows_3 b/virsorter/Scripts/Sliding_windows_3
new file mode 100755
index 0000000..5a432e4
Binary files /dev/null and b/virsorter/Scripts/Sliding_windows_3 differ
diff --git a/virsorter/Scripts/Sliding_windows_3.c b/virsorter/Scripts/Sliding_windows_3.c
new file mode 100755
index 0000000..cc18a06
--- /dev/null
+++ b/virsorter/Scripts/Sliding_windows_3.c
@@ -0,0 +1,259 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+long double factorial(unsigned n){
+	long double f=1;
+	while(n>0){f*=n--;}
+	return f;
+}
+
+long double combination(unsigned k,unsigned n){
+	long double f=(factorial(n) / (factorial(k) * factorial(n-k)));
+	return f;
+}
+
+
+long double combination_eff(unsigned k,unsigned n){
+	long double num=1;
+	if (k<(n/2)){k=n-k;}
+	int n_2=n;
+	while (n_2>k){num*=n_2--;}
+	long double f= num / factorial(n-k);
+	return f;
+}
+
+long double proba_n(unsigned n,unsigned k, long double proba){
+	long double result=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); // New way more efficient to compute combination
+	return result;
+}
+
+
+long double proba_more_than(int n,int k, long double proba){
+	long double result=0.0;
+	while(k<=n) {
+		result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k));
+		k++;
+	}
+	return result;
+}
+
+
+long double proba_less_than(int n,int k, long double proba){
+	long double result=0.0;
+	while(k>=0) {
+		result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k));
+		k--;
+	}
+	return result;
+}
+
+int get_th(int size_window,long double threshold, long double proba){
+	int th_nb_gene=size_window+1;
+	long double p_t=0.0;
+// 	printf("starting at %d / with proba %LE\n",th_nb_gene,proba);
+	while(p_t<=threshold && th_nb_gene>0){
+		th_nb_gene--;
+		p_t = p_t + proba_n(size_window,th_nb_gene,proba);
+// 		printf("\tp(x>=%d) = %LE\n",th_nb_gene,p_t);
+	}
+	return th_nb_gene;
+}
+
+
+int get_th_less(int size_window,long double threshold, long double proba){
+	int th_nb_gene=-1;
+	long double p_t=0.0;
+	while(p_t<=threshold && th_nb_gene<size_window){
+		th_nb_gene++;
+		p_t = p_t + proba_n(size_window,th_nb_gene,proba);
+	}
+	return th_nb_gene;
+}
+
+
+int is_local_maximum(int start,int size,int type, int p_nb_genes, int p_max,double ***store){
+	int i,j,result=1,hood=5; // What's the size of the 'hood 'bro ?
+// 	printf("we'll look for 5 around %d and for 5 around %d with a maximum index of %d and %d\n",start,size,p_nb_genes,p_max);
+	for (i=start-hood;i<=start+hood;i++){
+		for(j=size-hood;j<=size+hood;j++){
+// 			printf("-- Looking at %d %d\n",i,j);
+			if (i>=0 && j>=0 && i<p_nb_genes && j<=p_max){ // should be enough to get if there is a value in the table
+// 				printf("-- Really Looking at %d %d\n",i,j);
+				if (store[i][j][type]>store[start][size][type]){
+					result=0;
+// 					i=start+hood+1;j=size+hood+1;
+				}
+			}
+		}
+	}
+	return result;
+}
+
+
+long double log10perso(long double x){
+	return log(x)/log(10);
+}
+
+
+int main(int argc, char *argv[])
+{
+// 	printf( "I am alive!  Beware.\n" );
+	FILE *ifp, *reffile;
+	char* refFilename=argv[1];char* inputFilename=argv[2];char* outputFilename=argv[3];
+	reffile=fopen(refFilename,"r");
+	int nb_genes=0,phage=0,pfam=0,unch=0,size=0,strand=0,hallmark=0,i=0,noncaudo=0;
+	float f_size=0.0;
+	long double p_phage=0.0,p_pfam=0.0,p_unch=0.0,p_strand=0.0,p_noncaudo=0.0;
+	if (reffile == NULL) {
+		fprintf(stderr, "Can't open input file %s\n",refFilename);
+		exit(1);
+	}
+	while (fscanf(reffile,"%Lf %Lf %Lf %Lf %f %Lf", &p_phage, &p_pfam, &p_unch, &p_strand, &f_size, &p_noncaudo) == 6) {}
+	printf("refs => %LE %LE %LE %LE %f %LE\n", p_phage, p_pfam, p_unch, p_strand, f_size, p_noncaudo);
+	fclose(reffile);
+	ifp = fopen(inputFilename, "r");
+	if (ifp == NULL) {
+		fprintf(stderr, "Can't open input file %s!\n",inputFilename);
+		exit(1);
+	}
+	if (fscanf(ifp, "%d", &nb_genes) == 1){
+// 		printf("%d genes\n",nb_genes);
+	}
+	// Alloc memory for gene tables
+	int t_phage[nb_genes],t_pfam[nb_genes],t_unch[nb_genes], t_size[nb_genes],t_strand[nb_genes],t_hallmark[nb_genes],t_noncaudo[nb_genes];
+	while (fscanf(ifp,"%d %d %d %d %d %d %d", &phage, &noncaudo, &pfam, &unch, &size, &strand, &hallmark) == 7) {
+// 		printf("gene %d => %d %d %d %d %d %d %d\n", i, phage, noncaudo, pfam, unch, size, strand, hallmark);
+		t_phage[i]=phage;
+		t_noncaudo[i]=noncaudo;
+		t_pfam[i]=pfam;
+		t_unch[i]=unch;
+		t_size[i]=size;
+		t_strand[i]=strand;
+		t_hallmark[i]=hallmark;
+		i++;
+	}
+	fclose(ifp);
+	if (nb_genes!=i){
+		printf("Houston we got a problem !!!!!! : we had %d genes and we count %d lines\n",nb_genes,i);
+		exit(1);
+	}
+// 	// set up sliding windows
+	int min=10,max=100;
+	if (min>nb_genes){min=nb_genes;}
+	if (max>nb_genes){max=nb_genes;}
+// 	// how many sliding windows will we have ?
+	int k=0,j=0,max_g=0,c_phage=0,c_pfam=0,pred_nb_s_w=0,t=0,th_nb_gene=0;
+	for (k=min;k<=max;k++){
+		pred_nb_s_w+=nb_genes-k+1;
+	}
+// 	printf("Predicting %d sliding windows\n",pred_nb_s_w);
+	// computing the threshold for each size of sliding window
+// 	printf("Trying to allocate the memory 1\n");
+	long double th=0.01/pred_nb_s_w,p_t=0.0;
+	// alloc memory for score matrix for the 6 metrics
+	double ***store=malloc(nb_genes*sizeof(double **));
+	if (store==NULL){printf("out of memory\n");exit(1);}
+	for(i=0; i < nb_genes; i++){
+		store[i] = malloc(max * sizeof(double *));
+		if(store[i] == NULL){printf("out of memory\n");exit(1);}
+		for (j=0;j<=max;j++){
+			store[i][j] = malloc(6 * sizeof(double ));
+			if(store[i][j] == NULL){printf("out of memory\n");exit(1);}
+			for (k=0;k<6;k++){store[i][j][k]=0;}
+		}
+	}
+// 	printf("Memory Allocated and Initialized for %d %d 5\n",nb_genes,max);
+	int store_h[nb_genes][max];
+	int n_phage=0,n_pfam=0,n_short=0,n_switch=0,n_unch=0,n_hallmark=0,n_noncaudo=0;
+	printf("For this contig we'll have %d sliding windows (= nb of comparison)\n",pred_nb_s_w);
+	for (k=max;k>=min;k--){
+		int th_phage=k,th_pfam=k,th_size=k,th_unch=k,th_strand=k,th_noncaudo=k;
+		// we get all thresholds
+		th_phage=get_th(k,th,p_phage);
+// 		printf("For window size %d, you will need at least %d phage genes to be significant\n",k,th_phage);
+		th_pfam=get_th_less(k,th,p_pfam);
+		th_unch=get_th(k,th,p_unch);
+// 		printf("For window size %d, you will need at least %d uncharacterized genes to be significant\n",k,th_unch);
+		th_size=get_th(k,th,0.1);
+		th_strand=get_th_less(k,th,p_strand);
+		th_noncaudo=get_th(k,th,p_noncaudo);
+// 		printf("For window size %d, you will need at least %d noncaudo genes to be significant\n",k,th_noncaudo);
+// 		printf("////// Sliding window of %d genes -> th %d\n",k,th_phage);
+		// For all the sliding windows of this size, we count and compute and store the significativity value if > sig
+		for (i=0;i<(nb_genes-k+1);i++){
+			n_phage=0;n_pfam=0;n_unch=0;n_short=0;n_switch=0;n_hallmark=0;n_noncaudo=0;
+// 			// Counting 
+			for (j=i;j<(i+k);j++){
+				n_phage+=t_phage[j];
+// 				printf("Adding %d to the number of phage genes (%d)\n",t_phage[j],j);
+				n_pfam+=t_pfam[j];
+				n_unch+=t_unch[j];
+				n_short+=t_size[j];
+				n_switch+=t_strand[j];
+				n_hallmark+=t_hallmark[j];
+				n_noncaudo+=t_noncaudo[j];
+			}
+			unsigned tag=0;
+// 			// If above thresholds
+			if (n_phage>th_phage){
+// 				// Calculate and store significativity
+				store[i][k][0]=-1*log10(proba_more_than(k,n_phage,p_phage)*pred_nb_s_w);tag=1;
+// 				printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k);
+			}
+			if (n_pfam<th_pfam){
+				// Calculate and store significativity
+				store[i][k][1]=-1*log10(proba_less_than(k,n_pfam,p_pfam)*pred_nb_s_w);tag=1;
+// 				printf("Pfam => %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 1\n",n_pfam,th_pfam,store[i][k][1],i,k);
+			}
+			if (n_unch>th_unch){
+// 				// Calculate and store significativity
+				store[i][k][2]=-1*log10(proba_more_than(k,n_unch,p_unch)*pred_nb_s_w);tag=1;
+// 				printf("Unch => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 2\n",n_unch,th_unch,store[i][k][2],i,k);
+			}
+			if (n_short>th_size){
+// 				// Calculate and store significativity
+				store[i][k][3]=-1*log10(proba_more_than(k,n_short,0.1)*pred_nb_s_w);tag=1;
+// 				printf("Short => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 3\n",n_short,th_size,store[i][k][3],i,k);
+			}
+			if (n_switch<th_strand){
+				// Calculate and store significativity
+				store[i][k][4]=-1*log10(proba_less_than(k,n_switch,p_strand)*pred_nb_s_w);tag=1;
+// 				printf("Switch => %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 4\n",n_switch,th_strand,store[i][k][4],i,k);
+			}
+			if (n_noncaudo>th_noncaudo){
+// 				// Calculate and store significativity
+				store[i][k][5]=-1*log10(proba_more_than(k,n_noncaudo,p_noncaudo)*pred_nb_s_w);tag=1;
+// 				printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k);
+			}
+			if (tag==1){store_h[i][k]=n_hallmark;}
+		}
+	}
+	// We look for local maxima and export the results
+	FILE *ofp;
+	ofp = fopen(outputFilename, "w");
+	if (ofp == NULL) {
+		fprintf(stderr, "Can't open output file %s!\n",outputFilename);
+		exit(1);
+	}
+	for (k=max;k>=min;k--){
+		for (i=0;i<(nb_genes-k+1);i++){
+			for (j=0;j<6;j++){
+				if (store[i][k][j] != 0.0){ // the stored value is not null
+// 					printf("potential local maximum %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]);
+					if (is_local_maximum(i,k,j,nb_genes-1,max,store)==1){ // and is a local maxima
+						// so we print it, with the nb_hallmark (start / window size / type / sig / nb_hallmark)
+// 						printf("local maximum ! %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]);
+						// i - start gene / k - sliding window size / j - proof typ (0 - phage / 1 - pfam / 2 - unch / 3 - size / 4 - strand)
+						fprintf(ofp, "%d\t%d\t%d\t%.14lf\t%d\n",i,k,j,store[i][k][j],store_h[i][k]);
+					}
+				}
+			}
+		}
+	}
+	fclose(ofp);
+	printf("done");
+	// We export the results
+	return 0;
+}
diff --git a/virsorter/Scripts/Step_0_make_new_clusters.pl b/virsorter/Scripts/Step_0_make_new_clusters.pl
new file mode 100755
index 0000000..1982cb7
--- /dev/null
+++ b/virsorter/Scripts/Step_0_make_new_clusters.pl
@@ -0,0 +1,257 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+use File::Spec::Functions;
+use File::Path 'mkpath';
+use File::Which 'which';
+
+# Script to generate a new db with putative new clusters
+# Argument 0 : revision directory
+# Argument 1 : Fasta file of the predicted proteins
+# Argument 2 : Fasta file of the unclustered from previous Runs
+# Argument 3 : Liste of prots to try to cluster
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3])))
+{
+	print "# Script to generate a new db with putative new clusters
+# Argument 0 : revision directory
+# Argument 1 : Fasta file of the predicted proteins
+# Argument 2 : Fasta file of the unclustered from previous Runs
+# Argument 3 : Liste of prots to try to cluster\n";
+	die "\n";
+}
+
+my $path_to_blastall = which("blastall");
+my $MCX_LOAD         = which("mcxload");
+my $MCL              = which("mcl");
+
+my $r_dir=$ARGV[0];
+$r_dir=~/(r_\d*)\/?$/;
+my $r_number=$1;
+print "Revision $r_number\n";
+my $fasta_prot_contigs=$ARGV[1];
+my $fasta_prot_unclustered=$ARGV[2];
+my $liste=$ARGV[3];
+my $blast_unclustered=$fasta_prot_unclustered;
+$blast_unclustered=~s/Pool_unclustered.faa/Blast_unclustered.tab/;
+
+my $path_to_formatdb = which("formatdb");
+my $path_to_blastal = which("blastall");
+
+my $min_seq_in_a_cluster=3;
+
+my $path_to_muscle= which("muscle");
+my $path_to_hmmbuild= which("hmmbuild");
+my $path_to_hmmpress= which("hmmpress");
+
+
+my %check;
+open(LI,"<$liste") || die ("pblm opening liste $liste\n");
+while (<LI>){
+	chomp($_);
+	my @tab=split(",",$_);
+	foreach(@tab){$check{$_}=1;}
+}
+close LI;
+
+my $pool_new= catfile($r_dir, "pool_new_proteins.fasta");
+open(S1,">$pool_new") || die ("pblm opening file $pool_new");
+open(FA,"<$fasta_prot_contigs") || die ("pblm opening file $fasta_prot_contigs");
+my $tag=0;
+while (<FA>){
+	chomp($_);
+	if ($_=~/^>(.*)/){
+		my $seq=$1;
+		$tag=0;
+		if ($check{$seq}==1){
+			print S1 "$_\n";
+			$tag=1;
+		}
+	}
+	elsif($tag==1){
+		print S1 "$_\n";
+	}
+}
+close FA;
+close S1;
+
+my $db= catfile($r_dir, "pool_new_proteins");
+my $cmd_format="$path_to_formatdb -i $pool_new -n $db";
+print "$cmd_format\n";
+my $out=`$cmd_format`;
+print "Formatdb : $out\n";
+my $cmd_cat="cat $fasta_prot_unclustered >> $pool_new";
+print "$cmd_cat\n";
+$out=`$cmd_cat`;
+print "Cat : $cmd_cat\n";
+# BLAST des unclustered et des new contre les new
+my $out_blast=catfile($r_dir, "pool_unclustered-and-new-proteins-vs-new-proteins.tab");
+my $cmd_blast="$path_to_blastall -p blastp -i $pool_new -d $db -o $out_blast -m 8 -a 10 -e 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts
+print "$cmd_blast\n";
+$out=`$cmd_blast`;
+print "Blast : $out\n";
+$cmd_cat="cat $blast_unclustered >> $out_blast";
+print "$cmd_cat\n";
+$out=`$cmd_cat`;
+print "Cat : $out\n";
+print "Generating abc file\n";
+my $out_abc= catfile($r_dir, "new_clusters.abc");
+my $th_score=50;
+my $th_evalue=0.00001;
+my $max=200; # Max on sig
+open(S1,">$out_abc") || die ("pblm opening file $out_abc\n");
+open(BL,"<$out_blast") || die ("pblm opening file $out_blast\n");
+while(<BL>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	if ($tab[11]>$th_score && $tab[10]<$th_evalue && $tab[0] ne $tab[1]){
+		my $evalue=$tab[10];
+# 		$evalue=-log10($evalue);
+# 		if ($evalue>$max){$evalue=$max;}
+		print S1 "$tab[0]\t$tab[1]\t$evalue\n";
+	}
+}
+close BL;
+close S1;
+my $out_mci=catfile($r_dir, "new_clusters.mci");
+my $out_tab=catfile($r_dir, "new_clusters.tab");
+my $cmd_mcxload="$MCX_LOAD -abc $out_abc --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o $out_mci -write-tab $out_tab";
+print "$cmd_mcxload\n";
+$out=`$cmd_mcxload`;
+print "Mxc Load : $out\n";
+my $dump_file=catfile($r_dir, "new_clusters.csv");
+my $cmd_mcl="$MCL $out_mci -I 2  -use-tab $out_tab -o $dump_file";
+print "$cmd_mcl\n";
+$out=`$cmd_mcl`;
+print "Mcl : $out\n";
+
+my %unclustered;
+my %clusters;
+my %check_cluster;
+my $last_cluster_id=0;
+# toutes les séquences clusterisées dans des groupes de plus de 2 (3 et plus) -> on prend / Toutes les autres on les garde en tant qu'unclustered
+open(DUMP,"<$dump_file") || die "pblm ouverture fichier $dump_file\n";
+while(<DUMP>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	my $n_s_c=$#tab+1;
+	if ($n_s_c>=$min_seq_in_a_cluster){
+		# on a trouvé un cluster de plus de deux
+		my $cluster_id=$last_cluster_id+1;
+		$cluster_id="Phage_cluster_".$cluster_id."-".$r_number;
+		print "We found a cluster with $n_s_c sequences => Cluster $cluster_id\n";
+		$last_cluster_id++;
+		foreach(@tab){
+			$clusters{$cluster_id}{$_}=1;
+			$check_cluster{$_}=1;
+		}
+	}
+	else{
+		foreach(@tab){
+			$unclustered{$_}=1;
+			$check_cluster{$_}=1;
+		}
+	}
+}
+close DUMP;
+my %seq_temp;
+my $id_c="";
+open(FA,"<$pool_new") || die "pblm ouverture fichier $pool_new\n";
+while(<FA>){
+	chomp($_);
+	if ($_=~/^>(\S*)/){
+		$id_c=$1;
+		if (!defined($check_cluster{$id_c})){$unclustered{$id_c}=1;$check_cluster{$id_c}=1;}
+	}
+	else{$seq_temp{$id_c}.=$_;}
+}
+close FA;
+
+`mkdir $r_dir/clusts`;
+foreach(keys %clusters){
+	my $cluster_id=$_;
+	my $out_file=catfile($r_dir, "clusts", $cluster_id . ".faa");
+	open(S1,">$out_file") || die "pblm ouverture fichier $out_file\n";
+	foreach(keys %{$clusters{$cluster_id}}){
+		print S1 ">$_\n$seq_temp{$_}\n";
+	}
+	close S1;
+}
+
+mkpath(catdir($r_dir, 'db'));
+my $pool_unclustered=catfile($r_dir, "db", "Pool_unclustered.faa");
+my $blast_unclustered=catfile($r_dir, "db", "Blast_unclustered.tab");
+my $pool_new_unclustered=catfile($r_dir, "db", "Pool_new_unclustered.faa");
+my $blastable_new_unclustered=$pool_new_unclustered;
+$blastable_new_unclustered=~s/\.faa//;
+
+open(S2,">$pool_new_unclustered") || die "pblm ouverture fichier $pool_new_unclustered\n";
+open(S1,">$pool_unclustered") || die "pblm ouverture fichier $pool_unclustered\n";
+foreach(keys %unclustered){
+	print S1 ">$_\n$seq_temp{$_}\n";
+	if ($check{$_}==1){
+		print S2 ">$_\n$seq_temp{$_}\n";
+	}
+}
+close S1;
+close S2;
+print "making a blastable db from the new unclustered\n";
+$out=`$path_to_formatdb -i $pool_new_unclustered -n $blastable_new_unclustered`;
+# on réduit aussi le fichier blast qu'on ajoute au blast des unclustered
+open(BL,"<$out_blast") || die "pblm ouverture fichier $out_blast\n";
+open(S1,">$blast_unclustered") || die "pblm ouverture fichier $blast_unclustered\n";
+while(<BL>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	if ($unclustered{$tab[0]}==1 && $unclustered{$tab[1]}==1){
+		print S1 "$_\n";
+	}
+}
+close BL;
+close S1;
+
+my $tag=0;
+foreach(sort keys %clusters){
+	$tag=1;
+	my $ali_id=$_;
+	my $path_to_file= catfile($r_dir, "clusts", $ali_id);
+	my $path_to_fasta=catfile($r_dir, "clusts", $ali_id . ".faa");
+	my $path_to_ali=catfile($r_dir, "clusts", $ali_id . ".ali_faa");
+	my $path_to_hmm=catfile($r_dir, "clusts", $ali_id . "_ali.hmm");
+	if (-e $path_to_ali){
+		`rm $path_to_ali $path_to_hmm`;
+	}
+	my $muscle_out= catfile($r_dir, "log_out_muscle");
+	my $muscle_err= catfile($r_dir, "log_err_muscle");
+	`$path_to_muscle -in $path_to_fasta -out $path_to_ali > $muscle_out 2> $muscle_err`;
+	my $out_stokcholm=$path_to_ali.".stockholm";
+	open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n";
+	print S1 "# STOCKHOLM 1.0\n";
+	open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n";
+	while(<FA>){
+		chomp($_);
+		if ($_=~/^>(.*)/){
+			my $id=$1;
+			$id=~s/\s/_/g;
+			print S1 "\n$id  ";
+			
+		}
+		else{print S1 "$_";}
+	}
+	close FA;
+	print S1 "\n//\n";
+	`$path_to_hmmbuild --amino $path_to_hmm $out_stokcholm`;
+}
+
+my @tab_hmm=<$r_dir/clusts/*.hmm>;
+if ($#tab_hmm>=0){
+	# we gather all hmm and fasta (if any)
+	$out=`cat $r_dir/clusts/*.hmm >> $r_dir/db/Pool_clusters.hmm`;
+	print "cat new hmm : $out\n";
+	# and create a new database for hmmsearch
+	$out=`$path_to_hmmpress $r_dir/db/Pool_clusters.hmm`;
+	print "hmm press :$out\n";
+}
+else{
+	$out=`touch $r_dir/db/Pool_clusters.hmm`;
+}
diff --git a/virsorter/Scripts/Step_1_contigs_cleaning_and_gene_prediction.pl b/virsorter/Scripts/Step_1_contigs_cleaning_and_gene_prediction.pl
new file mode 100755
index 0000000..98aac5d
--- /dev/null
+++ b/virsorter/Scripts/Step_1_contigs_cleaning_and_gene_prediction.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+use Bio::Seq;
+use File::Spec::Functions;
+use File::Which 'which';
+
+# Script to detect circular contigs, nett sequences, and predict genes with mga
+# Argument 0 : Fasta file of contigs
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3])))
+{
+	print "# Script to detect circular contigs, nett sequences, and predict genes with mga
+# Argument 0 : Id du dataset
+# Argument 1 : Working dir
+# Argument 2 : Fasta file of contigs
+# Argument 3 : Threshold on the number of genes \n";
+	die "\n";
+}
+
+my $id                = $ARGV[0];
+my $tmp_dir           = $ARGV[1];
+my $fasta_contigs     = $ARGV[2];
+my $th_nb_genes       = $ARGV[3];
+my $path_to_mga       = which('mga_linux_ia64');
+my $in_file           = catfile($tmp_dir, $id . "_nett.fasta");
+my $circu_file        = catfile($tmp_dir, $id . "_circu.list");
+my $out_special_circu = catfile($tmp_dir, $id . "_contigs_circu_temp.fasta");
+
+# Reading fasta file of the contigs
+open my $fa, '<', $fasta_contigs;
+my %seq_base;
+my $id_seq="";
+while(<$fa>){
+	$_=~s/\r\n/\n/g; #Cas d'un fichier windows ##AJOUT
+	chomp($_);
+	if ($_=~/^>(\S*)/){$id_seq=$1;}
+	else{$seq_base{$id_seq}.=$_;}
+}
+close $fa;
+
+## DETECTION OF CIRCULAR CONTIG AND CLEANING OF THESE CIRCULAR (REMOVE THE MATCHING ENDS)
+my $minimum_size=1500;
+my %order_contig;
+my %length;
+my $n1=0;
+
+open my $s1, '>', $in_file;
+open my $s2, '>', $circu_file;
+for my $id_contig (
+    sort {length($seq_base{$b}) <=> length($seq_base{$a})} keys %seq_base){
+	$order_contig{$id_contig}=$n1;
+	$n1++;
+	my $s=$seq_base{$id_contig};
+	$length{$id_contig}=length($seq_base{$id_contig});
+	# Test its circularity
+	my $prefix=substr($seq_base{$id_contig},0,10);
+	if ($s=~/(.+)($prefix.*?)$/){
+# 		print "on a retrouvé prefix ($prefix) plus loin dans la séquence de $_\n";
+		my $sequence=$1;
+		my $suffixe=$2;
+		my $test=substr($seq_base{$id_contig},0,length($suffixe));
+# 		print "$suffixe\n$test\n";
+		if ($suffixe eq $test){
+# 			print " et il match bien $suffixe, donc c'est un contig circulaire\n";
+			my $l=$length{$id_contig};
+			$id_contig=$id_contig."-circular";
+			$length{$id_contig}=$l;
+			print $s2 "$id_contig\t$length{$id_contig}\n";
+			$seq_base{$id_contig}=$sequence;
+		}
+	}
+	# Update the length of the contig
+	$length{$id_contig}=length($seq_base{$id_contig});
+	print $s1 ">$id_contig\n$seq_base{$id_contig}\n";
+}
+close $s1;
+close $s2;
+
+# Gene prediction for all contigs
+my $out_file= $tmp_dir."/".$id."_mga.predict";
+print "$path_to_mga $in_file -m > $out_file\n";
+my $mga=`$path_to_mga $in_file -m > $out_file`;
+
+# Special prediction for circular contigs if we have some
+my $out_file_circu="";
+my %circu;
+if (-e $circu_file){
+	open my $tsv, '<', $circu_file;
+	while(<$tsv>){
+		chomp($_);
+		my @tab=split("\t",$_);
+		my $id_c=$tab[0];
+		$circu{$id_c}=1;
+	}
+	close $tsv;
+	open my $s3, '>', $out_special_circu;
+	my $long=1000; # we cp the 1000 first bases to the end of the contig
+	my $seuil_long=1000;
+	my $n_circu=0;
+	foreach(sort {$order_contig{$a} <=> $order_contig{$b} } keys %circu){
+		my $id_c=$_;
+		my $s=$seq_base{$id_c}.substr($seq_base{$id_c},0,$long);
+		print $s3 ">$id_c\n$s\n";
+		$n_circu++;
+	}
+	close $s3;
+	$out_file_circu= $tmp_dir."/".$id."_special_circus_mga.predict";
+	if ($n_circu>0){
+		my $mga=`$path_to_mga $out_special_circu -m > $out_file_circu`;
+	}
+	else{
+		`touch $out_file_circu`;
+	}
+}
+
+# Mix 'n match of the two results of gene prediction
+my %order_gene;
+my $n2=0;
+open my $fts, '<', $out_file;
+my %predict;
+my %type;
+my $id_c="";
+while(<$fts>){
+	chomp($_);
+	if($_=~/^# gc/){}
+	elsif($_=~/^# self: (.*)/){$type{$id_c}=$1;}
+	elsif ($_=~/^# (.*)/){
+		$id_c=$1;
+		$n2=0;
+	}
+	else{
+		my @tab=split("\t",$_);
+		$predict{$id_c}{$tab[0]}=$_;
+		if (!defined($order_gene{$id_c}{$tab[0]})){$order_gene{$id_c}{$tab[0]}=$n2;$n2++;}
+	}
+}
+close $fts;
+if (-e $circu_file){
+	open my $fts_c, '<', $out_file_circu;
+	my $tag=0;
+	while(<$fts_c>){
+		chomp($_);
+		if($_=~/^# gc/){}
+		elsif($_=~/^# self: (.*)/){$type{$id_c}=$1;}
+		elsif ($_=~/^# (.*)/){
+			if($tag==1){
+				my %to_start;
+				# Some ORFs were modified, we clean up
+				foreach(sort {$order_gene{$a} <=> $order_gene{$b} } keys %{$predict{$id_c}}){
+					my @tab=split("\t",$predict{$id_c}{$_});
+					if ($tab[5]!=11){
+						# $tab[0] miss start and/or stop codon
+						if(($tab[1]<3) || ($tab[2]>($length{$id_c}-3))){
+							# And it spans the origin, so we can remove it
+							if ($tab[1]<3){
+								$to_start{$tab[0]}{"start"}=$tab[1];
+								$to_start{$tab[0]}{"stop"}=$tab[2];
+							}
+							delete($predict{$id_c}{$tab[0]});
+						}
+						elsif(($tab[2]>997) && ($tab[2]<1001)){ # if we are around the zone of ~ 1000
+							foreach(keys %to_start){
+								my $total=($length{$id_c}-$tab[1]+1)+($to_start{$_}{"stop"}); 
+								if ($total % 3 == 0){
+									$tab[2]=$to_start{$_}{"stop"};
+									$tab[5]=11;
+									my $new_line=join("\t",@tab);
+									$predict{$id_c}{$tab[0]}=$new_line;
+								}
+							}
+						}
+					}
+				}
+			}
+			$id_c=$1;
+			$tag=0;
+		}
+		else{
+			my @tab=split("\t",$_);
+			if (defined($predict{$id_c}{$tab[0]})){
+				my @tab2=split("\t",$predict{$id_c}{$tab[0]});
+				if (($tab2[1]==$tab[1]) && ($tab2[2]==$tab[2])){}# same prediction, we don't change anything
+				else{
+					if (($tab[1]<$length{$id_c}) && ($tab[2]>$length{$id_c})){
+						# we span the origin, we replace the prediction
+						$tag=1;
+						my $stop=$tab[2]-$length{$id_c};
+						$tab[2]=$stop;
+						my $new_line=join("\t",@tab);
+						$predict{$id_c}{$tab[0]}=$new_line;
+					}
+				}
+			}
+			else{
+				# we predict a new gene, we keep only if at the start / end of the contig
+				if (($tab[1]<$length{$id_c}) && ($tab[2]>$length{$id_c})){
+					$tag=1;
+					my $stop=$tab[2]-$length{$id_c};
+					$tab[2]=$stop;
+					my $new_line=join("\t",@tab);
+					$predict{$id_c}{$tab[0]}=$new_line;
+					$tag=1;
+				}
+			}
+		}
+	}
+	if($tag==1){
+		my %to_start;
+		# we changed some things, we clean up
+		foreach(sort {$order_gene{$a} <=> $order_gene{$b} } keys %{$predict{$id_c}}){
+			my @tab=split("\t",$predict{$id_c}{$_});
+			if ($tab[5]!=11){
+				if(($tab[1]<3) || ($tab[2]>($length{$id_c}-3))){
+					if ($tab[1]<3){
+						$to_start{$tab[0]}{"start"}=$tab[1];
+						$to_start{$tab[0]}{"stop"}=$tab[2];
+					}
+					delete($predict{$id_c}{$tab[0]});
+				}
+				elsif(($tab[2]>997) && ($tab[2]<1001)){
+					foreach(keys %to_start){
+						my $total=($length{$id_c}-$tab[1]+1)+($to_start{$_}{"stop"}); 
+						if ($total % 3 == 0){
+							$tab[2]=$to_start{$_}{"stop"};
+							$tab[5]=11;
+							my $new_line=join("\t",@tab);
+							$predict{$id_c}{$tab[0]}=$new_line;
+						}
+					}
+				}
+			}
+		}
+	}
+	close $fts_c;
+}
+
+## Generation of the final files
+## One with all sequences nett and filtered (based on number of genes) - Fasta
+## One of the associated gene prediction - MGA-like
+## One of the predicted protein sequences - Fasta
+my $final_file=$tmp_dir."/".$id."_nett_filtered.fasta";
+my $out_final=$tmp_dir."/".$id."_mga_final.predict";
+my $prot_file=$tmp_dir."/".$id."_prots.fasta";
+
+open my $fa_s,  '>', $final_file;
+open my $out_s, '>', $out_final;
+open my $prot_s,'>', $prot_file;
+my $n=0;
+foreach(sort {$order_contig{$a} <=> $order_contig{$b} } keys %predict){
+	$n++;
+	if ($n % 10000 == 0){print "$n-ieme contig\n";}
+	my $id=$_;
+	my @tab_genes=sort {$order_gene{$id}{$a} <=> $order_gene{$id}{$b} } keys %{$predict{$id}};
+	my $n_complete_genes=0;
+	for (my $i=0;$i<=$#tab_genes;$i++){
+		my @tab=split("\t",$predict{$id}{$tab_genes[$i]});
+		if ($tab[5]!=11){}
+		else{$n_complete_genes++;}
+	}
+	if ($n_complete_genes<$th_nb_genes){
+# 		print "$id is excluded because too short ($n_complete_genes) \n";
+	}
+	else{
+		## We check the first gene and modify it if needed
+		my @tab_first=split("\t",$predict{$id}{$tab_genes[0]});
+		my @tab_second=split("\t",$predict{$id}{$tab_genes[1]});
+		$tab_first[0]=~/gene_(\d*)/;
+		my $n_1=$1;
+		$tab_second[0]=~/gene_(\d*)/;
+		my $n_2=$1;
+		if ($n_1>$n_2){
+			print "We probably have a circular contig ($id) as the first gene $tab_first[0] is beyond the second gene $tab_second[0] ($n_1>$n_2), so we switch $tab_first[0] ";
+			$tab_first[0]="gene_0";
+			print "to $tab_first[0]\n";
+			$predict{$id}{$tab_genes[0]}=join("\t",@tab_first);
+		}
+# 		if ($n_complete_genes<$th_nb_genes){print "$id is saved because of its circularity\n";}
+# 		else{print "We keep $id = $#tab_genes +1 genes\n";}
+		print $out_s ">$id\t$length{$id}\n";
+		print $fa_s ">$id\n";
+		my $seq_c=$seq_base{$id};
+		print $fa_s "$seq_c\n";
+		foreach(@tab_genes){
+			my @tab=split("\t",$predict{$id}{$_});
+			if ($tab[5]!=11){
+				# soit on est au début de séquence, soit en toute fin (théoriquement)
+				if ($tab[4]!=0){
+					if ($tab[3] eq "-"){
+						$tab[2]-=$tab[4];
+					}
+					elsif($tab[3] eq "+"){
+						$tab[1]+=$tab[4];
+					}
+					else{
+						print "%%%%%% pblm on a pas de sens pour $id : @tab\n";
+					}
+				}
+				my $new_line=join("\t",@tab);
+				$predict{$id}{$_}=$new_line;
+			}
+			print $out_s "$predict{$id}{$_}\n";
+			@tab=split("\t",$predict{$id}{$_});
+	                my $name  = $tab[0];
+	                my $start = $tab[1];
+	                my $stop  = $tab[2];
+	                my $sens  = $tab[3];
+	                my $frame = $tab[4];
+	                my $frag  = "";
+			# Regular case (not spanning the origin)
+			if ($start<$stop){
+				my $long=$stop-$start+1;
+				$frag=substr($seq_c,$start-1,$long);
+			}
+			# Exceptional case, we span the origin
+			else{
+				my $l1=length($seq_c)-$start+1;
+				$frag=substr($seq_c,$start-1,$l1);
+				$frag.=substr($seq_c,0,$stop);
+			}
+			## WE GET THE PREDICTED PROTEIN SEQUENCE
+			if ($frag eq ""){
+				print "!!!! FRAG IS $frag\n";
+			}
+			my $seq_bio = Bio::Seq->new(-id => "dummy_id" , -seq =>$frag, -alphabet => 'dna', -verbose => -1);
+			# Test to catch the Bio SeqUtils warning
+			my @seqs;
+			eval{
+				@seqs = Bio::SeqUtils->translate_6frames($seq_bio, -verbose => -1);
+			};
+			if ( $@ ){
+				print "We got the error $@\n";
+			}
+			#my @seqs = Bio::SeqUtils->translate_6frames($seq_bio, -verbose => -1);
+			# End of test
+			my $cadre=0;
+			if ($sens eq "-"){$cadre=3;}
+			my $prot=$seqs[$cadre];
+			my $prot_sequence=$prot->seq;
+			if ($prot_sequence=~/\*$/){
+				# we remove the stop codon
+				chop($prot_sequence);
+			}
+			my $id_out=$id."-".$name;
+			if (($prot_sequence=~/X{50,}/) || ($prot_sequence=~/F{50,}/) || ($prot_sequence=~/A{50,}/) || ($prot_sequence=~/K{50,}/) || ($prot_sequence=~/P{50,}/)){
+				print "we exclude $id_out because there is a pblm with the sequence -> too many succesive X, F, A, K or P\n";
+			}
+			else{
+				print $prot_s ">$id_out\n$prot_sequence\n";
+			}
+		}
+	}
+}
+close $fa_s;
+close $out_s;
+close $prot_s;
diff --git a/virsorter/Scripts/Step_2_merge_contigs_annotation.pl b/virsorter/Scripts/Step_2_merge_contigs_annotation.pl
new file mode 100755
index 0000000..f334619
--- /dev/null
+++ b/virsorter/Scripts/Step_2_merge_contigs_annotation.pl
@@ -0,0 +1,239 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+
+# Script to generate the merged contig annotation (annotate each gene)
+# Argument 0 : MGA predict file
+# Argument 1 : HMMsearch vs Phage Clusters
+# Argument 2 : BLast vs unclustered
+# Argument 3 : HMMsearch vs PFAMa
+# Argument 4 : HMMsearch vs PFAMb
+# Argument 5 : Ref Phage Clusters
+# Argument 6 : Out file
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[6])))
+{
+	print "# Script to generate the merged contig annotation (annotate each gene)
+# Argument 0 : MGA predict file
+# Argument 1 : HMMsearch vs Phage Clusters
+# Argument 2 : BLast vs unclustered
+# Argument 3 : HMMsearch vs PFAMa
+# Argument 4 : HMMsearch vs PFAMb
+# Argument 5 : Ref Phage Clusters
+# Argument 6 : Out file\n";
+	die "\n";
+}
+
+my $mga_file             = $ARGV[0];
+my $hmm_phage_clusters   = $ARGV[1];
+my $blast_vs_unclustered = $ARGV[2];
+my $hmm_pfama            = $ARGV[3];
+my $hmm_pfamb            = $ARGV[4];
+my $ref_phage_clusters   = $ARGV[5];
+my $out_file             = $ARGV[6];
+
+my $circu_file=$mga_file;
+$circu_file=~s/_mga_final.predict/_circu.list/;
+# Take list of circular files
+my %circu;
+open my $li, '<', $circu_file;
+while(<$li>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	my $id_c=$tab[0];
+	$circu{$id_c}=1;
+}
+close $li;
+
+my $n2=0;
+my %size;
+my %order_gene;
+my %predict;
+my %type;
+my $id_c="";
+my @liste_contigs;
+# Read all gene predictions
+open my $fts, '<',  $mga_file;
+while(<$fts>){
+	chomp($_);
+	if ($_=~/^>(.*)/){
+		my @tab=split("\t",$1);
+		$id_c=$tab[0];
+		$size{$id_c}=$tab[1];
+		$n2=0;
+		push(@liste_contigs,$id_c);
+	}
+	else{
+		my @tab=split("\t",$_);
+		$predict{$id_c}{$tab[0]}=$_;
+		$order_gene{$id_c}{$tab[0]}=$n2;
+		$n2++;
+	}
+}
+close $fts;
+
+# first the BLAST vs unclustered , which annotation will eventually be erased by the HMM vs Phage cluster if any (that we trust more)
+my %affi_phage_cluster;
+my $score_blast_th=50;
+my $evalue_blast_th=0.001;
+open my $tsv, '<', $blast_vs_unclustered;
+while (<$tsv>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	my $seq=$tab[0];
+	my $match=$tab[1];
+	$match=~s/\|/_/g;
+	my $evalue=$tab[10];
+	my $score=$tab[11];
+	if ($score>=$score_blast_th && $evalue<=$evalue_blast_th && (!defined($affi_phage_cluster{$seq}) || ($score>$affi_phage_cluster{$seq}{"score"})) && ($seq ne $match)){ ## We add the $seq ne $match so that we do not count a match to a phage sequence when it's only itself in the unclustered pool from a previous revision.
+		$affi_phage_cluster{$seq}{"score"}=$score;
+		$affi_phage_cluster{$seq}{"evalue"}=$evalue;
+		$affi_phage_cluster{$seq}{"match"}=$match;
+# 				print "$seq match $match\n";
+	}
+	
+}
+close $tsv;
+
+
+my $score_th=40;
+my $evalue_th=0.00001;
+
+# Then reading the annotation from the HMM vs Phage Cluster
+open my $tsv, '<', $hmm_phage_clusters;
+while(<$tsv>){
+	chomp($_);
+	if ($_=~m/^#/){
+		next;
+	}
+	else{
+		my @splign=split(m/\s+/,$_);
+		my $seq=$splign[0];
+		my $match=$splign[2];
+		$match=~s/\.ali_faa//g;
+		my $evalue=$splign[4];
+		my $score=$splign[5];
+		if ($score>=$score_th && $evalue<=$evalue_th && (!defined($affi_phage_cluster{$seq}) || ($score>$affi_phage_cluster{$seq}{"score"}))){
+			$affi_phage_cluster{$seq}{"score"}=$score;
+			$affi_phage_cluster{$seq}{"evalue"}=$evalue;
+			$affi_phage_cluster{$seq}{"match"}=$match;
+# 				print "$seq match $match\n";
+		}
+	}
+}
+close $tsv;
+
+# Then reading annotation from PFAM
+my %affi_pfam;
+open my $tsv, '<', $hmm_pfama;
+while(<$tsv>){
+	chomp($_);
+	if ($_=~m/^#/){
+		next;
+	}
+	else{
+		my @splign=split(m/\s+/,$_);
+		my $seq=$splign[0];
+		my $match=$splign[2];
+		my $evalue=$splign[4];
+		my $score=$splign[5];
+		if ($score>=$score_th && $evalue<=$evalue_th && (!defined($affi_pfam{$seq}) || ($score>$affi_pfam{$seq}{"score"}))){
+			$affi_pfam{$seq}{"score"}=$score;
+			$affi_pfam{$seq}{"evalue"}=$evalue;
+			$affi_pfam{$seq}{"match"}=$match;
+		}
+	}
+}
+close $tsv;
+
+open my $tsv, '<', $hmm_pfamb;
+while(<$tsv>){
+	chomp($_);
+	if ($_=~m/^#/){
+		next;
+	}
+	else{
+		my @splign=split(m/\s+/,$_);
+		my $seq=$splign[0];
+		my $match=$splign[2];
+		my $evalue=$splign[4];
+		my $score=$splign[5];
+		if ($score>=$score_th && $evalue<=$evalue_th && (!defined($affi_pfam{$seq}) || ($score>$affi_pfam{$seq}{"score"}))){
+			$affi_pfam{$seq}{"score"}=$score;
+			$affi_pfam{$seq}{"evalue"}=$evalue;
+			$affi_pfam{$seq}{"match"}=$match;
+		}
+	}
+}
+close $tsv;
+
+
+# We also read the annotation for each phage cluster, i.e. its category
+my %phage_cluster;
+open my $psv, '<', $ref_phage_clusters;
+while (<$psv>){
+	chomp($_);
+	my @tab=split(/\|/,$_);
+	$phage_cluster{$tab[0]}{"category"}=$tab[1];
+}
+close $psv;
+
+
+# Final output
+# >Contig,nb_genes,circularity
+# gene_id,start,stop,length,strand,affi_phage,score,evalue,category,affi_pfam,score,evalue,
+open my $s1, '>', $out_file;
+my $n=0;
+foreach(@liste_contigs){
+	$n++;
+	if ($n % 10000 == 0){print "$n-ieme contig\n";}
+	my $contig_c=$_;
+	my $circ="l";
+	if ($circu{$contig_c}==1){$circ="c";}
+	my @tab_genes=sort {$order_gene{$contig_c}{$a} <=> $order_gene{$contig_c}{$b} } keys %{$predict{$contig_c}};
+	my $n_g=$#tab_genes+1;
+	print $s1 ">$contig_c|$n_g|$circ\n";
+	foreach(@tab_genes){
+		my $g_c=$_;
+		my @tab=split("\t",$predict{$contig_c}{$g_c});
+		$g_c=$contig_c."-".$g_c;
+		my $name=$tab[0];
+		my $start=$tab[1];
+		my $stop=$tab[2];
+		my $strand=$tab[3];
+		my $frame=$tab[4];
+		my $affi_pc="-";
+		my $affi_pc_score="-";
+		my $affi_pc_evalue="-";
+		my $affi_category="-";
+		if (defined($affi_phage_cluster{$g_c})){
+			my $phage_c=$affi_phage_cluster{$g_c}{"match"};
+			if (defined($phage_cluster{$phage_c}{"category"})){$affi_category=$phage_cluster{$phage_c}{"category"};}
+# 			else{print "No category for $phage_c ????????\n";} # Blast unclustered do not have any category
+			$affi_pc=$phage_c;
+			$affi_pc_score=$affi_phage_cluster{$g_c}{"score"};
+			$affi_pc_evalue=$affi_phage_cluster{$g_c}{"evalue"};
+		}
+		my $affi_pfam="-";
+		my $affi_pfam_score="-";
+		my $affi_pfam_evalue="-";
+		if (defined($affi_pfam{$g_c})){
+			$affi_pfam=$affi_pfam{$g_c}{"match"};
+			$affi_pfam_score=$affi_pfam{$g_c}{"score"};
+			$affi_pfam_evalue=$affi_pfam{$g_c}{"evalue"};
+		}
+		my $length=$stop-$start;
+		if ($length<0){ # It can happen if one gene overlap the contig origin
+			$length=($size{$contig_c}-$start)+$stop;
+		}
+		print $s1 "$g_c|$start|$stop|$length|$strand|$affi_pc|$affi_pc_score|$affi_pc_evalue|$affi_category|$affi_pfam|$affi_pfam_score|$affi_pfam_evalue\n";
+	}
+}
+close $s1;
+
+
+
+
+
+
+
diff --git a/virsorter/Scripts/Step_3_highlight_phage_signal.pl b/virsorter/Scripts/Step_3_highlight_phage_signal.pl
new file mode 100755
index 0000000..8580025
--- /dev/null
+++ b/virsorter/Scripts/Step_3_highlight_phage_signal.pl
@@ -0,0 +1,650 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+use File::Spec::Functions;
+use FindBin '$Bin';
+
+# Script to measure metrics on the sliding window
+# Argument 0 : csv file of the contigs
+# Argument 1 : summary file of the phage fragments
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[1])))
+{
+	print "# Script to measure metrics on the sliding window
+# Argument 0 : csv file of the contigs
+# Argument 1 : summary file of the phage fragments
+# Argument 2 (optional) : a file with the refs values that we could use instead of estimating them \n";
+	die "\n";
+}
+$| = 1;
+my $csv_file = $ARGV[0];
+my $out_file = $ARGV[1];
+if ( -e $out_file ) { `rm $out_file`; }
+my $ref_file = $ARGV[0];
+$ref_file =~ s/\.csv/.refs/g;
+my $do_ref_estimation = 0;
+if (defined($ARGV[2])){
+#	$ref_file=$ARGV[2];
+	`cp $ARGV[2] $ref_file`; # That way, the ref file is in the result directory if a use wants to check it
+	$do_ref_estimation=1;
+}
+
+## ABSOLUTE THRESHOLDS ##
+my $th_viral_hallmark=1;
+my $th_sig=2;
+my $th_sig_2=4;
+my $th_nb_genes_covered=0.80;
+my $th_nb_genes_noncaudo=1;
+## END OF ABSOLUTE THRESHOLDS ##
+my $script_dir= catfile($Bin);
+my $path_to_c_script= catfile($script_dir, "Sliding_windows_3");
+
+print "## Taking information from the contig info file ($csv_file)\n";
+open F1, '<', $csv_file;
+my $n=0;
+my $id_c=$_;
+my %infos;
+my @liste_contigs;
+my %nb_genes;
+while(<F1>){
+	chomp($_);
+	if ($_=~/>(.*)/){
+		my @tab=split(/\|/,$1);
+		$id_c=$tab[0];
+		push(@liste_contigs,$id_c);
+		$nb_genes{$id_c}=$tab[1];
+		$n++;
+	}
+	else{
+		#     0  | 1   | 2  |  3   |  4   |    5     |  6  |   7  |   8     |   9    | 10   | 11
+		# gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue|
+		my @tab=split(/\|/,$_);
+		my $gene=$tab[0];
+		$gene=~/.*-(gene_\d*)/;
+		$gene=$1;
+		$infos{$id_c}{$gene}{"order"}=$n;
+		$infos{$id_c}{$gene}{"length"}=$tab[3];
+		$infos{$id_c}{$gene}{"strand"}=$tab[4];
+		$infos{$id_c}{$gene}{"category"}=-1;
+		if ($tab[5] eq "-"){ ## no Phage Cluster affiliation
+			if ($tab[9] eq "-"){ ## no PFAM either, ok.. 
+				$infos{$id_c}{$gene}{"best_domain_hit"}="-";
+			}
+			else{
+				$infos{$id_c}{$gene}{"best_domain_hit"}="PFAM-".$tab[9];
+			}
+		}
+		else{
+			if ($tab[9] eq "-" || $tab[6]>$tab[10]){ ## no PFAM or Phage Cluster better than PFAM (score comparison)
+				$infos{$id_c}{$gene}{"best_domain_hit"}="PC-".$tab[5];
+				if ($tab[9] ne ""){$infos{$id_c}{$gene}{"hit_PFAM"}="PFAM-".$tab[5];}
+				if ($tab[8] eq "-"){$infos{$id_c}{$gene}{"category"}=-1;}
+				else{$infos{$id_c}{$gene}{"category"}=$tab[8];}
+			}
+			else{ ## So we have a PFAM, which is clearly better than Phage Cluster, so we keep it
+				$infos{$id_c}{$gene}{"best_domain_hit"}="PFAM-".$tab[9];
+				$infos{$id_c}{$gene}{"hit_PC"}="PC-".$tab[5];
+			}
+		}
+		$n++;
+	}
+}
+close F1;
+
+my $th_gene_size=0;
+# WE HAVE A REF FILE, WE DONT ESTIMATE
+if ($do_ref_estimation==1){
+	print "## We have a ref file : $ref_file , so will use it\n";
+	open F1, '<', $ref_file;
+	while (<F1>){
+		chomp($_);
+		my @tab=split("\t",$_);
+		$th_gene_size=$tab[4];
+	}
+	close F1;
+}
+### ELSE, IF WE ESTIMATE THE PARAMETERS FROM THE DATASET
+else{
+	my %total;
+	my @store_avg_g_size;
+	# look at all contigs at once for the global metrics
+	print "## First look at everything to get the totals\n";
+	foreach(@liste_contigs){
+		my $contig=$_;
+		my @tab_genes=sort {$infos{$contig}{$a}{"order"} <=> $infos{$contig}{$b}{"order"}} keys %{$infos{$contig}};
+		my $total_nb_genes=$#tab_genes+1;
+		my $n_f=0;
+		# First, taking all the metrics for the totals
+		my $last_strand=$infos{$contig}{$tab_genes[0]}{"strand"};
+		for (my $i=0;$i<$total_nb_genes;$i++){
+			if (defined($tab_genes[$i])){
+				$total{"n_obs"}++;
+				if ($infos{$contig}{$tab_genes[$i]}{"best_domain_hit"}=~/^PC-/){ # look at best domain hit on a phage
+					$total{"phage"}++;
+					if (defined($infos{$contig}{$tab_genes[$i]}{"hit_PFAM"})){$total{"pfam"}++;}
+					if ($infos{$contig}{$tab_genes[$i]}{"category"}>=3){
+						$total{"noncaudo"}++;
+					}
+				}
+				elsif($infos{$contig}{$tab_genes[$i]}{"best_domain_hit"}=~/^PFAM-/){
+					$total{"pfam"}++;
+					if (defined($infos{$contig}{$tab_genes[$i]}{"hit_PC"})){$total{"phage"}++;}
+				}
+				elsif($infos{$contig}{$tab_genes[$i]}{"best_domain_hit"} eq "-"){
+					$total{"unch"}++;
+				}
+				if ($infos{$contig}{$tab_genes[$i]}{"strand"} ne $last_strand){
+					$total{"switch"}++;
+					$last_strand=$infos{$contig}{$tab_genes[$i]}{"strand"};
+				}
+				push(@store_avg_g_size,$infos{$contig}{$tab_genes[$i]}{"length"});
+			}
+		}
+	}
+
+	print "## Transform it into probability and gene size decile\n";
+	# Transform it into probability / ratios and sort the gene size table
+	$total{"phage"}/=$total{"n_obs"};
+	$total{"noncaudo"}/=$total{"n_obs"};
+	$total{"pfam"}/=$total{"n_obs"};
+	$total{"unch"}/=$total{"n_obs"};
+	$total{"switch"}/=$total{"n_obs"};
+	# Determine d1 (first decile) of the gene size distribution, so we divide the distribution in 10 parts
+	$th_gene_size=get_th_gene_size(\@store_avg_g_size,10);
+	open S2, '>', $ref_file;
+	print S2 $total{"phage"}."\t".$total{"pfam"}."\t".$total{"unch"}."\t".$total{"switch"}."\t".$th_gene_size."\t".$total{"noncaudo"};
+	close S2;
+}
+
+my $nb_gene_th=2;
+# Now the sliding windows
+print "## Then look at each contig and each sliding window\n";
+open S1, '>', $out_file;
+close S1;
+my $i=0;
+foreach(@liste_contigs){
+	my $contig_c=$_;
+	my @tab_genes=sort {$infos{$contig_c}{$a}{"order"} <=> $infos{$contig_c}{$b}{"order"}} keys %{$infos{$contig_c}};
+	my $total_nb_genes=$#tab_genes+1;
+	### Preparing data for C program
+	my $out_file_c=$ref_file;
+	$out_file_c=~s/\.refs/.tmp_$i/g;
+	my $out_file_c2=$ref_file;
+	$out_file_c2=~s/\.refs/.out_$i/g;
+	my $out_file_c3=$ref_file;
+	$out_file_c3=~s/\.refs/.out_$i-sorted/g;
+# 	print "we have $out_file_c $out_file_c2 $out_file_c3\n";
+	open MAP_C, '>', $out_file_c;
+	print MAP_C "$nb_genes{$contig_c}\n";
+	my $last_strand="0";
+	my $total_hallmark=0;
+	my $total_noncaudo=0;
+	foreach(@tab_genes){
+		my $gene=$_;
+		my $tag="";
+		# Line : PC / PFAM / UNCH / SIZE / STRAND / HALLMARK
+		if($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PC/){
+			if ($infos{$contig_c}{$gene}{"category"}>=3){$tag="1\t1\t0\t0\t";$total_noncaudo++;}
+			else{$tag="1\t0\t0\t0\t";}
+		}
+		elsif($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PFAM/){$tag="0\t0\t1\t0\t";}
+		else{$tag="0\t0\t0\t1\t";}
+		if ($infos{$contig_c}{$gene}{"length"}<$th_gene_size){$tag.="1\t";}
+		else{$tag.="0\t";}
+		if (($last_strand eq "0") || ($infos{$contig_c}{$gene}{"strand"} eq $last_strand)){$tag.="0\t";}
+		else{$tag.="1\t";}
+		$last_strand=$infos{$contig_c}{$gene}{"strand"};
+		if (($infos{$contig_c}{$gene}{"category"}==0) || ($infos{$contig_c}{$gene}{"category"}==3)){
+			$tag.="1\t";$total_hallmark++;
+			print "Gene $contig_c / $gene -> category $infos{$contig_c}{$gene}{category} -> putative hallmark\n";
+		} # look at putative hallmarklmark
+		else{$tag.="0\t";}
+		print MAP_C "$tag\n";
+	}
+	close MAP_C;
+	### Now go execute the C program
+	my $c_cmd="$path_to_c_script $ref_file $out_file_c $out_file_c2";
+#        print "Step 1 - $c_cmd\n";
+	my $out=`$c_cmd`;
+# 	print "$out\n";
+	$c_cmd="sort -r -n -k 4 $out_file_c2 > $out_file_c3";
+#        print "Step 2 - $c_cmd\n";	
+	$out=`$c_cmd`;
+# 	print "$out\n";
+	### reading the c program output to fill the match hash table / and removing overlap
+	my %match;
+	my %check;
+	my @check_gene;
+	open OUT_C, '<', $out_file_c3;
+	while(<OUT_C>){
+		chomp($_);
+		my @tab=split("\t",$_);
+		my $start=$tab[0];
+		my $last=$tab[0]+$tab[1]-1;
+		my $fragment_id=$contig_c."-".$tab_genes[$start]."-".$tab_genes[$last];
+		my $tag=0;
+		# Code : 0 phage / 1 pfam / 2 unch / 3 size / 4 strand switch
+		if ($tab[2]==0){
+			if (overlap($fragment_id,$check{"phage"})==0){
+				$match{$fragment_id}{"proof"}{"phage"}=$tab[3];
+				$check{"phage"}{$fragment_id}=1;
+				$tag=1;
+				for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;}
+			}
+		}
+		if ($tab[2]==1){
+			if (overlap($fragment_id,$check{"pfam"})==0){
+				$match{$fragment_id}{"proof"}{"pfam"}=$tab[3];
+				$check{"pfam"}{$fragment_id}=1;
+				$tag=1;
+				for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;}
+			}
+		}
+		if ($tab[2]==2){
+			if (overlap($fragment_id,$check{"unch"})==0){
+				$match{$fragment_id}{"proof"}{"unch"}=$tab[3];
+				$check{"unch"}{$fragment_id}=1;
+				$tag=1;
+				for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;}
+			}
+		}
+		if ($tab[2]==3){
+			if (overlap($fragment_id,$check{"avg_g_size"})==0){
+				$match{$fragment_id}{"proof"}{"avg_g_size"}=$tab[3];
+				$check{"avg_g_size"}{$fragment_id}=1;
+				$tag=1;
+			}
+		}
+		if ($tab[2]==4){
+			if (overlap($fragment_id,$check{"switch"})==0){
+				$match{$fragment_id}{"proof"}{"switch"}=$tab[3];
+				$check{"switch"}{$fragment_id}=1;
+				$tag=1;
+			}
+		}
+		if ($tab[2]==5){
+			if (overlap($fragment_id,$check{"noncaudo"})==0){
+				$match{$fragment_id}{"proof"}{"noncaudo"}=$tab[3];
+				$check{"noncaudo"}{$fragment_id}=1;
+				$tag=1;
+				for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;}
+			}
+		}
+		if ($tag==1){
+			# If a match, we also take the nb of hallmark genes, and the size
+			if ($tab[4]>0){$match{$fragment_id}{"hallmark"}=$tab[4];}
+			$match{$fragment_id}{"size"}=$tab[1];
+		}
+	}
+	close OUT_C;
+	### Ok, we read the C output, no we try (neatly) to merge all predictions for this sequence
+	my $n=0;
+	my %merged_match;
+	my $th_contig_size=$th_nb_genes_covered*$total_nb_genes;
+	my @tab_matches=sort { $match{$b}{"size"} <=> $match{$a}{"size"} } keys %match;
+	if (!defined($match{$tab_matches[0]}{"size"})){} # Not even an interesting region, skip to the next sequence
+	else{
+		my $tag_complete=0;
+		my $i=0;
+		while ($match{$tab_matches[$i]}{"size"}>$th_contig_size && $tag_complete==0){
+			if ($match{$tab_matches[$i]}{"size"}>$th_contig_size && (defined($match{$tab_matches[$i]}{"proof"}{"pfam"}) || defined($match{$tab_matches[$i]}{"proof"}{"phage"}) || defined($match{$tab_matches[$i]}{"proof"}{"unch"}) || defined($match{$tab_matches[$i]}{"proof"}{"noncaudo"}))){ # SEEMS LIKE WE HAVE A COMPLETE PHAGE SEQUENCE 
+				$tag_complete=1;
+				my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes];
+				if (defined($match{$fragment_id})){
+					$merged_match{$fragment_id}=$match{$fragment_id}; # If we indeed have complete metrics, we take themn
+				}
+				else{
+					$merged_match{$fragment_id}{"size"}=$total_nb_genes;# Otherwise we store just the size
+					$merged_match{$fragment_id}{"hallmark"}=$total_hallmark;# And the total number of hallmark genes on this fragment
+				}
+				$merged_match{$fragment_id}{"type"}="complete_phage";# And we store the type of fragment
+				foreach(@tab_matches){
+					my $fragment_id=$_;
+					if ($match{$fragment_id}{"size"}<$total_nb_genes){
+						my $r=get_overlap($fragment_id,\%merged_match);
+						if ($r eq "no"){ # if no overlap
+							$merged_match{$fragment_id}=$match{$fragment_id}; # NO OVERLAP WITH THE COMPLETE 
+							print "!!!!!!!!!!!!!!!!!!! THIS SHOULD NOT BE POSSIBLE\n";
+						}
+						else{
+							# Overlap, we propagate the proof and note it "partial"
+							foreach(keys %{$match{$fragment_id}{"proof"}}){
+								if (defined($merged_match{$r}{"proof"}{$_})){
+									if ($merged_match{$r}{"proof"}{$_}=~/:/){
+										$fragment_id=~/.*-(gene_\d*-gene_\d*)/;
+										$merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.",";
+									}
+									else{} # already a score for the entire match, no pblm
+								} 
+								else {
+									$fragment_id=~/.*-(gene_\d*-gene_\d*)/;
+									$merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.",";
+								}
+							}
+						}
+					}
+				}
+			}
+			$i++;
+		}
+		if($tag_complete==0){ # No complete phage, putatively one or several prophages
+			# First get all the phage region
+			# We look for interesting regions   my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes];
+			my $tag=-1;
+			my $tag_h=0;
+			for (my $i=0;$i<$total_nb_genes;$i++){
+				if ($tag>=0 && (!defined($check_gene[$i]) || $check_gene[$i]<1)){ # end of an interesting region
+					my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$i-1];
+					if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage
+						$fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes];
+						$merged_match{$fragment_id}{"type"}="complete_phage";
+						$merged_match{$fragment_id}{"size"}=$total_nb_genes;
+						$merged_match{$fragment_id}{"hallmark"}=$tag_h;
+					} 
+					else{ # Prophage
+						$merged_match{$fragment_id}{"size"}=$i-$tag;
+						$merged_match{$fragment_id}{"type"}="prophage";
+						$merged_match{$fragment_id}{"hallmark"}=$tag_h;
+					}
+					$tag=-1;
+					$tag_h=0;
+				}
+				elsif ($tag==-1 && $check_gene[$i]>=1){
+					$tag=$i;
+					$tag_h=0;
+				}
+				if ($infos{$contig_c}{$tab_genes[$i]}{"category"}==0 || $infos{$contig_c}{$tab_genes[$i]}{"category"}==3){$tag_h++;} # look at putative hallmark
+			}
+			if ($tag>=0){
+				my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$#tab_genes];
+				print "Region is $fragment_id ..";
+				if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage
+					print "which is a complete phage\n";
+					$fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes];
+					$merged_match{$fragment_id}{"type"}="complete_phage";
+					$merged_match{$fragment_id}{"size"}=$total_nb_genes;
+					$merged_match{$fragment_id}{"hallmark"}=$tag_h;
+				} 
+				else{ # Prophage
+					print "which is a prophage\n";
+					$merged_match{$fragment_id}{"size"}=$total_nb_genes-$tag;
+					$merged_match{$fragment_id}{"type"}="prophage";
+					$merged_match{$fragment_id}{"hallmark"}=$tag_h;
+				}
+			}
+			# Now we merge the annotation in these regions
+			foreach(@tab_matches){
+				my $fragment_id=$_;
+				# Check if overlap
+				my $r=get_overlap($fragment_id,\%merged_match);
+				if ($r eq "no"){ } # if no overlap # not in an interesting region 
+				else{
+					# Overlap, we propagate the proof and note it "partial"
+					foreach(keys %{$match{$fragment_id}{"proof"}}){
+						if (defined($merged_match{$r}{"proof"}{$_})){
+							if ($merged_match{$r}{"proof"}{$_}=~/:/){
+								$fragment_id=~/.*-(gene_\d*-gene_\d*)/;
+								$merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.",";
+							}
+							else{} # already a score for the entire match, no pblm
+						} 
+						else {
+							$fragment_id=~/.*-(gene_\d*-gene_\d*)/;
+							$merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.",";
+						}
+					}
+					delete($match{$fragment_id});
+				}
+			}
+			## New addition that should help to get the prophage coordinates correctly !
+			# And now check if one of the prophage map to the whole sequence
+			foreach(keys %merged_match){
+				print "This is a prophage\n";
+				my $fragment_id=$_;
+				if ($merged_match{$fragment_id}{"size"}>$th_contig_size){
+					$tag_complete=1;
+					my $new_fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes];
+					print "We have a complete prophage -- we add it $new_fragment_id !\n";
+					<STDIN>;
+					foreach(keys %{$merged_match{$fragment_id}}){
+						$merged_match{$new_fragment_id}{$_}=$merged_match{$fragment_id}{$_};
+					}
+					$merged_match{$new_fragment_id}{"type"}="complete_phage";# And we store the type of fragment
+				}
+			}
+			if ($tag_complete==1){
+				# We can remove all the prophages
+				my @tab_temp=keys %merged_match;
+				foreach(@tab_temp){
+					if ($merged_match{$_}{"type"} eq "complete_phage"){}
+					else{
+						delete($merged_match{$_});
+					}
+				}
+			}
+			## END OF THE NEW ADDITION
+		}
+		open S1, '>>', $out_file;
+		foreach(sort { $merged_match{$b}{"size"} <=> $merged_match{$a}{"size"} } keys %merged_match){ ## IMPORTANT, HAVE TO BE SIZE ORDERED
+			my $fragment_id=$_;
+			$fragment_id=~/.*-(gene_\d+-gene_\d+)/;
+			my $zone=$1;
+			my $type_detection=$merged_match{$fragment_id}{"type"};
+			print "$fragment_id\t$merged_match{$fragment_id}{size}\t$merged_match{$fragment_id}{hallmark}\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\n";
+			my $category=3;
+			if ($merged_match{$fragment_id}{"hallmark"}==0){delete($merged_match{$fragment_id}{"hallmark"});}
+			# Determine the category. To check this, we want several good indicators - And also remove prediction based on one single indicator, unless it's a strong one (sig >2)
+			# New categories : 
+			# Cat 1 - hallmark + gene phage enrichment
+			# Cat 2 - gene phage or hallmark without gene phage
+			# Cat 3 - no hallmark or gene phage, but other signal
+			my @tab_proof=keys %{$merged_match{$fragment_id}{"proof"}};
+			if ($merged_match{$fragment_id}{"hallmark"}>0){
+				if (defined($merged_match{$fragment_id}{"proof"}{"noncaudo"}) || defined($merged_match{$fragment_id}{"proof"}{"phage"})){
+					if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/(gene_\d+-gene_\d+):(\d+)/){
+						my $match_region=$1;
+						my $score=$2;
+						if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region
+					}
+					elsif ($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; ## THRESHOLD TO REMOVE THE NONCAUDO ON THE SMALL SMALL CONTIGS
+					if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/(gene_\d+-gene_\d+):(\d+)/){
+						my $match_region=$1;
+						my $score=$2;
+						if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region
+					}
+					elsif ($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1;
+					if ($category==3){ # no match complete, so category 2
+						$category=2;
+					}
+				}
+				else{
+					foreach(@tab_proof){
+						if ($merged_match{$fragment_id}{"proof"}{$_}=~/(gene_\d+-gene_\d+):(\d+)/){
+							my $match_region=$1;
+							my $score=$2;
+							print "Hallmark but no phage or noncaudo, but other proof $_ -> $match_region / $score ($merged_match{$fragment_id}{proof}{$_})\n";
+							if ($match_region eq $zone && $score>=$th_sig){$category=2;} # other metric on the whole region
+							elsif($score>=$th_sig_2){$category=2;} # metric partial only but strong enough so we keep it
+						}
+						elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig){
+							if ($_ eq "pfam" || $_ eq "unch"){
+								$category=2; # if we have hallmark or gene_size + a metric pfam or unch on the whole fragment -> should be quite sure
+							}
+						}
+					}
+				}
+			}
+			elsif (defined($merged_match{$fragment_id}{"proof"}{"phage"}) || defined($merged_match{$fragment_id}{"proof"}{"noncaudo"})){# If we have some phage signal, 
+				if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/:(\d*)/){
+					if ($1>=$th_sig){
+						$category=2; # Good, phage signal significant -> should be pretty sure
+					}
+				} 
+				elsif($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){
+					$category=2; # Good, phage signal significant -> should be pretty sure
+				}
+				if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/:(\d*)/){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS
+					if ($1>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){
+						$category=2; # Good, phage signal significant -> should be pretty sure
+					}
+				} 
+				elsif($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS
+					$category=2; # Good, phage signal significant -> should be pretty sure
+				}
+			}
+			if ($category==3){ # If the category is still 3, meaning that the phage signal (if there was any) was not that strong ..
+				if ($#tab_proof==0){
+					$category=0; # No phage signal nor hallmark gene, and only one metric, we remove
+				}
+				else{
+					my $tag1=0;
+					foreach(@tab_proof){
+						if ($merged_match{$fragment_id}{"proof"}{$_}=~/:(\d*)/){
+							if ($1>=$th_sig_2){
+								$tag1=1; # Good, one signal very significant 
+							}
+						} 
+						elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig_2){
+							$tag1=1; # Good, one signal very significant 
+						}
+					}
+					if ($tag1==0){ # If none of the metrics is really strong ...
+						$category=0; # .. we remove the detection
+					}
+				}
+			}
+			# Columns index :  0  /     1        /     2    /   3   /      4        /    5     /     6        /      7            /    8         /      9       /     10     /    11     /       12
+			# Columns : Contig / Total Nb Genes / Fragment / Size / Type detection / Category /  Enrich Phage / Enrich Non Caudo / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark
+			if ($category>0){
+				print S1 "$contig_c\t$total_nb_genes\t$fragment_id\t$merged_match{$fragment_id}{size}\t$type_detection\t$category\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{noncaudo}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\t$merged_match{$fragment_id}{hallmark}\n";
+			}
+		}
+		close S1;
+	}
+	$i++;
+	`rm $out_file_c $out_file_c2 $out_file_c3`;
+}
+
+sub factorial { # factorial $n
+	my $n = shift;
+	my $f = 1;
+	$f *= $n-- while $n > 0;    # Multiply, then decrement
+	return $f;
+}
+
+sub combine { # combination of $k elements in $n ensemble
+	my $k=$_[0];
+	my $n=$_[1];
+	my $f=factorial($n)/(factorial($k) * factorial($n-$k));
+	return $f;
+}
+
+sub proba { # probability of x=$i knowing nb_obs $n and p $p
+	my $i=$_[0];
+	my $n=$_[1];
+	my $p=$_[2];
+	my $f=combine($i,$n)*($p**$i)*((1-$p)**($n-$i));
+	return $f;
+}
+
+sub proba_more_than { # probability of x>=$s knowing nb_obs $n and p $p
+	my $s=$_[0];
+	my $n=$_[1];
+	my $p=$_[2];
+	my $f=0;
+	for (my $i=$s;$i<=$n;$i++){
+		$f+=proba($i,$n,$p);
+	}
+	return $f;
+}
+
+sub proba_less_than { # probability of x<=$s knowing nb_obs $n and p $p
+	my $s=$_[0];
+	my $n=$_[1];
+	my $p=$_[2];
+	my $f=0;
+	for (my $i=0;$i<=$s;$i++){
+		$f+=proba($i,$n,$p);
+	}
+	return $f;
+}
+
+sub log10 {
+	my $n = shift;
+	return log($n)/log(10);
+}
+
+sub overlap { # To check if a prediction is not within another of the same type, in which case we don't really care
+	my $pred=$_[0];
+	$pred=~/.*-gene_(\d*)-gene_(\d*)/;
+	my $start_pred=$1; my $end_pred=$2;
+	my $p_hash=$_[1];
+	my $o=0;
+	foreach(keys %{$p_hash}){
+		$_=~/.*-gene_(\d*)-gene_(\d*)/;
+		if (($start_pred<=$1 && $1<$end_pred) || ($start_pred<$2 && $2<=$end_pred) || ($1<=$start_pred && $2>=$end_pred)){
+			$o=1;
+		}
+	}
+	return $o;
+}
+
+
+sub get_overlap { # To get the overlapping if any
+	my $pred=$_[0];
+	$pred=~/.*-gene_(\d*)-gene_(\d*)/;
+	my $start_pred=$1; my $end_pred=$2;
+	my $p_hash=$_[1];
+	my $o="no";
+	foreach(keys %{$p_hash}){
+		$_=~/.*-gene_(\d*)-gene_(\d*)/;
+		if ($start_pred>=$1 && $end_pred<=$2){
+			$o=$_;
+		}
+	}
+	return $o;
+}
+
+sub is_local_max {
+	my $p_metrics=$_[0];
+	my $s=$_[1];
+	my $w=$_[2];
+	my $c=$_[3];
+	my $v=$$p_metrics{$s}{$w}{$c};
+	my $f=1;
+	my $how_much_to_look=5;
+	for (my $i=-$how_much_to_look;$i<=$how_much_to_look;$i++){
+		for (my $j=-$how_much_to_look;$j<=$how_much_to_look;$j++){
+			if ($i==0 && $j==0){}
+			elsif(defined($$p_metrics{$s+$i}{$w+$j})){
+				if(defined($$p_metrics{$s+$i}{$w+$j}{$c})){
+					if ($$p_metrics{$s+$i}{$w+$j}{$c}>$v){
+						$f=0; # we found a neightbor with a greater value, not a local maxima
+					}
+				}
+			}
+		}
+	}
+	return $f;
+}
+
+sub get_position{
+	my $value=$_[0];
+	my $p_tab=$_[1];
+	my @tab=sort {$a <=> $b} @$p_tab;
+	print "looking for $value in the gene size table\n";
+	my $index = 0;
+	while($tab[$index]<$value && $index<$#tab){$index++;}
+	print "found at index $index - $tab[$index] - total : $#tab\n";
+	my $ratio=$index/$#tab;
+	print "which gives a ratio of $ratio\n";
+	return $ratio;
+}
+
+sub get_th_gene_size{
+	my @tab=sort {$a <=> $b} (@{$_[0]});
+	my $div=$_[1];
+	my $m=0;
+	if ($#tab % $div == 0){return ($tab[$#tab/$div]);}
+	else{return (($tab[($#tab-1)/$div]+$tab[($#tab+1)/$div])/2);}
+}
diff --git a/virsorter/Scripts/Step_4_summarize_phage_signal.pl b/virsorter/Scripts/Step_4_summarize_phage_signal.pl
new file mode 100755
index 0000000..5683703
--- /dev/null
+++ b/virsorter/Scripts/Step_4_summarize_phage_signal.pl
@@ -0,0 +1,289 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+
+# Script to make a summary of the predictions to add to previous predictions
+# Argument 0 : summary file of the phage fragments
+# Argument 1 : global summary to be completed
+# Argument 2 : Out file for new prot list
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3])))
+{
+	print "# Script to make a summary of the predictions to add to previous predictions
+# Argument 0 : affiliation file of the contigs
+# Argument 1 : summary file of the phage fragments
+# Argument 2 : global summary to be completed
+# Argument 3 : Out file for new prot list\n";
+	die "\n";
+}
+
+my $affi_contigs   = $ARGV[0];
+my $new_summary    = $ARGV[1];
+my $global_summary = $ARGV[2];
+my $new_prot_list  = $ARGV[3];
+
+my %infos;
+my $tag=0;
+my %check_prot_old;
+my %check_contig_old;
+if (-e $global_summary){
+	 # Get info from global_summary
+	open SUM, '<', $global_summary;
+	while (<SUM>){
+		chomp($_);
+		if ($_=~/^## (\d+)/){
+			$tag=$1;
+		}
+		elsif($_=~/^##/ || $_ eq ""){}
+		elsif($tag<=3){
+# 			print "we had $_ -> tag $tag\n";
+			my @tab=split(",",$_);
+			$infos{$tag}{$tab[2]}{"nb_gene"}=$tab[1];
+			$infos{$tag}{$tab[2]}{"category"}=$tab[4];
+			$infos{$tag}{$tab[2]}{"hallmark"}=$tab[5];
+			$infos{$tag}{$tab[2]}{"phage"}=$tab[6];
+			$infos{$tag}{$tab[2]}{"noncaudo"}=$tab[7];
+			$infos{$tag}{$tab[2]}{"pfam"}=$tab[8];
+			$infos{$tag}{$tab[2]}{"unch"}=$tab[9];
+			$infos{$tag}{$tab[2]}{"switch"}=$tab[10];
+			$infos{$tag}{$tab[2]}{"size"}=$tab[11];
+			$check_contig_old{$tab[2]}=1;
+			print "checking old $tab[2]\n";
+		}
+		else{
+			my @tab=split(",",$_);
+			print "we had $tab[2] -> tag $tag\n";
+			$infos{$tag}{$tab[0]}{$tab[2]}{"nb_gene_contig"}=$tab[1];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"nb_gene"}=$tab[3];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"category"}=$tab[4];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"hallmark"}=$tab[5];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"phage"}=$tab[6];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"noncaudo"}=$tab[7];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"pfam"}=$tab[8];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"unch"}=$tab[9];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"switch"}=$tab[10];
+			$infos{$tag}{$tab[0]}{$tab[2]}{"size"}=$tab[11];
+			if($infos{$tag}{$tab[0]}{$tab[2]}{"category"}==1){ # If the category is 4, we check all the prot from this fragment
+				$tab[2]=~/.*-gene_(\d*)-gene_(\d*)/;				
+				for (my $i=$1;$i<=$2;$i++){
+					my $prot_id=$tab[0]."-gene_".$i;
+					$check_prot_old{$prot_id}=1;
+				}
+			}
+		}
+	}
+	close SUM;
+}
+else{
+	print "This is the first global summary that we'll do\n";
+}
+
+my %check_prot_new;
+my %check_contig_new;
+open SUM, '<', $new_summary;
+while (<SUM>){
+	chomp($_);
+	$_=~s/,/;/g;
+	my @tab=split("\t",$_);
+	if ($tab[4] eq "complete_phage"){
+#    0   /       1        /    2       /  3  /     4          /    5      /      6        /     7        /     8       /       9        /     10   /      11     /    
+# Contig / Total Nb Genes /  Fragment / Size / Type detection / Category /  Enrich Phage / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark
+		# Determine order in which this contig will be displayed
+		my $class=3;
+		if ($tab[5]==1){# If the category is 1, we check all the prot from this contig
+			$class=1;
+			$check_contig_new{$tab[0]}=1;
+		}
+		elsif ($tab[5]==2){$class=2;}
+ 		for(my $i=5;$i<=$#tab;$i++){
+ 			if ($tab[$i]=~/(.*);$/){
+ 				$tab[$i]=$1;
+ 			}
+ 		}
+# 		print "$_ => tag $class\n";
+		$infos{$class}{$tab[0]}{"nb_gene"}=$tab[1];
+		$infos{$class}{$tab[0]}{"category"}=$tab[5];
+		$infos{$class}{$tab[0]}{"phage"}=$tab[6];
+		$infos{$class}{$tab[0]}{"noncaudo"}=$tab[7];
+		$infos{$class}{$tab[0]}{"pfam"}=$tab[8];
+		$infos{$class}{$tab[0]}{"unch"}=$tab[9];
+		$infos{$class}{$tab[0]}{"switch"}=$tab[10];
+		$infos{$class}{$tab[0]}{"size"}=$tab[11];
+		$infos{$class}{$tab[0]}{"hallmark"}=$tab[12];
+	}
+	else{
+		my $class=6;
+		if ($tab[5]==1){
+			$class=4;
+			# If the category is 1, we check all the prot from this fragment
+			$tab[2]=~/.*-gene_(\d*)-gene_(\d*)/;
+			for (my $i=$1;$i<=$2;$i++){
+				my $prot_id=$tab[0]."-gene_".$i;
+				$check_prot_new{$prot_id}=1;
+				print "we check new $prot_id\n";
+			}
+		}
+		elsif($tab[5]==2){$class=5;}
+		# Remove all former prophages (if any) is there is an overlap
+		for (my $i=4;$i<=6;$i++){
+			if (defined $infos{$i}{$tab[0]}){
+				foreach (keys %{$infos{$i}{$tab[0]}}){
+					if (overlap($tab[2],$_)==1){
+						print "Overlap between $tab[1] and $_, we remove $_ ($tab[0] - 3)\n";
+						delete($infos{$i}{$tab[0]}{$_});
+					}
+				}
+			}
+		}
+ 		for(my $i=5;$i<=$#tab;$i++){
+ 			if ($tab[$i]=~/(.*);$/){
+ 				$tab[$i]=$1;
+ 			}
+ 		}
+# 		print "Prophage $class / $tab[0] - $tab[2]\n";
+		$infos{$class}{$tab[0]}{$tab[2]}{"nb_gene_contig"}=$tab[1];
+		$infos{$class}{$tab[0]}{$tab[2]}{"nb_gene"}=$tab[3];
+		$infos{$class}{$tab[0]}{$tab[2]}{"category"}=$tab[5];
+		$infos{$class}{$tab[0]}{$tab[2]}{"phage"}=$tab[6];
+		$infos{$class}{$tab[0]}{$tab[2]}{"noncaudo"}=$tab[7];
+		$infos{$class}{$tab[0]}{$tab[2]}{"pfam"}=$tab[8];
+		$infos{$class}{$tab[0]}{$tab[2]}{"unch"}=$tab[9];
+		$infos{$class}{$tab[0]}{$tab[2]}{"switch"}=$tab[10];
+		$infos{$class}{$tab[0]}{$tab[2]}{"size"}=$tab[11];
+		$infos{$class}{$tab[0]}{$tab[2]}{"hallmark"}=$tab[12];
+	}
+}
+
+# Remove redundancy 
+foreach(sort {$a <=> $b } keys %infos){
+	my $class=$_;
+	my @liste_contigs=keys %{$infos{$class}};
+	if ($class<=3){ ## For complete phages, remove all predictions with higher categories
+		for (my $i=$class+1;$i<=6;$i++){
+			foreach(@liste_contigs){
+				if (defined($infos{$i}{$_})){
+					print "$_ defined in $class, so we remove its info for $i\n";
+					delete($infos{$i}{$_});
+				}
+			}
+		}
+	}
+	else{
+		foreach(@liste_contigs){ ## For prophages, remove the prediction of the same prophages with higher categories
+			my $contig=$_;
+			foreach(keys %{$infos{$class}{$contig}}){
+				for (my $i=$class+1;$i<=6;$i++){
+					if (defined($infos{$i}{$contig}{$_})){
+						print "$_ defined in $class, so we remove its info for $i\n";
+						delete($infos{$i}{$contig}{$_});
+					}
+				}
+			}
+			
+			
+		}
+	}
+}
+
+
+open S1, '>', $global_summary;
+for (my $class=1;$class<=6;$class++){
+	if ($class==1){
+		print S1 "## 1 - Complete phage contigs - category 1 (pretty sure)\n";
+		print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n";
+	}
+	if ($class==2){
+		print S1 "## 2 - Complete phage contigs - category 2 (quite sure)\n";
+		print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n";
+	}
+	if ($class==3){
+		print S1 "## 3 - Complete phage contigs - category 3 (not so sure)\n";
+		print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n";
+	}
+	if ($class==4){
+		print S1 "## 4 - Prophages - category 1 (pretty sure)\n";
+		print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n";
+	}
+	if ($class==5){
+		print S1 "## 5 - Prophages - category 2 (quite sure)\n";
+		print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n";	
+	}
+	if ($class==6){
+		print S1 "## 6 - Prophages - category 3 (not so sure)\n";
+		print S1 "## Contig_id,Nb genes contigs,Fragment,Nb genes,Category,Nb phage hallmark genes,Phage gene enrichment sig,Non-Caudovirales phage gene enrichment sig,Pfam depletion sig,Uncharacterized enrichment sig,Strand switch depletion sig,Short genes enrichment sig\n";	
+	}
+	foreach(sort keys %{$infos{$class}}){
+		my $contig=$_;
+		if ($class<=3){
+			print S1 "$_,$infos{$class}{$contig}{nb_gene},$_,$infos{$class}{$contig}{nb_gene},$infos{$class}{$contig}{category},$infos{$class}{$contig}{hallmark},$infos{$class}{$contig}{phage},$infos{$class}{$contig}{noncaudo},$infos{$class}{$contig}{pfam},$infos{$class}{$contig}{unch},$infos{$class}{$contig}{switch},$infos{$class}{$contig}{size}\n";
+		}
+		else{
+			foreach (sort keys %{$infos{$class}{$contig}}) {
+				print S1 "$contig,$infos{$class}{$contig}{$_}{nb_gene_contig},$_,$infos{$class}{$contig}{$_}{nb_gene},$infos{$class}{$contig}{$_}{category},$infos{$class}{$contig}{$_}{hallmark},$infos{$class}{$contig}{$_}{phage},$infos{$class}{$contig}{$_}{noncaudo},$infos{$class}{$contig}{$_}{pfam},$infos{$class}{$contig}{$_}{unch},$infos{$class}{$contig}{$_}{switch},$infos{$class}{$contig}{$_}{size}\n";
+			}
+		}
+	}
+}
+close S1;
+
+# Check if they could be new clusters among the new proteins
+my @liste_to_add=();
+my $th_evalue=0.0000000001; # Big threshold, to prevent too much false positive
+open AFI, '<', $affi_contigs;
+my $contig_c="";
+while (<AFI>){
+	chomp($_);
+	if ($_=~/>(.*)/){
+		my @tab=split(/\|/,$1);
+		$contig_c=$tab[0];
+	}
+	else{
+		#     0  | 1   | 2  |  3   |  4   |    5     |  6  |   7  |   8     |   9    | 10   | 11
+		# gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue|
+		my @tab=split(/\|/,$_);
+		my $gene=$tab[0];
+		if (($check_prot_new{$gene}==1 && !defined($check_prot_old{$gene})) || (($check_contig_new{$contig_c}==1) && !defined($check_contig_old{$contig_c}))){
+# 			print "Ah, a new prot, putatively a new cluster\n";
+			if ($tab[5] eq "-"){ 
+# 				print "\t oh yep, no phage cluster, so we take this\n";
+				push(@liste_to_add,$gene);
+			}
+			elsif($tab[7]<=$th_evalue && $tab[5]=~/^Phage_cluster_\d+/){
+# 				print "\t nope, evalue of $tab[7] on a cluster, so will likely cluster with an existing PC\n";
+			}
+			else{
+# 				print "\t oh yep, we take this, because it's either a bigger evalue than the th or a non-clustered phage protein -> $tab[5]\n";
+				push(@liste_to_add,$gene);
+			}
+		}
+	}
+}
+
+if ($#liste_to_add>=0){
+	print "Listing the new prots to add\n";
+	my $l=join(",",@liste_to_add);
+	open S2, '>', $new_prot_list;
+	print S2 "$l\n";
+	close S2;
+}
+else{
+	print "No new prots, no list\n";
+}
+
+
+sub overlap { # To check if a prediction is not within another of the same type, in which case we don't really care
+	my $pred_wide=$_[0];
+	my $pred_short=$_[1];
+	$pred_wide=~/.*-gene_(\d*)-gene_(\d*)/;
+	my $start_pred_wide=$1; 
+	my $end_pred_wide=$2;
+	$pred_short=~/.*-gene_(\d*)-gene_(\d*)/;
+	my $start_pred_short=$1; 
+	my $end_pred_short=$2;
+	my $o=0;
+	if ($start_pred_short>=$start_pred_wide && $end_pred_short<=$end_pred_wide && (!($start_pred_short==$start_pred_wide && $end_pred_short==$end_pred_wide))){
+		$o=1;
+	}
+# 	print "$pred_short / $pred_wide $start_pred_short>=$start_pred_wide && $end_pred_short<=$end_pred_wide => $o\n";
+	return $o;
+}
diff --git a/virsorter/Scripts/Step_5_get_phage_fasta-gb.pl b/virsorter/Scripts/Step_5_get_phage_fasta-gb.pl
new file mode 100755
index 0000000..175b5ad
--- /dev/null
+++ b/virsorter/Scripts/Step_5_get_phage_fasta-gb.pl
@@ -0,0 +1,444 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+use Bio::SeqIO;
+use Bio::Seq;
+use Bio::SeqFeature::Generic;
+use Bio::Location::Simple;
+use Bio::Location::Split;
+use Cwd 'cwd';
+use File::Spec::Functions;
+use File::Path 'mkpath';
+
+# Script to get fasta file from VirSorter results
+# Argument 0 : code of the run
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[0])))
+{
+	print "# Script to get fasta file from VirSorter results
+# Argument 0 : code of the run\n";
+	die "\n";
+}
+
+
+my $code    = $ARGV[0] or die 'No code';
+my $wdir    = $ARGV[1] || cwd();
+my $dir_out = catdir($wdir, "Predicted_viral_sequences");
+
+unless (-d $dir_out) {
+    mkpath($dir_out);
+}
+
+# We decal each zone by 50 nt before and beyond
+my $decal=50;
+print "Code $code\n";
+my $out_file_1  = catfile( $dir_out, $code . '_cat-1.fasta' );
+my $out_file_2  = catfile( $dir_out, $code . '_cat-2.fasta' );
+my $out_file_3  = catfile( $dir_out, $code . '_cat-3.fasta' );
+my $out_file_p1 = catfile( $dir_out, $code . '_prophages_cat-4.fasta' );
+my $out_file_p2 = catfile( $dir_out, $code . '_prophages_cat-5.fasta' );
+my $out_file_p3 = catfile( $dir_out, $code . '_prophages_cat-6.fasta' );
+my $gb_file_1   = catfile( $dir_out, $code . '_cat-1.gb' );
+my $gb_file_2   = catfile( $dir_out, $code . '_cat-2.gb' );
+my $gb_file_3   = catfile( $dir_out, $code . '_cat-3.gb' );
+my $gb_file_p1  = catfile( $dir_out, $code . '_prophages_cat-4.gb' );
+my $gb_file_p2  = catfile( $dir_out, $code . '_prophages_cat-5.gb' );
+my $gb_file_p3  = catfile( $dir_out, $code . '_prophages_cat-6.gb' );
+print join("\n", "The sequences will be put in:",
+    ( map { " - $_" } 
+        $out_file_1,
+        $out_file_2,
+        $out_file_3,
+        $out_file_p1,
+        $out_file_p2,
+        $out_file_p3,
+    ),
+    ''
+);
+
+my $summary       = catfile($wdir, $code . '_global-phage-signal.csv');
+my $last_affi     = catfile($wdir, $code . '_phage-signal.csv');
+my $affi_contigs  = catfile($wdir, $code . '_affi-contigs.csv');
+my $fasta_contigs = catfile($wdir, 'fasta', $code . '_nett_filtered.fasta');
+my $fasta_prot    = catfile($wdir, 'fasta', $code . '_prots.fasta');
+
+print "Checking '$last_affi'\n";
+
+if (-e $last_affi){
+	my %compte=(1=>0,2=>0,3=>0,4=>0,5=>0,6=>0);
+	my %check;
+	my $current_c="";
+	open(SUM, '<', $summary);
+
+	while (<SUM>){
+		chomp($_);
+		if ($_=~/## (\d)/){$current_c=$1;}
+		elsif ($_=~/##/){}
+		else{
+			my @tab=split(",",$_);
+			$tab[0]=~s/\(/_/g;
+			$tab[0]=~s/\)/_/g;
+			$tab[2]=~s/\(/_/g;
+			$tab[2]=~s/\)/_/g;
+			$tab[0]=~s/\[/_/g;
+			$tab[0]=~s/\]/_/g;
+			$tab[2]=~s/\[/_/g;
+			$tab[2]=~s/\]/_/g;
+			if($tab[0]=~/(.*)-circular/){
+# 				$tab[2]=~s/-circular//g;
+				$check{$tab[0]}{$tab[2]}{"circular"}=1;
+			}
+			if($tab[0] eq ""){
+				print "!!!!! void\n";
+			}
+			else{
+				$check{$tab[0]}{$tab[2]}{"prophage"}=$tab[2];
+				$check{$tab[0]}{$tab[2]}{"line"}=$_;
+				$compte{$current_c}++;
+				$check{$tab[0]}{$tab[2]}{"category"}=$current_c;
+			}
+		}
+	}
+	close SUM;
+	print "$code\t$compte{1}\t$compte{2}\t$compte{3}\t$compte{4}\t$compte{5}\t$compte{6}\n";
+	# Get the sequence annotation
+	my $id_c="";
+	my %infos;
+	my $i=0;
+	open(ANOT,"<$affi_contigs") || die ("pblm opening file $affi_contigs\n");
+	while(<ANOT>){
+		chomp($_);
+		if ($_=~/>(.*)/){
+			my @tab=split(/\|/,$1);
+			$tab[0]=~s/\(/_/g;
+			$tab[0]=~s/\)/_/g;
+			$tab[0]=~s/\[/_/g;
+			$tab[0]=~s/\]/_/g;
+			$id_c=$tab[0];
+		}
+		else{
+			#     0  | 1   | 2  |  3   |  4   |    5     |  6  |   7  |   8     |   9    | 10   | 11
+			# gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue|
+			my @tab=split(/\|/,$_);
+			my $gene=$tab[0];
+			$gene=~/.*-(gene_\d*)/;
+			$gene=$1;
+			$infos{$id_c}{$gene}{"start"}=$tab[1];
+			$infos{$id_c}{$gene}{"stop"}=$tab[2];
+			$infos{$id_c}{$gene}{"length"}=$tab[3];
+			$infos{$id_c}{$gene}{"strand"}=$tab[4];
+			$infos{$id_c}{$gene}{"category"}=-1;
+			$infos{$id_c}{$gene}{"order"}=$i;
+			$i++;
+			if ($tab[5] eq "-"){ ## no Phage Cluster affiliation
+				if ($tab[9] eq "-"){ ## no PFAM either, ok.. 
+					$infos{$id_c}{$gene}{"affi"}="hypothetical protein";
+				}
+				else{
+					$infos{$id_c}{$gene}{"affi"}="PFAM-".$tab[9];
+				}
+			}
+			else{
+				if ($tab[9] eq "-"){ ## no PFAM or Phage Cluster better than PFAM (score comparison)
+					$infos{$id_c}{$gene}{"affi"}=$tab[5];
+				}
+				else{ ## So we have a PFAM, which is clearly better than Phage Cluster, so we keep it
+					$infos{$id_c}{$gene}{"affi"}=$tab[5]."_"."PFAM-".$tab[9];
+				}
+			}
+		}
+	}
+	close ANOT;
+	open(FA,"<$fasta_prot") || die ("pblm opening file $fasta_prot");
+	my $gene_c="";
+	my $tag=0;
+	while (<FA>){
+		chomp($_);
+		if ($_=~/^>(\S*)/){
+			$tag=0;
+			my $gene_temp=$1;
+			$gene_temp=~/(.*)-(gene_\d*)/;
+			$id_c=$1;
+			$gene_c=$2;
+			if(defined($infos{$id_c}{$gene_c})){$tag=1;}
+		}
+		elsif($tag==1){
+			$infos{$id_c}{$gene_c}{"seq"}.=$_;
+		}
+	}
+	close FA;
+	# Now get all the fasta cut of the contigs
+	open(SP1,">$out_file_p1") || die ("pblm opening file $out_file_p1\n");
+	open(SP2,">$out_file_p2") || die ("pblm opening file $out_file_p2\n");
+	open(SP3,">$out_file_p3") || die ("pblm opening file $out_file_p3\n");
+	open(S1,">$out_file_1") || die ("pblm opening file $out_file_1\n");
+	open(S2,">$out_file_2") || die ("pblm opening file $out_file_2\n");
+	open(S3,">$out_file_3") || die ("pblm opening file $out_file_3\n");
+	my $output_1 = Bio::SeqIO->new(-file => ">$gb_file_1",-format => 'GenBank');
+	my $output_2 = Bio::SeqIO->new(-file => ">$gb_file_2",-format => 'GenBank');
+	my $output_3 = Bio::SeqIO->new(-file => ">$gb_file_3",-format => 'GenBank');
+	my $output_p1 = Bio::SeqIO->new(-file => ">$gb_file_p1",-format => 'GenBank');
+	my $output_p2 = Bio::SeqIO->new(-file => ">$gb_file_p2",-format => 'GenBank');
+	my $output_p3 = Bio::SeqIO->new(-file => ">$gb_file_p3",-format => 'GenBank');
+	my $sequence=0;
+	open(FASTA,"<$fasta_contigs") || die ("pblm opening file $fasta_contigs\n");
+	$id_c="";
+	my $seq_c="";
+	while (<FASTA>){
+		chomp($_);
+		if ($_=~/^>(.*)/){
+			my $id_c_temp=$1;
+			$id_c_temp=~s/\(/_/g;
+			$id_c_temp=~s/\)/_/g;
+			$id_c_temp=~s/\[/_/g;
+			$id_c_temp=~s/\]/_/g;
+			if (defined($check{$id_c})){
+				my $id_red=$id_c;
+				print "We had checked $id_c -> $id_red\n";
+				foreach(keys %{$check{$id_c}}){
+					$id_red=$id_c;
+					my @tab=split(",",$check{$id_c}{$_}{"line"});
+					$tab[0]=~s/\(/_/g;
+					$tab[0]=~s/\)/_/g;
+					$tab[2]=~s/\(/_/g;
+					$tab[2]=~s/\)/_/g;
+					$tab[0]=~s/\[/_/g;
+					$tab[0]=~s/\]/_/g;
+					$tab[2]=~s/\[/_/g;
+					$tab[2]=~s/\]/_/g;
+					# $tab[2]=~s/-circular//g;
+					my $desc="Putative phage sequence (category $check{$id_c}{$_}{category}), predicted by PhageSorter";
+					my $iscirc=0;
+					if ($check{$id_c}{$tab[2]}{"circular"}==1){
+						# $id_red.="-circ";
+						$iscirc=1;
+					}
+					if ($check{$id_c}{$_}{"category"}<=3){
+						print ".. predicted to be a complete phage..\n";
+						$sequence = Bio::Seq::RichSeq->new(-display_id => "$id_red", -accession_number => "$id_red", -desc => $desc ,-seq =>"$seq_c",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna");
+						$sequence->add_date(`date +%D`);
+						my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($seq_c),-primary => "source",-tag => {'organism' => "$desc"});
+						$sequence->add_SeqFeature($featsource);
+						foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){
+							my $gene=$_;
+							my $splitlocation = Bio::Location::Split->new();
+							my $strand=0;
+							if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;}
+							# si on est sur un join, etc..
+							if ($infos{$id_c}{$gene}{"stop"} < $infos{$id_c}{$gene}{"start"}){
+								$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>length($seq_c),-strand=>$strand));
+								$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>1,-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand));
+							}
+							else{
+								$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand));
+							}
+							my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene});
+							$sequence->add_SeqFeature($featgene);
+							my $product=$infos{$id_c}{$gene}{"affi"};
+							my $note="Predicted by MGA";
+							my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"});
+							$sequence->add_SeqFeature($featcds);
+							$featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"});
+						}
+						if ($check{$id_c}{$_}{"category"}==1){
+							print S1 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n";
+							$output_1->write_seq($sequence);
+						}
+						elsif ($check{$id_c}{$_}{"category"}==2){
+							print S2 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n";
+							$output_2->write_seq($sequence);
+						}
+						elsif ($check{$id_c}{$_}{"category"}==3){
+							print S3 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n";
+							$output_3->write_seq($sequence);
+						}
+					}
+					else{
+						print ".. predicted to be a prophage..\n";
+						if ($tab[2]=~/^$id_c-(gene_\d+)-(gene_\d+)/){
+							my $gene_start=$1;
+							my $start=$infos{$id_c}{$gene_start}{"start"}-$decal;
+							if ($start<0){$start=0;}
+							my $gene_stop=$2;
+							my $stop=$infos{$id_c}{$gene_stop}{"stop"}+$decal;
+							my $length=$stop-$start;
+							print "  from $1 to $2 .. from $start to $stop ($length)\n";
+							my $substr=substr($seq_c,$start,$length);
+							$iscirc=0; # An integrated prophage cannot be circular, so set this to linear 
+							my $display_id=$id_red."_".$gene_start."_".$gene_stop."-".$start."-".$stop."-cat_".$check{$id_c}{$_}{"category"};
+							$sequence = Bio::Seq::RichSeq->new(-display_id => "$display_id", -accession_number => "$display_id", -desc => $desc ,-seq =>"$substr",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna");
+							$sequence->add_date(`date +%D`);
+							my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($substr),-primary => "source",-tag => {'organism' => "$desc"});
+							$sequence->add_SeqFeature($featsource);
+							foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){
+								my $gene=$_;
+								# Check if the gene is in the fragment entirely
+								if (($infos{$id_c}{$gene}{"start"}>=$start) && ($infos{$id_c}{$gene}{"start"}<=$stop) && ($infos{$id_c}{$gene}{"stop"}>=$start) && ($infos{$id_c}{$gene}{"stop"}<=$stop)){
+									my $splitlocation = Bio::Location::Split->new();
+									my $strand=0;
+									if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;}
+									$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"}-$start,-end=>$infos{$id_c}{$gene}{"stop"}-$start,-strand=>$strand));
+									my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene});
+									$sequence->add_SeqFeature($featgene);
+									my $product=$infos{$id_c}{$gene}{"affi"};
+									my $note="Predicted by MGA";
+									my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"});
+									$sequence->add_SeqFeature($featcds);
+									$featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"});
+								}
+							}
+							if ($check{$id_c}{$_}{"category"}==4){
+								print SP1 ">".$display_id."\n".$substr."\n";
+								$output_p1->write_seq($sequence);
+							}
+							elsif ($check{$id_c}{$_}{"category"}==5){
+								print SP2 ">".$display_id."\n".$substr."\n";
+								$output_p2->write_seq($sequence);
+							}
+							elsif($check{$id_c}{$_}{"category"}==6){
+								print SP3 ">".$display_id."\n".$substr."\n";
+								$output_p3->write_seq($sequence);
+							}
+						}
+						else{
+							print "Pblm with $tab[2] - tab 2\n";
+						}
+					}
+				}
+			}
+			$id_c=$id_c_temp;
+			$seq_c="";
+		}
+		else{$seq_c.=$_;}
+	}
+	close FASTA;
+	# We do not forget the last one
+	if (defined($check{$id_c})){
+		my $id_red=$id_c;
+		print "We had checked $id_c -> $id_red\n";
+		foreach(keys %{$check{$id_c}}){
+			$id_red=$id_c;
+			my @tab=split(",",$check{$id_c}{$_}{"line"});
+			$tab[0]=~s/\(/_/g;
+			$tab[0]=~s/\)/_/g;
+			$tab[2]=~s/\(/_/g;
+			$tab[2]=~s/\)/_/g;
+			$tab[0]=~s/\[/_/g;
+			$tab[0]=~s/\]/_/g;
+			$tab[2]=~s/\[/_/g;
+			$tab[2]=~s/\]/_/g;
+			# $tab[2]=~s/-circular//g;
+			my $desc="Putative phage sequence (category $check{$id_c}{$_}{category}), predicted by PhageSorter";
+			my $iscirc=0;
+			if ($check{$id_c}{$tab[2]}{"circular"}==1){
+				# $id_red.="-circ";
+				$iscirc=1;
+			}
+			if ($check{$id_c}{$_}{"category"}<=3){
+				print ".. predicted to be a complete phage..\n";
+				$sequence = Bio::Seq::RichSeq->new(-display_id => "$id_red", -accession_number => "$id_red", -desc => $desc ,-seq =>"$seq_c",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna");
+				$sequence->add_date(`date +%D`);
+				my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($seq_c),-primary => "source",-tag => {'organism' => "$desc"});
+				$sequence->add_SeqFeature($featsource);
+				foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){
+					my $gene=$_;
+					my $splitlocation = Bio::Location::Split->new();
+					my $strand=0;
+					if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;}
+					# si on est sur un join, etc..
+					if ($infos{$id_c}{$gene}{"stop"} < $infos{$id_c}{$gene}{"start"}){
+						$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>length($seq_c),-strand=>$strand));
+						$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>1,-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand));
+					}
+					else{
+						$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"},-end=>$infos{$id_c}{$gene}{"stop"},-strand=>$strand));
+					}
+					my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene});
+					$sequence->add_SeqFeature($featgene);
+					my $product=$infos{$id_c}{$gene}{"affi"};
+					my $note="Predicted by MGA";
+					my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"});
+					$sequence->add_SeqFeature($featcds);
+					$featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"});
+				}
+				if ($check{$id_c}{$_}{"category"}==1){
+					print S1 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n";
+					$output_1->write_seq($sequence);
+				}
+				if ($check{$id_c}{$_}{"category"}==2){
+					print S2 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n";
+					$output_2->write_seq($sequence);
+				}
+				elsif ($check{$id_c}{$_}{"category"}==3){
+					print S3 ">$id_red-cat_$check{$id_c}{$_}{category}\n$seq_c\n";
+					$output_3->write_seq($sequence);
+				}
+			}
+			else{
+				print ".. predicted to be a prophage..\n";
+				if ($tab[2]=~/^$id_c-(gene_\d+)-(gene_\d+)/){
+					my $gene_start=$1;
+					my $start=$infos{$id_c}{$gene_start}{"start"}-$decal;
+					if ($start<0){$start=0;}
+					my $gene_stop=$2;
+					my $stop=$infos{$id_c}{$gene_stop}{"stop"}+$decal;
+					my $length=$stop-$start;
+					print "  from $1 to $2 .. from $start to $stop ($length)\n";
+                                        $iscirc=0; # An integrated prophage cannot be circular, so set this to linear 
+                                        my $display_id=$id_red."_".$gene_start."_".$gene_stop."-".$start."-".$stop."-cat_".$check{$id_c}{$_}{"category"};
+					my $substr=substr($seq_c,$start,$length);
+					$sequence = Bio::Seq::RichSeq->new(-display_id => "$display_id", -accession_number => "$display_id", -desc => $desc ,-seq =>"$substr",-is_circular =>$iscirc,-division => "ENV",-alphabet => "dna");
+					$sequence->add_date(`date +%D`);
+					my $featsource = Bio::SeqFeature::Generic->new(-start => 1,-end => length($substr),-primary => "source",-tag => {'organism' => "$desc"});
+					$sequence->add_SeqFeature($featsource);
+					foreach(sort { $infos{$id_c}{$a}{"order"} <=> $infos{$id_c}{$b}{"order"} } keys %{$infos{$id_c}}){
+						my $gene=$_;
+						if ((($infos{$id_c}{$gene}{"start"}-$start)>0) && (($infos{$id_c}{$gene}{"start"}-$start)<=$stop) && (($infos{$id_c}{$gene}{"stop"}-$start)>0) && (($infos{$id_c}{$gene}{"stop"}-$start)<=$stop)){
+							my $splitlocation = Bio::Location::Split->new();
+							my $strand=0;
+							if ($infos{$id_c}{$gene}{"strand"} eq "-"){$strand=-1;}
+							$splitlocation->add_sub_Location(Bio::Location::Simple->new(-start=>$infos{$id_c}{$gene}{"start"}-$start,-end=>$infos{$id_c}{$gene}{"stop"}-$start,-strand=>$strand));
+							my $featgene = Bio::SeqFeature::Generic->new(-location => $splitlocation,-primary => "gene",-tag => {'gene' => "$gene",'locus_tag' => $id_c."_".$gene});
+							$sequence->add_SeqFeature($featgene);
+							my $product=$infos{$id_c}{$gene}{"affi"};
+							my $note="Predicted by MGA";
+							my $featcds = Bio::SeqFeature::Generic->new(-location=>$splitlocation,-primary => "CDS",-tag => {'product' => "$product",'note' => "$note",'locus_tag' => $id_c."_".$gene,'codon_start' => "1",'gene' => "$gene",'transl_table' => "11"});
+							$sequence->add_SeqFeature($featcds);
+							$featcds->add_tag_value('translation',$infos{$id_c}{$gene}{"seq"});
+						}
+					}
+					if ($check{$id_c}{$_}{"category"}==4){
+						print SP1 ">".$display_id."\n".$substr."\n";
+						$output_p1->write_seq($sequence);
+					}
+					elsif ($check{$id_c}{$_}{"category"}==5){
+						print SP2 ">".$display_id."\n".$substr."\n";
+						$output_p2->write_seq($sequence);
+					}
+					elsif($check{$id_c}{$_}{"category"}==6){
+						print SP3 ">".$display_id."\n".$substr."\n";
+						$output_p3->write_seq($sequence);
+					}
+				}
+				else{
+					print "Pblm with $tab[2] - tab 2\n";
+				}
+			}
+		}
+	}
+	close S1;
+	close S2;
+	close S3;
+	close SP1;
+	close SP2;
+	close SP3;
+	$output_1->close();
+	$output_2->close();
+	$output_3->close();
+	$output_p1->close();
+	$output_p2->close();
+	$output_p3->close();
+}
+else{
+	print "$code\tin progress\n";
+}
diff --git a/virsorter/Scripts/Step_first_add_custom_phage_sequence.pl b/virsorter/Scripts/Step_first_add_custom_phage_sequence.pl
new file mode 100755
index 0000000..8db79f4
--- /dev/null
+++ b/virsorter/Scripts/Step_first_add_custom_phage_sequence.pl
@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+
+use strict;
+use autodie;
+use FindBin '$Bin';
+use Bio::Seq;
+use File::Spec::Functions;
+use File::Path 'mkpath';
+use File::Which 'which';
+# Script to generate a new db with putative new clusters
+# Argument 0 : Fasta file of the new phages
+
+if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[2])))
+{
+	print "# Script to generate a new db with putative new clusters
+# Argument 0 : fasta of custom phages
+# Argument 1 : db-in directory
+# Argument 2 : db-out directory
+\n";
+	die "\n";
+}
+
+
+my $virsorter_dir    = "/usr/local/bin/Virsorter/";
+my $path_to_formatdb = which("formatdb");
+my $path_to_blastall = which("blastall");
+my $path_to_muscle   = which("muscle");
+my $path_to_hmmbuild = which("hmmbuild");
+my $path_to_hmmpress = which("hmmpress");
+my $path_hmmsearch   = which("hmmsearch");
+my $path_to_mga      = which("mga_linux_ia64");
+my $MCX_LOAD         = which("mcxload");
+my $MCL              = which("mcl");
+
+my $min_seq_in_a_cluster=3;
+my $n_cpus=8;
+
+my $fasta_contigs=$ARGV[0];
+my $db_in=$ARGV[1];
+my $db_out=$ARGV[2];
+
+my $tmp_dir=$db_out."/initial_db";
+`mkdir $tmp_dir`;
+`cp $db_in/* $tmp_dir/`;
+$tmp_dir.="/";
+my $log_out=$tmp_dir."log_out_step_custom_phage";
+my $log_err=$tmp_dir."log_err_step_custom_phage";
+
+my $db_phage              = $tmp_dir . "Pool_clusters.hmm";
+my $blastable_unclustered = $tmp_dir . "Pool_new_unclustered";
+my $fasta_unclustered     = $tmp_dir . "Pool_new_unclustered.faa";
+my $ref_phage_clusters    = $tmp_dir . "Phage_Clusters_current.tab";
+my $blast_unclustered     = $tmp_dir . "Blast_unclustered.tab";
+
+open(F1,"<$fasta_contigs") || die "pblm ouverture fichier $fasta_contigs\n";
+my %seq_base;
+my $id_seq="";
+my $i=0;
+my %order_contig;
+while(<F1>){
+	$_=~s/\r\n/\n/g; #Cas d'un fichier windows ##AJOUT
+	chomp($_);
+	if ($_=~/^>(\S*)/){$id_seq=$1;$order_contig{$id_seq}=$i;$i++}
+	else{$seq_base{$id_seq}.=$_;}
+}
+close F1;
+
+
+# Predict genes on the new phages
+my $out_file= $db_out."/Custom_phages_mga.predict";
+print "$path_to_mga $fasta_contigs -m > $out_file\n";
+my $mga=`$path_to_mga $fasta_contigs -m > $out_file`;
+my %order_gene;
+my $n2=0;
+open(RESU,"<$out_file")  || die "pblm ouverture fichier $out_file\n";
+my %predict;
+my %type;
+my $id_c="";
+while(<RESU>){
+	chomp($_);
+	if($_=~/^# gc/){}
+	elsif($_=~/^# self: (.*)/){$type{$id_c}=$1;}
+	elsif ($_=~/^# (.*)/){
+		$id_c=$1;
+		$n2=0;
+	}
+	else{
+		my @tab=split("\t",$_);
+		$predict{$id_c}{$tab[0]}=$_;
+		if (!defined($order_gene{$id_c}{$tab[0]})){$order_gene{$id_c}{$tab[0]}=$n2;$n2++;}
+	}
+}
+close RESU;
+my %check_prot;
+my $prot_file=$tmp_dir."/Custom_phages_mga_prots.fasta";
+open(PROT,">$prot_file") || die "pblm ouverture fichier $prot_file\n";
+my $n=0;
+foreach(sort {$order_contig{$a} <=> $order_contig{$b} } keys %predict){
+	$n++;
+	my $id=$_;
+	my @tab_genes=sort {$order_gene{$id}{$a} <=> $order_gene{$id}{$b} } keys %{$predict{$id}};
+	## We check the first gene and modify it if needed
+	my $seq_c=$seq_base{$id};
+	foreach(@tab_genes){
+		my @tab=split("\t",$predict{$id}{$_});
+		if ($tab[5]!=11){
+			# soit on est au début de séquence, soit en toute fin (théoriquement)
+			if ($tab[4]!=0){
+				if ($tab[3] eq "-"){
+					$tab[2]-=$tab[4];
+				}
+				elsif($tab[3] eq "+"){
+					$tab[1]+=$tab[4];
+				}
+				else{
+					print "%%%%%% pblm on a pas de sens pour $id : @tab\n";
+				}
+			}
+			my $new_line=join("\t",@tab);
+			$predict{$id}{$_}=$new_line;
+		}
+
+		@tab=split("\t",$predict{$id}{$_});
+		my $name=$tab[0];
+		my $start=$tab[1];
+		my $stop=$tab[2];
+		my $sens=$tab[3];
+		my $frame=$tab[4];
+		my $frag="";
+		# Cas "normal" (on chevauche pas l'origine)
+		if ($start<$stop){
+			my $long=$stop-$start+1;
+			$frag=substr($seq_c,$start-1,$long);
+		}
+		# Cas exceptionnel, on chevauche l'origine du contig
+		else{
+			my $l1=length($seq_c)-$start+1;
+			$frag=substr($seq_c,$start-1,$l1);
+			$frag.=substr($seq_c,0,$stop);
+		}
+		## POUR RECUPERER LA SEQ PROT
+		my $seq_bio = Bio::Seq->new(-seq =>$frag,-alphabet => 'dna' );
+		my @seqs = Bio::SeqUtils->translate_6frames($seq_bio);
+		my $cadre=0;
+		if ($sens eq "-"){$cadre=3;}
+		my $prot=$seqs[$cadre];
+		my $prot_sequence=$prot->seq;
+		if ($prot_sequence=~/\*$/){
+# 				print "on enlève le codon stop final pour muscle\n";
+			chop($prot_sequence);
+		}
+		my $id_out=$id."-".$name;
+		print PROT ">$id_out\n$prot_sequence\n";
+		$check_prot{$id_out}=1;
+	}
+}
+close PROT;
+# Clustering the proteins
+# - 1 - Hmmsearch vs the original db
+my $out_hmmsearch=$tmp_dir."New_prots_vs_Phagedb.tab";
+my $out_hmmsearch_bis=$tmp_dir."New_prots_vs_Phagedb.out";
+my $cmd_hmm_phage="$path_hmmsearch --tblout $out_hmmsearch --cpu $n_cpus -o $out_hmmsearch_bis --noali $db_phage $prot_file >> $log_out 2>> $log_err";
+print "Step 0.9 : $cmd_hmm_phage\n";
+`echo $cmd_hmm_phage >> $log_out 2>> $log_err`;
+my $out=`$cmd_hmm_phage`;
+print "$out\n";
+open(HMM,"<$out_hmmsearch") || die ("pblm opening file $out_hmmsearch\n");
+my $score_th=200;
+my $evalue_th=0.0000000001;
+while(<HMM>){
+	chomp($_);
+	if ($_=~m/^#/){
+		next;
+	}
+	else{
+		my @splign=split(m/\s+/,$_);
+		my $seq=$splign[0];
+		my $match=$splign[2];
+		my $evalue=$splign[4];
+		my $score=$splign[5];
+		if ($score>=$score_th && $evalue<=$evalue_th){
+			$check_prot{$seq}=0;
+		}
+	}
+}
+close HMM;
+# - 2 - All which does not match a known -> get it
+my $prot_file_to_cluster=$tmp_dir."/Custom_phages_mga_prots-to-cluster.fasta";
+my $tag=0;
+my %seq_temp;
+open(PROT,"<$prot_file") || die ("pblm opening file $prot_file\n");
+open(NEWPROT,">$prot_file_to_cluster") || die ("pblm opening file $prot_file_to_cluster\n");
+my $id_c="";
+while (<PROT>){
+	chomp($_);
+	if ($_=~/^>(.*)/){
+		my $id=$1;
+		$id_c=$id;
+		$tag=0;
+		if ($check_prot{$id}==1){
+			print NEWPROT "$_\n";
+			$tag=1;
+		}
+	}
+	elsif($tag==1){
+		print NEWPROT "$_\n";
+		$seq_temp{$id_c}.=$_;
+	}
+}
+close PROT;
+close NEWPROT;
+# - 3 - and make new clusters
+my $db=$tmp_dir."Custom_phages_mga_prots-to-cluster";
+my $cmd_format="$path_to_formatdb -i $prot_file_to_cluster -n $db";
+print "$cmd_format\n";
+my $out=`$cmd_format`;
+print "Formatdb : $out\n";
+my $cmd_cat="cat $fasta_unclustered >> $prot_file_to_cluster";
+print "$cmd_cat\n";
+$out=`$cmd_cat`;
+print "Cat : $cmd_cat\n";
+#     - blast vs themselves and the unclustered
+my $out_blast=$tmp_dir."pool_unclustered-and-custom-phages-vs-custom-phages.tab";
+my $cmd_blast="$path_to_blastall -p blastp -i $prot_file_to_cluster -d $db -o $out_blast -m 8 -a 10 -e 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts
+print "$cmd_blast\n";
+$out=`$cmd_blast`;
+print "Blast : $out\n";
+$cmd_cat="cat $blast_unclustered >> $out_blast";
+print "$cmd_cat\n";
+$out=`$cmd_cat`;
+print "Cat : $out\n";
+print "Generating abc file\n";
+#     - mcl 
+my $out_abc=$tmp_dir."new_clusters.abc";
+my $th_score=50;
+my $th_evalue=0.00001;
+my $max=200; # Max on sig
+open(S1,">$out_abc") || die ("pblm opening file $out_abc\n");
+open(BL,"<$out_blast") || die ("pblm opening file $out_blast\n");
+while(<BL>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	if ($tab[11]>$th_score && $tab[10]<$th_evalue && $tab[0] ne $tab[1]){
+		my $evalue=$tab[10];
+# 		$evalue=-log10($evalue);
+# 		if ($evalue>$max){$evalue=$max;}
+		print S1 "$tab[0]\t$tab[1]\t$evalue\n";
+	}
+}
+close BL;
+close S1;
+my $out_mci=$tmp_dir."new_clusters.mci";
+my $out_tab=$tmp_dir."new_clusters.tab";
+my $cmd_mcxload="$MCX_LOAD -abc $out_abc --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o $out_mci -write-tab $out_tab";
+print "$cmd_mcxload\n";
+$out=`$cmd_mcxload`;
+print "Mxc Load : $out\n";
+my $dump_file=$tmp_dir."new_clusters.csv";
+my $cmd_mcl="$MCL $out_mci -I 2  -use-tab $out_tab -o $dump_file";
+print "$cmd_mcl\n";
+$out=`$cmd_mcl`;
+print "Mcl : $out\n";
+#     - make new cluster
+my %unclustered;
+my %clusters;
+my %check_cluster;
+my $last_cluster_id=0;
+# toutes les séquences clusterisées dans des groupes de plus de 2 (3 et plus) -> on prend / Toutes les autres on les garde en tant qu'unclustered
+open(DUMP,"<$dump_file") || die "pblm ouverture fichier $dump_file\n";
+while(<DUMP>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	my $n_s_c=$#tab+1;
+	if ($n_s_c>=$min_seq_in_a_cluster){
+		# on a trouvé un cluster de plus de deux
+		my $cluster_id=$last_cluster_id+1;
+		$cluster_id="Phage_cluster_".$cluster_id."-c";
+		print "We found a cluster with $n_s_c sequences => Cluster $cluster_id\n";
+		$last_cluster_id++;
+		foreach(@tab){
+			$clusters{$cluster_id}{$_}=1;
+			$check_cluster{$_}=1;
+		}
+	}
+	else{
+		foreach(@tab){
+			$unclustered{$_}=1;
+			$check_cluster{$_}=1;
+		}
+	}
+}
+close DUMP;
+my %seq_temp;
+my $id_c="";
+open(FA,"<$prot_file_to_cluster") || die "pblm ouverture fichier $prot_file_to_cluster\n";
+while(<FA>){
+	chomp($_);
+	if ($_=~/^>(\S*)/){
+		$id_c=$1;
+		if (!defined($check_cluster{$id_c})){$unclustered{$id_c}=1;$check_cluster{$id_c}=1;}
+	}
+	else{$seq_temp{$id_c}.=$_;}
+}
+close FA;
+`mkdir $tmp_dir/clusts`;
+foreach(keys %clusters){
+	my $cluster_id=$_;
+	my $out_file=$tmp_dir."clusts/".$cluster_id.".faa";
+	open(S1,">$out_file") || die "pblm ouverture fichier $out_file\n";
+	foreach(keys %{$clusters{$cluster_id}}){
+		print S1 ">$_\n$seq_temp{$_}\n";
+	}
+	close S1;
+}
+# - 4 - Plus add the unclustered to the unclustered database
+my $final_pool_unclustered=$db_out."/Pool_new_unclustered.faa";
+my $final_blastable_unclustered=$final_pool_unclustered;
+$final_blastable_unclustered=~s/\.faa//;
+my $final_blast_unclustered=$db_out."/Blast_unclustered.tab";
+open(S1,">$final_pool_unclustered") || die "pblm ouverture fichier $final_pool_unclustered\n";
+foreach(keys %unclustered){
+	print S1 ">$_\n$seq_temp{$_}\n";
+}
+close S1;
+print "making a blastable db from the new unclustered\n";
+$out=`$path_to_formatdb -i $final_pool_unclustered -n $final_blastable_unclustered`;
+# on réduit aussi le fichier blast qu'on ajoute au blast des unclustered
+open(BL,"<$out_blast") || die "pblm ouverture fichier $out_blast\n";
+open(S1,">$final_blast_unclustered") || die "pblm ouverture fichier $final_blast_unclustered\n";
+while(<BL>){
+	chomp($_);
+	my @tab=split("\t",$_);
+	if ($unclustered{$tab[0]}==1 && $unclustered{$tab[1]}==1){
+		print S1 "$_\n";
+	}
+}
+close BL;
+close S1;
+# Generating the new database
+my $tag=0;
+foreach(sort keys %clusters){
+	$tag=1;
+	my $ali_id=$_;
+	my $path_to_file=$tmp_dir."clusts/".$ali_id;
+	my $path_to_fasta=$tmp_dir."clusts/".$ali_id.".faa";
+	my $path_to_ali=$tmp_dir."clusts/".$ali_id.".ali_faa";
+	my $path_to_hmm=$tmp_dir."clusts/".$ali_id."_ali.hmm";
+	if (-e $path_to_ali){
+		`rm $path_to_ali $path_to_hmm`;
+	}
+	my $muscle_out=$tmp_dir."log_out_muscle";
+	my $muscle_err=$tmp_dir."log_err_muscle";
+	`$path_to_muscle -in $path_to_fasta -out $path_to_ali > $muscle_out 2> $muscle_err`;
+	my $out_stokcholm=$path_to_ali.".stockholm";
+	open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n";
+	print S1 "# STOCKHOLM 1.0\n";
+	open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n";
+	while(<FA>){
+		chomp($_);
+		if ($_=~/^>(.*)/){
+			my $id=$1;
+			$id=~s/\s/_/g;
+			print S1 "\n$id  ";
+			
+		}
+		else{print S1 "$_";}
+	}
+	close FA;
+	print S1 "\n//\n";
+	`$path_to_hmmbuild --amino $path_to_hmm $out_stokcholm`;
+}
+# on poole tous les hmm et les fasta, y compris les precedentes !
+$out=`cat $db_phage > $db_out/Pool_clusters.hmm`;
+print "cat previous hmm : $out\n";
+$out=`cat $tmp_dir/clusts/*.hmm >> $db_out/Pool_clusters.hmm`;
+print "cat new hmm : $out\n";
+# on en fait une base de données screenable par hmmscan
+$out=`$path_to_hmmpress $db_out/Pool_clusters.hmm`;
+print "hmm press :$out\n";
+# update the phage cluster catalog
+my $final_catalog=$db_out."/Phage_Clusters_current.tab";
+$out=`cat $ref_phage_clusters > $final_catalog`;
+print "Cat old catalog : $out\n";
+open(CA,">>$final_catalog") || die ("pblm opening file $final_catalog\n");
+foreach(keys %clusters){
+	my $liste=join(" ",keys %{$clusters{$_}});
+	print CA "$_|2||$liste\n";
+}
+close CA;
diff --git a/virsorter/wrapper_phage_contigs_sorter_iPlant.pl b/virsorter/wrapper_phage_contigs_sorter_iPlant.pl
new file mode 100755
index 0000000..9a141d4
--- /dev/null
+++ b/virsorter/wrapper_phage_contigs_sorter_iPlant.pl
@@ -0,0 +1,441 @@
+#!/usr/bin/env perl
+
+=head1 USAGE 
+
+  ./wrapper_phage_contigs_sorter_iPlant.pl -d Code_dataset --fna Fasta file of contigs --db 1 --wdir /path/to/working_directory
+  Database codes : 1 for RefseqABVir only, 2 for RefseqABVir + Viromes
+  An additional set of reference sequences can be added to the database as a fasta file with the argument cp (--cp /path/to/fasta_file)
+  
+=cut
+
+use strict;
+use warnings;
+use autodie;
+use FindBin '$Bin';
+use File::Spec::Functions;
+use File::Path 'mkpath';
+use File::Which 'which';
+use Getopt::Long 'GetOptions';
+use Pod::Usage;
+use Cwd 'cwd';
+
+my $help              = '';
+my $code_dataset      = 'VIRSorter';
+my $original_fna_file = '';
+my $choice_database   = '';
+my $tag_virome        = 0;
+my $custom_phage      = '';
+my $data_dir          = '/data';
+my $wdir              = cwd();
+
+GetOptions(
+   'fna=s'       => \$original_fna_file,
+   'd|dataset:s' => \$code_dataset,
+   'db:i'        => \$choice_database,
+   'virome:i'    => \$tag_virome,
+   'wdir:s'      => \$wdir,
+   'cp:s'        => \$custom_phage,
+   'data-dir:s'  => \$data_dir,
+   'h|help'      => \$help,
+);
+
+if ($help) {
+   pod2usage();
+}
+
+unless ($original_fna_file) {
+    pod2usage('Missing FASTA file');
+}
+
+unless ($choice_database == 1 || $choice_database == 2) {
+    pod2usage('choice_database must be 1 or 2');
+}
+
+print join("\n",
+    "Dataset      : $code_dataset", 
+    "Fna file     : $original_fna_file", 
+    "Db           : $choice_database", 
+    "Wdir         : $wdir", 
+    "Custom phages: $custom_phage",
+    ''
+);
+
+#
+# This code does nothing useful.
+#
+# We check if the custom phage is an actual fasta file, or if it's the working dir (which means -> no custom phage).
+#if ($custom_phage=~/.*\.f.*/){}
+#else{
+#	if($wdir=~/.*\/$custom_phage$/){
+#		print "The custom phage is actually the wdir id, so we remove it\n";
+#		$custom_phage="";
+#	}
+#	else{
+#		die("we do not understand this custom phage : $custom_phage");
+#	}
+#}
+
+if ($tag_virome) {
+	print "!!! THIS WILL BE A VIROME DECONTAMINATION RUN\n";
+}
+
+# Need 2 databases
+# PCs from Refseq (phages) or PCs from Refseq+Viromes
+# PFAM (26.0)
+
+my $n_cpus = 8;
+
+# my $code_dataset      = $ARGV[0];
+# my $original_fna_file = $ARGV[1];
+# my $choice_database   = $ARGV[2];
+# my $wdir              = $ARGV[3];
+# my $custom_phage      = "";
+# if ( defined( $ARGV[4] ) ) { $custom_phage = $ARGV[4]; }
+print "#%#%#%#%#%# Processing $code_dataset....\n";
+my $microbial_base_needed = 0;
+## replace this directory with the iPlant dir
+#my $wdir=$wdir."/".$code_dataset."/";
+
+my $path_to_mga        = which('mga_linux_ia64');
+my $path_hmmsearch     = which('hmmsearch');
+my $path_blastall      = which('blastall');
+my $path_to_formatdb   = which('formatdb');
+my $log_out            = catfile($wdir, 'log_out');
+my $log_err            = catfile($wdir, 'log_err');
+my $script_dir         = catdir($Bin, 'Scripts');
+my $dir_Phage_genes    = catdir($data_dir,'Phage_gene_catalog');
+my $ref_phage_clusters = catfile($data_dir,
+                         'Phage_gene_catalog/Phage_Clusters_current.tab');
+# my $readme_file        = catfile($script_dir,"VirSorter_Readme.txt");
+my $readme_file        = catfile($data_dir,"VirSorter_Readme.txt");
+
+if ( $tag_virome == 1 ){
+#    $readme_file = catfile($script_dir,"VirSorter_Readme_viromes.txt");
+    $readme_file = catfile($data_dir,"VirSorter_Readme_viromes.txt");
+}
+
+# my $generic_ref_file = catfile($script_dir,"Generic_ref_file.refs");
+my $generic_ref_file = catfile($data_dir,"Generic_ref_file.refs");
+
+if ( $choice_database == 2 ) {
+    $dir_Phage_genes    = catdir($data_dir, "Phage_gene_catalog_plus_viromes/");
+    $ref_phage_clusters = catfile($data_dir,
+        "Phage_gene_catalog_plus_viromes/Phage_Clusters_current.tab");
+}
+
+my $db_PFAM_a = catfile($data_dir, "PFAM_27/Pfam-A.hmm");
+my $db_PFAM_b = catfile($data_dir, "PFAM_27/Pfam-B.hmm");
+
+my $out = "";
+
+## SETTING UP THE WORKING DIRECTORY
+my $log_dir = catdir($wdir, 'logs');
+if (-d $log_dir) {
+## Commented on iPlant, but can be useful when running VirSorter on a directory already processed 
+## (to avoid recomputing the gene prediction and comparison to PFAM especially)
+#    $out = `rm -r $log_dir/* *.csv`; 
+#    print "rm -r log* *.csv => $out\n";
+} 
+else {
+    mkpath($log_dir);
+}
+
+# cp fasta file in the wdir
+my $fastadir = catdir($wdir, "fasta");
+if ( !-d $fastadir ) {
+    mkpath($fastadir);
+    my $fna_file = catfile( $fastadir, "input_sequences.fna" );
+    open my $fa, '<', $original_fna_file;
+    open my $s1, '>', $fna_file;
+    while (<$fa>) {
+        chomp($_);
+        if ( $_ =~ /^>(.*)/ ) {
+            my $id = $1;
+            $id =~ s/[\/\.,\|\s?!\*%]/_/g;
+            my $new_id = $code_dataset . "_" . $id;
+            print $s1 ">$new_id\n";
+        }
+        else {
+            print $s1 "$_\n";
+        }
+
+    }
+    close $fa;
+    close $s1;
+    # detect circular, predict genes on contigs and extract proteins, as well
+    # as filtering on size (nb genes) and/or circular
+    my $nb_gene_th = 2; # At least two complete genes on the contig
+    my $path_script_step_1 = catfile($script_dir,"Step_1_contigs_cleaning_and_gene_prediction.pl");
+    my $cmd_step_1 = "$path_script_step_1 $code_dataset $fastadir $fna_file $nb_gene_th >> $log_out 2>> $log_err";
+    print "Step 0.5 : $cmd_step_1\n";
+    `echo $cmd_step_1 >> $log_out 2>> $log_err`;
+    { $out = `$cmd_step_1`; }
+}
+
+print "\t$out\n";
+my $fasta_contigs_nett = catfile( $fastadir, $code_dataset . "_nett_filtered.fasta" );
+my $fasta_file_prots = catfile( $fastadir, $code_dataset . "_prots.fasta" );
+
+# Match against PFAM, once for all
+# compare to PFAM a then b (hmmsearch)
+my $out_hmmsearch_pfama     = "Contigs_prots_vs_PFAMa.tab";
+my $out_hmmsearch_pfama_bis = "Contigs_prots_vs_PFAMa.out";
+my $cmd_hmm_pfama =
+"$path_hmmsearch --tblout $out_hmmsearch_pfama --cpu $n_cpus -o $out_hmmsearch_pfama_bis --noali $db_PFAM_a $fasta_file_prots >> $log_out 2>> $log_err";
+print "Step 0.8 : $cmd_hmm_pfama\n";
+
+`echo $cmd_hmm_pfama >> $log_out 2>> $log_err`;
+
+if ( !( -e $out_hmmsearch_pfama ) ) {
+    $out = `$cmd_hmm_pfama`;
+    print "\t$out\n";
+}
+
+my $out_hmmsearch_pfamb     = "Contigs_prots_vs_PFAMb.tab";
+my $out_hmmsearch_pfamb_bis = "Contigs_prots_vs_PFAMb.out";
+my $cmd_hmm_pfamb =
+"$path_hmmsearch --tblout $out_hmmsearch_pfamb --cpu $n_cpus -o $out_hmmsearch_pfamb_bis --noali $db_PFAM_b $fasta_file_prots >> $log_out 2>> $log_err";
+print "Step 0.9 : $cmd_hmm_pfamb\n";
+`echo $cmd_hmm_pfamb >> $log_out 2>> $log_err`;
+
+if ( !( -e $out_hmmsearch_pfamb ) ) {
+    $out = `$cmd_hmm_pfamb`;
+    print "\t$out\n";
+}
+else { 
+    $out = "Already a results for PFAM B .. skipping (the great guru)\n"; 
+}
+
+# Now work on the phage gene catalog
+
+# Files that will stay along the computations
+my $predict_file = catfile( $fastadir, $code_dataset . "_mga_final.predict" );
+my $out_hmmsearch = "Contigs_prots_vs_Phage_Gene_Catalog.tab";
+my $out_hmmsearch_bis        = "Contigs_prots_vs_Phage_Gene_Catalog.out";
+my $out_blast_unclustered    = "Contigs_prots_vs_Phage_Gene_unclustered.tab";
+my $out_file_affi            = $code_dataset . "_affi-contigs.csv";
+my $out_file_phage_fragments = $code_dataset . "_phage-signal.csv";
+my $global_out_file          = $code_dataset . "_global-phage-signal.csv";
+my $new_prots_to_cluster     = $code_dataset . "_new_prot_list.csv";
+
+# Constant scripts
+my $script_merge_annot = catfile($script_dir,"Step_2_merge_contigs_annotation.pl");
+my $cmd_merge =
+"$script_merge_annot $predict_file $out_hmmsearch $out_blast_unclustered $out_hmmsearch_pfama $out_hmmsearch_pfamb $ref_phage_clusters $out_file_affi >> $log_out 2>> $log_err";
+# my $cmd_merge = "$script_merge_annot -m $predict_file -hmm_pc $out_hmmsearch -blast_pc $out_blast_unclustered -hmm_pfa $out_hmmsearch_pfama -hmm_pfb $out_hmmsearch_pfamb -ref_pc $ref_phage_clusters -out_f $out_file_affi >> $log_out 2>> $log_err";
+
+my $script_detect = catfile($script_dir,"Step_3_highlight_phage_signal.pl");
+my $cmd_detect =
+"$script_detect $out_file_affi $out_file_phage_fragments >> $log_out 2>> $log_err";
+if ($tag_virome==1){$cmd_detect =
+"$script_detect $out_file_affi $out_file_phage_fragments $generic_ref_file >> $log_out 2>> $log_err";}
+
+my $script_summary =catfile($script_dir,"Step_4_summarize_phage_signal.pl");
+my $cmd_summary =
+"$script_summary $out_file_affi $out_file_phage_fragments $global_out_file $new_prots_to_cluster >> $log_out 2>> $log_err";
+
+# # Get the final result file ready
+`touch $global_out_file`;
+my $r_n = -1;
+# Si on a des nouvelles prots a clusteriser ou si on est dans la premiere
+# revision
+while ( (-e $new_prots_to_cluster || $r_n == -1) && ($r_n<=10) ) {
+    $r_n++;    # New revision of the prediction
+    my $dir_revision = "r_" . $r_n;
+    print "### Revision $r_n\n";
+    if ( !-d $dir_revision ) {
+        ## mkdir de la db de cette revision
+        #print "mkdir $dir_revision >> $log_out 2>> $log_err\n";
+        #$out=`mkdir $dir_revision >> $log_out 2>> $log_err`;
+        mkpath($dir_revision);
+        print "Out : $out\n";
+        ## Clustering of the new prots with the unclustered
+        my $script_new_cluster = catfile($script_dir,"Step_0_make_new_clusters.pl");
+        # First revision, we just import the Refseq database
+        if ( $r_n == 0 ) {
+            #`mkdir $dir_revision/db`;
+            mkpath( catdir( $dir_revision, 'db' ) );
+
+            ## Adding custom sequences to the database if required by the user
+            if ( $custom_phage ne "" ) {
+                my $script_custom_phage = catfile($script_dir,"Step_first_add_custom_phage_sequence.pl");
+                $out =`$script_custom_phage $custom_phage $dir_Phage_genes/ $dir_revision/db >> $log_out 2>> $log_err`;
+                print "Adding custom phage to the database : $out\n";
+            }
+            # should replace Pool_cluster / Pool_unclustered and
+            # Pool_new_unclustered else , we just import the Refseq database
+            else { `cp $dir_Phage_genes/* $dir_revision/db/`; }
+        }
+        else {
+            my $previous_r = $r_n - 1;
+            my $previous_fasta_unclustered =
+              catfile( "r_" . $previous_r, "db", "Pool_unclustered.faa" );
+            my $cmd_new_clusters = join(' ',
+                "$script_new_cluster $dir_revision $fasta_file_prots",
+                "$previous_fasta_unclustered",
+                "$new_prots_to_cluster >> $log_out 2>> $log_err"
+            );
+
+            print "$cmd_new_clusters\n";
+            $out = `$cmd_new_clusters`;
+            print "Step 1.1 new clusters and new database : $out\n";
+            # Rm the list of prots to be clustered now that they should be
+            # clustered
+            $out = `rm $new_prots_to_cluster`;
+            print "rm $new_prots_to_cluster -> $out\n";
+        }
+
+        # Check if there are some data in these new clusters, or if all the new
+        # proteins are unclustered
+        my $new_db_profil = catfile( $dir_revision, "db", "Pool_clusters.hmm" );
+        my $check = 0;
+        open my $DB, '<', $new_db_profil;
+        while (<$DB>) {
+	    chomp($_);
+            if ( $_ =~ /^NAME/ ) { 
+		$check++; 
+		# print "there is a cluster $_ in the database, so we're good\n"; 
+	    }
+        }
+        close $DB;
+        if ( $check == 0 ) {
+            print "There is no clusters in the database, so we skip the hmmsearch\n";
+        }
+        else {
+            my $out_hmmsearch_new =
+              catfile( $dir_revision, "Contigs_prots_vs_New_clusters.tab" );
+            my $out_hmmsearch_bis_new =
+              catfile( $dir_revision, "Contigs_prots_vs_New_clusters.out" );
+            my $cmd_hmm_cluster = join(' ',
+                "$path_hmmsearch --tblout $out_hmmsearch_new --cpu $n_cpus",
+                "-o $out_hmmsearch_bis_new --noali $new_db_profil",
+                "$fasta_file_prots >> $log_out 2>> $log_err"
+            );
+
+            print "Step 1.2 : $cmd_hmm_cluster\n";
+
+            `echo $cmd_hmm_cluster >> $log_out 2>> $log_err`;
+
+            $out = `$cmd_hmm_cluster`;
+            print "\t$out\n";
+
+            $out = `cat $out_hmmsearch_new >> $out_hmmsearch`;
+            print "\t$out\n";
+        }
+
+        my $out_blast_new_unclustered =
+          catfile( $dir_revision, "Contigs_prots_vs_New_unclustered.tab" );
+        my $blastable_unclustered =
+          catfile( $dir_revision, 'db', 'Pool_new_unclustered' );
+        my $cmd_blast_unclustered = join(' ',
+            "$path_blastall -p blastp -i $fasta_file_prots -d",
+            "$blastable_unclustered -o $out_blast_new_unclustered -a $n_cpus", 
+            "-m 8 -e 0.001 >> $log_out 2>> $log_err"
+        );
+
+        print "\nStep 1.3 : $cmd_blast_unclustered\n";
+        `echo $cmd_blast_unclustered >> $log_out 2>> $log_err`;
+        $out = `$cmd_blast_unclustered`;
+        print "\t$out\n";
+        $out = `cat $out_blast_new_unclustered >> $out_blast_unclustered`;
+        print "\t$out\n";
+        ## Make backup of the previous files to have trace of the different steps
+        my $backup_affi = catfile( $dir_revision, "affi_backup.csv" );
+        my $backup_phage_signal =
+          catfile( $dir_revision, "phage_signal_backup.csv" );
+        my $backup_global_signal =
+          catfile( $dir_revision, "global_signal_backup.csv" );
+        if ( -e $out_file_affi ) { `cp $out_file_affi $backup_affi`; }
+        if ( -e $out_file_phage_fragments ) {
+            `cp $out_file_phage_fragments $backup_phage_signal`;
+        }
+        if ( -e $global_out_file ) {
+            `cp $global_out_file $backup_global_signal`;
+        }
+    }
+
+    ## Complete the affi
+    print "Step 2 : $cmd_merge\n";
+    `echo $cmd_merge >> $log_out 2>> $log_err`;
+    $out = `$cmd_merge`; 
+    ## This generate a csv table including the map of each contig, with PFAM
+    #and Viral PCs annotations, as well as strand and length of genes
+
+    print "\t$out\n";
+    ## Complete the summary
+    print "Step 3 : $cmd_detect\n";
+    `echo $cmd_detect >> $log_out 2>> $log_err`;
+    $out = `$cmd_detect`;
+    print "\t$out\n";
+
+    # Decide which contigs are entirely viral and which are prophages, and
+    # which of both of these categories are phage enough to be added to the
+    # databases
+    print "Setting up the final result file\n";
+    print "Step 4 : $cmd_summary\n";
+    `echo $cmd_summary >> $log_out 2>> $log_err`;
+    $out = `$cmd_summary`;
+    print "\t$out\n";
+}
+
+# Last step -> extract all sequences as fasta files and gb
+my $script_generate_output = catfile($script_dir,"Step_5_get_phage_fasta-gb.pl");
+my $cmd_step_5 = "$script_generate_output $code_dataset >> $log_out 2>> $log_err";
+print "\nStep 5 : $cmd_step_5\n";
+
+`echo $cmd_step_5 >> $log_out 2>> $log_err`;
+
+$out = `$cmd_step_5`;
+print "\t$out\n";
+
+
+# Plus clean the output directory
+print "Cleaning the output directory\n";
+# We rm the first db to not overload user disk space
+my $db_revision_0="r_0/db";
+$out=`rm -r $db_revision_0`;
+print "rm -r $db_revision_0 : $out\n";
+`mv fasta/ Fasta_files/`;
+# We put all results from Hmmsearch and BLAST files in a separate directory
+my $store_database_comparison="Tab_files";
+mkpath($store_database_comparison);
+`mv $out_hmmsearch $store_database_comparison/`;
+# `mv $out_hmmsearch_bis $store_database_comparison/`;
+`mv $out_blast_unclustered $store_database_comparison/`;
+`mv $out_hmmsearch_pfama $store_database_comparison/`;
+`mv $out_hmmsearch_pfama_bis $store_database_comparison/`;
+`mv $out_hmmsearch_pfamb $store_database_comparison/`;
+`mv $out_hmmsearch_pfamb_bis $store_database_comparison/`;
+`mv error.log $log_dir`;
+`mv formatdb.log $log_dir`;
+my $final_error_log=catfile($log_dir,'Virsorter_stderr_log');
+`mv log_err $final_error_log`;
+# Then we clean error log to remove the ugly (and unnecessary) warning from BioPerl - Not needed anymore, we (i.e. Ken) figured out what was causing the warning (seq object had no id)
+# my $cmd_sed="sed -i '/Use of uninitialized value in concatenation (.) or string at \\/usr\\/local\\/lib\\/perl5\\/site_perl\\/5.22.0\\/Bio\\/SeqUtils.pm line 375.\$\/d' $final_error_log";
+# print "$cmd_sed\n";
+# `$cmd_sed`;
+my $final_out_log=catfile($log_dir,'Virsorter_stdout_log');
+`mv log_out $final_out_log`;
+# We put all the files linked to the metric computation in a new directory
+my $store_metric_files="Metric_files";
+mkpath($store_metric_files);
+`mv $out_file_affi $store_metric_files/VIRSorter_affi-contigs.tab`;
+my $out_file_affi_ref  = $code_dataset . "_affi-contigs.refs";
+`mv $out_file_affi_ref $store_metric_files/`;
+`mv $out_file_phage_fragments $store_metric_files/VIRSorter_phage_signal.tab`;
+if (-e $new_prots_to_cluster){`mv $new_prots_to_cluster $store_metric_files/`;}
+# And we customize and add the readme file in the output directory
+my $datestring=localtime();
+my $local_readme_file="Readme.txt";
+open my $s1,'>',$local_readme_file;
+print $s1 "VirSorter parameters used :\n\n";
+print $s1 "--> Fasta file mined for viral sequences : $original_fna_file\n";
+print $s1 "--> Viral database used : ";
+if ($choice_database==2){print $s1 "Viromes : all bacterial and archaeal virus genomes in Refseq, as of January 2014, plus non-redundant predicted genes from viral metagenomes (including seawater, freshwater, and human-related samples)\n"}
+else{print $s1 "RefseqABVir (all bacterial and archaeal virus genomes in Refseq, as of January 2014)\n";}
+if ($custom_phage eq ""){print $s1 "--> No custom reference sequence was added to the database\n";}
+else{print $s1 "--> Custom reference sequences from fasta file $custom_phage were added to the database\n";}
+if ($tag_virome==1){print $s1 "VirSorter was run with the in the 'Virome Decontamination' mode: overall metrics for microbial sequences were not evaluated from the complete dataset, but instead pre-computed values based on bacterial and archaeal genomes from Refseq were used."}
+print $s1 "\nThis VirSorter computation finished on $datestring\n";
+close $s1;
+`cat $readme_file >> $local_readme_file`;