cafe

#!/usr/bin/env perl 

use File::Copy;
use Bio::SeqIO;
use List::MoreUtils qw(uniq);
use List::Util 'max';
#Defaults
$verb=0;
$expert=0;
$visual=0;
getOptions();
#my(@Options, $annotation, $outfile);

#If output filename is provided
if (defined $outfile){
	$output=shift(@ARGV);
	$name=$output;
}

#If input file in Genbank format
if (defined($gbk)) {
	$dna=$sequence;
	$name=$infile if(!defined$outfile);
	$in_filename="$name\_CAFE.fna";
}

#If input file in Fasta format
else {

	if (scalar(@ARGV)<1){
	print "Sequence file not provided\n";
	exit;
	}

	$f = shift @ARGV;
	$in_filename=$f;
	$name=$f if(!defined$outfile);
	#$f=$sequence;
	#Check if sequence file exists
	unless (-e $f) {
		print "Sequence file does not exist\n";
		exit;
	} 
	$nce=1;
	#Check if sequence file is in fasta format	
	open(F,$f);
	$line= <F>;
	if ($line!~/^>/){
		print "Incorrect sequence file. Please provide sequence file in fasta format\n";
		exit;
	}
	#Get sequence from file
	while($line= <F>){
		$line =~ s/\n//;
		if($line !~ /^>/){
			$dna .=uc($line);
		}
	}

}


#Check only ATCG nucs are present 
@nucs= ('A','T','C','G');
@g=split("",$dna);
foreach(@g){
	$s=$nucs[rand@nucs];
	$_=~s/[^ATCG]/$s/g;
}
open (OUT, ">$name\_CAFE_1");
$m=join("",@g);
print OUT "$m";
$lenm=length($m);
print "Genome loaded\nLength of genome is $lenm bp\n" if ($verb==1); 

#If annotation file is provided

if (!defined$annotation){

	#If genbank file is not provided, check if annotation file exists
	if (!defined$gbk){

		if (scalar@ARGV < 1){
		print "Annotation file does not exist\n";
		exit; 
		}


		$f1 = shift @ARGV;

		#$f1=$ARGV[1];
		#Check if annotation file exists
		unless (-e $f1) {
			print "Annotation file does not exist\n";
			exit;
		} 
	
		open(F1,$f1);
	}

	else {
		open (F1, "$infile\_CAFE.ptt");
	}

	@D=<F1>;
	$a=0;$b=0;$c=0;

	#Check if annotation file is in ptt format
	foreach $D(@D){
		@C=split(/\t/,$D);
		$coords=$C[0];
		($start,$end)=split('\..',$coords);
		$arr1[$a++]=$start;
		$arr2[$b++]=$end;
		$arr3[$c++]=$C[8];
	}
	
	@identifiers=split('\t',$D[2]);

	if (defined $identifiers[0]&& defined $identifiers[1] && defined $identifiers[8]){
		if (($identifiers[0]!~/Location/)|($identifiers[1]!~/Strand/)|($identifiers[8]!~/Product/)){
			print "Incorrect annotation file. Please provide annotation file in ptt format\n";
			exit;
		}
	}

	else {
		print "Incorrect annotation file. Please provide annotation file in ptt format\n";
		exit;
	}

	#Get annotations from ptt file

	#open (PTTOUT, ">$name\_CAFE_gene_coord");
	$gene_counter=0;
	for($i=3;$i<$a;$i++){
		chomp $arr3[$i];
		#print PTTOUT "$arr3[$i]\t$arr1[$i]\t$arr2[$i]\n";
		$st=$arr1[$i];$en=$arr2[$i];$fn=$arr3[$i];
		$gene_start[$gene_counter]=$st;$gene_end[$gene_counter]=$en;$gene_func[$gene_counter]=$fn;
		$gene_counter+=1;
	}

}

#If annotation file is not provided

else{
	#Run Prodigal for predicting genes
	print "Predicting genes using prodigal...\n" if ($verb==1);
	#print "$infile\t$in_filename\t$name\n";
	system("prodigal -a $name\_CAFE.faa -f sco -i $in_filename -o $name\_CAFE_sco_file.txt -q"); #bartonella =.fna file
	
	$sco="$name\_CAFE_sco_file.txt";
	open (S, $sco);
	$a=0;$b=0;$c=0;
	%hash=();
	while (<S>){
		chomp $_;
		next if $_=~/#/;
		@coord=split('_', $_);
		$start[$a++]=$coord[1];
		$end[$b++]=$coord[2];
		$strand[$c++]=$coord[3];
	}
	print "Identified $a genes\n" if ($verb==1);
	print "Scanning genome for presence of genomic island specific marker genes...\n" if ($verb==1);
	
	#Run hmmer for annnotating genes
	system("hmmscan -o $name\_CAFE_stdout_out.txt --tblout $name\_CAFE_table.txt cafe_database_table $name\_CAFE.faa");
	$f="$name\_CAFE_table.txt";
	open(F,$f);


	#open hmmer output file
	while (<F>){
		chomp $_;
		next if $_=~/#/;
		@line=split('\s+',$_);	
		push(@id1,$line[2]);
		push (@evalue,$line[4]);
		$annot=join(' ', @line[18..$#line]);
		push(@func1, $annot);
	
	}

	#Get all markers with BLAST evalue < 0.01
	for ($i=0; $i<scalar(@id1); $i++){
		next if ($id1[$i] eq $id1[$i-1]);
		@idd=split('_',$id1[$i]);
		if ($evalue[$i]<0.01){  #Count only if evalue is less than 0.01.
			$id=$idd[1];
			$genome=$idd[0];
			$func=$func1[$i];

			$hash{$id}=$func unless !defined $idd[1];
		}
	
		#print "$id\t$func[$i]\n";
	}


	#Make ptt file for annotated genes
	open (OUT, ">$name\_CAFE.ptt");
	print OUT "$genome\n";
	print OUT "$c proteins\n";
	print OUT "Location\tStrand\tLength\tPID\tGene\tSynonym\tCode\tCOG\tProduct\n";

	$marker_counter=0;
	for ($i=0;$i<$a;$i++){
		$num=$i+1;
		$length=int(($end[$i]-$start[$i])/3);
	
		if (exists($hash{$num})){
			print OUT "$start[$i]..$end[$i]\t$strand[$i]\t$length\t-\t-\t-\t-\t-\t$hash{$num}\ transposase\n";
			$arr1[$i]=$start[$i];$arr2[$i]=$end[$i];$arr3[$i]=$hash{$num};
			$marker_counter+=1;
		}
		else {
			print OUT "$start[$i]..$end[$i]\t$strand[$i]\t$length\t-\t-\t-\t-\t-\t-\n";
			$arr1[$i]=$start[$i];$arr2[$i]=$end[$i];$arr3[$i]="-";
		}
	
	}
	
	#open (PTTOUT, ">$name\_CAFE_gene_coord");
	$gene_counter=0;

	for($i=0;$i<$a;$i++){
	
	#print PTTOUT "$arr3[$i]\t$arr1[$i]\t$arr2[$i]\n";
	$st=$arr1[$i];$en=$arr2[$i];$fn=$arr3[$i];
	$gene_start[$gene_counter]=$st;$gene_end[$gene_counter]=$en;$gene_func[$gene_counter]=$fn;
	$gene_counter+=1;
	}
	print "Identified $marker_counter genomic island marker genes\n" if ($verb==1);

}

#If phylo module is used
if ($phylo==1){
	print "Checking phylogenetic distribution of genes\n" if ($verb==1);
	shift(@ARGV);
	#print "$infile\n";
	system ("cat ./faa/* > faa_database");
	@faa_files=<./faa/*>;
	$expect=scalar (@faa_files);
	if ($expect < 5){
		print "Phylo module requires atleast 5 genomes for comparison\n";
		exit;
	}
	if(defined$annotation){$infile=$name;}
	##make database
	system ("makeblastdb -dbtype prot -in faa_database");#faa_database
	system ("blastp -db faa_database -query $infile\_CAFE.faa -outfmt \"6 qseqid sseqid stittle salltitles qcovs pident\" -out $name\_blast_output -num_threads 4");
	$blastf="$name\_blast_output";
	open(BL, $blastf);
	open (BLOUT,">$name\_phy");
	
	$bf=0;
	while (<BL>){
		chomp;
		@BFC=split('\t', $_);
		$queryacc[$bf]=$BFC[0];
		#$subacc[$bf]=$BFC[1];
		$blsub[$bf]=$BFC[2];
		$coverage[$bf]=$BFC[3];
		$identities[$bf]=$BFC[4];
		$bf++;
	}

	$phprev="";
	$phcount=0;
	$phcum_count=0;
	$pi=0;
	foreach(@queryacc) {
		if($phprev ne $_) {
			if($phcount) {
				#printf("%s:%d \n",$_,$count);
				$phash{$phprev}=$phcount;
				$phash_cum{$phprev}=$phcum_count;
				#print "$_ $count $cum_count\n";

			}
		$ptemp_count=$phcount; #save values for last query
		$ptemp_cum_count=$phcum_count;
		$phprev=$_;
		$phcount=0;
		}
		$phcount++;
		$phcum_count++;
		$pi++;
	
		
	}

	for ($i=0; $i<$bf; $i++){
		if ($blsub[$i]=~m/\[(.*)\]/){ #get blast subject names
			$blastname[$i]=$1;
			
		}
	
	}

	for ($i=0; $i<$bf; $i++){
		$cur=$blastname[$i];
		if (defined $cur){
			@blast=split('\s+',$cur);
		}
		else {$blastname[$i]='';}
		$blast_genus[$i]=$blast[0];
		#$blast_sp[$i]=$blast[1];
		$blast_strain[$i]=join('', @blast[2..$#blast]);
		#print "$i $blast_strain[$i]\n";
		#print "$blast_genus[$i]\t$blast_sp[$i]\t$blast_strain[$i]\n";
	}
	
	@uniq_queryacc=uniq(@queryacc);

	$last_element=$uniq_queryacc[-1];
	$phash{$last_element}=$ptemp_count;
	$phash_cum{$last_element}=$ptemp_cum_count;

	$spcount=0;
	@all_strains='';
	$phv=0;
	
	for ($k=0; $k<scalar(@uniq_queryacc); $k++){
		$cur_query=$uniq_queryacc[$k];
	

		$next_query=$uniq_queryacc[$k+1];
		pop(@all_strains);
		$start=	$phash_cum{$cur_query}-$phash{$cur_query};
		$end=$phash_cum{$cur_query};
		#print "$start\t$end\t$input_genus\n";
		for ($i=$start;$i<$end;$i++){
			
			if ($blast_genus[$i]=~m/$input_genus/i){
				
				if ($coverage[$i]>=70&&$identities[$i]>=70){ #if identities and coverage are greater than 70
					$spcount+=1;
					push(@all_strains,$blast_strain[$i]);
					#print "$i\t$cur_query\t$spcount\t$phash{$cur_query}\t$blast_strain[$i]\n";
				}
			}
		}
		#print "@all_strains\n";
		$actual_count=scalar(uniq(@all_strains)); #Do not count multiple times if query matches multiple genes in same genome/strain
		
		if (!defined $expect ) {$pc=1;}
		else {$pc=$actual_count/$expect;}
		if ($pc>=0.5){
			$value=0; #typical phyletic spread
			$phyvalue[$phv++]=0;
		}
		else {
			$value=1;
			$phyvalue[$phv++]=1;
		} # atypical phyletic pattern
		
		if (!defined $expect ) {
			print BLOUT "$cur_query\t0\t$actual_count\t$phash{$cur_query}\t$value\n";
		}
		
		else {
			print BLOUT "$cur_query\t$expect\t$actual_count\t$phash{$cur_query}\t$value\n";
		}
		if ($next_query-$cur_query>1){
			$diff=$uniq_queryacc[$k+1]-$cur_query;
			for ($j=0; $j<($diff-1);$j++){
				$cur_query+=1;
				$phyvalue[$phv++]=1;
				print BLOUT "$cur_query\t$expect\t$actual_count\t0\t1\n";
			}
		}
		$spcount=0;
		@all_strains='';

	}


}

#Run segmentation and clustering algorithm using compiled c program.

#If user has defined thresholds

if (defined ($seg) && defined ($clus1) && defined ($clus2)){
	system("./cafe.out $name\_CAFE_1 $seg $clus1 $clus2 $verb"); 
}

#Default thresholds

else {
	
	system("./cafe.out $name\_CAFE_1 0.8 0.99999 0.999 $verb");
} 	

$cafetemp = substr(abs_path($0), 0, -5)."/cafe_temp";
opendir $currdir, Cwd::cwd();
@dirfiles = readdir $currdir;
$cpcafetemp = 1;
foreach (@dirfiles){
	if ("cafe_temp" eq $_){
		$cpcafetemp = 0;
	}
}

if ($cpcafetemp == 1){
	system("cp $cafetemp cafe_temp");
}


#Parse segment and clustering output file. Combine contiguous segments with same cluster id.

$sc_out='cafe_temp'; #seg-clus output file
open (SCOUT, $sc_out);
@SC=<SCOUT>;
#open(F1,">$name\_CAFE_out");
$sca=0;$scb=0;$scc=0;
foreach (@SC){
	@SC_C=split(/\s+/,$_);
	$sc_arr1[$sca++]=$SC_C[0];
	$sc_arr2[$scb++]=$SC_C[1];
	$sc_arr3[$scc++]=$SC_C[2];
}

$counter=0;
for ($i=0; $i<$scc; $i++){
	if ($sc_arr3[$i]==1){
		$counter=1;
	}
}

$min=1000;

if($counter==0){
	for ($i=0; $i<$scc; $i++){
		if ($sc_arr3[$i]<$min){
			$min=$sc_arr3[$i];
		}
	}
}

if($counter==0){
	for ($i=0; $i<$scc; $i++){
		$sc_arr3[$i]=$sc_arr3[$i]-$min+1;
	}
}

$k=0;

for ($i=0; $i<$scc; $i++){
	if ($i==$scc-1){		
		$cst=$sc_arr1[$i];$cend=$sc_arr2[$i];$cclu=$sc_arr3[$i];
		$cond_arr1[$k]=$cst;$cond_arr2[$k]=$cend;$cond_arr3[$k]=$cclu;		
		$k++;
	}
	else{
		if($sc_arr3[$i]==$sc_arr3[$i+1]){
			$sc_arr1[$i+1]=$sc_arr1[$i];
			if ($i+1==$scc){
				#print F1 "$sc_arr1[$i]\t$sc_arr2[$i]\t$sc_arr3[$i]\n";
				$cst=$sc_arr1[$i];$cend=$sc_arr2[$i];$cclu=$sc_arr3[$i];
				$cond_arr1[$k]=$cst;$cond_arr2[$k]=$cend;$cond_arr3[$k]=$cclu;
				$k++;
			}
		}
		 
		else{
			#print F1 "$sc_arr1[$i]\t$sc_arr2[$i]\t$sc_arr3[$i]\n";
			$cst=$sc_arr1[$i];$cend=$sc_arr2[$i];$cclu=$sc_arr3[$i];
			$cond_arr1[$k]=$cst;$cond_arr2[$k]=$cend;$cond_arr3[$k]=$cclu;		
			$k++;
		}
	}
	
}			
		
#Calculate size of each cluster as a percentage of the length of genome
#open(F3,">$name\_CAFE_clustersize");

$length=length($m);
$max=0;
$size=0;

for($i=0;$i<$k;$i++){
	if($cond_arr3[$i]>$max){
		$max=$cond_arr3[$i];
	}
}

#print F3 "max is $max\n";
%sort_hash=();

for($i=0;$i<$k;$i++){

	for($j=1;$j<($max+1);$j++){
		if($cond_arr3[$i]==$j){
			$size[$j]+=$cond_arr2[$i]-$cond_arr1[$i]+1;
			
		}
	}			
}

for($j=1;$j<($max+1);$j++){
	$percent[$j]=(($size[$j]/$length)*100);
	$pc=$percent[$j];
	$sort_hash{$pc}=$j;
	#print F3 "$j $size[$j] $percent[$j]\n";
	
}

$percent[0]=0;

#Identify two largest clusters by size.
#shift(@percent);
@sortedpc = sort { $a <=> $b } @percent;
$large_cl1=$sortedpc[-1];
$large_cl2=$sortedpc[-2];
$lc1=$sort_hash{$large_cl1};
$lc2=$sort_hash{$large_cl2};

#print F3 "the two largest clusters are: $lc1 and $lc2\n";

#Assign genes to clusters identified by segmentation and clustering algorithm.

open (PYOUT, ">$name\_CAFE_extract_out");
$gex_counter=0;

#print "gene counter $gene_counter\n";
for ($i=0;$i<$k;$i++){
	for ($l=0;$l<$gene_counter;$l++){
		if($cond_arr1[$i]<=$gene_start[$l] && $cond_arr1[$i]<=$gene_end[$l] && $gene_start[$l]<=$cond_arr2[$i] && $gene_end[$l]<=$cond_arr2[$i]){
			print PYOUT "$cond_arr3[$i]\t$cond_arr1[$i]\t$cond_arr2[$i]\t$gene_func[$l]\t$gene_start[$l]\t$gene_end[$l]\t$phyvalue[$l]\n";
			$sc_clus[$gex_counter]=$cond_arr3[$i];$sc_start[$gex_counter]=$cond_arr1[$i];$sc_end[$gex_counter]=$cond_arr2[$i];
			$g_func[$gex_counter]=$gene_func[$l];$g_start[$gex_counter]=$gene_start[$l];$g_end[$gex_counter]=$gene_end[$l];$phylovalue[$gex_counter]=$phyvalue[$l];
			$gex_counter+=1;
		}
		elsif($cond_arr1[$i]>=$gene_start[$l] && $gene_end[$l]>=$cond_arr1[$i] && $cond_arr2[$i]>=$gene_end[$l]){ 
			print PYOUT "$cond_arr3[$i]\t$cond_arr1[$i]\t$cond_arr2[$i]\t$gene_func[$l]\t$gene_start[$l]\t$gene_end[$l]\t$phyvalue[$l]\n";
			$sc_clus[$gex_counter]=$cond_arr3[$i];$sc_start[$gex_counter]=$cond_arr1[$i];$sc_end[$gex_counter]=$cond_arr2[$i];
			$g_func[$gex_counter]=$gene_func[$l];$g_start[$gex_counter]=$gene_start[$l];$g_end[$gex_counter]=$gene_end[$l];$phylovalue[$gex_counter]=$phyvalue[$l];
			$gex_counter+=1;			
		}
		elsif($cond_arr1[$i]<=$gene_start[$l] && $gene_start[$l]<=$cond_arr2[$i] && $cond_arr2[$i]<=$gene_end[$l]){ 
			print PYOUT "$cond_arr3[$i]\t$cond_arr1[$i]\t$cond_arr2[$i]\t$gene_func[$l]\t$gene_start[$l]\t$gene_end[$l]\t$phyvalue[$l]\n";
			$sc_clus[$gex_counter]=$cond_arr3[$i];$sc_start[$gex_counter]=$cond_arr1[$i];$sc_end[$gex_counter]=$cond_arr2[$i];
			$g_func[$gex_counter]=$gene_func[$l];$g_start[$gex_counter]=$gene_start[$l];$g_end[$gex_counter]=$gene_end[$l];$phylovalue[$gex_counter]=$phyvalue[$l];
			$gex_counter+=1;
		}
	}
		
}

for ($i=0;$i<$gex_counter;$i++){
	$g_func[$i]=~s/[^a-zA-Z0-9]/ /g;
}


@Dlib=qw(transposase transposon integrase integration phage prophage bacteriophage mobile mobility insertion recombinase plasmid);
$count2=0;
$total_count=0;
foreach(@Dlib){ #for all words in library
	$count=0;
	$_[$count2]=0;
	for($i=0;$i<$gex_counter;$i++){ #for all genes
				
		@splitarr7=split(/ /,$g_func[$i]);
		foreach $splitarr7(@splitarr7){ #for each word in line
		               
			if($_=~m/$splitarr7/i){
				if (length($_)==length($splitarr7)){
					$count+=1;
					$_[$count2]+=1;
					$total_count+=1;
				}
			}
									
		} 
					      
		@splitarr7='';
	}

}
##print "Cluster_id\t" if ($verb==1);
foreach(@Dlib){
##print "$_\t" if ($verb==1);
}
##print "total_markers\n" if ($verb==1);
@splitarr7='';

$lib_word_counter=0;

$total_word=0;
$total_words_per_cluster=0;

for($j=1;$j<$max+1;$j++){#for all cluster ids
	##print"$j\t" if ($verb==1);
	$count2 =0;
	foreach(@Dlib){ #for all words in library
		
		$lib_word_counter+=1;
		$count=0;
		$count1=0;
 		$count1[$j]=0;	
  		for($i=0;$i<$gex_counter;$i++){ #for all genes
			if($sc_clus[$i]==$j){ #if cluster id  match
				#$g_func[$i]=~s/,//g;
				@splitarr7=split(/ /,$g_func[$i]);
		
				foreach $splitarr7(@splitarr7){ #for each word in line
				
					if($_=~m/$splitarr7/i){
						if (length($_)==length($splitarr7)){
							
							$count+=1;
							$count1[$j]+=1;
						   }
				   	}
				} 
			}
			@splitarr7='';
		}

		$count2+=$count1[$j];
		##print "$count1[$j]\t" if ($verb==1);	
		
	}

	##print "$count2\n" if ($verb==1);
	$total_words_per_cluster[$j]=$count2;
	$total_word+=$count2;

}


$j=1;

open(FOUT1, ">$name\_CAFE_enrichment.txt");

if($total_word==0){$total_word=1;}

shift(@percent);
$e_counter=0;
foreach$D2(@percent){
	#print "$total_words_per_cluster[$j]\t$j\t$D2\n";
	$percentage=($total_words_per_cluster[$j]/$total_word)*100;
	$enrichment=$percentage/$D2;
	#print "$percent\t$D2\t$enrichment\t$j\n";
	print FOUT1 "$j\t$D2\t$enrichment\n";
	push (@fout, $D2);
	
	$enrich[$e_counter]=$enrichment;
	$e_counter+=1;
	$j+=1;
}

for ($i=0;$i<scalar(@cond_arr1);$i++){
	#print "$cond_arr1[$i]\t$cond_arr2[$i]\t$cond_arr3[$i]\n";#print condensed array
}

for ($i=0;$i<scalar(@percent);$i++){
	#print "$i\t$percent[$i]\t$enrich[$i]\n";
}

#Merge two largest clusters if they are not enriched in marker genes

$j=$lc1;
$k=$lc2;
$max=$large_cl1;
$max1=$large_cl2;


if($max1>=20){
	if($enrich[$k]<=1.5){
		#print "merging native clusters $j and $k using marker gene information\n" if ($verb==1);
		for($i=0;$i<scalar(@cond_arr1);$i++){
			if ($cond_arr3[$i]==$k){
				$cond_arr3[$i]=$j;
			}
		#print "$cond_arr1[$i]\t$cond_arr2[$i]\t$cond_arr3[$i]\n";#print condensed array
		}
		
	}
	else{
		print "Native cluster is cluster $j\n" if ($verb==1);
	}
}

else{
	print "Native cluster is cluster $j\n" if ($verb==1);
}

open (COUT, ">$name\_CAFE_seg_clus_out.txt");

#Assign labels to clusters (only required for CAFE_full version)
for($i=0;$i<scalar(@cond_arr1);$i++){
	$oriclusid[$i]=$cond_arr3[$i];
	if ($cond_arr3[$i]!=$j){
		$cond_arr3[$i]='121212'; #alien
		print COUT "$cond_arr1[$i]\t$cond_arr2[$i]\t$oriclusid[$i]\tAlien\n";
	}
	else{
		print COUT "$cond_arr1[$i]\t$cond_arr2[$i]\t$oriclusid[$i]\tNative\n";
		$cond_arr3[$i]='343434'; #native
	}

}
=start
#Get CAFE full version GIs
$out_file= "$name\_CAFE_full_version.txt";
open (OUTFILE, ">$out_file");
$is_counter=0;
$gic=0;
$cff=0;
for($i=0;$i<scalar(@cond_arr1);$i++){

	if($cond_arr3[$i]==$cond_arr3[$i+1]){
		$cond_arr1[$i+1]=$cond_arr1[$i];
	}

	else{
		if($cond_arr3[$i]==121212){
			if($cond_arr2[$i]-$cond_arr1[$i]>7999){
				$is_counter+=1;
				if ($is_counter==1){
					print OUTFILE "Genomic_island	Start	End	Length\n";
				}
				#$cfgi_start[$gic]=$cond_arr1[$i];
				#$cfgi_end[$gic]=$cond_arr2[$i];
				$gic+=1;
				$lenn=$cond_arr2[$i]-$cond_arr1[$i]+1;
				$cafe_f_gi_start[$cff]=$cond_arr1[$i];$cafe_f_gi_end[$cff]=$cond_arr2[$i];$cff+=1; 	
				print OUTFILE "GI-$is_counter\t$cond_arr1[$i]\t$cond_arr2[$i]\t$lenn\n";
			}
		}

	}
}


if ($is_counter==0){
	print OUTFILE "No Genomic islands detected\n";
}
=cut
#######################################################################################################################################################
##Identify aberrant clusters

$max=0;
for ($i=0; $i< scalar(@fout); $i++){
	if ($fout[$i]>$max){
		$max=$fout[$i];
		$j=$i+1;
	}
}

$clusno=max(@oriclusid); #total number of clusters

#initialize to 0
for ($j=1; $j<($clusno)+1;$j++){ 
	$count_gene[$j]=0;
	$count_aberrant[$j]=0;
}

for ($j=1; $j<($clusno)+1;$j++){ 
	for ($i=0; $i<scalar(@phylovalue); $i++){
		if ($sc_clus[$i]==$j){
			$count_gene[$j]+=1;
			if ($phylovalue[$i]==1) {
				$count_aberrant[$j]+=1;
			}
		}
	}
}

@abb='';

#Determine if a cluster is composed of genes with aberrant phyletic pattern
for ($j=1; $j<($clusno+1);$j++){ 
	#print "$j\t$count_aberrant[$j]\t$count_gene[$j]\n";
	if ($count_gene[$j]==0){$count_aberrant[$j]=1;$count_gene[$j]=1; }
	if (($count_aberrant[$j]/$count_gene[$j])>=0.99){ 
		$abb[$j]=1;
	}
	else {$abb[$j]=0;}
	
}

#Determine native and alien clusters
for ($i=0; $i<scalar(@cond_arr3); $i++){
	$val=$oriclusid[$i];

	if ($val==$lc1){
		$cond_arr3[$i]=343434; #make largest cluster native
	}
	else{
		
		if ($enrich[$val-1]<1.5){ #Native if a cluster is not enriched in marker genes
			
			$cond_arr3[$i]=343434;#native
			
			if ($abb[$val]==1){ # if cluster has aberrant phyletic pattern
			
				#print "Aberrant only clusters are $val\n";
				$cond_arr3[$i]=121212;#alien
			}


		}
		else{
		
			$cond_arr3[$i]=121212;#alien
		
		}
	}

	
}

$ncondcounter=0;

for ($i=0; $i< scalar(@cond_arr3); $i++){ #Need to condense ##Merge contiguous alien clusters
	if ($i+1==scalar(@cond_arr3)){
		$nconds=$cond_arr1[$i];$nconde=$cond_arr2[$i];$ncondcl=$cond_arr3[$i];$noricl=$oriclusid[$i];
		$nca1[$ncondcounter]=$nconds; $nca2[$ncondcounter]=$nconde; $nca3[$ncondcounter]=$ncondcl; $nca4[$ncondcounter]=$noricl;
		$ncondcounter++;
		
	}
	else{
		if ($cond_arr3[$i]==$cond_arr3[$i+1]){
			$cond_arr1[$i+1]=$cond_arr1[$i];
			if ($i+1==scalar(@cond_arr3)){
				$nconds=$cond_arr1[$i];$nconde=$cond_arr2[$i];$ncondcl=$cond_arr3[$i];$noricl=$oriclusid[$i];
				$nca1[$ncondcounter]=$nconds; $nca2[$ncondcounter]=$nconde; $nca3[$ncondcounter]=$ncondcl; $nca4[$ncondcounter]=$noricl;
				$ncondcounter++;
			
			}
		}
		else{
			$nconds=$cond_arr1[$i];$nconde=$cond_arr2[$i];$ncondcl=$cond_arr3[$i];$noricl=$oriclusid[$i];
			$nca1[$ncondcounter]=$nconds; $nca2[$ncondcounter]=$nconde; $nca3[$ncondcounter]=$ncondcl; $nca4[$ncondcounter]=$noricl;
			$ncondcounter++;
		}
	}
}

open (OUTFILE1, ">$name\_CAFE.txt");
$nul=0;
$cffm=0;
#Identify islands from alien clusters	
for ($i=0; $i< $ncondcounter; $i++){
	#print COUT "$nca1[$i]\t $nca2[$i]\t$nca4[$i]\t$nca3[$i]\n";
	
	if ($nca3[$i]==121212){
		if ($nca2[$i]-$nca1[$i]>=8000){
			$nul+=1;
			if ($nul==1){print OUTFILE1 "Genomic_island_id	Start	End	Length\n";}
			$length=$nca2[$i]-$nca1[$i]+1;
			$cgist=$nca1[$i]; $cgiend=$nca2[$i];
			$cafe_gi_start[$cffm]=$cgist; $cafe_gi_end[$cffm]=$cgiend;$cffm+=1; 
			print OUTFILE1 "GI-$nul\t$nca1[$i]\t$nca2[$i]\t$length\n";
		}
	}

	
}

if($nul==0){
	print "No Genomic islands detected\n";
}

#######################################################################################################################################################

#For generating genomic island map using CGview
if ($visual==1){
	print "Preparing files for making visualizing genomic islands\n" if ($verb==1);
	open (VOUT, ">$name\_CAFE_feature_table");
	open (VOUTF, ">$name\_CAFE_full_feature_table");	
	print VOUT "seqname\tsource\tfeature\tstart\tend\tscore\tstrand\tframe\n";
	print VOUTF "seqname\tsource\tfeature\tstart\tend\tscore\tstrand\tframe\n";
	for ($i=0;$i<$cffm;$i++){
		print VOUT "gene$i\t.\tgene\t$cafe_gi_start[$i]\t$cafe_gi_end[$i]\t1\t.\t.\n";
	}

	for ($i=0;$i<$cff;$i++){
		print VOUTF "gene$i\t.\tgene\t$cafe_f_gi_start[$i]\t$cafe_f_gi_end[$i]\t1\t.\t.\n";
	}	


	open(VISL, ">$name\_CAFE_label"); 
	print VISL "gene\t1";
	if ($verb==0){
		system ("perl cgview_xml_builder.pl -sequence $in_filename -verbose s -genes $name\_CAFE_feature_table -title \"$name\" -labels_to_show $name\_CAFE_label -custom featureThickness=45 labelFontSize=45 -output $name\_CAFE_marker.xml");
		system ("perl cgview_xml_builder.pl -sequence $in_filename -verbose s -genes $name\_CAFE_full_feature_table -title \"$name\" -labels_to_show $name\_CAFE_label -custom featureThickness=45 labelFontSize=45 -output $name\_CAFE_full.xml");
		
	}
	else {
		system ("perl cgview_xml_builder.pl -sequence $in_filename -genes $name\_CAFE_feature_table -title \"$name\" -labels_to_show $name\_CAFE_label -custom featureThickness=45 labelFontSize=45 -output $name\_CAFE_marker.xml");
		system ("perl cgview_xml_builder.pl -sequence $in_filename -genes $name\_CAFE_full_feature_table -title \"$name\" -labels_to_show $name\_CAFE_label -custom featureThickness=45 labelFontSize=45 -output $name\_CAFE_full.xml");
	}

	push (@vers, "$name\_CAFE_marker.xml", "$name\_CAFE_full.xml");
	$ci=0;

	#Edit XML file to include labels and color GIs
	foreach $vers (@vers){ 
		open(VISF, $vers);
		@file=<VISF>;
		$count=0;
		open(VISF1,">$name\_CAFE1\_$ci.xml");	
		

		foreach(@file){
			if ($_=~m/showLabel/){
				$count+=1;
			}
	
		}		

		foreach(@file){
			if ($_=~m/showLabel/){
				$_=~s/showLabel="false"/label=\"GI-$count" showLabel="true"/;
				$count-=1;
			}
			if ($_=~m/plain, 20/){
				$_=~s/plain, 20/plain, 45/;
			}
			if ($_=~m/showShading="true"/){
				$_=~s/showShading="true"/showShading="false"/;
			}
			if ($_=~m/51,51,51/){
					$_=~s/51,51,51/0,0,204/;
			}
			if ($_=~m/plain, 80" text/){
					$_=~s/plain, 80" text/italics, 80" text/;
			}


			print VISF1 "$_";
		}				

		$ci+=1;
	}
	system ("java -jar cgview.jar -i $name\_CAFE1_0.xml -o $name\_CAFE_marker.png > $name\_CAFE_CGViewout");
	
	
}

#system ("rm temp");

if ($expert==0){
	system ("rm $name\_CAFE_stdout_out.txt") if (defined$annotation);
	system ("rm $name\_CAFE_1");	
	system ("rm $name\_CAFE.ptt"); 
	system ("rm $name\_CAFE.faa");
	system ("rm $name\_phy") if($phylo==1); 
	system ("rm $name\_blast_output") if($phylo==1);	
	system ("rm $name\_CAFE_extract_out");
	system ("rm faa_database") if (defined$phylo);	
	system ("rm $name\_CAFE_sco_file.txt") if (defined$annotation);
	system ("rm $name\_CAFE_seg_clus_out.txt");
	system ("rm $name\_CAFE_table.txt") if (defined$annotation);
	system ("rm $infile\_CAFE.fna") if (defined$gbk);	
	system ("rm $name\_CAFE_enrichment.txt");
	system ("rm $name\_CAFE_full.xml") if ($visual==1);
	system ("rm $name\_CAFE_marker.xml") if ($visual==1);
	system ("rm $name\_CAFE1_0.xml") if ($visual==1);
	system ("rm $name\_CAFE1_1.xml") if ($visual==1);
	system ("rm $name\_CAFE_feature_table") if ($visual==1);
	system ("rm $name\_CAFE_full_feature_table") if ($visual==1);	
	system ("rm $name\_CAFE_label") if ($visual==1);
	#system ("rm $name\_CAFE_CGViewout1") if ($visual==1);
	system ("rm $name\_CAFE_CGViewout") if ($visual==1);
	#system ("rm cafe_temp");
	system ("rm $cafetemp");
	system ("rm cafe_temp")if ($cpcafetemp==1);
}


sub usage {

	foreach (@Options) {
		if (ref) {
			my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
			$def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
			my $opt = $_->{OPT};
			$opt =~ s/!$//; 
			$opt =~ s/=s$/ [X]/; 
			$opt =~ s/=i$/ [N]/;
			$opt =~ s/=f$/ [n.n]/;
			printf STDERR "  --%-15s %s%s\n", $opt, $_->{description}, $def;
		}
		else {
			print STDERR "$_\n";
		}      
	}
 	print STDERR "\nUsage:\n  $0 [options] -gbk genome.gbk\n\n  $0 [options] -annot genome.fna\n\t\tOR\n";

  	exit;
}

sub info {
	print "CAFE is a genomic island prediction tool. This is CAFE v1.0 2017, written by Mehul Jani <mehuljani\@my.unt.edu>\n";
	exit;
}


sub Thres {
	$seg=shift(@ARGV);
	$clus1=shift(@ARGV);
	$clus2=shift(@ARGV);
	if (defined ($seg) && defined ($clus1) && defined ($clus2)){
		if ($seg=~m/^-?\d+\.?\d*$/ && $clus1=~m/^-?\d+\.?\d*$/ && $clus2=~m/^-?\d+\.?\d*$/){ #matches real numbers

			if ($seg<0||$seg>1||$clus1<0||$clus1>1||$clus2<0||$clus2>1){ #check if thresholds are in range
				print "Thresholds should be in range 0 to 1\n";
				exit;
			}
			else {		
				print "Thresholds are $seg $clus1 $clus2\n";
			}
		}
		else{
			print "Input segmentation contiguous clustering and non-contiguous clustering thresholds\n";
			exit;
		}
		
	}
	else {
		
		print "Input segmentation contiguous clustering and non-contiguous clustering thresholds\n";
		exit;
	}
}

sub verb {
	$verb=1;
}

sub expert {
	$expert=1;
}

sub visual {
	$visual=1;
}

sub gbk {
	$gbk=1;
	
	#print $ARGV[0];
	
	$infile=shift(@ARGV);
	#print "$infile\n";
	if (defined $nce){exit;}
	unless (-e $infile) {
		print "Sequence file does not exist\n";
		exit;
	} 
	$nce=1;
	my $seqio = Bio::SeqIO->new(
		                        -file   =>  $infile,
		                        -format =>  'genbank',
		                    );
	open(my $PTT, ">", $infile . "_CAFE.ptt");
	open(FNA, ">", $infile . "_CAFE.fna");
	open(FAA, ">", $infile . "_CAFE.faa");
	my %tags = ();
	$gene_no=1;
	while (my $seq = $seqio->next_seq()) {
	
		my $header1 = $seq->desc() || 'unknown' . " - 1.." . $seq->length();
		$sequence=$seq->seq;	
		print FNA ">$header1 ", "\n", $sequence;

		my @CDS = $seq->get_SeqFeatures('CDS');
		my $featcnt = 0;
		
		print $PTT $header1 . "\n";
		print $PTT scalar(@CDS) . " proteins" . "\n";
		print $PTT "Location\tStrand\tLength\tPID\tGene\tSynonym\tCode\tCOG\tProduct\n";


		for my $feature (@CDS){

			next if $feature->has_tag('pseudo');

			my $tag = $feature->primary_tag();
			++$tags{$tag};


			my $start = $feature->start();
			my $stop = $feature->end();
			my $strand = $feature->strand();
			my $length = $feature->length();
			my @pid = $feature->has_tag('protein_id') ? $feature->get_tag_values('protein_id') : $feature->get_tag_values('locus_tag');
			my @gene = $feature->has_tag('gene') ? $feature->get_tag_values('gene') : '-';
			my @synonym = $feature->get_tag_values('locus_tag');
			my $code = '-';
			my $cog = '-';
			my @description = $feature->get_tag_values('product');
			my @protein =$feature->get_tag_values('translation');
			
			if ($strand > 0) {
			    $strand = '+';
			} elsif ($strand < 0) {
			    $strand = '-';
			}
			print $PTT $start . ".." . "$stop\t$strand\t$length\t$pid[0]\t$gene[0]\t$synonym[0]\t$code\t$cog\t$description[0]\n";

			print FAA ">$gene_no\n$protein[0]\n";
			$gene_no++;
			#my $protein= $feature->seq() ;
			#print "$protein\n"
			
		}

	}
}

sub phylo {
	$phylo=1;
}

sub genus {
	$input_genus=shift(@ARGV);
	#print "in_gen $input_genus\n";
	return $input_genus;
	#exit;
}

if ($phylo==0){
	shift(@ARGV);
	gbk();
	#print "$infile\n";
	#system ("cat ./faa/* > faa_database");
	@faa_files=<./faa/*>;
	print @faa_files;
	$expect=scalar (@faa_files);
	#print "expect = $expect\n";
	#system ("makeblastdb -dbtype prot -in faa_database");
	#system ("blastp -db faa_database -query $infile\_CAFE.faa -outfmt \"6 qseqid sseqid stittle salltitles qcovs pident\" -out $infile\_blast_output -num_threads 4");
	$blastf="$name\_blast_output";
	open(BL, $blastf);
	open (BLOUT,">$name\_phy");
	#$input_genus='Bartonella';
	$bf=0;
	while (<BL>){
		chomp;
		@BFC=split('\t', $_);
		$queryacc[$bf]=$BFC[0];
		#$subacc[$bf]=$BFC[1];
		$blsub[$bf]=$BFC[2];
		$coverage[$bf]=$BFC[3];
		$identities[$bf]=$BFC[4];
		$bf++;
	}

	$phprev="";
	$phcount=0;
	$phcum_count=0;
	#$hash;
	$pi=0;
	foreach(@queryacc) {
		if($phprev ne $_) {
			if($phcount) {
				#printf("%s:%d \n",$_,$count);
				$phash{$phprev}=$phcount;
				$phash_cum{$phprev}=$phcum_count;
				#print "$_ $count $cum_count\n";

			}
		$ptemp_count=$phcount; #save values for last query
		$ptemp_cum_count=$phcum_count;
		$phprev=$_;
		$phcount=0;
		}
		$phcount++;
		$phcum_count++;
		$pi++;
	
		
	}

	for ($i=0; $i<$bf; $i++){
		if ($blsub[$i]=~m/\[(.*)\]/){ #get blast subject names
			$blastname[$i]=$1;
		
		}
	
	}

	for ($i=0; $i<$bf; $i++){
		$cur=$blastname[$i];
		if (defined $cur){
			@blast=split('\s+',$cur);
		}
		else {$blastname[$i]='';}
		$blast_genus[$i]=$blast[0];
		#$blast_sp[$i]=$blast[1];
		$blast_strain[$i]=join('', @blast[2..$#blast]);
		#print "$blast_genus[$i]\t$blast_sp[$i]\t$blast_strain[$i]\n";
	}

	@uniq_queryacc=uniq(@queryacc);

	$last_element=$uniq_queryacc[-1];
	$phash{$last_element}=$ptemp_count;
	$phash_cum{$last_element}=$ptemp_cum_count;

	$spcount=0;
	@all_strains='';
	foreach(@uniq_queryacc){
		pop(@all_strains);
		$start=	$phash_cum{$_}-$phash{$_};
		$end=$phash_cum{$_};
		#print "$start\t$end\n";
		for ($i=$start;$i<$end;$i++){
			if ($blast_genus[$i]=~m/$input_genus/){
				if ($coverage[$i]>=70&&$identities[$i]>=70){ #if identities and coverage are greater than 70
					$spcount+=1;
					push(@all_strains,$blast_strain[$i]);
					#print "$i\t$_\t$spcount\t$hash{$_}\t$blast_strain[$i]\n";
				}
			}
		}
		$actual_count=scalar(uniq(@all_strains)); #Do not count multiple times if query matches multiple genes in same genome/strain
		if (!defined $expect ) {$pc=1;}
		else {$pc=$actual_count/$expect;}
		$phv=0;
		
		if ($pc>=0.5){
			$phyvalue[$phv++]=0; #typical phyletic spread
			$value=0;
		}
		else {
			$phyvalue[$phv++]=1;
			$value=0; #atypical phyletic pattern			
		} 
		if (!defined $expect ) {
			print BLOUT "$_\t0\t$actual_count\t$phash{$_}\t$value\n";
		}
		else {
			print BLOUT "$_\t$expect\t$actual_count\t$phash{$_}\t$value\n";
		}
		$spcount=0;
		@all_strains='';
		
	}
	
	return @phyvalue;
	#exit;
}

#getOptions();


sub getOptions {
	use Getopt::Long;

	@Options = (
	"",'Options:',
	{OPT=>"help",    VAR=>\&usage,             description=>"This help"},
	{OPT=>"info", VAR=>\&info, description=>"Information about program"},
	'Input:',
	{OPT=>"annot", VAR=>\$annotation, description=>"Annotate marker genes (Requires prodigal and Hmmer)"},
	{OPT=>"Thres", VAR=>\&Thres, description=>"Provide segmentation, contiguous clustering and non-contiguous clustering thresholds (range: 0-1)"},
	{OPT=>"gbk", VAR=>\&gbk, description=>"Use genbank as input file"},
	{OPT=>"phylo", VAR=>\&phylo, description=>"Use phylogenetics"},
	{OPT=>"genus", VAR=>\&genus, description=>"Write genus of the query genome"},
	'Output:',
	{OPT=>"out", VAR=>\$outfile, description=>"Output file name"},
	{OPT=>"verbose", VAR=>\&verb, description=>"print on screen"},	
	{OPT=>"expert", VAR=>\&expert, description=>"keep temporary files for user analyses"},
	{OPT=>"visual", VAR=>\&visual, description=>"Make map of genomic islands (Requires CGView)"},
	
	
	);
	(!@ARGV) && (usage());
	&GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage();
	foreach (@Options) {
		if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
		${$_->{VAR}} = $_->{DEFAULT};
	}
  }
}