#!/usr/bin/perl
# Author: Olivier Mirabeau
# Purpose:  process the signal peptide file EUKSIG.red (JP Vert 2002) and write files containing relevant variables related to the n,h,c regions of the signal peptide, like frequencies and length distributions

if ($#ARGV<0) {
	print "Enter the signal peptide fasta file: ";
	chomp ($inputfile = <STDIN>);
} else { $inputfile = $ARGV[0]; }


my %aminoacids = ("A" => 0,
		  "R" => 1,
		  "N" => 2,
		  "D" => 3,
		  "C" => 4,
		  "Q" => 5,
		  "E" => 6,
		  "G" => 7,
		  "H" => 8,
		  "I" => 9,
		  "L" => 10,
		  "K" => 11,
		  "M" => 12,
		  "F" => 13,
		  "P" => 14,
		  "S" => 15,
		  "T" => 16,
		  "W" => 17,
		  "Y" => 18,
		  "V" => 19);


my @indexc;
my @indexh;
my @sequences;
my @names;
my @smallnames;
my @organisms;
my @optwinsize; 

open (IN,"<$inputfile") || die "couldn't open file ".$inputfile;
open (OUTNTER,">$inputfile"."_nter_freq\.txt");
open (OUTHydro,">$inputfile"."_hydro_freq\.txt");
open (OUTCter,">$inputfile"."cter_freq\.txt");
open (OUTNTERLength,">$inputfile"."_nter_length\.txt");
open (OUTHydroLength,">$inputfile"."_hydro_length\.txt");
open (OUTCterLength,">$inputfile"."cter_length\.txt");
open (OUTnames,">$inputfile"."names\.txt");
open (OUTseq,">$inputfile"."seq\.txt");
open (OUTseqn,">$inputfile"."seqn\.txt");
open (OUTseqh,">$inputfile"."seqh\.txt");
open (OUTseqc,">$inputfile"."seqc\.txt");

open (OUTLength,">$inputfile"."_sig_nhlength\.txt");

open (OUTnFreqMysql,">$inputfile"."_nfreqmysql\.txt");
open (OUThFreqMysql,">$inputfile"."_hfreqmysql\.txt");
open (OUTc1FreqMysql,">$inputfile"."_c1freqmysql\.txt");
open (OUTc2FreqMysql,">$inputfile"."_c2freqmysql\.txt");
open (OUTc3FreqMysql,">$inputfile"."_c3freqmysql\.txt");



open (OUT,">$inputfile"."nhc_freq\.txt"); #printing in 3 columns the n,h,c frequencies



my $compt=0;
my $line="";

######################## computes cleavage indexes and prints them #####################
while (<IN>) {
	chomp();
	$line=$_;
	if (/[^\w]/) {
		if ($line =~ /\s+(\w+\_\w+)\s+/) {
			$names[$compt]=$1;
			
		}
	}
	elsif (/^M/) {
		$sequences[$compt]=$line;
	}
	else {
		for (my $j=0;$j<length($line);$j++) {
			if (substr($line,$j,1) eq "C") {
				$indexc[$compt]=$j;
#print $indexc[$compt]."\n";
				print CLEAVOUT $indexc[$compt]."\n";
			}
		
		}

		$compt++;
	}
}
for (my $i=0;$i<$#names+1;$i++) {
	if ($names[$i] =~ /(\w+)_(\w+)/) {
			$smallnames[$i] = $1;
			$organisms[$i] = $2;
	}
}
my $numseq = $compt;
print "number of sequences ".$numseq;
my $maxsizecleav=0;
for (my $i=0;$i<$numseq;$i++) {
	if ($indexc[$i]>$maxsizecleav) {
		$maxsizecleav=$indexc[$i];
	}
}
print "maxsizecleav ".$maxsizecleav."\n";

my @bestsize;  
my @ratio;

######################## computes for each sequence the optimal hydrophobic window between 10 and 17 #####################

my $maxbeforeh=0; #max length before the hydrophobic part
my $maxafterc=0; #max length after the cleavage part
my $maxsize=0;
for (my $i=0;$i<$numseq;$i++) {
	$ratio[$i]=0;
	$bestsize[$i]=0;
	for (my $j=10;$j<17;$j++) {
		my @pair = besthydroindex($sequences[$i],$j);
#print "window size ".$j." opt size ".$pair[1]." index hydro ".$pair[0]."\n";
		if ($pair[1]/$j>$ratio[$i]) {
			$ratio[$i]=$pair[1]/$j;
			$bestsize[$i]=$j;
			$indexh[$i]=$pair[0];
#print "bestsize ".$bestsize[$i]."\n";
		}
	}
	if ($indexh[$i]>$maxbeforeh) { #definition du maximum!
		$maxbeforeh=$indexh[$i];
	}

	my $diff=$indexc[$i]-3-$indexh[$i];
	if ($diff-$bestsize[$i]<0) { # 
		$bestsize[$i]=$diff; #we shorten the h region to have the proper cleavage site!!
		$diff=$bestsize[$i]; # IMPORTANT
#print $i." diff ".($diff-$bestsize[$i])."\n";
	}
	if ($diff>$maxafterc) {
		$maxafterc=$diff;
	}

	
	#print "for seq number ".$i."the best size is ".$bestsize." the corresponding ratio ".$ratio." and the best index ".$bestindex."\n";
}
#print "maxbeforeh ".$maxbeforeh."\n";
#print "maxafter ".$maxafterc."\n";

##############BOUNDARIES ON THE GAPS
#$maxafterc -= 32;
#$maxbeforeh -= 65 ;
my $exprlength = 0;
for (my $i=0;$i<$numseq;$i++) {

	my $gapbeforeh = $maxbeforeh-$indexh[$i];
	my $gapafterc = $maxafterc - $indexc[$i]+$indexh[$i]+3;
#print "gapbeforeh ".$gapbeforeh."\n";
#print "gapafterc ".$gapafterc."\n";
my $expression=substr($sequences[$i],0,$indexh[$i]-1).insertions($gapbeforeh).substr($sequences[$i],$indexh[$i],$bestsize[$i]).insertions($gapafterc).substr($sequences[$i],$indexh[i]+$bestsize[$i],$indexc[$i]-$indexh[$i]-$bestsize[$i]).substr($sequences[$i],$indexc[$i]-3,3);
	
	if (length($expression) eq $exprlength || $i==0) { #  
	#	print OUT $names[$i]."   ".$expression."\n";
		$exprlength = length $expression;
	}
	else {
		print "length ".$i." ".length($expression)."\n";
	}
}


############### on print les differentes length pour matlab!!!!################
for (my $i=0;$i<$numseq;$i++) {
	if ($index[$i]>=0 && $bestsize[$i]>=0 && $bestsize[$i]<20 && $index[$i]<15) {
		print OUTNTERLength $indexh[$i]."\n"; 
		print OUTHydroLength $bestsize[$i]."\n";
	}
}

############### on print les differentes length pour mysql!!!!################
for (my $i=0;$i<$numseq;$i++) {
	
	print OUTLength $smallnames[$i]."\t".$organisms[$i]."\t".$indexh[$i]."\t".$bestsize[$i]."\n"; 

}



my $co=0;
my $totalseqn;
my $totalseqh;
my $totalseqc;
my $totalseqc1;
my $totalseqc2;
my $totalseqc3;


my @nseq;
my @hseq;
###### FOR THE THREE POSITIONS OF THE CLEAVAGE SITE #################
my @cseq1;
my @cseq2;
my @cseq3;


for (my $i=0;$i<$#sequences+1;$i++) {
	print "n region : ".substr($sequences[$i],0,$indexh[$i]-1)."\n";
	print "h region : ".substr($sequences[$i],$indexh[$i],$bestsize[$i])."\n";
	print "c region : ".substr($sequences[$i],$indexc[$i]-3,3)."\n";
	$nseq[$i] = substr($sequences[$i],0,$indexh[$i]-1);
	$hseq[$i] = substr($sequences[$i],$indexh[$i],$bestsize[$i]);
	$cseq[$i] = substr($sequences[$i],$indexc[$i]-3,3);

	$cseq1[$i] = substr($sequences[$i],$indexc[$i]-3,1);
	$cseq2[$i] = substr($sequences[$i],$indexc[$i]-2,1);
	$cseq3[$i] = substr($sequences[$i],$indexc[$i]-1,1);
	
	$totalseqn .= $nseq[$i];
	$totalseqh .= $hseq[$i];
	$totalseqc .= $cseq[$i];
	$totalseqc1 .= $cseq1[$i];
	$totalseqc2 .= $cseq2[$i];
	$totalseqc3 .= $cseq3[$i];
	
}
my @sortedkeys = sort {$aminoacids{$a} <=> $aminoacids{$b} || $a cmp $b} keys (%aminoacids);

my @nvec=compFreq($totalseqn);
my @hvec=compFreq($totalseqh);
my @cvec=compFreq($totalseqc);
my @cvec1=compFreq($totalseqc1);
my @cvec2=compFreq($totalseqc2);
my @cvec3=compFreq($totalseqc3);

################## Prints number of states, number of symbols, etc..
print OUT "5\n"; #number of states
print OUT "20\n"; #number of symbols/aminoacids
print OUT "0\n"; 
print OUT "1\n"; 

############# write state transition matrix###################

#print OUT "0.65\t"."0.35\t"."0\t"."0\t"."0\n";
#print OUT "0.65\t"."0.35\t"."0\t"."0\t"."0\n";
print OUT "0.5\t"."0.5\t"."0\t"."0\t"."0\n";
print OUT "0\t"."0.75\t"."0.25\t"."0\t"."0\n";
print OUT "0\t"."0\t"."0\t"."1\t"."0\n";
print OUT "0\t"."0\t"."0\t"."0\t"."1\n";
print OUT "0\t"."0\t"."0\t"."0\t"."1\n";



for (my $i=0;$i<$#sortedkeys+1;$i++) {
	print OUT $nvec[$i]."\t";
	print $sortedkeys[$i]." ";
}
print OUT "\n";
for (my $i=0;$i<$#sortedkeys+1;$i++) {
	print OUT $hvec[$i]."\t";
}
print OUT "\n";
#for (my $i=0;$i<$#sortedkeys+1;$i++) {
#	print OUT $cvec[$i]."\t";
#}

for (my $i=0;$i<$#sortedkeys+1;$i++) {
	print OUT $cvec1[$i]."\t";
}
print OUT "\n";
for (my $i=0;$i<$#sortedkeys+1;$i++) {
	print OUT $cvec2[$i]."\t";
}
print OUT "\n";
for (my $i=0;$i<$#sortedkeys+1;$i++) {
	print OUT $cvec3[$i]."\t";
}



for (my $i=0;$i<$#sortedkeys+1;$i++) {
	print OUTc1FreqMysql "c-3signal\t".$sortedkeys[$i]."\t".$cvec1[$i]."\t"."Euka\n";
	print OUTc2FreqMysql "c-2signal\t".$sortedkeys[$i]."\t".$cvec2[$i]."\t"."Euka\n";
	print OUTc3FreqMysql "c-1signal\t".$sortedkeys[$i]."\t".$cvec3[$i]."\t"."Euka\n";
	print OUTnFreqMysql "nsignal\t".$sortedkeys[$i]."\t".$nvec[$i]."\t"."Euka\n";
	print OUThFreqMysql "hsignal\t".$sortedkeys[$i]."\t".$hvec[$i]."\t"."Euka\n";
}


################ writing the initial pi probabilities..##############
print OUT "\n"."1\t"."0\t"."0\t"."0\t"."0";



print "length of multiple ali ".$exprlength."\n";





#print "$atypical atypical sequences are ignored.\n";
#print "$seqnotaligned sequences are discarded because they are not well subdivided.\n";






my @sortedkeys = sort {$aminoacids{$a} <=> $aminoacids{$b} || $a cmp $b} keys (%aminoacids);

my @nvec=compFreq($totalseqn);
my @hvec=compFreq($totalseqh);
my @cvec=compFreq($totalseqc);




#for (my $i=0;$i<$#sortedkeys+1;$i++) {
#	print $sortedkeys[$i]." ";
#	print OUT $nvec[$i]."\t".$hvec[$i]."\t".$cvec[$i]."\n";
#}


# now we print the files for loading into mysql

for (my $i=0;$i<$#names+1;$i++) {
	print OUTseq $smallnames[$i]."\t".$organisms[$i]."\t"."signal\t".$sequences[$i]."\t".$indexh[$i]."\t".$bestsize[$i]."\n";
}
for (my $i=0;$i<$#names+1;$i++) {
	print OUTseqn $smallnames[$i]."\t".$organisms[$i]."\t"."nsig\t".$nseq[$i]."\n";
}
for (my $i=0;$i<$#names+1;$i++) {
	print OUTseqh $smallnames[$i]."\t".$organisms[$i]."\t"."hsig\t".$hseq[$i]."\n";
}
for (my $i=0;$i<$#names+1;$i++) {
	print OUTseqc $smallnames[$i]."\t".$organisms[$i]."\t"."csig\t".$cseq[$i]."\n";
}




close OUT;



#print "$atypical atypical sequences are ignored.\n";
#print "$seqnotaligned sequences are discarded because they are not well subdivided.\n";

close IN;
close OUTNTER;
close OUTHydro;
close OUTCter;
close OUTNTERLength;
close OUTHydroLength;
close OUTCterLength;
close OUTnames;
close OUTseq;
close OUTseqn;
close OUTseqh;
close OUTseqc;
close OUTNTERLengthForMysql;
close OUTHydroLengthForMysql;

close OUTc1FreqMysql;
close OUTc2FreqMysql;
close OUTc3FreqMysql;

###########sub insertions#############################
sub insertions {
	my $number=$_[0];
	my $res="";
	for (my $i=0;$i<$number;$i++) {
		$res .= "-";
	}
	return $res;
}

###########sub compFreq#############################
sub compFreq {
	my $seq = $_[0];
	my @countFreq;
	for (my $i=0;$i<length $seq;$i++) {
		$countFreq[$aminoacids{substr($seq,$i,1)}]++;
	}
	for (my $i=0;$i<20;$i++) {
		$countFreq[$i]=$countFreq[$i]/length $seq;
	}
	return(@countFreq);
}

###########sub insertions#############################
sub insertions {
	my $number=$_[0];
	my $res="";
	for (my $i=0;$i<$number;$i++) {
		$res .= "-";
	}
	return $res;
}

###########sub compFreq#############################
sub compFreq {
	my $seq = $_[0];
	my @countFreq;
	for (my $i=0;$i<length $seq;$i++) {
		$countFreq[$aminoacids{substr($seq,$i,1)}]++;
	}
	for (my $i=0;$i<20;$i++) {
		$countFreq[$i]=$countFreq[$i]/length $seq;
	}
	return(@countFreq);
}

###########sub hydroprofile#############################
# given a window of size n, choose "best index of hydrophobicity"
sub besthydroindex {
	my @input = @_;
	my $seq = $input[0];
	my $winsize = $input[1]; ## size of the hydrophobicity profile window (def=10?)
	my $seqlength=length($seq);
	my $max=0;
	my $indexmax=0;
	for (my $i=0;$i<$seqlength-$winsize;$i++) {
		my $subseq = substr($seq,$i,$winsize);
		my $hydro = hydrophobicity($subseq);
		if ($hydro>$max) {
			$max= $hydro;
			$indexmax = $i;
		}
	}
	my @result;
	$result[0]=$indexmax;
	$result[1]=$max;
	return @result;
}
###########hydrophobicity#############################

sub hydrophobicity {
	my $seq=$_[0];
	my $number=0;
	for (my $i=0;$i<length($seq);$i++){
		if (ishydrophobic(substr($seq,$i,1))) {
			$number++;
		}
	}
	return $number;

}


####### simple aa evaluation methods #####################################
sub ishydrophobic {
	my $thisaa = $_[0];
	#if ($thisaa =~ /[AILMFVW]/i) { return 1; } else { return 0; }
	if ($thisaa =~ /[AILV]/i) { return 1; } else { return 0; }
}

sub ischarged {
	my $thisaa = $_[0];
	if ($thisaa =~ /[RKDE]/i) { return 1; } else { return 0; }
}

sub nh {
	# subroutine for determine if non-hydrophobic or not
	my $thisaa = $_[0];
	if ($thisaa =~ /[CDEGHKNPQRSTY]/i) { return 1; } else { return 0; }
}

###########sub hydroprofile#############################
# given a window of size n, choose "best index of hydrophobicity"
sub besthydroindex {
	my @input = @_;
	my $seq = $input[0];
	my $winsize = $input[1]; ## size of the hydrophobicity profile window (def=10?)
	my $seqlength=length($seq);
	my $max=0;
	my $indexmax=0;
	for (my $i=0;$i<$seqlength-$winsize;$i++) {
		my $subseq = substr($seq,$i,$winsize);
		my $hydro = hydrophobicity($subseq);
		if ($hydro>$max) {
			$max= $hydro;
			$indexmax = $i;
		}
	}
	my @result;
	$result[0]=$indexmax;
	$result[1]=$max;
	return @result;
}
###########hydrophobicity#############################

sub hydrophobicity {
	my $seq=$_[0];
	my $number=0;
	for (my $i=0;$i<length($seq);$i++){
		if (ishydrophobic(substr($seq,$i,1))) {
			$number++;
		}
	}
	return $number;

}


####### simple aa evaluation methods #####################################
sub ishydrophobic {
	my $thisaa = $_[0];
	#if ($thisaa =~ /[AILMFVW]/i) { return 1; } else { return 0; }
	if ($thisaa =~ /[AILV]/i) { return 1; } else { return 0; }
}

sub ischarged {
	my $thisaa = $_[0];
	if ($thisaa =~ /[RKDE]/i) { return 1; } else { return 0; }
}

sub nh {
	# subroutine for determine if non-hydrophobic or not
	my $thisaa = $_[0];
	if ($thisaa =~ /[CDEGHKNPQRSTY]/i) { return 1; } else { return 0; }
}

