#!/usr/bin/perl -w

# C.Vogel, UT Austin, TX; April 2008

# input: 
# 	list of proteins and their tryptic peptides
#	protein_id *tab* peptides [separated by *space*]
# output:
#	list of peptides and their properties
# 	pseudo-arff format (.arf)
# 	protein_id *tab* peptide *tab* properties [separated by *comma*]


# ======================================================================================
# Warm up:

use strict;
$|=1;

die "\nnp_peptide_properties <list of proteins and peptides (format description see script)>\n" unless ($#ARGV==0);
my $protein_digest = $ARGV[0]; 

# ======================================================================================
# Features list:

my @alphabet = split("", "ACDEFGHIKLMNPQRSTVWY");
my @rel_alphabet;
foreach my $letter (@alphabet)  {
	push(@rel_alphabet, "$letter\_rel");
} # foreach my $letter (@alphabet)  {

# Notes:
#	can possibly leave out @alphabet and all 'sums' of features as they are likely to be correlated with sequence length
# 	can include further features from AAindex file
my @features = (
	"length_aa", 
	"molecular_weight", 
	@alphabet,  
	@rel_alphabet,
	
	"CHOP780201 sum", "CHOP780201 avg", # Normalized frequency of alpha-helix (Chou-Fasman, 1978b)
	"CHOP780202 sum", "CHOP780202 avg", # Normalized frequency of beta-sheet (Chou-Fasman, 1978b)
	"CHOP780203 sum", "CHOP780203 avg", # Normalized frequency of beta-turn (Chou-Fasman, 1978b)
	
	"WERD780101 sum", "WERD780101 avg", # Propensity to be buried inside (Wertz-Scheraga, 1978)
	"ZIMJ680104 sum", "ZIMJ680104 avg", # Isoelectric point (Zimmerman et al., 1968)
	"KLEP840101 sum", "KLEP840101 avg", # Net charge (Klein et al., 1984)
	"EISD860102 sum", "EISD860102 avg", # Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986)
	"FAUJ880111 sum", "FAUJ880111 avg", # Positive charge (Fauchere et al., 1988)

	"VINM940101 sum", "VINM940101 avg", # Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
	"FAUJ880103 sum", "FAUJ880103 avg", # Normalized van der Waals volume (Fauchere et al., 1988)
	"GUYH850105 sum", "GUYH850105 avg", # Apparent partition energies calculated from Chothia index (Guy, 1985); Amino acid side-chain partition energies and distribution of residues in soluble proteins
	"NOZY710101 sum", "NOZY710101 avg", # Transfer energy, organic solvent/water (Nozaki-Tanford, 1971)
	);

=pod
=cut

print "\#YID\tSEQUENCE\tFEATURES TOTAL\t", scalar(@features), " \n";
for (my $k=0; $k<=$#features; $k++ ) {
	my $attribute = $k+1;
	$attribute .= " $features[$k]";
	$attribute =~ s/ /_/g; 
	print "\@attribute $attribute numeric\n";
}#  for (my $k=0; $k<=$#features; $k++ ) {

# ======================================================================================
# Go through each line of input file, print properties of each peptide

open (LIST, $protein_digest) or die "Cant open $protein_digest\n";
while ( my $line = <LIST> ) { 
	next if length(chomp $line) == 0;	
	my ($YID, $peptides) = split("\t", $line); # see above for format of input list		
	die "No ID for gene in line $line\n" if length(chomp $YID) == 0;
	my @c = split(" ", $peptides);
	my $total_peptides = scalar( @c); 
	die "No peptides for $YID in line $line\n" unless $total_peptides >= 1;
	
	foreach my $peptide ( split(" ", $peptides) )  {
	
		next if length(chomp $peptide) == 0;
	
		# analyse aa frequencies etc (basic attributes):
		my $sequence = $peptide;
		$sequence =~ s/X//g;
		my @sequence = split("", $sequence);
		next if scalar(@sequence)==0;
		my @peptide_features;				
		my ($length, $mass, $ar_absfreq, $ar_relfreq) = &aa_frequencies($sequence); # length, mass, absolute, relative frequencies of sequence
		next unless $length >=3; # minimum 3 amino acids per peptide

		# basic attributes: 
		push(@peptide_features, $length);
		push(@peptide_features, $mass);

		push(@peptide_features, @$ar_absfreq);
		push(@peptide_features, @$ar_relfreq);

		# secondary structure attributes: 
		push(@peptide_features, &sequence_feature($sequence, &aa_CHOP780201_hash));	# Normalized frequency of alpha-helix (Chou-Fasman, 1978b)
		push(@peptide_features, &sequence_feature($sequence, &aa_CHOP780202_hash));	# Normalized frequency of beta-sheet (Chou-Fasman, 1978b)
		push(@peptide_features, &sequence_feature($sequence, &aa_CHOP780203_hash));	# Normalized frequency of beta-turn (Chou-Fasman, 1978b)
		
		# attributes identified Mallick et al. [Nature Biotech, 2007] for MUDPIT-ESI: 
		push(@peptide_features, &sequence_feature($sequence, &aa_WERD780101_hash));	# Propensity to be buried inside (Wertz-Scheraga, 1978)
		push(@peptide_features, &sequence_feature($sequence, &aa_ZIMJ680104_hash));	# Isoelectric point (Zimmerman et al., 1968)
		push(@peptide_features, &sequence_feature($sequence, &aa_KLEP840101_hash));	# Net charge (Klein et al., 1984)
		push(@peptide_features, &sequence_feature($sequence, &aa_EISD860102_hash));	# Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986)
		push(@peptide_features, &sequence_feature($sequence, &aa_FAUJ880111_hash));	# Positive charge (Fauchere et al., 1988)

		# additional attributes:
		push(@peptide_features, &sequence_feature($sequence, &aa_VINM940101_hash));	# Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
		push(@peptide_features, &sequence_feature($sequence, &aa_FAUJ880103_hash));	# Normalized van der Waals volume (Fauchere et al., 1988)
		push(@peptide_features, &sequence_feature($sequence, &aa_GUYH850105_hash));	# Apparent partition energies calculated from Chothia index (Guy, 1985)
		push(@peptide_features, &sequence_feature($sequence, &aa_NOZY710101_hash));	# Transfer energy, organic solvent/water (Nozaki-Tanford, 1971)

		print "$YID\t$peptide\t", join(",", @peptide_features), "\n";		
	} # foreach my $peptide ( split(" ", $peptides) )  {
} # while ( my $id = <LIST>) { 
close LIST;

# ======================================================================================
# ======================================================================================
### SUBROUTINES ### 
# ======================================================================================
# ======================================================================================
sub sequence_feature {
	my ($seq, $hr_AA_FEATURE) = @_;
	my @sequence = split("", $seq);
	my $length = scalar(@sequence);
	my ($sum, $avg);
	foreach my $aa (@sequence) {
		$sum += $$hr_AA_FEATURE{"$aa"} if defined $$hr_AA_FEATURE{"$aa"};
		$avg += $$hr_AA_FEATURE{"$aa"}/$length if defined $$hr_AA_FEATURE{"$aa"};
	} # 	
	return(&d4($sum), &d4($avg));
} # sub sequence_feature
# ======================================================================================
# ======================================================================================
sub aa_frequencies  {
 
	my ($sequence) = @_;
	my $sequence_copy = $sequence;
	my @sequence = split("", $sequence);
	my ($length, $mass, @abs_freq, @rel_freq);
	my @alphabet = split("", "ACDEFGHIKLMNPQRSTVWY");
	
	$length = scalar(@sequence);

	# molecular masses
	my %AAHH;
=pod
H FASG760101
D Molecular weight (Fasman, 1976)
R 
A Fasman, G.D., ed.
T 
J "Handbook of Biochemistry and Molecular Biology", 3rd ed., Proteins - Volume 
  1, CRC Press, Cleveland (1976)
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
   89.09  174.20  132.12  133.10  121.15  146.15  147.13   75.07  155.16  131.17
  131.17  146.19  149.21  165.19  115.13  105.09  119.12  204.24  181.19  117.15
=cut
	$AAHH{"D"} = 133.10;  #
	$AAHH{"E"} = 147.13;  #
	$AAHH{"K"} = 146.19;  #
	$AAHH{"R"} = 174.20;  #
	$AAHH{"H"} = 155.16;  #
	$AAHH{"Y"} = 181.19;  #
	$AAHH{"W"} = 204.24;  #
	$AAHH{"F"} = 165.19;  #
	$AAHH{"C"} = 121.15;  #
	$AAHH{"M"} = 149.21;  #
	$AAHH{"S"} = 105.09;  #
	$AAHH{"T"} = 119.12;  #
	$AAHH{"N"} = 132.12;  #
	$AAHH{"Q"} = 146.15;  #
	$AAHH{"G"} = 75.07;  #
	$AAHH{"A"} = 89.09;  #
	$AAHH{"V"} = 117.15;  #
	$AAHH{"L"} = 131.17;  #
	$AAHH{"I"} = 131.17;  #
	$AAHH{"P"} = 115.13;  #

	foreach my $aa ( @alphabet ) {
	
		die "unusual amino acid $aa in $sequence\n" unless defined $AAHH{"$aa"};
		my $number = $sequence =~ s/$aa//g;
		$number = 0 unless $number > 0;
		push(@abs_freq, $number); 
		push(@rel_freq, $number/$length); 
		$mass += ( $number * ($AAHH{"$aa"}-18.02) ); 
		
	}  # foreach my $aa ( @alphabet ) {
	
	$mass += 18.02;
	return($length, $mass, \@abs_freq, \@rel_freq);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H VINM940101
D Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
R LIT:2014123 PMID:8090708
A Vihinen, M., Torkkila, E. and Riikonen, P.
T Accuracy of protein flexibility predictions
J Proteins 19, 141-149 (1994)
C MIYS990104    0.965  MIYS990105    0.952  MIYS990103    0.951
  VINM940102    0.940  OOBM770103    0.936  GUYH850102    0.924
  VINM940103    0.921  FUKS010104    0.919  PARS000101    0.919
  FASG890101    0.904  MEIH800101    0.900  KRIW790101    0.890
  KARP850102    0.885  MIYS990102    0.883  MIYS990101    0.880
  PARS000102    0.877  FUKS010102    0.876  MEIH800102    0.872
  GRAR740102    0.869  CORJ870108    0.862  GUYH850103    0.860
  HOPT810101    0.859  KRIW790102    0.853  PUNT030102    0.850
  RACS770102    0.844  PARJ860101    0.837  RACS770101    0.835
  WOEC730101    0.834  RACS770103    0.830  GUYH850101    0.829
  FUKS010103    0.827  KARP850101    0.821  VINM940104    0.815
  LEVM760101    0.815  MUNV940103    0.811  PUNT030101    0.805
  PALJ810104   -0.801  WIMW960101   -0.804  DESM900101   -0.806
  NADH010105   -0.808  CORJ870102   -0.817  SWER830101   -0.817
  ROSM880105   -0.818  GEIM800107   -0.819  QIAN880120   -0.823
  CORJ870104   -0.826  QIAN880121   -0.828  LIFS790103   -0.829
  DESM900102   -0.829  CHOP780202   -0.831  ZHOH040101   -0.833
  LIFS790101   -0.834  MANP780101   -0.836  CIDH920103   -0.837
  CORJ870103   -0.848  CIDH920101   -0.854  ROBB790101   -0.858
  NADH010102   -0.859  MEIH800103   -0.861  BASU050101   -0.867
  FAUJ830101   -0.871  CIDH920102   -0.872  CORJ870105   -0.873
  CORJ870107   -0.877  PONP800103   -0.878  PONP800101   -0.878
  CORJ870106   -0.881  CIDH920104   -0.883  MIYS850101   -0.883
  PONP800102   -0.883  CIDH920105   -0.885  NADH010103   -0.889
  PONP800108   -0.891  NADH010104   -0.891  BAEK050101   -0.896
  BASU050103   -0.902  BASU050102   -0.904  RADA880108   -0.906
  PONP930101   -0.913  NISK800101   -0.922  ZHOH040103   -0.922
  CORJ870101   -0.924  BIOV880102   -0.929  WERD780101   -0.931
  BIOV880101   -0.941  ROSG850102   -0.943  CASG920101   -0.947
  NISK860101   -0.959
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
   0.984   1.008   1.048   1.068   0.906   1.037   1.094   1.031   0.950   0.927
   0.935   1.102   0.952   0.915   1.049   1.046   0.997   0.904   0.929   0.931
//
=cut
sub aa_VINM940101_hash  {

	my %AAHH;

	$AAHH{"A"} = 0.984;  #
	$AAHH{"L"} = 0.935;  #
	$AAHH{"R"} = 1.008;  #
	$AAHH{"K"} = 1.102;  #
	$AAHH{"N"} = 1.048;  #
	$AAHH{"M"} = 0.952;  #
	$AAHH{"D"} = 1.068;  #
	$AAHH{"F"} = 0.915;  #
	$AAHH{"C"} = 0.906;  #
	$AAHH{"P"} = 1.049;  #
	$AAHH{"Q"} = 1.037;  #
	$AAHH{"S"} = 1.046;  #
	$AAHH{"E"} = 1.094;  #
	$AAHH{"T"} = 0.997;  #
	$AAHH{"G"} = 1.031;  #
	$AAHH{"W"} = 0.904;  #
	$AAHH{"H"} = 0.950;  #
	$AAHH{"Y"} = 0.929;  #
	$AAHH{"I"} = 0.927;  #
	$AAHH{"V"} = 0.931;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H FAUJ880103
D Normalized van der Waals volume (Fauchere et al., 1988)
R LIT:1414114 PMID:3209351
A Fauchere, J.L., Charton, M., Kier, L.B., Verloop, A. and Pliska, V.
T Amino acid side chain parameters for correlation studies in biology and 
  pharmacology
J Int. J. Peptide Protein Res. 32, 269-278 (1988) (Pro !) Original reference of 
  these two data: Fauchere, L.J. In "QSAR in Design of Bioactive Compounds", 
  (Kuchar, M., ed.), Prous, Barcelona pp.135-144 (1984)
C CHAM820101    0.992  CHOC750101    0.990  CHOC760101    0.985
  TSAJ990102    0.985  TSAJ990101    0.983  FASG760101    0.979
  BIGC670101    0.972  GOLD730102    0.972  KRIW790103    0.965
  PONJ960101    0.963  GRAR740103    0.959  HARY940101    0.951
  LEVM760102    0.947  LEVM760105    0.945  CHAM830106    0.927
  FAUJ880106    0.908  ROSG850101    0.892  DAWD720101    0.880
  LEVM760107    0.875  RADA880106    0.869  MCMT640101    0.847
  ZHOH040102    0.816  WOLS870102    0.814  CHAM830105    0.813
  HUTJ700102    0.807  FAUJ880104    0.804  OOBM770102    0.801
  RADA880103   -0.923
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
    1.00    6.13    2.95    2.78    2.43    3.95    3.78    0.00    4.66    4.00
    4.00    4.77    4.43    5.89    2.72    1.60    2.60    8.08    6.47    3.00
//
=cut
sub aa_FAUJ880103_hash  {

	my %AAHH;

	$AAHH{"A"} = 1.00;  #
	$AAHH{"L"} = 4.00;  #
	$AAHH{"R"} = 6.13;  #
	$AAHH{"K"} = 4.77;  #
	$AAHH{"N"} = 2.95;  #
	$AAHH{"M"} = 4.43;  #
	$AAHH{"D"} = 2.78;  #
	$AAHH{"F"} = 5.89;  #
	$AAHH{"C"} = 2.43;  #
	$AAHH{"P"} = 2.72;  #
	$AAHH{"Q"} = 3.95;  #
	$AAHH{"S"} = 1.60;  #
	$AAHH{"E"} = 3.78;  #
	$AAHH{"T"} = 2.60;  #
	$AAHH{"G"} = 0.00;  #
	$AAHH{"W"} = 8.08;  #
	$AAHH{"H"} = 4.66;  #
	$AAHH{"Y"} = 6.47;  #
	$AAHH{"I"} = 4.00;  #
	$AAHH{"V"} = 3.00;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H GUYH850105
D Apparent partition energies calculated from Chothia index (Guy, 1985)
R PMID:3978191
A Guy, H.R.
T Amino acid side-chain partition energies and distribution of residues in 
  soluble proteins
J Biophys. J. 47, 61-70 (1985)
C CHOC760102    0.946  FAUJ880109    0.927  JANJ780101    0.923
  GUYH850104    0.908  JANJ780103    0.885  OOBM770101    0.874
  ROSM880102    0.874  PRAM900101    0.867  ENGD860101    0.867
  PUNT030101    0.858  KUHL950101    0.850  ROSM880101    0.849
  VHEG790101    0.845  GUYH850101    0.843  MEIH800102    0.811
  RADA880105   -0.809  EISD860103   -0.812  NADH010103   -0.815
  DESM900102   -0.818  CHOC760104   -0.822  YUTK870101   -0.841
  NADH010101   -0.847  NADH010102   -0.867  KYTJ820101   -0.883
  JACR890101   -0.887  JANJ780102   -0.898  RADA880104   -0.899
  RADA880101   -0.899  JURD980101   -0.900  WOLR790101   -0.908
  JANJ790102   -0.913  WOLR810101   -0.916  OLSK800101   -0.927
  CHOC760103   -0.933  EISD840101   -0.951  RADA880107   -0.953
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
   -0.27    2.00    0.61    0.50   -0.23    1.00    0.33   -0.22    0.37   -0.80
   -0.44    1.17   -0.31   -0.55    0.36    0.17    0.18    0.05    0.48   -0.65
//
=cut
sub aa_GUYH850105_hash  {

	my %AAHH;

	$AAHH{"A"} = -0.27;  #
	$AAHH{"L"} = -0.44;  #
	$AAHH{"R"} =  2.00;  #
	$AAHH{"K"} =  1.17;  #
	$AAHH{"N"} =  0.61;  #
	$AAHH{"M"} = -0.31;  #
	$AAHH{"D"} =  0.50;  #
	$AAHH{"F"} = -0.55;  #
	$AAHH{"C"} = -0.23;  #
	$AAHH{"P"} =  0.36;  #
	$AAHH{"Q"} =  1.00;  #
	$AAHH{"S"} =  0.17;  #
	$AAHH{"E"} =  0.33;  #
	$AAHH{"T"} =  0.18;  #
	$AAHH{"G"} = -0.22;  #
	$AAHH{"W"} =  0.05;  #
	$AAHH{"H"} =  0.37;  #
	$AAHH{"Y"} =  0.48;  #
	$AAHH{"I"} = -0.80;  #
	$AAHH{"V"} = -0.65;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
//
H NOZY710101
D Transfer energy, organic solvent/water (Nozaki-Tanford, 1971)
R PMID:5555568
A Nozaki, Y. and Tanford, C.
T The solubility of amino acids and two glycine peptides in aqueous ethanol and 
  dioxane solutions
J J. Biol. Chem. 246, 2211-2217 (1971) Missing values filled with zeros
C ZHOH040101    0.932  RADA880102    0.917  MEEJ810102    0.899
  VENT840101    0.897  ZHOH040102    0.897  MEEJ800102    0.895
  CIDH920102    0.889  TAKK010101    0.884  GUOD860101    0.884
  MEEJ810101    0.882  CIDH920105    0.857  ROSM880104    0.847
  BASU050102    0.847  LEVM760107    0.845  ZHOH040103    0.842
  PLIV810101    0.839  CORJ870102    0.838  ZIMJ680105    0.837
  SWER830101    0.836  ROSG850101    0.834  BROC820101    0.829
  EISD860101    0.822  GARJ730101    0.821  WIMW960101    0.818
  MIYS850101    0.810  SIMZ760101    0.807  FAUJ830101    0.803
  ARGP820101    0.800  MIYS990102   -0.819  MIYS990101   -0.821
  OOBM770103   -0.828  PARS000101   -0.829  WOLS870101   -0.874
  WEBA780101   -0.890  BULH740101   -0.892  PARJ860101   -0.900
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
     0.5     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.5     1.8
     1.8     0.0     1.3     2.5     0.0     0.0     0.4     3.4     2.3     1.5
//
=cut
sub aa_NOZY710101_hash  {

	my %AAHH;

	$AAHH{"A"} = 0.5;  #
	$AAHH{"L"} = 1.8;  #
	$AAHH{"R"} = 0.0;  #
	$AAHH{"K"} = 0.0;  #
	$AAHH{"N"} = 0.0;  #
	$AAHH{"M"} = 1.3;  #
	$AAHH{"D"} = 0.0;  #
	$AAHH{"F"} = 2.5;  #
	$AAHH{"C"} = 0.0;  #
	$AAHH{"P"} = 0.0;  #
	$AAHH{"Q"} = 0.0;  #
	$AAHH{"S"} = 0.0;  #
	$AAHH{"E"} = 0.0;  #
	$AAHH{"T"} = 0.4;  #
	$AAHH{"G"} = 0.0;  #
	$AAHH{"W"} = 3.4;  #
	$AAHH{"H"} = 0.5;  #
	$AAHH{"Y"} = 2.3;  #
	$AAHH{"I"} = 1.8;  #
	$AAHH{"V"} = 1.5;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H WERD780101
D Propensity to be buried inside (Wertz-Scheraga, 1978)
R LIT:0405105 PMID:621952
A Wertz, D.H. and Scheraga, H.A.
T Influence of water on protein structure. An analysis of the preferences of 
  amino acid residues for the inside or outside and for specific conformations 
  in a protein molecule
J Macromolecules 11, 9-15 (1978) Adjusted values
C NISK860101    0.966  BIOV880101    0.951  ROSG850102    0.943
  MIYS850101    0.934  RADA880108    0.930  BIOV880102    0.929
  CASG920101    0.927  ZHOH040103    0.923  BASU050102    0.920
  CIDH920105    0.905  CIDH920104    0.896  PONP930101    0.895
  MEIH800103    0.895  BAEK050101    0.895  NISK800101    0.891
  NADH010104    0.890  CORJ870107    0.887  PONP800102    0.883
  CORJ870103    0.882  CIDH920103    0.881  NADH010103    0.880
  PONP800101    0.880  CIDH920101    0.878  CORJ870106    0.878
  PONP800103    0.876  CORJ870101    0.873  ROBB790101    0.872
  CIDH920102    0.871  FAUJ830101    0.862  ZHOH040101    0.859
  CORJ870105    0.858  BASU050103    0.857  MANP780101    0.853
  CORJ870104    0.850  PONP800108    0.843  BASU050101    0.843
  PLIV810101    0.841  NADH010102    0.841  NADH010105    0.837
  MEEJ810101    0.825  DESM900102    0.814  CORJ870102    0.804
  SWER830101    0.804  FUKS010102   -0.801  BHAR880101   -0.803
  KRIW710101   -0.819  PUNT030101   -0.821  GRAR740102   -0.826
  FUKS010104   -0.832  KARP850101   -0.842  RACS770103   -0.846
  PARS000101   -0.853  PARJ860101   -0.869  FUKS010103   -0.869
  GUYH850101   -0.871  KRIW790102   -0.875  GUYH850103   -0.876
  CORJ870108   -0.878  VINM940102   -0.886  KRIW790101   -0.899
  MEIH800102   -0.903  RACS770102   -0.906  OOBM770103   -0.906
  KARP850102   -0.909  MIYS990101   -0.912  RACS770101   -0.912
  MIYS990102   -0.914  FASG890101   -0.926  VINM940103   -0.926
  VINM940101   -0.931  MIYS990105   -0.936  MIYS990103   -0.938
  MEIH800101   -0.943  MIYS990104   -0.949  GUYH850102   -0.976
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
    0.52    0.49    0.42    0.37    0.83    0.35    0.38    0.41    0.70    0.79
    0.77    0.31    0.76    0.87    0.35    0.49    0.38    0.86    0.64    0.72
//
=cut
sub aa_WERD780101_hash  {

	my %AAHH;

	$AAHH{"A"} = 0.52;  #
	$AAHH{"L"} = 0.77;  #
	$AAHH{"R"} = 0.49;  #
	$AAHH{"K"} = 0.31;  #
	$AAHH{"N"} = 0.42;  #
	$AAHH{"M"} = 0.76;  #
	$AAHH{"D"} = 0.37;  #
	$AAHH{"F"} = 0.87;  #
	$AAHH{"C"} = 0.83;  #
	$AAHH{"P"} = 0.35;  #
	$AAHH{"Q"} = 0.35;  #
	$AAHH{"S"} = 0.49;  #
	$AAHH{"E"} = 0.38;  #
	$AAHH{"T"} = 0.38;  #
	$AAHH{"G"} = 0.41;  #
	$AAHH{"W"} = 0.86;  #
	$AAHH{"H"} = 0.70;  #
	$AAHH{"Y"} = 0.64;  #
	$AAHH{"I"} = 0.79;  #
	$AAHH{"V"} = 0.72;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H ZIMJ680104
D Isoelectric point (Zimmerman et al., 1968)
R LIT:2004109b PMID:5700434
A Zimmerman, J.M., Eliezer, N. and Simha, R.
T The characterization of amino acid sequences in proteins by statistical 
  methods
J J. Theor. Biol. 21, 170-201 (1968)
C KLEP840101    0.941  FAUJ880111    0.813  FINA910103    0.805
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
    6.00   10.76    5.41    2.77    5.05    5.65    3.22    5.97    7.59    6.02
    5.98    9.74    5.74    5.48    6.30    5.68    5.66    5.89    5.66    5.96
//
=cut
sub aa_ZIMJ680104_hash  {

	my %AAHH;

	$AAHH{"A"} = 6.00;  #
	$AAHH{"L"} = 5.98;  #
	$AAHH{"R"} = 10.76;  #
	$AAHH{"K"} = 9.74;  #
	$AAHH{"N"} = 5.41;  #
	$AAHH{"M"} = 5.74;  #
	$AAHH{"D"} = 2.77;  #
	$AAHH{"F"} = 5.48;  #
	$AAHH{"C"} = 5.05;  #
	$AAHH{"P"} = 6.30;  #
	$AAHH{"Q"} = 5.65;  #
	$AAHH{"S"} = 5.68;  #
	$AAHH{"E"} = 3.22;  #
	$AAHH{"T"} = 5.66;  #
	$AAHH{"G"} = 5.97;  #
	$AAHH{"W"} = 5.89;  #
	$AAHH{"H"} = 7.59;  #
	$AAHH{"Y"} = 5.66;  #
	$AAHH{"I"} = 6.02;  #
	$AAHH{"V"} = 5.96;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H CHOP780201
D Normalized frequency of alpha-helix (Chou-Fasman, 1978b)
R PMID:364941
A Chou, P.Y. and Fasman, G.D.
T Prediction of the secondary structure of proteins from their amino acid 
  sequence
J Adv. Enzymol. 47, 45-148 (1978)
C PALJ810102    0.981  ROBB760101    0.969  ISOY800101    0.959
  MAXF760101    0.956  KANM800101    0.956  TANS770101    0.947
  BURA740101    0.917  GEIM800101    0.912  KANM800103    0.912
  NAGK730101    0.886  LEVM780104    0.886  PALJ810101    0.881
  QIAN880106    0.874  PRAM900102    0.873  LEVM780101    0.873
  GEIM800104    0.868  RACS820108    0.868  AURR980108    0.867
  AURR980109    0.859  AURR980112    0.856  CRAJ730101    0.851
  QIAN880107    0.843  BEGF750101    0.841  QIAN880105    0.835
  AURR980114    0.828  AURR980115    0.816  AURR980110    0.814
  PALJ810109    0.814  AURR980111    0.813  ROBB760103    0.806
  MUNV940101   -0.802  CRAJ730103   -0.808  ROBB760113   -0.811
  MUNV940102   -0.812  CHAM830101   -0.828  NAGK730103   -0.837
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
    1.42    0.98    0.67    1.01    0.70    1.11    1.51    0.57    1.00    1.08
    1.21    1.16    1.45    1.13    0.57    0.77    0.83    1.08    0.69    1.06
//
=cut
sub aa_CHOP780201_hash  {
# 

	my %AAHH;

	$AAHH{"D"} = 1.01;  #
	$AAHH{"E"} = 1.51;  #
	$AAHH{"K"} = 1.16;  #
	$AAHH{"R"} = 0.98;  #
	$AAHH{"H"} = 1.00;  #
	$AAHH{"Y"} = 0.69;  #
	$AAHH{"W"} = 1.08;  #
	$AAHH{"F"} = 1.13;  #
	$AAHH{"C"} = 0.70;  #
	$AAHH{"M"} = 1.45;  #
	$AAHH{"S"} = 0.77;  #
	$AAHH{"T"} = 0.83;  #
	$AAHH{"N"} = 0.67;  #
	$AAHH{"Q"} = 1.11;  #
	$AAHH{"G"} = 0.57;  #
	$AAHH{"A"} = 1.42;  #
	$AAHH{"V"} = 1.06;  #
	$AAHH{"L"} = 1.21;  #
	$AAHH{"I"} = 1.08;  #
	$AAHH{"P"} = 0.57;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H CHOP780202
D Normalized frequency of beta-sheet (Chou-Fasman, 1978b)
R PMID:364941
A Chou, P.Y. and Fasman, G.D.
T Prediction of the secondary structure of proteins from their amino acid 
  sequence
J Adv. Enzymol. 47, 45-148 (1978)
C PALJ810104    0.970  LIFS790101    0.947  KANM800102    0.945
  PALJ810103    0.937  ROBB760106    0.931  LEVM780105    0.930
  GEIM800107    0.929  QIAN880120    0.915  PTIO830102    0.913
  QIAN880121    0.911  LIFS790103    0.908  GEIM800105    0.890
  ROBB760105    0.885  BASU050101    0.883  BASU050103    0.874
  PONP930101    0.867  NAGK730102    0.858  QIAN880119    0.855
  CHOP780208    0.851  BASU050102    0.841  KANM800104    0.839
  GEIM800106    0.839  LEVM780102    0.833  PRAM900103    0.833
  NISK860101    0.832  SWER830101    0.823  CORJ870102    0.822
  CHOP780209    0.822  CORJ870101    0.815  PALJ810112    0.815
  PONP800108    0.809  PALJ810110    0.808  MANP780101    0.805
  VENT840101    0.805  MIYS990102   -0.801  PUNT030102   -0.803
  VINM940102   -0.810  OOBM770103   -0.820  GEIM800110   -0.824
  MIYS990103   -0.825  MIYS990104   -0.829  VINM940101   -0.831
  MUNV940103   -0.892
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
    0.83    0.93    0.89    0.54    1.19    1.10    0.37    0.75    0.87    1.60
    1.30    0.74    1.05    1.38    0.55    0.75    1.19    1.37    1.47    1.70
//
=cut
sub aa_CHOP780202_hash  {
# 

	my %AAHH;

	$AAHH{"D"} = 0.54;  #
	$AAHH{"E"} = 0.37;  #
	$AAHH{"K"} = 0.74;  #
	$AAHH{"R"} = 0.93;  #
	$AAHH{"H"} = 0.87;  #
	$AAHH{"Y"} = 1.47;  #
	$AAHH{"W"} = 1.37;  #
	$AAHH{"F"} = 1.38;  #
	$AAHH{"C"} = 1.19;  #
	$AAHH{"M"} = 1.05;  #
	$AAHH{"S"} = 0.75;  #
	$AAHH{"T"} = 1.19;  #
	$AAHH{"N"} = 0.89;  #
	$AAHH{"Q"} = 1.10;  #
	$AAHH{"G"} = 0.75;  #
	$AAHH{"A"} = 0.83;  #
	$AAHH{"V"} = 1.70;  #
	$AAHH{"L"} = 1.30;  #
	$AAHH{"I"} = 1.60;  #
	$AAHH{"P"} = 0.55;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
//
H CHOP780203
D Normalized frequency of beta-turn (Chou-Fasman, 1978b)
R PMID:364941
A Chou, P.Y. and Fasman, G.D.
T Prediction of the secondary structure of proteins from their amino acid 
  sequence
J Adv. Enzymol. 47, 45-148 (1978)
C CHOP780216    0.979  CHOP780101    0.940  TANS770110    0.940
  LEVM780106    0.935  GEIM800111    0.933  ISOY800103    0.933
  CHAM830101    0.931  PRAM900104    0.928  QIAN880132    0.928
  LEVM780103    0.927  GEIM800108    0.925  CHOP780210    0.918
  QIAN880133    0.915  PALJ810106    0.907  PALJ810105    0.878
  QIAN880131    0.861  QIAN880134    0.838  RACS770101    0.827
  QIAN880135    0.811  CORJ870106   -0.813  QIAN880119   -0.814
  CORJ870105   -0.815  PONP800107   -0.818  SUEM840101   -0.892
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
    0.74    1.01    1.46    1.52    0.96    0.96    0.95    1.56    0.95    0.47
    0.50    1.19    0.60    0.66    1.56    1.43    0.98    0.60    1.14    0.59
//
=cut
sub aa_CHOP780203_hash  {

	my %AAHH;

	$AAHH{"D"} = 1.52;  #
	$AAHH{"E"} = 0.95;  #
	$AAHH{"K"} = 1.19;  #
	$AAHH{"R"} = 1.01;  #
	$AAHH{"H"} = 0.95;  #
	$AAHH{"Y"} = 1.14;  #
	$AAHH{"W"} = 0.60;  #
	$AAHH{"F"} = 0.66;  #
	$AAHH{"C"} = 0.96;  #
	$AAHH{"M"} = 0.60;  #
	$AAHH{"S"} = 1.43;  #
	$AAHH{"T"} = 0.98;  #
	$AAHH{"N"} = 1.46;  #
	$AAHH{"Q"} = 0.96;  #
	$AAHH{"G"} = 1.56;  #
	$AAHH{"A"} = 0.74;  #
	$AAHH{"V"} = 0.59;  #
	$AAHH{"L"} = 0.50;  #
	$AAHH{"I"} = 0.47;  #
	$AAHH{"P"} = 1.56;  #
	
	return(\%AAHH);
} # sub aahh_hash  {

# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
H KLEP840101
D Net charge (Klein et al., 1984)
R LIT:1008055 PMID:6547351
A Klein, P., Kanehisa, M. and DeLisi, C.
T Prediction of protein function from sequence properties: Discriminant 
  analysis of a data base
J Biochim. Biophys. Acta 787, 221-226 (1984)
C ZIMJ680104    0.941
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
      0.      1.      0.     -1.      0.      0.     -1.      0.      0.      0.
      0.      1.      0.      0.      0.      0.      0.      0.      0.      0.
=cut
sub aa_KLEP840101_hash  {

	my %AAHH;

	$AAHH{"D"} = -1;  #
	$AAHH{"E"} = -1;  #
	$AAHH{"K"} = 1;  #
	$AAHH{"R"} = 1;  #
	$AAHH{"H"} = 0;  #
	$AAHH{"Y"} = 0;  #
	$AAHH{"W"} = 0;  #
	$AAHH{"F"} = 0;  #
	$AAHH{"C"} = 0;  #
	$AAHH{"M"} = 0;  #
	$AAHH{"S"} = 0;  #
	$AAHH{"T"} = 0;  #
	$AAHH{"N"} = 0;  #
	$AAHH{"Q"} = 0;  #
	$AAHH{"G"} = 0;  #
	$AAHH{"A"} = 0;  #
	$AAHH{"V"} = 0;  #
	$AAHH{"L"} = 0;  #
	$AAHH{"I"} = 0;  #
	$AAHH{"P"} = 0;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================

# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
H EISD860102
D Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986)
R LIT:2004121b PMID: 3945310
A Eisenberg, D. and McLachlan, A.D.
T Solvation energy in protein folding and binding
J Nature 319, 199-203 (1986)
C FAUJ880109    0.841  HUTJ700103    0.841  RADA880107   -0.837
  YUTK870103   -0.839  YUTK870104   -0.840  JACR890101   -0.871
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
      0.     10.     1.3     1.9    0.17     1.9      3.      0.    0.99     1.2
     1.0     5.7     1.9     1.1    0.18    0.73     1.5     1.6     1.8    0.48
=cut
sub aa_EISD860102_hash  {

	my %AAHH;

	$AAHH{"D"} = 1.9;  #
	$AAHH{"E"} = 3;  #
	$AAHH{"K"} = 5.7;  #
	$AAHH{"R"} = 10;  #
	$AAHH{"H"} = 0.99;  #
	$AAHH{"Y"} = 1.8;  #
	$AAHH{"W"} = 1.6;  #
	$AAHH{"F"} = 1.1;  #
	$AAHH{"C"} = 0.17;  #
	$AAHH{"M"} = 1.9;  #
	$AAHH{"S"} = 0.73;  #
	$AAHH{"T"} = 1.5;  #
	$AAHH{"N"} = 1.3;  #
	$AAHH{"Q"} = 1.9;  #
	$AAHH{"G"} = 0;  #
	$AAHH{"A"} = 0;  #
	$AAHH{"V"} = 0.48;  #
	$AAHH{"L"} = 1;  #
	$AAHH{"I"} = 1.2;  #
	$AAHH{"P"} = 0.18;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
=pod
# AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1
H FAUJ880111
D Positive charge (Fauchere et al., 1988)
R LIT:1414114 PMID:3209351
A Fauchere, J.L., Charton, M., Kier, L.B., Verloop, A. and Pliska, V.
T Amino acid side chain parameters for correlation studies in biology and 
  pharmacology
J Int. J. Peptide Protein Res. 32, 269-278 (1988)
C ZIMJ680104    0.813
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
      0.      1.      0.      0.      0.      0.      0.      0.      1.      0.
      0.      1.      0.      0.      0.      0.      0.      0.      0.      0.
=cut
sub aa_FAUJ880111_hash  {

	my %AAHH;

	$AAHH{"D"} = 0;  #
	$AAHH{"E"} = 0;  #
	$AAHH{"K"} = 1;  #
	$AAHH{"R"} = 1;  #
	$AAHH{"H"} = 1;  #
	$AAHH{"Y"} = 0;  #
	$AAHH{"W"} = 0;  #
	$AAHH{"F"} = 0;  #
	$AAHH{"C"} = 0;  #
	$AAHH{"M"} = 0;  #
	$AAHH{"S"} = 0;  #
	$AAHH{"T"} = 0;  #
	$AAHH{"N"} = 0;  #
	$AAHH{"Q"} = 0;  #
	$AAHH{"G"} = 0;  #
	$AAHH{"A"} = 0;  #
	$AAHH{"V"} = 0;  #
	$AAHH{"L"} = 0;  #
	$AAHH{"I"} = 0;  #
	$AAHH{"P"} = 0;  #
	
	return(\%AAHH);
} # sub aahh_hash  {
# ======================================================================================
# ======================================================================================
sub d4 {
	my ($value) = @_;
	my $new_value = ( int($value*10000) )/10000;
	return($new_value);
} # sub d4
# ======================================================================================
# ======================================================================================



