#!/usr/bin/perl -w # C.Vogel, UT Austin, TX; April 2008 # input: # list of proteins and their tryptic peptides # protein_id *tab* peptides [separated by *space*] # output: # list of peptides and their properties # pseudo-arff format (.arf) # protein_id *tab* peptide *tab* properties [separated by *comma*] # ====================================================================================== # Warm up: use strict; $|=1; die "\nnp_peptide_properties \n" unless ($#ARGV==0); my $protein_digest = $ARGV[0]; # ====================================================================================== # Features list: my @alphabet = split("", "ACDEFGHIKLMNPQRSTVWY"); my @rel_alphabet; foreach my $letter (@alphabet) { push(@rel_alphabet, "$letter\_rel"); } # foreach my $letter (@alphabet) { # Notes: # can possibly leave out @alphabet and all 'sums' of features as they are likely to be correlated with sequence length # can include further features from AAindex file my @features = ( "length_aa", "molecular_weight", @alphabet, @rel_alphabet, "CHOP780201 sum", "CHOP780201 avg", # Normalized frequency of alpha-helix (Chou-Fasman, 1978b) "CHOP780202 sum", "CHOP780202 avg", # Normalized frequency of beta-sheet (Chou-Fasman, 1978b) "CHOP780203 sum", "CHOP780203 avg", # Normalized frequency of beta-turn (Chou-Fasman, 1978b) "WERD780101 sum", "WERD780101 avg", # Propensity to be buried inside (Wertz-Scheraga, 1978) "ZIMJ680104 sum", "ZIMJ680104 avg", # Isoelectric point (Zimmerman et al., 1968) "KLEP840101 sum", "KLEP840101 avg", # Net charge (Klein et al., 1984) "EISD860102 sum", "EISD860102 avg", # Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986) "FAUJ880111 sum", "FAUJ880111 avg", # Positive charge (Fauchere et al., 1988) "VINM940101 sum", "VINM940101 avg", # Normalized flexibility parameters (B-values), average (Vihinen et al., 1994) "FAUJ880103 sum", "FAUJ880103 avg", # Normalized van der Waals volume (Fauchere et al., 1988) "GUYH850105 sum", "GUYH850105 avg", # Apparent partition energies calculated from Chothia index (Guy, 1985); Amino acid side-chain partition energies and distribution of residues in soluble proteins "NOZY710101 sum", "NOZY710101 avg", # Transfer energy, organic solvent/water (Nozaki-Tanford, 1971) ); =pod =cut print "\#YID\tSEQUENCE\tFEATURES TOTAL\t", scalar(@features), " \n"; for (my $k=0; $k<=$#features; $k++ ) { my $attribute = $k+1; $attribute .= " $features[$k]"; $attribute =~ s/ /_/g; print "\@attribute $attribute numeric\n"; }# for (my $k=0; $k<=$#features; $k++ ) { # ====================================================================================== # Go through each line of input file, print properties of each peptide open (LIST, $protein_digest) or die "Cant open $protein_digest\n"; while ( my $line = ) { next if length(chomp $line) == 0; my ($YID, $peptides) = split("\t", $line); # see above for format of input list die "No ID for gene in line $line\n" if length(chomp $YID) == 0; my @c = split(" ", $peptides); my $total_peptides = scalar( @c); die "No peptides for $YID in line $line\n" unless $total_peptides >= 1; foreach my $peptide ( split(" ", $peptides) ) { next if length(chomp $peptide) == 0; # analyse aa frequencies etc (basic attributes): my $sequence = $peptide; $sequence =~ s/X//g; my @sequence = split("", $sequence); next if scalar(@sequence)==0; my @peptide_features; my ($length, $mass, $ar_absfreq, $ar_relfreq) = &aa_frequencies($sequence); # length, mass, absolute, relative frequencies of sequence next unless $length >=3; # minimum 3 amino acids per peptide # basic attributes: push(@peptide_features, $length); push(@peptide_features, $mass); push(@peptide_features, @$ar_absfreq); push(@peptide_features, @$ar_relfreq); # secondary structure attributes: push(@peptide_features, &sequence_feature($sequence, &aa_CHOP780201_hash)); # Normalized frequency of alpha-helix (Chou-Fasman, 1978b) push(@peptide_features, &sequence_feature($sequence, &aa_CHOP780202_hash)); # Normalized frequency of beta-sheet (Chou-Fasman, 1978b) push(@peptide_features, &sequence_feature($sequence, &aa_CHOP780203_hash)); # Normalized frequency of beta-turn (Chou-Fasman, 1978b) # attributes identified Mallick et al. [Nature Biotech, 2007] for MUDPIT-ESI: push(@peptide_features, &sequence_feature($sequence, &aa_WERD780101_hash)); # Propensity to be buried inside (Wertz-Scheraga, 1978) push(@peptide_features, &sequence_feature($sequence, &aa_ZIMJ680104_hash)); # Isoelectric point (Zimmerman et al., 1968) push(@peptide_features, &sequence_feature($sequence, &aa_KLEP840101_hash)); # Net charge (Klein et al., 1984) push(@peptide_features, &sequence_feature($sequence, &aa_EISD860102_hash)); # Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986) push(@peptide_features, &sequence_feature($sequence, &aa_FAUJ880111_hash)); # Positive charge (Fauchere et al., 1988) # additional attributes: push(@peptide_features, &sequence_feature($sequence, &aa_VINM940101_hash)); # Normalized flexibility parameters (B-values), average (Vihinen et al., 1994) push(@peptide_features, &sequence_feature($sequence, &aa_FAUJ880103_hash)); # Normalized van der Waals volume (Fauchere et al., 1988) push(@peptide_features, &sequence_feature($sequence, &aa_GUYH850105_hash)); # Apparent partition energies calculated from Chothia index (Guy, 1985) push(@peptide_features, &sequence_feature($sequence, &aa_NOZY710101_hash)); # Transfer energy, organic solvent/water (Nozaki-Tanford, 1971) print "$YID\t$peptide\t", join(",", @peptide_features), "\n"; } # foreach my $peptide ( split(" ", $peptides) ) { } # while ( my $id = ) { close LIST; # ====================================================================================== # ====================================================================================== ### SUBROUTINES ### # ====================================================================================== # ====================================================================================== sub sequence_feature { my ($seq, $hr_AA_FEATURE) = @_; my @sequence = split("", $seq); my $length = scalar(@sequence); my ($sum, $avg); foreach my $aa (@sequence) { $sum += $$hr_AA_FEATURE{"$aa"} if defined $$hr_AA_FEATURE{"$aa"}; $avg += $$hr_AA_FEATURE{"$aa"}/$length if defined $$hr_AA_FEATURE{"$aa"}; } # return(&d4($sum), &d4($avg)); } # sub sequence_feature # ====================================================================================== # ====================================================================================== sub aa_frequencies { my ($sequence) = @_; my $sequence_copy = $sequence; my @sequence = split("", $sequence); my ($length, $mass, @abs_freq, @rel_freq); my @alphabet = split("", "ACDEFGHIKLMNPQRSTVWY"); $length = scalar(@sequence); # molecular masses my %AAHH; =pod H FASG760101 D Molecular weight (Fasman, 1976) R A Fasman, G.D., ed. T J "Handbook of Biochemistry and Molecular Biology", 3rd ed., Proteins - Volume 1, CRC Press, Cleveland (1976) I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 89.09 174.20 132.12 133.10 121.15 146.15 147.13 75.07 155.16 131.17 131.17 146.19 149.21 165.19 115.13 105.09 119.12 204.24 181.19 117.15 =cut $AAHH{"D"} = 133.10; # $AAHH{"E"} = 147.13; # $AAHH{"K"} = 146.19; # $AAHH{"R"} = 174.20; # $AAHH{"H"} = 155.16; # $AAHH{"Y"} = 181.19; # $AAHH{"W"} = 204.24; # $AAHH{"F"} = 165.19; # $AAHH{"C"} = 121.15; # $AAHH{"M"} = 149.21; # $AAHH{"S"} = 105.09; # $AAHH{"T"} = 119.12; # $AAHH{"N"} = 132.12; # $AAHH{"Q"} = 146.15; # $AAHH{"G"} = 75.07; # $AAHH{"A"} = 89.09; # $AAHH{"V"} = 117.15; # $AAHH{"L"} = 131.17; # $AAHH{"I"} = 131.17; # $AAHH{"P"} = 115.13; # foreach my $aa ( @alphabet ) { die "unusual amino acid $aa in $sequence\n" unless defined $AAHH{"$aa"}; my $number = $sequence =~ s/$aa//g; $number = 0 unless $number > 0; push(@abs_freq, $number); push(@rel_freq, $number/$length); $mass += ( $number * ($AAHH{"$aa"}-18.02) ); } # foreach my $aa ( @alphabet ) { $mass += 18.02; return($length, $mass, \@abs_freq, \@rel_freq); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H VINM940101 D Normalized flexibility parameters (B-values), average (Vihinen et al., 1994) R LIT:2014123 PMID:8090708 A Vihinen, M., Torkkila, E. and Riikonen, P. T Accuracy of protein flexibility predictions J Proteins 19, 141-149 (1994) C MIYS990104 0.965 MIYS990105 0.952 MIYS990103 0.951 VINM940102 0.940 OOBM770103 0.936 GUYH850102 0.924 VINM940103 0.921 FUKS010104 0.919 PARS000101 0.919 FASG890101 0.904 MEIH800101 0.900 KRIW790101 0.890 KARP850102 0.885 MIYS990102 0.883 MIYS990101 0.880 PARS000102 0.877 FUKS010102 0.876 MEIH800102 0.872 GRAR740102 0.869 CORJ870108 0.862 GUYH850103 0.860 HOPT810101 0.859 KRIW790102 0.853 PUNT030102 0.850 RACS770102 0.844 PARJ860101 0.837 RACS770101 0.835 WOEC730101 0.834 RACS770103 0.830 GUYH850101 0.829 FUKS010103 0.827 KARP850101 0.821 VINM940104 0.815 LEVM760101 0.815 MUNV940103 0.811 PUNT030101 0.805 PALJ810104 -0.801 WIMW960101 -0.804 DESM900101 -0.806 NADH010105 -0.808 CORJ870102 -0.817 SWER830101 -0.817 ROSM880105 -0.818 GEIM800107 -0.819 QIAN880120 -0.823 CORJ870104 -0.826 QIAN880121 -0.828 LIFS790103 -0.829 DESM900102 -0.829 CHOP780202 -0.831 ZHOH040101 -0.833 LIFS790101 -0.834 MANP780101 -0.836 CIDH920103 -0.837 CORJ870103 -0.848 CIDH920101 -0.854 ROBB790101 -0.858 NADH010102 -0.859 MEIH800103 -0.861 BASU050101 -0.867 FAUJ830101 -0.871 CIDH920102 -0.872 CORJ870105 -0.873 CORJ870107 -0.877 PONP800103 -0.878 PONP800101 -0.878 CORJ870106 -0.881 CIDH920104 -0.883 MIYS850101 -0.883 PONP800102 -0.883 CIDH920105 -0.885 NADH010103 -0.889 PONP800108 -0.891 NADH010104 -0.891 BAEK050101 -0.896 BASU050103 -0.902 BASU050102 -0.904 RADA880108 -0.906 PONP930101 -0.913 NISK800101 -0.922 ZHOH040103 -0.922 CORJ870101 -0.924 BIOV880102 -0.929 WERD780101 -0.931 BIOV880101 -0.941 ROSG850102 -0.943 CASG920101 -0.947 NISK860101 -0.959 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0.984 1.008 1.048 1.068 0.906 1.037 1.094 1.031 0.950 0.927 0.935 1.102 0.952 0.915 1.049 1.046 0.997 0.904 0.929 0.931 // =cut sub aa_VINM940101_hash { my %AAHH; $AAHH{"A"} = 0.984; # $AAHH{"L"} = 0.935; # $AAHH{"R"} = 1.008; # $AAHH{"K"} = 1.102; # $AAHH{"N"} = 1.048; # $AAHH{"M"} = 0.952; # $AAHH{"D"} = 1.068; # $AAHH{"F"} = 0.915; # $AAHH{"C"} = 0.906; # $AAHH{"P"} = 1.049; # $AAHH{"Q"} = 1.037; # $AAHH{"S"} = 1.046; # $AAHH{"E"} = 1.094; # $AAHH{"T"} = 0.997; # $AAHH{"G"} = 1.031; # $AAHH{"W"} = 0.904; # $AAHH{"H"} = 0.950; # $AAHH{"Y"} = 0.929; # $AAHH{"I"} = 0.927; # $AAHH{"V"} = 0.931; # return(\%AAHH); } # sub aahh_hash { # ======================================================================================# ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H FAUJ880103 D Normalized van der Waals volume (Fauchere et al., 1988) R LIT:1414114 PMID:3209351 A Fauchere, J.L., Charton, M., Kier, L.B., Verloop, A. and Pliska, V. T Amino acid side chain parameters for correlation studies in biology and pharmacology J Int. J. Peptide Protein Res. 32, 269-278 (1988) (Pro !) Original reference of these two data: Fauchere, L.J. In "QSAR in Design of Bioactive Compounds", (Kuchar, M., ed.), Prous, Barcelona pp.135-144 (1984) C CHAM820101 0.992 CHOC750101 0.990 CHOC760101 0.985 TSAJ990102 0.985 TSAJ990101 0.983 FASG760101 0.979 BIGC670101 0.972 GOLD730102 0.972 KRIW790103 0.965 PONJ960101 0.963 GRAR740103 0.959 HARY940101 0.951 LEVM760102 0.947 LEVM760105 0.945 CHAM830106 0.927 FAUJ880106 0.908 ROSG850101 0.892 DAWD720101 0.880 LEVM760107 0.875 RADA880106 0.869 MCMT640101 0.847 ZHOH040102 0.816 WOLS870102 0.814 CHAM830105 0.813 HUTJ700102 0.807 FAUJ880104 0.804 OOBM770102 0.801 RADA880103 -0.923 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 1.00 6.13 2.95 2.78 2.43 3.95 3.78 0.00 4.66 4.00 4.00 4.77 4.43 5.89 2.72 1.60 2.60 8.08 6.47 3.00 // =cut sub aa_FAUJ880103_hash { my %AAHH; $AAHH{"A"} = 1.00; # $AAHH{"L"} = 4.00; # $AAHH{"R"} = 6.13; # $AAHH{"K"} = 4.77; # $AAHH{"N"} = 2.95; # $AAHH{"M"} = 4.43; # $AAHH{"D"} = 2.78; # $AAHH{"F"} = 5.89; # $AAHH{"C"} = 2.43; # $AAHH{"P"} = 2.72; # $AAHH{"Q"} = 3.95; # $AAHH{"S"} = 1.60; # $AAHH{"E"} = 3.78; # $AAHH{"T"} = 2.60; # $AAHH{"G"} = 0.00; # $AAHH{"W"} = 8.08; # $AAHH{"H"} = 4.66; # $AAHH{"Y"} = 6.47; # $AAHH{"I"} = 4.00; # $AAHH{"V"} = 3.00; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H GUYH850105 D Apparent partition energies calculated from Chothia index (Guy, 1985) R PMID:3978191 A Guy, H.R. T Amino acid side-chain partition energies and distribution of residues in soluble proteins J Biophys. J. 47, 61-70 (1985) C CHOC760102 0.946 FAUJ880109 0.927 JANJ780101 0.923 GUYH850104 0.908 JANJ780103 0.885 OOBM770101 0.874 ROSM880102 0.874 PRAM900101 0.867 ENGD860101 0.867 PUNT030101 0.858 KUHL950101 0.850 ROSM880101 0.849 VHEG790101 0.845 GUYH850101 0.843 MEIH800102 0.811 RADA880105 -0.809 EISD860103 -0.812 NADH010103 -0.815 DESM900102 -0.818 CHOC760104 -0.822 YUTK870101 -0.841 NADH010101 -0.847 NADH010102 -0.867 KYTJ820101 -0.883 JACR890101 -0.887 JANJ780102 -0.898 RADA880104 -0.899 RADA880101 -0.899 JURD980101 -0.900 WOLR790101 -0.908 JANJ790102 -0.913 WOLR810101 -0.916 OLSK800101 -0.927 CHOC760103 -0.933 EISD840101 -0.951 RADA880107 -0.953 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V -0.27 2.00 0.61 0.50 -0.23 1.00 0.33 -0.22 0.37 -0.80 -0.44 1.17 -0.31 -0.55 0.36 0.17 0.18 0.05 0.48 -0.65 // =cut sub aa_GUYH850105_hash { my %AAHH; $AAHH{"A"} = -0.27; # $AAHH{"L"} = -0.44; # $AAHH{"R"} = 2.00; # $AAHH{"K"} = 1.17; # $AAHH{"N"} = 0.61; # $AAHH{"M"} = -0.31; # $AAHH{"D"} = 0.50; # $AAHH{"F"} = -0.55; # $AAHH{"C"} = -0.23; # $AAHH{"P"} = 0.36; # $AAHH{"Q"} = 1.00; # $AAHH{"S"} = 0.17; # $AAHH{"E"} = 0.33; # $AAHH{"T"} = 0.18; # $AAHH{"G"} = -0.22; # $AAHH{"W"} = 0.05; # $AAHH{"H"} = 0.37; # $AAHH{"Y"} = 0.48; # $AAHH{"I"} = -0.80; # $AAHH{"V"} = -0.65; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // // H NOZY710101 D Transfer energy, organic solvent/water (Nozaki-Tanford, 1971) R PMID:5555568 A Nozaki, Y. and Tanford, C. T The solubility of amino acids and two glycine peptides in aqueous ethanol and dioxane solutions J J. Biol. Chem. 246, 2211-2217 (1971) Missing values filled with zeros C ZHOH040101 0.932 RADA880102 0.917 MEEJ810102 0.899 VENT840101 0.897 ZHOH040102 0.897 MEEJ800102 0.895 CIDH920102 0.889 TAKK010101 0.884 GUOD860101 0.884 MEEJ810101 0.882 CIDH920105 0.857 ROSM880104 0.847 BASU050102 0.847 LEVM760107 0.845 ZHOH040103 0.842 PLIV810101 0.839 CORJ870102 0.838 ZIMJ680105 0.837 SWER830101 0.836 ROSG850101 0.834 BROC820101 0.829 EISD860101 0.822 GARJ730101 0.821 WIMW960101 0.818 MIYS850101 0.810 SIMZ760101 0.807 FAUJ830101 0.803 ARGP820101 0.800 MIYS990102 -0.819 MIYS990101 -0.821 OOBM770103 -0.828 PARS000101 -0.829 WOLS870101 -0.874 WEBA780101 -0.890 BULH740101 -0.892 PARJ860101 -0.900 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.5 1.8 1.8 0.0 1.3 2.5 0.0 0.0 0.4 3.4 2.3 1.5 // =cut sub aa_NOZY710101_hash { my %AAHH; $AAHH{"A"} = 0.5; # $AAHH{"L"} = 1.8; # $AAHH{"R"} = 0.0; # $AAHH{"K"} = 0.0; # $AAHH{"N"} = 0.0; # $AAHH{"M"} = 1.3; # $AAHH{"D"} = 0.0; # $AAHH{"F"} = 2.5; # $AAHH{"C"} = 0.0; # $AAHH{"P"} = 0.0; # $AAHH{"Q"} = 0.0; # $AAHH{"S"} = 0.0; # $AAHH{"E"} = 0.0; # $AAHH{"T"} = 0.4; # $AAHH{"G"} = 0.0; # $AAHH{"W"} = 3.4; # $AAHH{"H"} = 0.5; # $AAHH{"Y"} = 2.3; # $AAHH{"I"} = 1.8; # $AAHH{"V"} = 1.5; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H WERD780101 D Propensity to be buried inside (Wertz-Scheraga, 1978) R LIT:0405105 PMID:621952 A Wertz, D.H. and Scheraga, H.A. T Influence of water on protein structure. An analysis of the preferences of amino acid residues for the inside or outside and for specific conformations in a protein molecule J Macromolecules 11, 9-15 (1978) Adjusted values C NISK860101 0.966 BIOV880101 0.951 ROSG850102 0.943 MIYS850101 0.934 RADA880108 0.930 BIOV880102 0.929 CASG920101 0.927 ZHOH040103 0.923 BASU050102 0.920 CIDH920105 0.905 CIDH920104 0.896 PONP930101 0.895 MEIH800103 0.895 BAEK050101 0.895 NISK800101 0.891 NADH010104 0.890 CORJ870107 0.887 PONP800102 0.883 CORJ870103 0.882 CIDH920103 0.881 NADH010103 0.880 PONP800101 0.880 CIDH920101 0.878 CORJ870106 0.878 PONP800103 0.876 CORJ870101 0.873 ROBB790101 0.872 CIDH920102 0.871 FAUJ830101 0.862 ZHOH040101 0.859 CORJ870105 0.858 BASU050103 0.857 MANP780101 0.853 CORJ870104 0.850 PONP800108 0.843 BASU050101 0.843 PLIV810101 0.841 NADH010102 0.841 NADH010105 0.837 MEEJ810101 0.825 DESM900102 0.814 CORJ870102 0.804 SWER830101 0.804 FUKS010102 -0.801 BHAR880101 -0.803 KRIW710101 -0.819 PUNT030101 -0.821 GRAR740102 -0.826 FUKS010104 -0.832 KARP850101 -0.842 RACS770103 -0.846 PARS000101 -0.853 PARJ860101 -0.869 FUKS010103 -0.869 GUYH850101 -0.871 KRIW790102 -0.875 GUYH850103 -0.876 CORJ870108 -0.878 VINM940102 -0.886 KRIW790101 -0.899 MEIH800102 -0.903 RACS770102 -0.906 OOBM770103 -0.906 KARP850102 -0.909 MIYS990101 -0.912 RACS770101 -0.912 MIYS990102 -0.914 FASG890101 -0.926 VINM940103 -0.926 VINM940101 -0.931 MIYS990105 -0.936 MIYS990103 -0.938 MEIH800101 -0.943 MIYS990104 -0.949 GUYH850102 -0.976 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0.52 0.49 0.42 0.37 0.83 0.35 0.38 0.41 0.70 0.79 0.77 0.31 0.76 0.87 0.35 0.49 0.38 0.86 0.64 0.72 // =cut sub aa_WERD780101_hash { my %AAHH; $AAHH{"A"} = 0.52; # $AAHH{"L"} = 0.77; # $AAHH{"R"} = 0.49; # $AAHH{"K"} = 0.31; # $AAHH{"N"} = 0.42; # $AAHH{"M"} = 0.76; # $AAHH{"D"} = 0.37; # $AAHH{"F"} = 0.87; # $AAHH{"C"} = 0.83; # $AAHH{"P"} = 0.35; # $AAHH{"Q"} = 0.35; # $AAHH{"S"} = 0.49; # $AAHH{"E"} = 0.38; # $AAHH{"T"} = 0.38; # $AAHH{"G"} = 0.41; # $AAHH{"W"} = 0.86; # $AAHH{"H"} = 0.70; # $AAHH{"Y"} = 0.64; # $AAHH{"I"} = 0.79; # $AAHH{"V"} = 0.72; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H ZIMJ680104 D Isoelectric point (Zimmerman et al., 1968) R LIT:2004109b PMID:5700434 A Zimmerman, J.M., Eliezer, N. and Simha, R. T The characterization of amino acid sequences in proteins by statistical methods J J. Theor. Biol. 21, 170-201 (1968) C KLEP840101 0.941 FAUJ880111 0.813 FINA910103 0.805 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 6.00 10.76 5.41 2.77 5.05 5.65 3.22 5.97 7.59 6.02 5.98 9.74 5.74 5.48 6.30 5.68 5.66 5.89 5.66 5.96 // =cut sub aa_ZIMJ680104_hash { my %AAHH; $AAHH{"A"} = 6.00; # $AAHH{"L"} = 5.98; # $AAHH{"R"} = 10.76; # $AAHH{"K"} = 9.74; # $AAHH{"N"} = 5.41; # $AAHH{"M"} = 5.74; # $AAHH{"D"} = 2.77; # $AAHH{"F"} = 5.48; # $AAHH{"C"} = 5.05; # $AAHH{"P"} = 6.30; # $AAHH{"Q"} = 5.65; # $AAHH{"S"} = 5.68; # $AAHH{"E"} = 3.22; # $AAHH{"T"} = 5.66; # $AAHH{"G"} = 5.97; # $AAHH{"W"} = 5.89; # $AAHH{"H"} = 7.59; # $AAHH{"Y"} = 5.66; # $AAHH{"I"} = 6.02; # $AAHH{"V"} = 5.96; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H CHOP780201 D Normalized frequency of alpha-helix (Chou-Fasman, 1978b) R PMID:364941 A Chou, P.Y. and Fasman, G.D. T Prediction of the secondary structure of proteins from their amino acid sequence J Adv. Enzymol. 47, 45-148 (1978) C PALJ810102 0.981 ROBB760101 0.969 ISOY800101 0.959 MAXF760101 0.956 KANM800101 0.956 TANS770101 0.947 BURA740101 0.917 GEIM800101 0.912 KANM800103 0.912 NAGK730101 0.886 LEVM780104 0.886 PALJ810101 0.881 QIAN880106 0.874 PRAM900102 0.873 LEVM780101 0.873 GEIM800104 0.868 RACS820108 0.868 AURR980108 0.867 AURR980109 0.859 AURR980112 0.856 CRAJ730101 0.851 QIAN880107 0.843 BEGF750101 0.841 QIAN880105 0.835 AURR980114 0.828 AURR980115 0.816 AURR980110 0.814 PALJ810109 0.814 AURR980111 0.813 ROBB760103 0.806 MUNV940101 -0.802 CRAJ730103 -0.808 ROBB760113 -0.811 MUNV940102 -0.812 CHAM830101 -0.828 NAGK730103 -0.837 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 1.42 0.98 0.67 1.01 0.70 1.11 1.51 0.57 1.00 1.08 1.21 1.16 1.45 1.13 0.57 0.77 0.83 1.08 0.69 1.06 // =cut sub aa_CHOP780201_hash { # my %AAHH; $AAHH{"D"} = 1.01; # $AAHH{"E"} = 1.51; # $AAHH{"K"} = 1.16; # $AAHH{"R"} = 0.98; # $AAHH{"H"} = 1.00; # $AAHH{"Y"} = 0.69; # $AAHH{"W"} = 1.08; # $AAHH{"F"} = 1.13; # $AAHH{"C"} = 0.70; # $AAHH{"M"} = 1.45; # $AAHH{"S"} = 0.77; # $AAHH{"T"} = 0.83; # $AAHH{"N"} = 0.67; # $AAHH{"Q"} = 1.11; # $AAHH{"G"} = 0.57; # $AAHH{"A"} = 1.42; # $AAHH{"V"} = 1.06; # $AAHH{"L"} = 1.21; # $AAHH{"I"} = 1.08; # $AAHH{"P"} = 0.57; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H CHOP780202 D Normalized frequency of beta-sheet (Chou-Fasman, 1978b) R PMID:364941 A Chou, P.Y. and Fasman, G.D. T Prediction of the secondary structure of proteins from their amino acid sequence J Adv. Enzymol. 47, 45-148 (1978) C PALJ810104 0.970 LIFS790101 0.947 KANM800102 0.945 PALJ810103 0.937 ROBB760106 0.931 LEVM780105 0.930 GEIM800107 0.929 QIAN880120 0.915 PTIO830102 0.913 QIAN880121 0.911 LIFS790103 0.908 GEIM800105 0.890 ROBB760105 0.885 BASU050101 0.883 BASU050103 0.874 PONP930101 0.867 NAGK730102 0.858 QIAN880119 0.855 CHOP780208 0.851 BASU050102 0.841 KANM800104 0.839 GEIM800106 0.839 LEVM780102 0.833 PRAM900103 0.833 NISK860101 0.832 SWER830101 0.823 CORJ870102 0.822 CHOP780209 0.822 CORJ870101 0.815 PALJ810112 0.815 PONP800108 0.809 PALJ810110 0.808 MANP780101 0.805 VENT840101 0.805 MIYS990102 -0.801 PUNT030102 -0.803 VINM940102 -0.810 OOBM770103 -0.820 GEIM800110 -0.824 MIYS990103 -0.825 MIYS990104 -0.829 VINM940101 -0.831 MUNV940103 -0.892 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0.83 0.93 0.89 0.54 1.19 1.10 0.37 0.75 0.87 1.60 1.30 0.74 1.05 1.38 0.55 0.75 1.19 1.37 1.47 1.70 // =cut sub aa_CHOP780202_hash { # my %AAHH; $AAHH{"D"} = 0.54; # $AAHH{"E"} = 0.37; # $AAHH{"K"} = 0.74; # $AAHH{"R"} = 0.93; # $AAHH{"H"} = 0.87; # $AAHH{"Y"} = 1.47; # $AAHH{"W"} = 1.37; # $AAHH{"F"} = 1.38; # $AAHH{"C"} = 1.19; # $AAHH{"M"} = 1.05; # $AAHH{"S"} = 0.75; # $AAHH{"T"} = 1.19; # $AAHH{"N"} = 0.89; # $AAHH{"Q"} = 1.10; # $AAHH{"G"} = 0.75; # $AAHH{"A"} = 0.83; # $AAHH{"V"} = 1.70; # $AAHH{"L"} = 1.30; # $AAHH{"I"} = 1.60; # $AAHH{"P"} = 0.55; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 // H CHOP780203 D Normalized frequency of beta-turn (Chou-Fasman, 1978b) R PMID:364941 A Chou, P.Y. and Fasman, G.D. T Prediction of the secondary structure of proteins from their amino acid sequence J Adv. Enzymol. 47, 45-148 (1978) C CHOP780216 0.979 CHOP780101 0.940 TANS770110 0.940 LEVM780106 0.935 GEIM800111 0.933 ISOY800103 0.933 CHAM830101 0.931 PRAM900104 0.928 QIAN880132 0.928 LEVM780103 0.927 GEIM800108 0.925 CHOP780210 0.918 QIAN880133 0.915 PALJ810106 0.907 PALJ810105 0.878 QIAN880131 0.861 QIAN880134 0.838 RACS770101 0.827 QIAN880135 0.811 CORJ870106 -0.813 QIAN880119 -0.814 CORJ870105 -0.815 PONP800107 -0.818 SUEM840101 -0.892 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0.74 1.01 1.46 1.52 0.96 0.96 0.95 1.56 0.95 0.47 0.50 1.19 0.60 0.66 1.56 1.43 0.98 0.60 1.14 0.59 // =cut sub aa_CHOP780203_hash { my %AAHH; $AAHH{"D"} = 1.52; # $AAHH{"E"} = 0.95; # $AAHH{"K"} = 1.19; # $AAHH{"R"} = 1.01; # $AAHH{"H"} = 0.95; # $AAHH{"Y"} = 1.14; # $AAHH{"W"} = 0.60; # $AAHH{"F"} = 0.66; # $AAHH{"C"} = 0.96; # $AAHH{"M"} = 0.60; # $AAHH{"S"} = 1.43; # $AAHH{"T"} = 0.98; # $AAHH{"N"} = 1.46; # $AAHH{"Q"} = 0.96; # $AAHH{"G"} = 1.56; # $AAHH{"A"} = 0.74; # $AAHH{"V"} = 0.59; # $AAHH{"L"} = 0.50; # $AAHH{"I"} = 0.47; # $AAHH{"P"} = 1.56; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 H KLEP840101 D Net charge (Klein et al., 1984) R LIT:1008055 PMID:6547351 A Klein, P., Kanehisa, M. and DeLisi, C. T Prediction of protein function from sequence properties: Discriminant analysis of a data base J Biochim. Biophys. Acta 787, 221-226 (1984) C ZIMJ680104 0.941 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0. 1. 0. -1. 0. 0. -1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. =cut sub aa_KLEP840101_hash { my %AAHH; $AAHH{"D"} = -1; # $AAHH{"E"} = -1; # $AAHH{"K"} = 1; # $AAHH{"R"} = 1; # $AAHH{"H"} = 0; # $AAHH{"Y"} = 0; # $AAHH{"W"} = 0; # $AAHH{"F"} = 0; # $AAHH{"C"} = 0; # $AAHH{"M"} = 0; # $AAHH{"S"} = 0; # $AAHH{"T"} = 0; # $AAHH{"N"} = 0; # $AAHH{"Q"} = 0; # $AAHH{"G"} = 0; # $AAHH{"A"} = 0; # $AAHH{"V"} = 0; # $AAHH{"L"} = 0; # $AAHH{"I"} = 0; # $AAHH{"P"} = 0; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 H EISD860102 D Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986) R LIT:2004121b PMID: 3945310 A Eisenberg, D. and McLachlan, A.D. T Solvation energy in protein folding and binding J Nature 319, 199-203 (1986) C FAUJ880109 0.841 HUTJ700103 0.841 RADA880107 -0.837 YUTK870103 -0.839 YUTK870104 -0.840 JACR890101 -0.871 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0. 10. 1.3 1.9 0.17 1.9 3. 0. 0.99 1.2 1.0 5.7 1.9 1.1 0.18 0.73 1.5 1.6 1.8 0.48 =cut sub aa_EISD860102_hash { my %AAHH; $AAHH{"D"} = 1.9; # $AAHH{"E"} = 3; # $AAHH{"K"} = 5.7; # $AAHH{"R"} = 10; # $AAHH{"H"} = 0.99; # $AAHH{"Y"} = 1.8; # $AAHH{"W"} = 1.6; # $AAHH{"F"} = 1.1; # $AAHH{"C"} = 0.17; # $AAHH{"M"} = 1.9; # $AAHH{"S"} = 0.73; # $AAHH{"T"} = 1.5; # $AAHH{"N"} = 1.3; # $AAHH{"Q"} = 1.9; # $AAHH{"G"} = 0; # $AAHH{"A"} = 0; # $AAHH{"V"} = 0.48; # $AAHH{"L"} = 1; # $AAHH{"I"} = 1.2; # $AAHH{"P"} = 0.18; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== =pod # AA index ftp://ftp.genome.ad.jp/pub/db/genomenet/aaindex/aaindex1 H FAUJ880111 D Positive charge (Fauchere et al., 1988) R LIT:1414114 PMID:3209351 A Fauchere, J.L., Charton, M., Kier, L.B., Verloop, A. and Pliska, V. T Amino acid side chain parameters for correlation studies in biology and pharmacology J Int. J. Peptide Protein Res. 32, 269-278 (1988) C ZIMJ680104 0.813 I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. =cut sub aa_FAUJ880111_hash { my %AAHH; $AAHH{"D"} = 0; # $AAHH{"E"} = 0; # $AAHH{"K"} = 1; # $AAHH{"R"} = 1; # $AAHH{"H"} = 1; # $AAHH{"Y"} = 0; # $AAHH{"W"} = 0; # $AAHH{"F"} = 0; # $AAHH{"C"} = 0; # $AAHH{"M"} = 0; # $AAHH{"S"} = 0; # $AAHH{"T"} = 0; # $AAHH{"N"} = 0; # $AAHH{"Q"} = 0; # $AAHH{"G"} = 0; # $AAHH{"A"} = 0; # $AAHH{"V"} = 0; # $AAHH{"L"} = 0; # $AAHH{"I"} = 0; # $AAHH{"P"} = 0; # return(\%AAHH); } # sub aahh_hash { # ====================================================================================== # ====================================================================================== sub d4 { my ($value) = @_; my $new_value = ( int($value*10000) )/10000; return($new_value); } # sub d4 # ====================================================================================== # ======================================================================================