#!/usr/bin/perl -w # C.Vogel, UT Austin, TX; April 2008 # input: # .arf file created by np_peptide_properties.pl # lists attributes and rows of protein_id *tab* peptide *tab* properties [separated by *comma*] # .protlst file created by np_parse_ProteinProphet.pl # lists proteins observed, spectral counts, contributing peptides # output: # .arff file for WEKA input (TESTING) # with proteins filtered according to specified filter criteria (protein-probability; spectral count; peptide-probability) # lists each peptide as observed (1) or not-observed (0) in the .protlst file # .names file with Protein IDs and Peptide listed for corresponding .arff file # .log with data summary (also contains fractions useful for COST MATRIX) # ====================================================================================== use strict; $|=1; die "\nnp_arf_to_arff_TRAINING.pl <.arf file with all peptides, properties> <.protlist file with training data information> \n\n[Note: Set one or all thresholds to 0 and degenerate to Y for least stringent data selection]\n\n" unless ($#ARGV==7); my $arf = $ARGV[0]; my $training = $ARGV[1]; my $out = $ARGV[2]; # filters: # can be applied on their own or in combination # produce a set of peptides (proteins) identified with high confidence my $min_protein_prob = $ARGV[3]; # minimum ProteinProphet probability of correct protein identification my $min_total_spectra = $ARGV[4]; # total contributing spectra observed for protein my $min_peptide_prob = $ARGV[5]; # minimum PeptideProphet probability (adjusted) of correct peptide identification my $min_peptide_spectra = $ARGV[6]; my $degen = $ARGV[7]; # allows (or doesn't allow) for ambiguous protein and peptide identifications `rm -rf $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.log`; open (LOG, ">>$out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.log"); print LOG "\# PARAMETER SETTINGS for TRAINING DATA collection:\n"; print LOG "\#\tMINIMUM PROTEIN PROBABILITY\t$min_protein_prob\n"; print LOG "\#\tMINIMUM SPECTRAL COUNT per PROTEIN\t$min_total_spectra\n"; print LOG "\#\tMINIMUM PEPTIDE PROBABILITY\t$min_peptide_prob\n"; print LOG "\#\tMINIMUM SPECTRAL COUNT per PEPTIDE\t$min_peptide_spectra\n"; print LOG "\#\tALLOW DEGENERATE PROTEINS or PEPTIDES\t$degen\n"; # ====================================================================================== # read through .protlst (training file) # collect information on # 1) observed proteins (passing all filters and have at least one observed peptide) # 2) observed peptides (passing all filters) # note that non-observed peptides will be all peptides listed in the .arf file # which belong to an observed protein (1) but are not an observed peptide (2) # format .protlst: # 0 PROTEIN_ID 1 PROTEINPROPHET_PROBABILITY 2 TOTAL_SPECTRAL_COUNT 3 NUMBER_INDISTINGUISHABLE_PROTEINS 4 IDS_INDISTINGUISHABLE_PROTEINS # 5 PEPTIDE_CHARGE 6 NON_DEGENERATE 7 INSTANCES 8 CONTRIBUTING 9 NSP_PROBABILITY 10 WEIGHT 11 PROTEIN_ANNOTATION print "\#... READING .protlist $training for OBSERVED DATA...\n"; my %OBSERVED_PEPTIDES; # hash{protein} = observed peptide open (INFO, $training) or die "Can't open $training\n"; while ( my $line = ) { next if length(chomp $line) == 0; next if $line =~ /^\#/; my ($id, $prot_prob, $total_spectra, $protein_degen, $degen_ids, $peptide_charge, $peptide_degen, $peptide_count, $contributing, $nsp_peptide_prob, $peptide_weight, $protein_annotation) = split("\t", $line); next if length(chomp $id) == 0; my $peptide = ""; if ( $peptide_charge =~ /(\w+)\_\d+/ ) { $peptide = $1; } # if # FILTERS -- this is where the quality of the training set will be determined next unless $prot_prob >= $min_protein_prob; next unless $nsp_peptide_prob >= $min_peptide_prob; next unless $total_spectra >= $min_total_spectra; next unless $peptide_count >= $min_peptide_spectra; if ( $degen eq "N" ) { next if $protein_degen > 1; # number of indistinguishable proteins > 1 next if $peptide_degen eq "N"; # if non-degenerate peptide (reused in different proteins) yes or no } # if # print "GOT THROUGH\n$id\t$peptide\n\tprotein prob $prot_prob\n\tpeptide prob\t$nsp_peptide_prob\n\ttotal spect $total_spectra\n\tdegen/non-degen $protein_degen\t$peptide_degen\n"; # get data $OBSERVED_PEPTIDES{$id}{$peptide}++; } # while close INFO; # summary printout for data collected: print LOG "\# PROTEINS OBSERVED in TRAINING DATA\t", scalar(keys(%OBSERVED_PEPTIDES)), "\n"; my $pept_cnt = 0; foreach my $protein ( keys(%OBSERVED_PEPTIDES)) { $pept_cnt += scalar( keys(%{$OBSERVED_PEPTIDES{$protein}}) ); } # foreach print LOG "\# TOTAL OBSERVED PEPTIDES in TRAINING DATA\t$pept_cnt\n"; # ====================================================================================== # first read through .arf # collecting attribute information my @HEADER; # header information (@ lines) open (IN, $arf) or die "Can't open $arf\n"; while ( my $line = ) { next if length(chomp $line) == 0; next if $line =~ /^\#/; next unless $line =~ /^\@/; # collect attributes only push(@HEADER, $line); } # close IN; # ====================================================================================== # second read through .arf # filter for only proteins with at least one observed peptide # make .arff files `rm -rf $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.arff`; `rm -rf $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.names`; print "\# ...SCREENING .arf $arf FILE...\n"; open (OUT, ">>$out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.arff"); open (NAMES, ">>$out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.names"); open (IN, $arf) or die ""; print OUT "\@relation $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.arff\n\n"; print OUT join("\n", @HEADER), "\n"; print OUT "\@attribute MSdetectability {1,0}"; print OUT "\n\n\@data\n\n"; my %PROT_CNT; # proteins in TRAINING ARFF my %OBS_CNT; # OBSERVED peptides in TRAINING ARFF my %NOBS_CNT; # NON_OBSERVED peptides in TRAINING ARFF while ( my $line = ) { next if length(chomp $line) == 0; next if $line =~ /^\#/; next if $line =~ /^\@/; my ($id, $pept, $features) = split("\t", $line); # protein observed and peptide observed and not yet printed if ( defined $OBSERVED_PEPTIDES{$id} and defined $OBSERVED_PEPTIDES{$id}{$pept} and not defined $OBS_CNT{$pept}) { print NAMES "$id\t$pept\n"; print OUT "$features,1\n"; $OBS_CNT{$pept}++; $PROT_CNT{$id}++; } # if # protein observed but peptide not-observed and not yet printed elsif ( defined $OBSERVED_PEPTIDES{$id} and not defined $OBSERVED_PEPTIDES{$id}{$pept} and not defined $NOBS_CNT{$pept}) { print NAMES "$id\t$pept\n"; print OUT "$features,0\n"; $NOBS_CNT{$pept}++; $PROT_CNT{$id}++; } # elsif # protein not observed (in data file above) else { next; } # else } # while ( my $line = ) { close IN; close OUT; close NAMES; # print some basic statistics print LOG "\n\# IN TRAINING ARFF:\n"; print LOG "\# PROTEINS\t", scalar(keys(%PROT_CNT)), "\n"; print LOG "\# OBSERVED PEPTIDES\t", scalar(keys(%OBS_CNT)),"\tFRACTION\t",scalar(keys(%OBS_CNT))/(scalar(keys(%OBS_CNT))+scalar(keys(%NOBS_CNT))),"\n"; print LOG "\# NON-OBSERVED PEPTIDES\t", scalar(keys(%NOBS_CNT)), "\tFRACTION\t",scalar(keys(%NOBS_CNT))/(scalar(keys(%OBS_CNT))+scalar(keys(%NOBS_CNT))),"\n"; print "\#... DONE\n"; close LOG; # ====================================================================================== # ======================================================================================