#!/usr/bin/perl -w # C.Vogel, UT Austin, TX; April 2008 # input: # .wekaout files (saved resultbuffers) from WEKA predictions # .names files in same directory as input .arff files (input for WEKA prediction) -- need to be in same order as predictions in .wekaout!!! # output: # .oi file: protein_id *tab* Oi-value *tab* total_peptides # notes: # Be in directory with WEKAOUT files (cutandpaste from wekapredictions) and NAMES file (ID _tab_ Peptide) # Files should be numbered as _number.names or _number.wekaout # ====================================================================================== use strict; $|=1; die "\nnp_PeptidePredictions_to_ProteinOi.pl <.names filename> <.wekaout filename> \nNOTES:\tBe in directory with WEKAOUT files (saved resultbuffer from WEKA prediction) and NAMES file (ID _tab_ Peptide)\n\tFiles should be numbered as _number.names or _number.wekaout\n" unless ($#ARGV==3); my $namesfile = $ARGV[0]; # name of _[number].names files in directory my $wekafile = $ARGV[1]; # name of _[number].wekaout files in directory (with WEKA resultbuffers) my $number = $ARGV[2]; # number of files in directory (from 0 to X) my $out = $ARGV[3]; # # file formats: # .names file: (created by np_arf_to_arff_TEST.pl) # id *tab* peptide # .wekaout file: (saved resultbuffer from WEKA predictions) # 1 ? 2:0 + 0.01 *0.99 # ====================================================================================== # define file numbers (note: currently starting from 0): my @file_numbers = (); for (my $k=0; $k<=($number-1);$k++) { push(@file_numbers, $k); } # for (... my %OIS; # hash{protein} = sum of probabilities (for all peptides) my %PEPS; # hash(protein) = count peptides per protein # ====================================================================================== # collect data and prting --- foreach my $numb ( @file_numbers ) { print "\# PARSING FILE NUMBER\t$numb\n"; # read in peptides/IDs (in same order as below the prediction!!!) --- my @STORE_NAMES = (); # store ID and peptide open (NAM, "$namesfile$numb.names") or die "Can't open $namesfile$numb.names\n"; while ( my $line = ) { next if length(chomp $line) == 0; my @a = split("\t", $line); die "ERROR STRANGE LINE in $namesfile$numb.names at $line\n" unless scalar( @a ) == 2; push(@STORE_NAMES, $line); # # store content } # while close NAM; print "\# STORING\t", scalar( @STORE_NAMES ), "\tNAMES from $namesfile$numb.names\n"; # go through all predictions, map to peptides, and sum predictions for each peptide --- my @PREDS = `grep '?' $wekafile$numb.wekaout`; print "\# PARSING \t", scalar( @PREDS ), "\tPREDICTIONS from $wekafile$numb.wekaout\n"; # mapping and counting for ( my $k=0; $k<= $#PREDS; $k++ ) { # print "last line $k\n" if $k>199999; my ($id, $peptide) = split("\t", $STORE_NAMES[$k]); my $cline = $PREDS[$k]; chomp $cline; $cline =~ s/\*//g; if ( $cline =~ /\+\s+(.*\d+.*)\s+(.*\d+.*)/ ) { my $pred = $1; $PEPS{$id}++; $OIS{$id} += $pred; } # if else { # print "ERROR $cline does not match pattern\n"; } # else } # for ( my $k=0; $k<= $#PREDS; $k++ ) { } # foreach my $numb ( @file_numbers ) { ### print out ---- `rm -f $out.oi`; open (OUT, ">>$out.oi"); print "\# PRINTING Oi file $out.oi\n"; print OUT "\# PROTEIN_ID\tOi\tTOTAL_UNIQUE_PEPTIDES_EXPECTED\n"; foreach my $id ( keys( %OIS ) ) { print OUT "$id\t$OIS{$id}\t$PEPS{$id}\n"; } #foreach close OUT; # ====================================================================================== # ======================================================================================