#!/usr/bin/perl -w # C.Vogel, UT Austin, TX; April 2008 # input: # .oi file -- format: # 0 protein_id 1 Oi-value 2 total_peptides # .protlst: tab-delimited list of observed proteins (at certain FDR) and their observed peptides # 0 PROTEIN_ID 1 PROTEINPROPHET_PROBABILITY 2 TOTAL_SPECTRAL_COUNT 3 NUMBER_INDISTINGUISHABLE_PROTEINS 4 IDS_INDISTINGUISHABLE_PROTEINS # 5 PEPTIDE_CHARGE 6 NON_DEGENERATE 7 INSTANCES 8 CONTRIBUTING 9 NSP_PROBABILITY 10 WEIGHT 11 PROTEIN_ANNOTATION # output: # .apex file - format: # # 0 ID 1 OI_VALUE 2 PROTEINPROPHET_PROBABILITY 3 TOTAL_SPECTRAL_COUNTS 4 APEX_PROTEIN_ABUNDANCE # 5 NUMBER_OF_DEGENERATE_PROTEINS 6 IDS_DEGENERATE_PROTEINS 7 ANNOTATION # ====================================================================================== use strict; $|=1; die "\nnp_APEX_from_Oi_and_protlst.pl \n" unless ($#ARGV==2); my $file_Oi = $ARGV[0]; my $file_MSout = $ARGV[1]; my $avg_number = $ARGV[2]; `rm -f $file_MSout.apex`; open (OUT, ">> $file_MSout.apex"); # ====================================================================================== # get Oi values my %OI; # hash{ID} = Oi value open (IN, $file_Oi) or die "Cant open $file_Oi\n"; while ( my $line = ) { next if length(chomp $line) == 0; my @a = split("\t", $line); # next unless scalar(@a) == 2; next unless $a[0] =~ /\w+/; next unless $a[1] =~ /\d+/; $OI{$a[0]} = $a[1]; } # while ( my $line = ) { close IN; print OUT "\# NUMBER of proteins with Oi-value\t", scalar(keys(%OI)), "\n"; # ====================================================================================== # read in proteomics data and do pre-APEX calculations my %APEX_pre; # hash{ID} = ni * pi / Oi my $APEX_sum; open (IN, $file_MSout) or die "Cant open $file_MSout\n"; # .protlst format: # 0 PROTEIN_ID 1 PROTEINPROPHET_PROBABILITY 2 TOTAL_SPECTRAL_COUNT 3 NUMBER_INDISTINGUISHABLE_PROTEINS 4 IDS_INDISTINGUISHABLE_PROTEINS # 5 PEPTIDE_CHARGE 6 NON_DEGENERATE 7 INSTANCES 8 CONTRIBUTING 9 NSP_PROBABILITY 10 WEIGHT 11 PROTEIN_ANNOTATION while ( my $line = ) { next if length(chomp $line) == 0; next if $line =~ /^\#/; my @a = split("\t", $line); die "STRANGE LINE $line\n" unless scalar(@a) == 12; my ($id, $prob, $total_pepts, $protein_degeneracy, $ids_protein_degen, $peptide_charge, $peptide_degen, $peptide_instance, $peptide_contrib, $peptide_prob, $peptide_weight, $annot) = @a ; next if $prob =~ /\#/; next if defined $APEX_pre{$id}; # do not count a protein more than once (it mostly occurs redundantly in .protlst) print "ERROR\tNo Oi for $id\n" unless defined $OI{$id}; next unless defined $OI{$id}; $APEX_pre{$id} = $prob * $total_pepts / $OI{$id}; $APEX_sum += $APEX_pre{$id}; } # while ( my $line = ) { close IN; print OUT "\# NUMBER of proteins in calculation\t", scalar(keys(%APEX_pre)), "\n"; # ====================================================================================== # calculate APEX # NEW total number of molecules/cell # calculated from average of 4,000 mol/protein/cell and total of unique number of identified proteins # = keys of %APEX_pre my $number_moleculescell = $avg_number * scalar(keys(%APEX_pre)); # total number of proteins per cell based on avg number of proteins times detected proteins (since coverage may be low) my %STORE_APEX; # hash{geneid} = string"info" open (IN, $file_MSout) or die "Cant open $file_MSout\n"; while ( my $line = ) { next if length(chomp $line) == 0; next if $line =~ /^\#/; my @a = split("\t", $line); die "STRANGE LINE $line\n" unless scalar(@a) == 12; my ($id, $prob, $total_pepts, $protein_degeneracy, $ids_protein_degen, $peptide_charge, $peptide_degen, $peptide_instance, $peptide_contrib, $peptide_prob, $peptide_weight, $annot) = @a ; next unless defined $OI{$id}; # APEX calculation -- my $protein_abundance = &d4($APEX_pre{$id} * $number_moleculescell / $APEX_sum); $STORE_APEX{$id} = "$id\t$OI{$id}\t$prob\t$total_pepts\t$protein_abundance\t$protein_degeneracy\t$ids_protein_degen\t$annot"; } # while ( my $line = ) { close IN; # ====================================================================================== # print everything out print OUT "\# TOTAL MOLECULES $number_moleculescell\t(based on average of\t$avg_number\tmolecules per protein)\n"; print OUT "\# ID\tOI_VALUE\tPROTEINPROPHET_PROBABILITY\tTOTAL_SPECTRAL_COUNTS\tAPEX_PROTEIN_ABUNDANCE\tNUMBER_OF_DEGENERATE_PROTEINS\tIDS_DEGENERATE_PROTEINS\tANNOTATION\n"; foreach my $id ( keys(%STORE_APEX) ) { print OUT $STORE_APEX{$id}, "\n"; } # foreach my close OUT; # ====================================================================================== # SUBROUTINES # ====================================================================================== sub d4 { my ($value) = @_; my $newvalue = (int($value*10000))/10000; return($newvalue); } # sub d4 # ====================================================================================== # ======================================================================================