#!/usr/bin/perl -w

# C.Vogel, UT Austin, TX; April 2008

# input: 
# 	.arf file created by np_peptide_properties.pl
#		lists attributes and rows of protein_id *tab* peptide *tab* properties [separated by *comma*]
# 	.protlst file created by np_parse_ProteinProphet.pl
#		lists proteins observed, spectral counts, contributing peptides
# output:
#	.arff file for WEKA input (TESTING)
#		with proteins filtered according to specified filter criteria (protein-probability; spectral count; peptide-probability)
# 		lists each peptide as observed (1) or not-observed (0) in the .protlst file
#	.names file with Protein IDs and Peptide listed for corresponding .arff file
#	.log with data summary (also contains fractions useful for COST MATRIX)

# ======================================================================================
use strict;
$|=1;

die "\nnp_arf_to_arff_TRAINING.pl  <.arf file with all peptides, properties> <.protlist file with training data information> <out file name> <MIN protein probability> <MIN total spectral count per protein> <MIN peptide probability> <MIN spectral count per peptide> <allow degenerate proteins/peptides [Y N]> \n\n[Note: Set one or all thresholds to 0 and degenerate to Y for least stringent data selection]\n\n" unless ($#ARGV==7);
my $arf = $ARGV[0]; 
my $training = $ARGV[1];
my $out = $ARGV[2];
# filters: 
# 	can be applied on their own or in combination
# 	produce a set of peptides (proteins) identified with high confidence
my $min_protein_prob = $ARGV[3]; # minimum ProteinProphet probability of correct protein identification
my $min_total_spectra = $ARGV[4]; # total contributing spectra observed for protein
my $min_peptide_prob = $ARGV[5]; # minimum PeptideProphet probability (adjusted) of correct peptide identification
my $min_peptide_spectra = $ARGV[6];
my $degen = $ARGV[7]; # allows (or doesn't allow) for ambiguous protein and peptide identifications

`rm -rf $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.log`;
open (LOG, ">>$out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.log");

print LOG "\# PARAMETER SETTINGS for TRAINING DATA collection:\n";
print LOG "\#\tMINIMUM PROTEIN PROBABILITY\t$min_protein_prob\n";
print LOG "\#\tMINIMUM SPECTRAL COUNT per PROTEIN\t$min_total_spectra\n";
print LOG "\#\tMINIMUM PEPTIDE PROBABILITY\t$min_peptide_prob\n";
print LOG "\#\tMINIMUM SPECTRAL COUNT per PEPTIDE\t$min_peptide_spectra\n";
print LOG "\#\tALLOW DEGENERATE PROTEINS or PEPTIDES\t$degen\n";

# ======================================================================================
# read through .protlst (training file)
# collect information on
# 1) observed proteins (passing all filters and have at least one observed peptide)
# 2) observed peptides (passing all filters)
# note that non-observed peptides will be all peptides listed in the .arf file 
# which belong to an observed protein (1) but are not an observed peptide (2)

# format .protlst:
# 0 PROTEIN_ID     1 PROTEINPROPHET_PROBABILITY      2 TOTAL_SPECTRAL_COUNT    3 NUMBER_INDISTINGUISHABLE_PROTEINS       4 IDS_INDISTINGUISHABLE_PROTEINS 
# 5 PEPTIDE_CHARGE       6 NON_DEGENERATE     7 INSTANCES      8 CONTRIBUTING   9 NSP_PROBABILITY 10 WEIGHT 11 PROTEIN_ANNOTATION

print "\#... READING .protlist $training for OBSERVED DATA...\n";

my %OBSERVED_PEPTIDES; # hash{protein} = observed peptide
open (INFO, $training) or die "Can't open $training\n";
while ( my $line = <INFO> ) {
	next if length(chomp $line) == 0;
	next if $line =~ /^\#/;
	
	my ($id, $prot_prob, $total_spectra, $protein_degen, $degen_ids, $peptide_charge, $peptide_degen, $peptide_count, $contributing, $nsp_peptide_prob, $peptide_weight, $protein_annotation) = split("\t", $line);
	next if length(chomp $id) == 0;
	my $peptide = "";
	if ( $peptide_charge =~ /(\w+)\_\d+/ ) {
		$peptide = $1;
	} # if 

	# FILTERS -- this is where the quality of the training set will be determined
	next unless $prot_prob >= $min_protein_prob;
	next unless $nsp_peptide_prob >= $min_peptide_prob;
	next unless $total_spectra >= $min_total_spectra;
	next unless $peptide_count >= $min_peptide_spectra;
	if ( $degen eq "N" )  {
		next if $protein_degen > 1; # number of indistinguishable proteins > 1
		next if $peptide_degen eq "N"; # if non-degenerate peptide (reused in different proteins) yes or no
	} # if  
	
	# print "GOT THROUGH\n$id\t$peptide\n\tprotein prob $prot_prob\n\tpeptide prob\t$nsp_peptide_prob\n\ttotal spect $total_spectra\n\tdegen/non-degen $protein_degen\t$peptide_degen\n";
	
	# get data
	$OBSERVED_PEPTIDES{$id}{$peptide}++;
	
} # while
close INFO;

# summary printout for data collected:
print LOG "\# PROTEINS OBSERVED in TRAINING DATA\t", scalar(keys(%OBSERVED_PEPTIDES)), "\n";
my $pept_cnt = 0;
foreach my $protein ( keys(%OBSERVED_PEPTIDES)) {
	$pept_cnt += scalar( keys(%{$OBSERVED_PEPTIDES{$protein}}) );
} # foreach
print LOG "\# TOTAL OBSERVED PEPTIDES in TRAINING DATA\t$pept_cnt\n";

# ======================================================================================
# first read through .arf
# collecting attribute information 
my @HEADER; # header information (@ lines)
open (IN, $arf) or die "Can't open $arf\n";
while ( my $line = <IN> ) {
	next if length(chomp $line) == 0;
	next if $line =~ /^\#/;
	next unless $line =~ /^\@/; # collect attributes only
	push(@HEADER, $line);
} # 
close IN;

# ======================================================================================
# second read through .arf
# filter for only proteins with at least one observed peptide
# make .arff files

`rm -rf $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.arff`;
`rm -rf $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.names`;

print "\# ...SCREENING .arf $arf FILE...\n";

open (OUT, ">>$out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.arff");
open (NAMES, ">>$out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.names");
open (IN, $arf) or die "";

print OUT "\@relation $out\_TRAINING\_prprob$min_protein_prob\_prspec$min_total_spectra\_peprob$min_peptide_prob\_pespec$min_peptide_spectra.arff\n\n";
print OUT join("\n", @HEADER), "\n";
print OUT "\@attribute MSdetectability {1,0}";
print OUT "\n\n\@data\n\n";

my %PROT_CNT; # proteins in TRAINING ARFF
my %OBS_CNT; # OBSERVED peptides in TRAINING ARFF
my %NOBS_CNT; # NON_OBSERVED peptides in TRAINING ARFF
while ( my $line = <IN> ) {
	next if length(chomp $line) == 0;
	next if $line =~ /^\#/;
	next if $line =~ /^\@/;
	
	my ($id, $pept, $features) = split("\t", $line);
	
	# protein observed and peptide observed and not yet printed
	if ( defined $OBSERVED_PEPTIDES{$id} and defined $OBSERVED_PEPTIDES{$id}{$pept} and not defined $OBS_CNT{$pept}) {
		print NAMES "$id\t$pept\n";
		print OUT "$features,1\n";
		
		$OBS_CNT{$pept}++;
		$PROT_CNT{$id}++;
	} # if
	# protein observed but peptide not-observed and not yet printed
	elsif ( defined $OBSERVED_PEPTIDES{$id} and not defined $OBSERVED_PEPTIDES{$id}{$pept} and not defined $NOBS_CNT{$pept}) {
		print NAMES "$id\t$pept\n";
		print OUT "$features,0\n";

		$NOBS_CNT{$pept}++;
		$PROT_CNT{$id}++;
	} # elsif
	# protein not observed (in data file above)
	else {
		next;
	} # else

} # while ( my $line = <IN> ) {

close IN;
close OUT;
close NAMES;

# print some basic statistics
print LOG "\n\# IN TRAINING ARFF:\n";
print LOG "\# PROTEINS\t", scalar(keys(%PROT_CNT)), "\n";
print LOG "\# OBSERVED PEPTIDES\t", scalar(keys(%OBS_CNT)),"\tFRACTION\t",scalar(keys(%OBS_CNT))/(scalar(keys(%OBS_CNT))+scalar(keys(%NOBS_CNT))),"\n";
print LOG "\# NON-OBSERVED PEPTIDES\t", scalar(keys(%NOBS_CNT)), "\tFRACTION\t",scalar(keys(%NOBS_CNT))/(scalar(keys(%OBS_CNT))+scalar(keys(%NOBS_CNT))),"\n";

print "\#... DONE\n";
close LOG;

# ======================================================================================
# ======================================================================================