#!/usr/bin/perl -w use strict; my $filename_seq = $ARGV[0]; my $usage_mesg = "Usage: perl nfreq.pl "; if( $#ARGV != 0 ) { print STDERR $usage_mesg,"($#ARGV)\n"; exit; } if( not -e $filename_seq ) { print STDERR "$filename_seq does not exist!\n"; print STDERR $usage_mesg,"\n"; exit; } #my $filename_seq = 'Ecoli_genome.txt'; #my $filename_seq = 'Tvolcanium_genome.txt'; #my $filename_seq = 'Mgene1'; #my $filename_seq = 'Mgene2'; #my $filename_seq = 'Mgene3'; my $genome_seq = ''; open(SEQ,$filename_seq); while(my $line = ) { chomp($line); $genome_seq .= $line; } close(SEQ); my $count_GC = 0; my %n1_freq = (); for(my $i=0; $i < length($genome_seq); $i++) { my $tmp_n1 = substr($genome_seq,$i,1); if( $tmp_n1 eq 'G' or $tmp_n1 eq 'C' ) { $count_GC += 1; } if( not exists $n1_freq{$tmp_n1} ) { $n1_freq{$tmp_n1} = 0; } $n1_freq{$tmp_n1} += 1; } my %n2_freq = (); for(my $i=0; $i < length($genome_seq)-1; $i++) { my $tmp_n2 = substr($genome_seq,$i,2); if( not exists $n2_freq{$tmp_n2} ) { $n2_freq{$tmp_n2} = 0; } $n2_freq{$tmp_n2} += 1; } my @genome_n = split('',$genome_seq); my $last_n2 = $genome_n[$#genome_n].$genome_n[0]; print "Filename: ",$filename_seq,"\n"; print "Length: ",length($genome_seq),"\n"; print "%GC contents: ",$count_GC/length($genome_seq),"\n"; print "[Single nucleotide frequency]\n"; foreach my $tmp_n (sort keys %n1_freq) { my $tmp_n1_freq = $n1_freq{$tmp_n}; my $tmp_n1_freq_pct = $tmp_n1_freq/length($genome_seq); print sprintf("Freq. %s: %d (%.3f)",$tmp_n, $tmp_n1_freq, $tmp_n1_freq_pct),"\n"; } print "[Dinucleotide frequency: raw]\n"; foreach my $tmp_label (sort keys %n2_freq) { my $tmp_n2_freq = $n2_freq{$tmp_label}; my $tmp_n2_freq_pct = $tmp_n2_freq/length($genome_seq); my @tmp_n1 = split('',$tmp_label); my $exp_n2_freq = $n1_freq{$tmp_n1[0]}*$n1_freq{$tmp_n1[1]}/length($genome_seq)**2; print sprintf("Freq. %s: %d (%.3f, exp. %.3f)",$tmp_label, $tmp_n2_freq, $tmp_n2_freq_pct, $exp_n2_freq),"\n"; } my %revcomp_map = ('AA+TT' => ['AA','TT'], 'GG+CC' => ['GG','CC'], 'AT' => ['AT'], 'TA' => ['TA'], 'CG' => ['CG'], 'GC' => ['GC'], 'AG+CT' => ['AG','CT'], 'AC+GT' => ['AC','GT'], 'TG+CA' => ['TG','CA'], 'TC+GA' => ['TC','GA']); print "[Dinucleotide frequency: revcomp]\n"; foreach my $tmp_label (sort keys %revcomp_map) { my $tmp_n2_freq = 0; foreach my $tmp_n (@{$revcomp_map{$tmp_label}}) { $tmp_n2_freq += $n2_freq{$tmp_n}; } my $tmp_n2_freq_pct = $tmp_n2_freq/length($genome_seq); print sprintf("Freq. %s: %d (%.3f)",$tmp_label, $tmp_n2_freq, $tmp_n2_freq_pct),"\n"; }