#!/usr/bin/perl -w # quickfock.pl Frequency of Oligomers by Charlie Kim # Purpose: Count oligomer frequency of a genome sequence # Input: Sequence file, oligomer length # Output: Tab-delimited text file of oligomer frequencies # Written July 2, 2001 # Modified September 16, 2002 # Charles C. Kim ########### # MODULES # ########### use strict; use Bio::Seq; use Bio::SeqIO; use Getopt::Std; use IO::File; ######################### # VARIABLES & FILENAMES # ######################### &main_fock; sub main_fock { my $opt_s; my $opt_o; my $opt_l; my $infile; my $outfile; my $oligomerlength; getopts('s:o:l:'); # sequence file, output file, length if ($opt_s) { $infile = $opt_s; } else { print 'Enter your concatenated FASTA sequence filename: '; chomp ($infile=); } unless (-e $infile) { die "$infile not found\n"; } if ($opt_o) { $outfile = $opt_o; if (-e $outfile) { print "$outfile already exists! Overwrite (Y/N)? "; chomp ($_ = ); while (/[^yn]/i) { print 'Y or N, please: '; chomp ($_ = ); } if (/n/i) { die "$outfile not overwritten.\n"; } } } else { print 'Enter an output filename: '; chomp ($outfile=); if (-e $outfile) { print "$outfile already exists! Overwrite (Y/N)? "; chomp ($_ = ); while (/[^yn]/i) { print 'Y or N, please: '; chomp ($_ = ); } if (/n/i) { die "$outfile not overwritten.\n"; } } } if ($opt_l) { if ($opt_l !~ /\d/) { die "Specified length is non-numeric\n"; } $oligomerlength = $opt_l; } else { while () { print 'Enter an oligomer length to count: '; chomp($oligomerlength=); if ($oligomerlength !~ /\d/) { print "Value is non-numeric!\n"; } else {last;} } } ######## # MAIN # ######## if ($oligomerlength >= 9) { print "An oligomer length of $oligomerlength will generate "; print 4 ** $oligomerlength, " combinations,\nwhich could cause "; print "an out of memory error. Proceed? (y/n) "; chomp($_=); if (/y/i) { ; } else { die "Program terminated\n"; } } my %oligos = (); my $in = Bio::SeqIO->new( -file => "$infile", -format => 'Fasta'); my $seqnumber = 1; my $oligocounts = 0; while (my $seq = $in->next_seq() ) { my $sequence = $seq->seq(); # print "$sequence\n"; print "Sequence $seqnumber\n"; my @seqa = split('',$sequence); for (my $c = 0; $c <= $#seqa - $oligomerlength; $c++) { my $oligocount = 0; my $oligo; until ($oligocount == $oligomerlength) { $oligo .= $seqa[$c+$oligocount]; $oligocount++; } $oligos{$oligo}++; # print "$oligo\n"; $oligocounts++; $| = 1; if ($oligocounts % 1000 == 0) { print "\b" x 60; print "Base $oligocounts"; } } $seqnumber++; print "\b" x 60; print "Base $oligocounts\n"; } # Print to temporary output my $tmpf = IO::File->new_tmpfile or die "IO::File->new_tmpfile: $!"; $tmpf->autoflush(1); print $tmpf $seqnumber-1, " sequences analyzed\n"; print $tmpf "$oligocounts total $oligomerlength-mers counted\n"; print $tmpf "$oligomerlength-mer\tNumber\tFrequency\n"; foreach my $key (sort keys %oligos) { print $tmpf "$key\t$oligos{$key}\t", $oligos{$key}/$oligocounts, "\n"; } print "Program complete\n"; # Read & Convert output seek($tmpf, 0, 0); my %posoligos = (); my %negoligos = (); my %totaloligos = (); my %oligokeys = (); &open_writefile(*OUTFILE,'new oligo frequency',$outfile); chomp($_=<$tmpf>); print OUTFILE "$_\n"; chomp($_=<$tmpf>); print OUTFILE "$_ per strand\n\n"; $_=<$tmpf>; my ($oligomer,$garbage1,$garbage2) = split(/\t/,$_); print OUTFILE "$oligomer\tBoth Strands\t\# on \+ Strand\t\# on \- Strand\n"; while(<$tmpf>) { chomp($_); my ($oligo,$posnum,$posfreq) = split(/\t/,$_); my $rcoligo = $oligo; $rcoligo =~ tr/gatcGATC/ctagCTAG/; # N, W, and S excluded, no changes necessary $rcoligo =~ tr/rkbdymvhRKBDYMVH/ymvhrkbdYMVHRKBD/; $rcoligo = reverse($rcoligo); $posoligos{$oligo} = $posnum; $negoligos{$rcoligo} = $posnum; $oligokeys{$oligo} = 1; $oligokeys{$rcoligo} = 1; } foreach my $entry (keys %oligokeys) { if (!$posoligos{$entry}) { $totaloligos{$entry} = $negoligos{$entry}; } elsif (!$negoligos{$entry}) { $totaloligos{$entry} = $posoligos{$entry}; } else {$totaloligos{$entry} = $posoligos{$entry} + $negoligos{$entry}; } } foreach my $entry (sort keys %oligokeys) { print OUTFILE "$entry\t$totaloligos{$entry}\t"; if ($posoligos{$entry}) { print OUTFILE "$posoligos{$entry}\t"; } else { print OUTFILE "0\t"; } if ($negoligos{$entry}) { print OUTFILE "$negoligos{$entry}\n"; } else { print OUTFILE "0\n"; } } } sub open_writefile { my $filehandle = $_[0]; my $filetype = $_[1] if $_[1]; my $filename = $_[2] if $_[2]; unless ($filename) { print "Enter $filetype filename to create: "; chomp ($filename=); } if (-e $filename) { print "$filename already exists! Overwrite (Y/N)? "; chomp ($_ = ); while (/[^yn]/i) { print 'Y or N, please: '; chomp ($_ = ); } if (/n/i) { die "$filename not overwritten.\n"; } else { open($filehandle, ">$filename") or die "Couldn't open $filename\n"; } } else { open($filehandle, ">$filename") or die "Couldn't open $filename\n"; } $filehandle = ''; $filetype = ''; $filename = ''; }