#!/usr/bin/perl -w # ALACK # alack.pl # Copyright 2002 Charlie Kim # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # For a copy of the GNU General Public License, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # This program uses the following modules # Perl/Tk 800.023 Copyright (c) 1995-1996 Nick Ing-Simmons. # http://starbase.neosoft.com/%7Eclaird/comp.lang.perl.tk/ptkFAQ.html # Statistics::Descriptive use strict; use Tk; use Tk::Label; use Tk::Button; use Tk::Checkbutton; use Tk::Frame; use Tk::Entry; use Tk::Radiobutton; use Tk::Scale; use Tk::Scrollbar; use Tk::Text; use Tk::ErrorDialog; use Tk::ProgressBar; use Tk::Balloon; use Statistics::Descriptive; use Getopt::Std; our ($opt_w, $opt_f, $opt_a, $opt_i, $opt_r, $opt_o); getopts('w:f:a:i:r:o:'); my ($wordfile, $fullfile, $analyzedfile, $iter, $raw, $outfile); $wordfile = $opt_w if $opt_w; $fullfile = $opt_f if $opt_f; $analyzedfile = $opt_a if $opt_a; $outfile = $opt_o if $opt_o; if ($opt_i) { $iter = $opt_i; } else { $iter = 1000; } if ($opt_r) { $raw = 1; } else {$raw = 0; } if (!$wordfile || !$fullfile || !$analyzedfile || !$outfile) { &lackgui; } else { &lackcommand($wordfile,$analyzedfile,$fullfile,$iter,$raw,$outfile); } sub lackgui { my $mw = MainWindow->new(-title=>'ALACK 0.1'); my $data = <Photo(-data => $data); $mw->Icon( -image => $image ); $mw->Label(-text=>'Automated Lexical Analysis by Charlie Kim v 0.1', )->pack(); # Input # my $inputfr = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); my $full_label =$inputfr->Label(-text=>'Full Dataset File', )->grid(-row=>1,-column=> 0,-sticky=>'e'); my $full_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $fullhelp = "Full filtered dataset. Should be identical to the full dataset used for analysis (e.g. in SAM or Cluster)."; $full_lab_bal->attach($full_label, -balloonmsg => $fullhelp); my $annfile; my $annfile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$annfile, -width=>30, )->grid(-row=>1,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$annfile_e, \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>1,-column=> 2,-sticky=>'w'); my $anal_label = $inputfr->Label(-text=>'Analyzed Dataset File', )->grid(-row=>2,-column=> 0,-sticky=>'e'); my $anal_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $analhelp = "The top significant genes generated in an analysis (e.g. SAM or Cluster).\nShould have far fewer entries than the full dataset."; $anal_lab_bal->attach($anal_label, -balloonmsg=>$analhelp); my $datafile; my $datafile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$datafile, -width=>30, )->grid(-row=>2,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$datafile_e, \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>2,-column=> 2,-sticky=>'w'); my $iterations = 1000; my $iter_label = $inputfr->Label(-text=>'Iterations', )->grid(-row=>3,-column=>0,-sticky=>'e'); my $iter_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $iterhelp = "The number of random datasets to generate."; $iter_lab_bal->attach($iter_label, -balloonmsg=>$iterhelp); $inputfr->Entry(-state=>'normal', -textvariable=> \$iterations, -width=>10, )->grid(-row=>3,-column=>1,-sticky=>'w'); my $threshold = 2; my $thresh_label = $inputfr->Label(-text=>'Threshold', )->grid(-row=>4,-column=>0,-sticky=>'e'); my $thresh_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $thresh_help = "The number of times a word must be found in the analyzed dataset in order to do the analysis."; $thresh_lab_bal->attach($thresh_label, -balloonmsg=>$thresh_help); $inputfr->Entry(-state=>'normal', -textvariable=> \$threshold, -width=>10, )->grid(-row=>4,-column=>1,-sticky=>'w'); # Run # my $percent_done = 0; my $progress = $mw->ProgressBar(-width => 8, -blocks => 100, -gap => 0, -colors => [0, 'darkblue'], -variable => \$percent_done, -borderwidth => 2, -troughcolor => 'grey', )->pack(-fill => 'x', ); my $percent_done2 = 0; my $progress2 = $mw->ProgressBar(-width => 8, -blocks => 100, -gap => 0, -colors => [0, 'red'], -variable => \$percent_done2, -borderwidth => 2, -troughcolor => 'grey', )->pack(-fill => 'x', ); # log window my $log_f = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both'); my $logwin = $log_f->Scrolled("Text")->pack(-fill=>'both'); $logwin->configure(-height=>10, -width=>40, -wrap=>'none', -state=>'disabled', ); $mw->Button(-text=>"Run LACK", -command=> [ \&lackguimain,\$datafile,\$annfile,\$iterations,\$mw,\$percent_done,\$logwin,\$threshold,\$percent_done2 ], -state=> 'normal', )->pack(); $mw->Button(-text=>"Exit", -command=> sub { exit }, )->pack(); MainLoop; } sub printlog { my $line = shift; my $logwinref = shift; my $logwin = $$logwinref; $logwin->configure(-state=>'normal'); $logwin->insert('end',$line); $logwin->yview(moveto=>1); $logwin->configure(-state=>'disabled'); $logwin->update(); } sub lackguimain { my $datafileref = shift; my $datafile = $$datafileref; my $annfileref = shift; my $annfile = $$annfileref; my $iterationsref = shift; my $iterations = $$iterationsref; my $mwref = shift; my $mw = $$mwref; my $progref = shift; my $logwinref = shift; my $logwin = $$logwinref; my $thresholdref = shift; my $threshold = $$thresholdref; my $percentdone2ref = shift; if (!$datafile) { die "No dataset file specified\n"; return; } if (!$annfile) { die "No annotation file specified\n"; return; } if (!$iterations) { die "Must specify non-zero number of iterations\n"; return; } if ($iterations =~ /\D/) { die "Iterations must be numeric\n"; return; } if ($iterations < 100) { my $answer = $mw->messageBox(-title => 'Warning', -type => 'YesNo', -default => 'no', -message => "This program uses a statistical parameter which assumes many iterations. You have chosen to use only $iterations iterations, which could cause inaccurate statistical results. Do you wish to proceed?"); unless ($answer eq 'yes') { return; } } my %words; open(DATAFILE,$datafile) or die "Couldn't open analyzed dataset file: $datafile\n"; open(ANNLIST,$annfile) or die "Couldn't open full dataset file: $annfile\n"; my $outfile = $mw->getSaveFile(-title=>'Save Results As'); open(OUTFILE,">$outfile") or die "Can't open output file $outfile\n"; while() { chomp; next if !$_; $_ =~ s/\W/ /g; # need to remove punctation: (),"| cause problems # but it also removes hyphens, which may be undesirable my @temp = split(' ', $_); foreach (@temp) { $words{$_}++; } } my @wordlist; foreach (sort keys %words) { next unless $_ =~ /\D/; # gets rid of numbers next if length($_) == 1; next if $_ =~ /^and$/i; next if $_ =~ /^of$/i; next if $_ =~ /^or$/i; next if $_ =~ /^for$/i; next if $_ =~ /^to$/i; next if $_ =~ /^the$/i; next if $_ =~ /^with$/i; next if $_ =~ /^in$/i; push @wordlist, $_ if $words{$_} > $threshold; } my $counter = 1; my %outdata; my %pval; &printlog("Word\tMatches\tP(x<=k)\n", $logwinref); foreach my $word (@wordlist) { $$percentdone2ref = 100 * $counter / ($#wordlist + 1); seek DATAFILE,0,0; my @word_a = ($word); my @dataann; my $datahits = 0; my $datasize = 0; while() { chomp; next if !$_; my $temphit = &countwords(\@word_a,$_); if ($temphit == 2) { $datahits++; } $datasize++; } # print "$word\t$datahits\t$datasize\n"; # remember line numbers of hits in annotation seek ANNLIST,0,0; my $fulllinecount = 0; my %linehits; while() { chomp; next if !$_; my $temphit = &countfullwords(\@word_a,$_,$logwinref); if ($temphit == 2) { $linehits{$fulllinecount} = 1; } $fulllinecount++; } my @statdata; for (my $d = 0; $d < $iterations; $d++) { $$progref = 100 * $d / $iterations; $mw->update unless $$progref < 2; my @int = &randint($fulllinecount,$datasize); my $fullhits = 0; foreach (@int) { if ($linehits{$_}) { $fullhits++; } } push @statdata, $fullhits; } # GENERATE STATS my @stats = &calcdescstat(\@statdata); my $mean = $stats[1]; my $p_less_than_hits; for (my $c = 0; $c < $datahits; $c++) { my $tempval = ($mean ** $c) * exp(-$mean) / factorial($c); $p_less_than_hits += $tempval; } my $p_hits = ($mean ** $datahits) * exp(-$mean) / factorial($datahits); &printlog("$word\t$datahits\t$p_less_than_hits\n", $logwinref); $outdata{$word} = "$datahits\t$datasize\t$mean"; $pval{$word} = $p_less_than_hits; $counter++; } $$progref = 100; $$percentdone2ref = 100; $mw->update; print OUTFILE "Word\tP(x<=k)\tMatches\tSize\tMean\n"; foreach my $word (sort { $pval{$b}<=>$pval{$a} } keys %pval ) { print OUTFILE "$word\t$pval{$word}\t$outdata{$word}\n"; } close OUTFILE; $mw->messageBox(-title => 'Program Finished', -message => 'Processing complete', -type => 'OK'); } sub lackcommand { die "Command-line interface not yet implemented\n"; my ($wordfile,$datafile,$annfile,$iterations,$rawout,$outfile) = @_; if (!$wordfile) { die "No word list file specified\n"; return; } if (!$datafile) { die "No dataset file specified\n"; return; } if (!$annfile) { die "No annotation file specified\n"; return; } if (!$iterations) { die "Must specify non-zero number of iterations\n"; return; } if ($iterations =~ /\D/) { die "Iterations must be numeric\n"; return; } if (!$outfile) { die "No output file specified\n"; return; } print "Reading wordlist ..."; my @wordlist; open(WORDLIST,$wordfile) or die "Couldn't open word file: $wordfile\n"; while() { chomp; push @wordlist, $_; } print " complete\n"; print "Tallying terms in analyzed data file ..."; open(DATAFILE,$datafile) or die "Couldn't open analyzed dataset file: $datafile\n"; my @dataann; my $datahits = 0; my $datasize = 0; while() { chomp; next if !$_; my $temphit = &countwords(\@wordlist,$_); if ($temphit == 2) { $datahits++; # print "datahit: $_\n"; } $datasize++; } # print "datahits: $datahits\t$datasize: $datasize\n"; print " complete\n"; print "Tallying terms in full data file ..."; # remember line numbers of hits in annotation open(ANNLIST,$annfile) or die "Couldn't open full dataset file: $annfile\n"; my $fulllinecount = 0; my %linehits; while() { chomp; next if !$_; my $temphit = &countwords(\@wordlist,$_); # print "line: $fulllinecount\thit: $hits\n"; if ($temphit == 2) { # print "fullhit: $_\n"; $linehits{$fulllinecount} = 1; } $fulllinecount++; } # print "$fulllinecount\n"; print " complete\n"; open(OUTFILE,">$outfile") or die "Can't open output file: $outfile\n"; print OUTFILE "Significant Dataset\n"; print OUTFILE "\tHits\t$datahits\n"; print OUTFILE "\tTotal Dataset\t$datasize\n"; my @statdata; $| = 1; for (my $d = 0; $d < $iterations; $d++) { print "\b" x 60, "Generating and tallying random datasets ..."; print " iteration ", $d+1; my @int = &randint($fulllinecount,$datasize); my $fullhits = 0; foreach (@int) { if ($linehits{$_}) { $fullhits++; } } # print "fullhits: $fullhits\n"; push @statdata, $fullhits; } print " complete\n"; print "Generating stats and output file ..."; # GENERATE STATS my @stats = &calcdescstat(\@statdata); my $mean = $stats[1]; # my $stdev = $stats[2]; my $p_less_than_hits; for (my $c = 0; $c < $datahits; $c++) { my $tempval = ($mean ** $c) * exp(-$mean) / factorial($c); $p_less_than_hits += $tempval; } my $p_hits = ($mean ** $datahits) * exp(-$mean) / factorial($datahits); print OUTFILE "\tp(x <= $datahits)\t", $p_less_than_hits + $p_hits, "\n"; print OUTFILE "\tp(x = $datahits)\t$p_hits\n"; # OUTPUT STATS print OUTFILE "\nRandomized Annotation Set\n"; print OUTFILE "\n\tDistribution Statistics\n"; print OUTFILE "\t\tCount\t$stats[0]\n"; print OUTFILE "\t\tMean\t$stats[1]\n"; print OUTFILE "\t\tMin\t$stats[3]\n"; print OUTFILE "\t\tMax\t$stats[4]\n"; # HISTOGRAM BIN & FREQ OUTPUT my %histotally = (); foreach (@statdata) { $histotally{$_}++; } my $min = $stats[3]; my $max = $stats[4]; print OUTFILE "\n\tHistogram Tally\n"; print OUTFILE "\t\tHits\tFrequency\n"; for (my $c = $min; $c <= $max; $c++) { if ($histotally{$c}) { print OUTFILE "\t\t$c\t$histotally{$c}\n"; } else { print OUTFILE "\t\t$c\t0\n"; } } # RAW COUNTS OUTPUT if ($rawout) { print OUTFILE "\n\tRaw Counts\n"; print OUTFILE "\t\tHits\tTotal Dataset Size\n"; foreach (@statdata) { print OUTFILE "\t\t$_\t$datasize\n"; } } close OUTFILE; print " complete\n"; } sub randint { my @integers = (); my %intcheck; my $c = 0; while ($c <= $_[1]) { my $randint = int(rand($_[0])); next if $intcheck{$randint}; $intcheck{$randint} = 1; push @integers, $randint; $c++; } return @integers; } sub countwords { my $wordlistref = shift; my @wordlist = @{$wordlistref}; my $annotation = shift; my $hitcounter = 0; my $hit = 1; foreach my $word (@wordlist) { if ($annotation =~ /($word)/i) { next unless ($annotation =~ /\b$word/ || $annotation =~ /$word\b/); # print "$word\t$annotation\n"; $hit++; last; } } return $hit; # 1 = no match, 2 = match } sub countfullwords { my $wordlistref = shift; my @wordlist = @{$wordlistref}; my $annotation = shift; my $hitcounter = 0; my $logwinref = shift; my $hit = 1; foreach my $word (@wordlist) { if ($annotation =~ /($word)/i) { next unless ($annotation =~ /\b($word)/ || $annotation =~ /($word)\b/); # print "$word\t$annotation\n"; my $hitreport = "Found term ($1) in line $annotation\n"; # &printlog($hitreport,$logwinref); $hit++; last; } } return $hit; # 1 = no match, 2 = match } sub factorial { my $n = shift; return undef if $n < 0; return 1 if $n == 0; return $n * factorial($n - 1); } sub calcdescstat { my $dataref = shift; my @data = @{$dataref}; my $stat = Statistics::Descriptive::Full->new(); $stat->add_data(@data); my $count = $stat->count(); my $mean = $stat->mean(); my $stdev = $stat->standard_deviation(); my $min = $stat->min(); my $max = $stat->max(); my $p1 = $stat->percentile(1); my $p5 = $stat->percentile(5); my $p95 = $stat->percentile(95); my $p99 = $stat->percentile(99); return ($count,$mean,$stdev,$min,$max,$p1,$p5,$p95,$p99); } sub selectsinglefile { my $entryblankref = shift; my $entryblank = $$entryblankref; my $mwref = shift; my $mw = $$mwref; my $opentypes = [ "{All files} * ", "{PCL files} {.pcl} ", "{Text files} {.txt} ", ]; my $file = $mw->getOpenFile(-filetypes=>$opentypes); if (defined $file and $file ne '') { $entryblank->delete(0, 'end'); $entryblank->insert(0, $file); $entryblank->xview('end'); } }