#!/usr/bin/perl -w # LACK v 3.1 # lack30.pl # Copyright 2002-2005 Charlie Kim # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # For a copy of the GNU General Public License, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # This program uses the following modules # Perl/Tk 800.023 Copyright (c) 1995-1996 Nick Ing-Simmons. # http://starbase.neosoft.com/%7Eclaird/comp.lang.perl.tk/ptkFAQ.html # Statistics::Descriptive use strict; use Tk; use Tk::Label; use Tk::Button; use Tk::Checkbutton; use Tk::Frame; use Tk::Entry; use Tk::Radiobutton; use Tk::Scale; use Tk::Scrollbar; use Tk::Text; use Tk::ErrorDialog; use Tk::ProgressBar; use Tk::Balloon; use Tk::ROText; use Tk::NoteBook; use Statistics::Descriptive; use Getopt::Std; #use bignum; use Math::BigInt; use Math::BigFloat; our ($opt_w, $opt_f, $opt_a, $opt_i, $opt_r, $opt_o); getopts('w:f:a:i:r:o:'); my ($wordfile, $fullfile, $analyzedfile, $iter, $raw, $outfile); $wordfile = $opt_w if $opt_w; $fullfile = $opt_f if $opt_f; $analyzedfile = $opt_a if $opt_a; $outfile = $opt_o if $opt_o; if ($opt_i) { $iter = $opt_i; } else { $iter = 1000; } if ($opt_r) { $raw = 1; } else {$raw = 0; } if (!$wordfile || !$fullfile || !$analyzedfile || !$outfile) { &lackgui; } else { &lackcommand($wordfile,$analyzedfile,$fullfile,$iter,$raw,$outfile); } sub lackgui { my $mw = MainWindow->new(-title=>'LACK 3.1'); my $data = <Photo(-data => $data); $mw->Icon( -image => $image ); $mw->Label(-text=>'Lexical Analysis by Charlie Kim', )->pack(); # Input # my $inputfr = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); my $wordfile_label = $inputfr->Label(-text=>'Wordlist File', )->grid(-row=>0,-column=> 0,-sticky=>'e'); my $wordfile_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $wordfilehelp = "A line-delimited list of words to search for.\nExample:\niron\nmanganese\nmagnesium\ncalcium\ndivalent"; $wordfile_lab_bal->attach($wordfile_label, -balloonmsg => $wordfilehelp, ); my $wordfile; my $wordfile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$wordfile, -width=>30, )->grid(-row=>0,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$wordfile_e , \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>0,-column=> 2,-sticky=>'w'); my $full_label =$inputfr->Label(-text=>'Full Dataset File', )->grid(-row=>1,-column=> 0,-sticky=>'e'); my $full_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $fullhelp = "Full filtered dataset. Should be identical to the full dataset used for analysis (e.g. in SAM or Cluster)."; $full_lab_bal->attach($full_label, -balloonmsg => $fullhelp); my $annfile; my $annfile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$annfile, -width=>30, )->grid(-row=>1,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$annfile_e, \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>1,-column=> 2,-sticky=>'w'); my $anal_label = $inputfr->Label(-text=>'Significant Genes File', )->grid(-row=>2,-column=> 0,-sticky=>'e'); my $anal_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $analhelp = "The top significant genes generated in an analysis (e.g. SAM or Cluster).\nShould have far fewer entries than the full dataset. Also known as the significant genes list (SGL)."; $anal_lab_bal->attach($anal_label, -balloonmsg=>$analhelp); my $datafile; my $datafile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$datafile, -width=>30, )->grid(-row=>2,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$datafile_e, \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>2,-column=> 2,-sticky=>'w'); my $statfr = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); my $stattype = 0; $statfr->Label(-text=>"Statistics")->grid(-row=>5,-column=>0,-sticky=>'e'); my $iter_e; my $raw_c; my $iter_label; my $raw_label; my $binbut = $statfr->Radiobutton(-text=> "Binomial", -variable=> \$stattype, -value=> 0, -borderwidth=>0, -command=> [\&modstate,\$iter_e,\$raw_c,\$stattype,\$iter_label,\$raw_label ], )->grid(-row=>5, -column=>1, -sticky=>'w', ); my $poibut = $statfr->Radiobutton(-text=> "Poisson", -variable=> \$stattype, -value=> 1, -borderwidth=>0, -command=> [\&modstate,\$iter_e,\$raw_c,\$stattype,\$iter_label,\$raw_label ], )->grid(-row=>6, -column=>1, -sticky=>'w', ); my $iterations = 1000; $iter_label = $statfr->Label(-text=>'Iterations', # -foreground=>'SystemDisabledText', )->grid(-row=>10, -column=>1, -sticky=>'e', ); # portability ok if ($^O =~ /mswin/i) { $iter_label->configure(-foreground=>'SystemDisabledText'); } else { $iter_label->configure(-foreground=>'grey'); } my $iter_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $iterhelp = "The number of random datasets to generate."; $iter_lab_bal->attach($iter_label, -balloonmsg=>$iterhelp); $iter_e = $statfr->Entry(-state=>'normal', -textvariable=> \$iterations, -width=>10, -state=>'disabled', # -foreground=>'SystemDisabledText', # -background=>'SystemButtonFace', )->grid(-row=>10, -column=>2, -sticky=>'w', ); # portability ok if ($^O =~ /mswin/i) { $iter_e->configure(-foreground=>'SystemDisabledText', -background=>'SystemButtonFace', ); } else { $iter_e->configure(-foreground=>'grey', -background=>'grey', ); } my $rawout = 0; $raw_label = $statfr->Label(-text=>"Raw Counts Output", # -foreground=>'SystemDisabledText', )->grid(-row=>11,-column=>1,-sticky=>'e'); # portability ok if ($^O =~ /mswin/i) { $raw_label->configure(-foreground=>'SystemDisabledText', ); } else { $raw_label->configure(-foreground=>'grey', ); } my $raw_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $rawhelp = "Include raw tally values from the random datasets in the output."; $raw_lab_bal->attach($raw_label, -balloonmsg=>$rawhelp); $raw_c = $statfr->Checkbutton(-variable => \$rawout, -state=>'disabled' )->grid(-row=>11,-column=>2,-sticky=>'w'); # Run # my $percent_done = 0; my $progress = $mw->ProgressBar(-width => 8, -blocks => 100, -gap => 0, -colors => [0, 'darkblue'], -variable => \$percent_done, -borderwidth => 2, # -troughcolor => 'SystemButtonFace', )->pack(-fill => 'x', ); #portability ok if ($^O =~ /mswin/i) { $progress->configure(-troughcolor=>'SystemButtonFace', ); } else { $progress->configure(-troughcolor=>'grey', ); } # log window my $log_f = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both', -expand=>1); my $logwin = $log_f->Scrolled("Text")->pack(-fill=>'both', -expand=>1); $logwin->configure(-height=>15, -width=>50, -wrap=>'none', -state=>'disabled', ); # text options my $textopt_f = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both'); my $verbose = 0; my $verb_label = $textopt_f->Label(-text=>"Verbose matching", )->grid(-row=>0, -column=>0, -sticky=>'e'); my $verb_lab_bal = $textopt_f->Balloon(-state=>'balloon', -background=>'white'); my $verb_help = "Displays word matches found in the annotation in the results box."; $verb_lab_bal->attach($verb_label, -balloonmsg=>$verb_help); my $verb_c = $textopt_f->Checkbutton(-variable => \$verbose, )->grid(-row=>0, -column=>1, -sticky=>'w'); $textopt_f->Button(-text=>"Save Results", -command=> [ \&saveresults,\$mw,\$logwin ], -state=> 'normal', )->grid(-row=>0, -column=>2, -sticky=>'e'); my $statusb = $mw->ROText(-height=>1, -width=>60, -wrap=>'word', # -background=>'SystemButtonFace', -relief=>'flat', -border=>0, )->pack(-side=>'left', -anchor=>'sw'); #portability ok if ($^O =~ /mswin/i) { $statusb->configure(-background=>'SystemButtonFace'); } else { $statusb->configure(-background=>'grey'); } $statusb->insert('end','Idle'); $mw->Button(-text=>"Run LACK", -command=> [ \&lackguimain,\$wordfile,\$datafile,\$annfile,\$iterations,\$mw,\$percent_done,\$rawout,\$logwin,\$stattype,\$verbose,\$statusb ], -state=> 'normal', )->pack(-pady=>4); $mw->Button(-text=>"Help", -command=> [\&makehelp,\$mw], )->pack(-pady=>4); $mw->Button(-text=>"Exit", -command=> sub { exit }, )->pack(-pady=>2); MainLoop; } sub makehelp { my $mwref = shift; my $mw = $$mwref; my $helpwin = MainWindow->new(-title=>"LACK Help"); $helpwin->focusForce; my $data = <Photo(-data => $data); $helpwin->Icon( -image => $image ); $helpwin->Button(-text=>"Close Help", -command=> sub { $helpwin->destroy; }, )->pack(-pady=>10); my $hf = $helpwin->Frame()->pack(-fill=>'both', -expand=>1, ); my $hn = $hf->NoteBook()->pack(-fill=>'both', -expand=>1, ); my $overview = $hn->add("overview", -label=>"Overview", ); my $files = $hn->add("files", -label=>"File Formats", ); my $options = $hn->add("options", -label=>"Commands and Options", ); my $stats = $hn->add("stats", -label=>"Statistics", ); my $warn = $hn->add("warn", -label=>"Warnings", ); my $add = $hn->add("add", -label=>"Additional Help", ); my $ov_t = $overview->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $file_t = $files->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $opt_t = $options->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $stat_t = $stats->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $warn_t = $warn->Scrolled('ROText', -scrollbars=>'osoe', -wrap=>'word', )->pack(-fill=>'both', -expand=>1, ); my $add_t = $add->Scrolled('ROText', -scrollbars=>'osoe', -wrap=>'word', )->pack(-fill=>'both', -expand=>1, ); my $ov_data = <insert('end',$ov_data); $file_t->insert('end',$file_data); $opt_t->insert('end',$opt_data); $stat_t->insert('end',$stat_data); $warn_t->insert('end',$warn_data); $add_t->insert('end',$add_data); } sub saveresults { my $mwref = shift; my $mw = $$mwref; my $logwinref = shift; # fix portability my $outfile = $mw->getSaveFile(-title=>'Save Results As'); open(OUTFILE,">$outfile") or die "Can't open output file $outfile\n"; print OUTFILE $$logwinref->get("1.0","end"); close OUTFILE; $mw->messageBox(-title => 'Finished', -message => 'Results Saved', -type => 'OK'); } sub modstate { my $iter_ref = shift; my $raw_ref = shift; my $stateref = shift; my $iter_l_ref = shift; my $raw_l_ref = shift; # portability ok if ($^O =~ /mswin/i) { if ($$stateref == 0) { $$iter_ref->configure(-state=>'disabled'); $$iter_ref->configure(-background=>'SystemButtonFace'); $$iter_ref->configure(-foreground=>'SystemDisabledText'); $$raw_ref->configure(-state=>'disabled'); $$iter_l_ref->configure(-foreground=>'SystemDisabledText'); $$raw_l_ref->configure(-foreground=>'SystemDisabledText'); } elsif ($$stateref == 1) { $$iter_ref->configure(-state=>'normal'); $$iter_ref->configure(-background=>'white'); $$iter_ref->configure(-foreground=>'SystemButtonText'); $$raw_ref->configure(-state=>'normal'); $$iter_l_ref->configure(-foreground=>'SystemButtonText'); $$raw_l_ref->configure(-foreground=>'SystemButtonText'); } else {die "Problem with statistics state\n";} } else { if ($$stateref == 0) { $$iter_ref->configure(-state=>'disabled'); $$iter_ref->configure(-background=>'lightgrey'); $$iter_ref->configure(-foreground=>'grey'); $$raw_ref->configure(-state=>'disabled'); $$iter_l_ref->configure(-foreground=>'grey'); $$raw_l_ref->configure(-foreground=>'grey'); } elsif ($$stateref == 1) { $$iter_ref->configure(-state=>'normal'); $$iter_ref->configure(-background=>'white'); $$iter_ref->configure(-foreground=>'black'); $$raw_ref->configure(-state=>'normal'); $$iter_l_ref->configure(-foreground=>'black'); $$raw_l_ref->configure(-foreground=>'black'); } else {die "Problem with statistics state\n";} } } sub printlog { my $line = shift; my $logwinref = shift; my $logwin = $$logwinref; $logwin->configure(-state=>'normal'); $logwin->insert('end',$line); $logwin->yview(moveto=>1); # $logwin->configure(-state=>'disabled'); $logwin->update(); } sub lackguimain { my $wordfileref = shift; my $wordfile = $$wordfileref; my $datafileref = shift; my $datafile = $$datafileref; my $annfileref = shift; my $annfile = $$annfileref; my $iterationsref = shift; my $iterations = $$iterationsref; my $mwref = shift; my $mw = $$mwref; my $progref = shift; my $rawoutref = shift; my $rawout = $$rawoutref; my $logwinref = shift; my $logwin = $$logwinref; my $stattyperef = shift; my $verboseref = shift; my $statusbref = shift; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Clearing results'); $mw->update; $logwin->configure(-state=>'normal'); $logwin->delete('1.0','end'); $logwin->configure(-state=>'disabled'); $logwin->update(); if ($$stattyperef == 0) { &lackguimain_binomial($wordfileref,$datafileref,$annfileref,$iterationsref,$mwref,$progref,$rawoutref,$logwinref,$verboseref,$statusbref); } elsif ($$stattyperef == 1) { &lackguimain_poisson($wordfileref,$datafileref,$annfileref,$iterationsref,$mwref,$progref,$rawoutref,$logwinref,$verboseref,$statusbref); } else { die "Statistics type not specified or not found\n"; } $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Idle'); $mw->update; $$progref = 0; } sub lackguimain_binomial { my $wordfileref = shift; my $wordfile = $$wordfileref; my $datafileref = shift; my $datafile = $$datafileref; my $annfileref = shift; my $annfile = $$annfileref; my $iterationsref = shift; my $iterations = $$iterationsref; my $mwref = shift; my $mw = $$mwref; my $progref = shift; my $rawoutref = shift; my $rawout = $$rawoutref; my $logwinref = shift; my $logwin = $$logwinref; my $verboseref = shift; my $statusbref = shift; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Beginning binomial analysis'); $mw->update; if (!$wordfile) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "No word list file specified\n"; return; } if (!$datafile) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "No dataset file specified\n"; return; } if (!$annfile) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "No annotation file specified\n"; return; } $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Reading search terms'); $mw->update; my @wordlist; open(WORDLIST,$wordfile) or die "Couldn't open word file: $wordfile\n"; while() { chomp; $_ =~ s/\r//; next if !$_; push @wordlist, $_; } $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Analyzing SGL'); $mw->update; open(DATAFILE,$datafile) or die "Couldn't open analyzed dataset file: $datafile\n"; my @dataann; my $datahits = 0; my $datasize = 0; while() { chomp; $_ =~ s/\r//; next if !$_; my $temphit = &countwords(\@wordlist,$_); if ($temphit == 2) { $datahits++; # print "datahit: $_\n"; } $datasize++; } # print "datahits: $datahits\t$datasize: $datasize\n"; # remember line numbers of hits in annotation $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Analyzing full dataset'); $mw->update; open(ANNLIST,$annfile) or die "Couldn't open full dataset file: $annfile\n"; my $prelimcount = 0; while() { next if !$_; $prelimcount++; } seek ANNLIST,0,0; my $fulllinecount = 0; my %linehits; my $fullhits = 0; my $prog = $$progref; my $lastprog = 0; while() { chomp; $_ =~ s/\r//; next if !$_; my $temphit = &countfullwords(\@wordlist,$_,$logwinref,$verboseref,$fulllinecount); # print "line: $fulllinecount\thit: $hits\n"; if ($temphit == 2) { # print "fullhit: $_\n"; $linehits{$fulllinecount} = 1; $fullhits++; } $fulllinecount++; $prog = int(100 * $fulllinecount / $prelimcount); # print "$prog\n"; if ($prog % 20 == 0) { $$progref = $prog unless $lastprog == $prog; $mw->update; } $lastprog = $prog; } # print "$fulllinecount\n"; &printlog("\n",$logwinref) if $$verboseref; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Calculating statistics'); $mw->update; &printlog("Significant Dataset\n",$logwinref); &printlog("\tHits\t$datahits\n",$logwinref); &printlog("\tTotal Dataset\t$datasize\n",$logwinref); my @pval; my @histo; my $n = $datasize; my $X = 0; my $p = Math::BigFloat->new($fullhits); $p->bdiv($fulllinecount); my $q = Math::BigFloat->new('1'); $q->bsub($p); my $cumulative = Math::BigFloat->new('0'); my $massdensity = 0; my $curve_side_flag = 0; while () { $$statusbref->delete('1.0','end'); $$statusbref->insert('end',"Calculating binomial term for $X hits"); $mw->update; my $n_over_n_minus_x = Math::BigInt->new('1'); my $xfact = Math::BigInt->new('1'); for (my $c = $n; $c > $n - $X; $c--) { $n_over_n_minus_x->bmul($c); } for (my $d = 1; $d <= $X; $d++) { $xfact->bmul($d); } my $nCxval = Math::BigFloat->new($n_over_n_minus_x); $nCxval->bdiv($xfact); my $term2 = Math::BigFloat->new($p); $term2->bpow($X); my $term3 = Math::BigFloat->new($q); $term3->bpow($n-$X); my $temp_p = Math::BigFloat->new($nCxval); $temp_p->bmul($term2); $temp_p->bmul($term3); my $temp_histo = $temp_p * 100; # 100 is arbitrary my $temp_p2 = sprintf("%.10f",$temp_p); $temp_p2 = '<0.0000000001' if $temp_p2 == 0; $massdensity = $temp_p2 if $X == $datahits; $cumulative += $temp_p if $X <= $datahits; $temp_histo = &myround($temp_histo); push @pval, $temp_p2; push @histo, $temp_histo; if ($temp_p > 0.01 && $curve_side_flag == 0) { $curve_side_flag++; } last if $temp_p < 0.00001 && $curve_side_flag && $X >= $datahits; # arbitrary cutoff $X++; } $cumulative->ffround(-10); my $invcumul = 1 - $cumulative; $cumulative = $cumulative->bstr; if ($cumulative == 1) { $cumulative = '>0.9999999999'; } if ($massdensity == 0) { $massdensity = '<0.0000000001'; } if ($invcumul == 0) { $invcumul = '<0.0000000001'; } &printlog("\tp(x <= $datahits)\t$cumulative\n",$logwinref); &printlog("\tp(x = $datahits)\t$massdensity\n",$logwinref); &printlog("\tp(x > $datahits)\t$invcumul\n",$logwinref); # OUTPUT STATS &printlog("\nCalculated Binomial Distribution\n",$logwinref); &printlog("\n\tDistribution Statistics\n",$logwinref); my $mean = $n*$p; $mean = sprintf("%.10f",$mean); &printlog("\t\tMean\t$mean\n",$logwinref); # HISTOGRAM BIN & FREQ OUTPUT &printlog("\n\tHistogram Tally\n",$logwinref); &printlog("\t\tHits\tExpected frequency in 100 samples\tp(x = Hits)\n",$logwinref); for (my $c = 0; $c<= $#histo; $c++) { &printlog("\t\t$c\t$histo[$c]\t$pval[$c]\n",$logwinref); } # close OUTFILE; $$progref = 100; $mw->update; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Analysis complete'); $mw->update; $mw->messageBox(-title => 'Program Finished', -message => 'Processing complete', -type => 'OK'); } sub myround { my $num = shift; my $intnum = int($num); my $remain = $num - $intnum; return $intnum if $remain < 0.5; return $intnum+1; } sub lackguimain_poisson { my $wordfileref = shift; my $wordfile = $$wordfileref; my $datafileref = shift; my $datafile = $$datafileref; my $annfileref = shift; my $annfile = $$annfileref; my $iterationsref = shift; my $iterations = $$iterationsref; my $mwref = shift; my $mw = $$mwref; my $progref = shift; my $rawoutref = shift; my $rawout = $$rawoutref; my $logwinref = shift; my $logwin = $$logwinref; my $verboseref = shift; my $statusbref = shift; my $poissonconfirm = $mw->messageBox(-title => 'Confirm Poisson', -message => 'This version of LACK contains an update that allows processing of very large factorials, allowing use of binomial statistics even for very large datasets. However, Poisson statistics, while slightly less accurate, can be computed faster. Are you sure you wish to use Poisson statistics?', -type => 'YesNo', -default => 'yes'); # print "$poissonconfirm\n"; if ($poissonconfirm =~ /no/i) { return; } $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Beginning Poisson analysis'); $mw->update; if (!$wordfile) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "No word list file specified\n"; return; } if (!$datafile) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "No dataset file specified\n"; return; } if (!$annfile) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "No annotation file specified\n"; return; } if (!$iterations) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "Must specify non-zero number of iterations\n"; return; } if ($iterations =~ /\D/) { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); die "Iterations must be numeric\n"; return; } if ($iterations < 100) { my $answer = $mw->messageBox(-title => 'Warning', -type => 'YesNo', -default => 'no', -message => "This program uses a statistical parameter which assumes many iterations. You have chosen to use only $iterations iterations, which could cause inaccurate statistical results. Do you wish to proceed?"); unless ($answer eq 'yes') { $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Error'); return; } } $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Reading search terms'); $mw->update; my @wordlist; open(WORDLIST,$wordfile) or die "Couldn't open word file: $wordfile\n"; while() { chomp; $_ =~ s/\r//; next if !$_; push @wordlist, $_; } $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Analyzing SGL'); $mw->update; open(DATAFILE,$datafile) or die "Couldn't open analyzed dataset file: $datafile\n"; my @dataann; my $datahits = 0; my $datasize = 0; while() { chomp; $_ =~ s/\r//; next if !$_; my $temphit = &countwords(\@wordlist,$_); if ($temphit == 2) { $datahits++; # print "datahit: $_\n"; } $datasize++; } # print "datahits: $datahits\t$datasize: $datasize\n"; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Analyzing full dataset'); $mw->update; $$progref = 1; # remember line numbers of hits in annotation open(ANNLIST,$annfile) or die "Couldn't open full dataset file: $annfile\n"; my $prelimcount = 0; while() { next if !$_; $prelimcount++; } seek ANNLIST,0,0; my $fulllinecount = 0; my %linehits; my $prog = $$progref; my $lastprog = 0; while() { chomp; $_ =~ s/\r//; next if !$_; my $temphit = &countfullwords(\@wordlist,$_,$logwinref,$verboseref,$fulllinecount); # print "line: $fulllinecount\thit: $hits\n"; if ($temphit == 2) { # print "fullhit: $_\n"; $linehits{$fulllinecount} = 1; } $prog = int(100 * $fulllinecount / $prelimcount); # print "$prog\n"; if ($prog % 20 == 0) { $$progref = $prog unless $prog == $lastprog; $mw->update; } $lastprog = $prog; $fulllinecount++; } # print "$fulllinecount\n"; &printlog("\n",$logwinref) if $$verboseref; # $$progref = 1; # $mw->update; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Generating random samples'); $mw->update; # my $outfile = $mw->getSaveFile(-title=>'Save Results As'); # open(OUTFILE,">$outfile") or die "Can't open output file $outfile\n"; &printlog("Significant Dataset\n",$logwinref); &printlog("\tHits\t$datahits\n",$logwinref); &printlog("\tTotal Dataset\t$datasize\n",$logwinref); my @statdata; for (my $d = 0; $d < $iterations; $d++) { my @int = &randint($fulllinecount,$datasize); my $fullhits = 0; foreach (@int) { if ($linehits{$_}) { $fullhits++; } } # print "fullhits: $fullhits\n"; push @statdata, $fullhits; } # GENERATE STATS $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Calculating statistics'); $mw->update; my @stats = &calcdescstat(\@statdata); my $mean = $stats[1]; # my $stdev = $stats[2]; my $p_less_than_hits; for (my $c = 0; $c < $datahits; $c++) { my $tempval = ($mean ** $c) * exp(-$mean) / factorial($c); $p_less_than_hits += $tempval; } my $p_hits = ($mean ** $datahits) * exp(-$mean) / factorial($datahits); my $ptotal = $p_less_than_hits + $p_hits; &printlog("\tp(x <= $datahits)\t$ptotal\n",$logwinref); &printlog("\tp(x = $datahits)\t$p_hits\n",$logwinref); # OUTPUT STATS &printlog("\nPoisson Random Sampling\n",$logwinref); &printlog("\n\tDistribution Statistics\n",$logwinref); &printlog("\t\tCount\t$stats[0]\n",$logwinref); &printlog("\t\tMean\t$stats[1]\n",$logwinref); &printlog("\t\tMin\t$stats[3]\n",$logwinref); &printlog("\t\tMax\t$stats[4]\n",$logwinref); # HISTOGRAM BIN & FREQ OUTPUT my %histotally = (); foreach (@statdata) { $histotally{$_}++; } my $min = $stats[3]; my $max = $stats[4]; &printlog("\n\tHistogram Tally\n",$logwinref); &printlog("\t\tHits\tFrequency\n",$logwinref); for (my $c = $min; $c <= $max; $c++) { if ($histotally{$c}) { &printlog("\t\t$c\t$histotally{$c}\n",$logwinref); } else { &printlog("\t\t$c\t0\n",$logwinref); } } # RAW COUNTS OUTPUT if ($rawout) { my $rawout = $mw->getSaveFile(-title=>'Save Raw Poisson Results As'); open(RAWOUT,">$rawout") or die "Can't open output file $outfile\n"; print RAWOUT "Raw Poisson Sample Counts\n"; print RAWOUT "\tHits\tTotal Dataset Size\n"; foreach (@statdata) { print RAWOUT "\t$_\t$datasize\n"; } close RAWOUT; } $$progref = 100; $mw->update; $$statusbref->delete('1.0','end'); $$statusbref->insert('end','Analysis complete'); $mw->update; $mw->messageBox(-title => 'Program Finished', -message => 'Processing complete', -type => 'OK'); } sub lackcommand { my ($wordfile,$datafile,$annfile,$iterations,$rawout,$outfile) = @_; if (!$wordfile) { die "No word list file specified\n"; return; } if (!$datafile) { die "No dataset file specified\n"; return; } if (!$annfile) { die "No annotation file specified\n"; return; } if (!$iterations) { die "Must specify non-zero number of iterations\n"; return; } if ($iterations =~ /\D/) { die "Iterations must be numeric\n"; return; } if (!$outfile) { die "No output file specified\n"; return; } print "Reading wordlist ..."; my @wordlist; open(WORDLIST,$wordfile) or die "Couldn't open word file: $wordfile\n"; while() { chomp; push @wordlist, $_; } print " complete\n"; print "Tallying terms in analyzed data file ..."; open(DATAFILE,$datafile) or die "Couldn't open analyzed dataset file: $datafile\n"; my @dataann; my $datahits = 0; my $datasize = 0; while() { chomp; next if !$_; my $temphit = &countwords(\@wordlist,$_); if ($temphit == 2) { $datahits++; # print "datahit: $_\n"; } $datasize++; } # print "datahits: $datahits\t$datasize: $datasize\n"; print " complete\n"; print "Tallying terms in full data file ..."; # remember line numbers of hits in annotation open(ANNLIST,$annfile) or die "Couldn't open full dataset file: $annfile\n"; my $fulllinecount = 0; my %linehits; while() { chomp; next if !$_; my $temphit = &countwords(\@wordlist,$_); # print "line: $fulllinecount\thit: $hits\n"; if ($temphit == 2) { # print "fullhit: $_\n"; $linehits{$fulllinecount} = 1; } $fulllinecount++; } # print "$fulllinecount\n"; print " complete\n"; open(OUTFILE,">$outfile") or die "Can't open output file: $outfile\n"; print OUTFILE "Significant Dataset\n"; print OUTFILE "\tHits\t$datahits\n"; print OUTFILE "\tTotal Dataset\t$datasize\n"; my @statdata; $| = 1; for (my $d = 0; $d < $iterations; $d++) { print "\b" x 60, "Generating and tallying random datasets ..."; print " iteration ", $d+1; my @int = &randint($fulllinecount,$datasize); my $fullhits = 0; foreach (@int) { if ($linehits{$_}) { $fullhits++; } } # print "fullhits: $fullhits\n"; push @statdata, $fullhits; } print " complete\n"; print "Generating stats and output file ..."; # GENERATE STATS my @stats = &calcdescstat(\@statdata); my $mean = $stats[1]; # my $stdev = $stats[2]; my $p_less_than_hits; for (my $c = 0; $c < $datahits; $c++) { my $tempval = ($mean ** $c) * exp(-$mean) / factorial($c); $p_less_than_hits += $tempval; } my $p_hits = ($mean ** $datahits) * exp(-$mean) / factorial($datahits); print OUTFILE "\tp(x <= $datahits)\t", $p_less_than_hits + $p_hits, "\n"; print OUTFILE "\tp(x = $datahits)\t$p_hits\n"; # OUTPUT STATS print OUTFILE "\nRandomized Annotation Set\n"; print OUTFILE "\n\tDistribution Statistics\n"; print OUTFILE "\t\tCount\t$stats[0]\n"; print OUTFILE "\t\tMean\t$stats[1]\n"; print OUTFILE "\t\tMin\t$stats[3]\n"; print OUTFILE "\t\tMax\t$stats[4]\n"; # HISTOGRAM BIN & FREQ OUTPUT my %histotally = (); foreach (@statdata) { $histotally{$_}++; } my $min = $stats[3]; my $max = $stats[4]; print OUTFILE "\n\tHistogram Tally\n"; print OUTFILE "\t\tHits\tFrequency\n"; for (my $c = $min; $c <= $max; $c++) { if ($histotally{$c}) { print OUTFILE "\t\t$c\t$histotally{$c}\n"; } else { print OUTFILE "\t\t$c\t0\n"; } } # RAW COUNTS OUTPUT if ($rawout) { print OUTFILE "\n\tRaw Counts\n"; print OUTFILE "\t\tHits\tTotal Dataset Size\n"; foreach (@statdata) { print OUTFILE "\t\t$_\t$datasize\n"; } } close OUTFILE; print " complete\n"; } sub randint { my @integers = (); my %intcheck; my $c = 0; while ($c <= $_[1]) { my $randint = int(rand($_[0])); next if $intcheck{$randint}; $intcheck{$randint} = 1; push @integers, $randint; $c++; } return @integers; } sub countwords { my $wordlistref = shift; my @wordlist = @{$wordlistref}; my $annotation = shift; my $hitcounter = 0; my $hit = 1; foreach my $word (@wordlist) { if ($annotation =~ /($word)/i) { next unless ($annotation =~ /\b$word/ || $annotation =~ /$word\b/); # print "$word\t$annotation\n"; $hit++; last; } } return $hit; # 1 = no match, 2 = match } sub countfullwords { my $wordlistref = shift; my @wordlist = @{$wordlistref}; my $annotation = shift; my $hitcounter = 0; my $logwinref = shift; my $verboseref = shift; my $linecount = shift; my $hit = 1; foreach my $word (@wordlist) { if ($annotation =~ /($word)/i) { next unless ($annotation =~ /\b($word)/ || $annotation =~ /($word)\b/); # print "$word\t$annotation\n"; my $hitreport = "Found term ($1) in line $linecount $annotation\n"; &printlog($hitreport,$logwinref) if $$verboseref; $hit++; last; } } return $hit; # 1 = no match, 2 = match } sub factorial { my $n = shift; return undef if $n < 0; return 1 if $n == 0; return $n * factorial($n - 1); } sub calcdescstat { my $dataref = shift; my @data = @{$dataref}; my $stat = Statistics::Descriptive::Full->new(); $stat->add_data(@data); my $count = $stat->count(); my $mean = $stat->mean(); my $stdev = $stat->standard_deviation(); my $min = $stat->min(); my $max = $stat->max(); my $p1 = $stat->percentile(1); my $p5 = $stat->percentile(5); my $p95 = $stat->percentile(95); my $p99 = $stat->percentile(99); return ($count,$mean,$stdev,$min,$max,$p1,$p5,$p95,$p99); } sub selectsinglefile { my $entryblankref = shift; my $entryblank = $$entryblankref; my $mwref = shift; my $mw = $$mwref; my $file; if ($^O =~ /mswin/i) { my $opentypes = [ "{All files} * ", "{PCL files} {.pcl} ", "{Text files} {.txt} ", ]; $file = $mw->getOpenFile(-filetypes=>$opentypes); } else { $file = $mw->getOpenFile(-title=>'Load File'); } if (defined $file and $file ne '') { $entryblank->delete(0, 'end'); $entryblank->insert(0, $file); $entryblank->xview('end'); } }