#!/usr/bin/perl -w # LACK v 4.2 # lack42.pl # Copyright 2002-2005 Charlie Kim # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # For a copy of the GNU General Public License, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # This program uses the following modules # Perl/Tk 800.023 Copyright (c) 1995-1996 Nick Ing-Simmons. # http://starbase.neosoft.com/%7Eclaird/comp.lang.perl.tk/ptkFAQ.html # Statistics::Descriptive # Version History # 4.1 integrated automated wordlist generation # change p-value reporting # single-term list search versus all terms search # 4.3 updated mac newline handling use strict; use Tk; use Tk::ProgressBar; use Tk::Balloon; use Tk::ROText; use Tk::NoteBook; use Tk::LabFrame; use Tk::LabEntry; use Tk::NumEntry; use Statistics::Descriptive; use Getopt::Std; use Math::BigInt; use Math::BigFloat; if (!@ARGV) { &lackgui; } else { our ($opt_w, $opt_f, $opt_a, $opt_t, $opt_s, $opt_i, $opt_r, $opt_c, $opt_v, $opt_n, $opt_x); getopts('w:f:a:s:i:r:c:v:t:'); my ($iter, $raw, $minCount, $maxCount); if ($opt_i) { $iter = $opt_i; } else { $iter = 1000; } if ($opt_r) { $raw = 1; } else { $raw = 0; } if ($opt_n) { $minCount = $opt_n; } else { $minCount = 3; } if ($opt_x) { $maxCount = $opt_x; } else { $maxCount = 100; } die "Usage: lack42.pl [options]\n-w \n-f \n-a \n-t \n\t0=analyze all terms together\n\t1=analyze each term individually\n\t2=analyze each term present in analyzed dataset individually\n-s \n\t0=binomial\n\t1=poisson\n-i \n-r \n-c \n\t0=not\n\t1=case-sensitive\n-v \n\t0=off\n\t1=on\n-n \n-x \n" if (!$opt_w || !$opt_f || !$opt_a || $opt_t eq '' || $opt_s eq '' || $opt_c eq '' || $opt_v eq ''); &lackmain(\$opt_w,\$opt_f,\$opt_a,\$iter,0,0,\$raw,0,\$opt_s,\$opt_v,0,\$opt_t,\$minCount,\$maxCount,\$opt_c,0); } sub lackgui { my $mw = MainWindow->new(-title=>'LACK 4.2'); my $data = <Photo(-data => $data); $mw->Icon( -image => $image ); $mw->Label(-text=>"Lexical Analysis\nCopyright 2002-2005 Charlie Kim", )->pack(); # Input # my $analysisfr = $mw->LabFrame(-label => 'Analysis Type', -labelside => 'acrosstop', -relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); my $analysisType = 0; my $wordfile; my $wordfile_e; my $wordfile_label; my $wordfile_but; my $minCount = 3; my $maxCount = 100; my ($minCount_l, $minCount_e, $maxCount_l, $maxCount_e); my $allSearchbut = $analysisfr->Radiobutton(-text=> "Search for all terms (words analyzed together, with a single p-value)", -variable=> \$analysisType, -value=> 0, -borderwidth=>0, -command=> [\&analysisState,\$wordfile_e,\$wordfile_label,\$wordfile_but,\$analysisType,\$minCount_l,\$minCount_e,\$maxCount_l,\$maxCount_e ], )->grid(-row=>0, -column=>0, -sticky=>'w', ); my $singleSearchbut = $analysisfr->Radiobutton(-text=>'Search each term (each word analyzed individually, with separate p-values)', -variable=> \$analysisType, -value=> 1, -borderwidth=>0, -command=> [\&analysisState,\$wordfile_e,\$wordfile_label,\$wordfile_but,\$analysisType,\$minCount_l,\$minCount_e,\$maxCount_l,\$maxCount_e ], )->grid(-row=>1, -column=>0, -sticky=>'w', ); my $autoSearchbut = $analysisfr->Radiobutton(-text=>'Derive search terms from analyzed dataset (SGL)', -variable=> \$analysisType, -value=> 2, -borderwidth=>0, -command=> [\&analysisState,\$wordfile_e,\$wordfile_label,\$wordfile_but,\$analysisType,\$minCount_l,\$minCount_e,\$maxCount_l,\$maxCount_e ], )->grid(-row=>2, -column=>0, -sticky=>'w', ); $minCount_l = $analysisfr->Label(-text=>'Minimum Word Count', -state=>'disabled', )->grid(-row=>3, -column=>0, -sticky=>'e', ); $minCount_e = $analysisfr->NumEntry(-textvariable=> \$minCount, -width=> 5, -state=>'disabled', )->grid(-row=>3, -column=>1, -sticky=>'w', ); $maxCount_l = $analysisfr->Label(-text=>'Maximum Word Count', -state=>'disabled', )->grid(-row=>4, -column=>0, -sticky=>'e', ); $maxCount_e = $analysisfr->NumEntry(-textvariable=> \$maxCount, -width=>5, -state=>'disabled', )->grid(-row=>4, -column=>1, -sticky=>'w', ); my $inputfr = $mw->LabFrame(-label => 'Files', -labelside => 'acrosstop', -relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); $wordfile_label = $inputfr->Label(-text=>'Wordlist File', )->grid(-row=>0,-column=> 0,-sticky=>'e'); my $wordfile_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $wordfilehelp = "A line-delimited list of words to search for.\nExample:\niron\nmanganese\nmagnesium\ncalcium\ndivalent"; $wordfile_lab_bal->attach($wordfile_label, -balloonmsg => $wordfilehelp, ); $wordfile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$wordfile, -width=>30, )->grid(-row=>0,-column=> 1,-sticky=>'w'); $wordfile_but = $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$wordfile_e , \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>0,-column=> 2,-sticky=>'w'); my $full_label =$inputfr->Label(-text=>'Full Dataset File', )->grid(-row=>1,-column=> 0,-sticky=>'e'); my $full_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $fullhelp = "Full dataset. Should be identical to the full dataset used for the original analysis (e.g. in SAM or Cluster)."; $full_lab_bal->attach($full_label, -balloonmsg => $fullhelp); my $fullFile; my $annfile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$fullFile, -width=>30, )->grid(-row=>1,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$annfile_e, \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>1,-column=> 2,-sticky=>'w'); my $anal_label = $inputfr->Label(-text=>'Significant Genes File', )->grid(-row=>2,-column=> 0,-sticky=>'e'); my $anal_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $analhelp = "The top significant genes generated in an analysis (e.g. SAM or Cluster).\nShould have far fewer entries than the full dataset. Also known as the significant genes list (SGL)."; $anal_lab_bal->attach($anal_label, -balloonmsg=>$analhelp); my $sglFile; my $datafile_e = $inputfr->Entry(-state=>'normal', -textvariable=> \$sglFile, -width=>30, )->grid(-row=>2,-column=> 1,-sticky=>'w'); $inputfr->Button(-text=>'Browse', -command=> [ \&selectsinglefile, \$datafile_e, \$mw ], -borderwidth=>1, -padx=> 0, -pady=> 0, )->grid(-row=>2,-column=> 2,-sticky=>'w'); my $statfr = $mw->LabFrame(-label => 'Analysis Options', -labelside => 'acrosstop', -relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); my $stattype = 0; # $statfr->Label(-text=>"Statistics")->grid(-row=>5,-column=>0,-sticky=>'e'); my $iter_e; my $raw_c; my $iter_label; my $raw_label; my $binbut = $statfr->Radiobutton(-text=> "Binomial", -variable=> \$stattype, -value=> 0, -borderwidth=>0, -command=> [\&statState,\$iter_e,\$raw_c,\$stattype,\$iter_label ], )->grid(-row=>0, -column=>0, -sticky=>'w', ); my $poibut = $statfr->Radiobutton(-text=> "Poisson", -variable=> \$stattype, -value=> 1, -borderwidth=>0, -command=> [\&statState,\$iter_e,\$raw_c,\$stattype,\$iter_label ], )->grid(-row=>1, -column=>0, -sticky=>'w', ); my $iterations = 1000; $iter_label = $statfr->Label(-text=>'Iterations', -state=>'disabled', )->grid(-row=>1, -column=>2, -sticky=>'e', # -padx=>20, ); # portability ok # if ($^O =~ /mswin/i) { # $iter_label->configure(-foreground=>'SystemDisabledText'); # } # else { # $iter_label->configure(-foreground=>'grey'); # } my $iter_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $iterhelp = "The number of random datasets to generate in a Poisson analysis."; $iter_lab_bal->attach($iter_label, -balloonmsg=>$iterhelp); $iter_e = $statfr->Entry(-textvariable=> \$iterations, -width=>10, -state=>'disabled', )->grid(-row=>1, -column=>3, -sticky=>'w', ); # portability ok # if ($^O =~ /mswin/i) { # $iter_e->configure(-foreground=>'SystemDisabledText', # -background=>'SystemButtonFace', # ); # } # else { # $iter_e->configure(-foreground=>'grey', # -background=>'grey', # ); # } my $rawout = 0; # $raw_label = $statfr->Label(-text=>'', # )->grid(-row=>1, # -column=>4, # -padx=>20, # -sticky=>'e'); # portability ok # if ($^O =~ /mswin/i) { # $raw_label->configure(-foreground=>'SystemDisabledText', # ); # } # else { # $raw_label->configure(-foreground=>'grey', # ); # } $raw_c = $statfr->Checkbutton(-text => 'Raw Histogram Output', -variable => \$rawout, # -state=>'disabled' )->grid(-row=>2, -column=>0, -sticky=>'w', ); my $raw_lab_bal = $mw->Balloon(-state=>'balloon', -background=>'white'); my $rawhelp = "Include raw histogram bins and frequencies in the output."; $raw_lab_bal->attach($raw_c, -balloonmsg=>$rawhelp); my $caseSensitive = 1; $statfr->Checkbutton(-text => 'Case-sensitive search', -variable => \$caseSensitive, )->grid(-row=>3, -column=>0, -sticky=>'w', ); # Run # my $percent_done = 0; my $progress = $mw->ProgressBar(-width => 8, -blocks => 100, -gap => 0, -colors => [0, 'darkblue'], -variable => \$percent_done, -borderwidth => 2, # -troughcolor => 'SystemButtonFace', )->pack(-fill => 'x', ); #portability ok if ($^O =~ /mswin/i) { $progress->configure(-troughcolor=>'SystemButtonFace', ); } else { $progress->configure(-troughcolor=>'grey', ); } # log window my $log_f = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both', -expand=>1); my $logwin = $log_f->Scrolled("Text")->pack(-fill=>'both', -expand=>1); $logwin->configure(-height=>5, -width=>70, -wrap=>'none', -state=>'disabled', ); # text options my $textopt_f = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both'); $textopt_f->Button(-text=>"Clear Results", -command=> [ \&clearresults,\$mw,\$logwin ], -state=> 'normal', )->grid(-row=>0, -column=>0, -sticky=>'e'); my $verbose = 0; my $verb_c = $textopt_f->Checkbutton(-text => "Verbose matching", -variable => \$verbose, )->grid(-row=>0, -column=>1, -sticky=>'w'); my $verb_lab_bal = $textopt_f->Balloon(-state=>'balloon', -background=>'white'); my $verb_help = "Displays word matches found in the full dataset in the results box."; $verb_lab_bal->attach($verb_c, -balloonmsg=>$verb_help); $textopt_f->Button(-text=>"Save Results", -command=> [ \&saveresults,\$mw,\$logwin ], -state=> 'normal', )->grid(-row=>0, -column=>2, -sticky=>'e'); my $statusb; my $interrupt = 0; my $exec_fr = $mw->Frame(#-borderwidth=>2, #-relief=>'groove', )->pack(-side=>'top', -anchor=>'n'); $exec_fr->Button(-text=>"Run LACK", -command=> [ \&lackmain,\$wordfile,\$fullFile,\$sglFile,\$iterations,\$mw,\$percent_done,\$rawout,\$logwin,\$stattype,\$verbose,\$statusb,\$analysisType,\$minCount,\$maxCount,\$caseSensitive,\$interrupt ], -state=> 'normal', )->grid(-row=>0, -column=>2, -padx => 20, -pady => 4, ); $exec_fr->Button(-text=>"Stop", -command=> [\&incrementInterrupt,\$interrupt], )->grid(-row=>0, -column=>1, -padx => 20, -pady => 4, ); $exec_fr->Button(-text=>"Help", -command=> [\&makehelp,\$mw], )->grid(-row=>0, -column=>0, -padx => 20, -pady => 4, ); $exec_fr->Button(-text=>"Exit", -command=> sub { exit }, )->grid(-row=>0, -column=>3, -padx => 20, -pady => 4, ); $statusb = $mw->ROText(-height=>1, -width=>60, -wrap=>'word', # -background=>'SystemButtonFace', -relief=>'flat', -border=>0, )->pack(-side=>'left', -anchor=>'sw'); #portability ok if ($^O =~ /mswin/i) { $statusb->configure(-background=>'SystemButtonFace'); } else { $statusb->configure(-background=>'grey'); } $statusb->insert('end','Idle'); MainLoop; } sub incrementInterrupt { my $interrupt_r = shift; $$interrupt_r = 1; } sub makehelp { my $mwref = shift; my $mw = $$mwref; my $helpwin = MainWindow->new(-title=>"LACK Help"); $helpwin->focusForce; my $data = <Photo(-data => $data); $helpwin->Icon( -image => $image ); $helpwin->Button(-text=>"Close Help", -command=> sub { $helpwin->destroy; }, )->pack(-pady=>10); my $hf = $helpwin->Frame()->pack(-fill=>'both', -expand=>1, ); my $hn = $hf->NoteBook()->pack(-fill=>'both', -expand=>1, ); my $overview = $hn->add("overview", -label=>"Overview", ); my $files = $hn->add("files", -label=>"File Formats", ); my $options = $hn->add("options", -label=>"Commands and Options", ); my $stats = $hn->add("stats", -label=>"Statistics", ); my $warn = $hn->add("warn", -label=>"Warnings", ); my $add = $hn->add("add", -label=>"Additional Help", ); my $ov_t = $overview->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $file_t = $files->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $opt_t = $options->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $stat_t = $stats->Scrolled('ROText', -wrap=>'word', -scrollbars=>'osoe', )->pack(-fill=>'both', -expand=>1, ); my $warn_t = $warn->Scrolled('ROText', -scrollbars=>'osoe', -wrap=>'word', )->pack(-fill=>'both', -expand=>1, ); my $add_t = $add->Scrolled('ROText', -scrollbars=>'osoe', -wrap=>'word', )->pack(-fill=>'both', -expand=>1, ); my $ov_data = < lines and no more than in lines. Statistics ========== See the Statistics tab in the help. Raw Histogram Output =============---==== Turn this on if you wish to generate raw histogram data for analysis using an external statistical method or an external graphing program. Iterations ========== The number of random samples chosen from the full dataset file in a Poisson analysis. Case-sensitive Matching ======================= Leave this option on for case-sensitive matching. Uncheck the box to convert all text to lowercase before conducting the matching. Verbose matching ================ Selecting this option will report word matches in the full dataset. This option can be useful for familiarizing oneself with how the pattern matching algorithm of LACK performs. Save Results ============ Saves the text window results to a text file. Run LACK ======== Executes LACK. Note that all 3 files must be loaded prior to execution. Help ==== Pop-up help window. Exit ==== Exit LACK. Status Window ============= The lower left corner of the interface displays the current status of LACK. EOF my $stat_data = <insert('end',$ov_data); $file_t->insert('end',$file_data); $opt_t->insert('end',$opt_data); $stat_t->insert('end',$stat_data); $warn_t->insert('end',$warn_data); $add_t->insert('end',$add_data); } sub saveresults { my $mwref = shift; my $mw = $$mwref; my $logwinref = shift; # fix portability my $outfile = $mw->getSaveFile(-title=>'Save Results As'); open(OUTFILE,">$outfile") or die "Can't open output file $outfile\n"; print OUTFILE $$logwinref->get("1.0","end"); close OUTFILE; $mw->messageBox(-title => 'Finished', -message => 'Results Saved', -type => 'OK'); } sub clearresults { my $mw_r = shift; my $logwin_r = shift; $$logwin_r->delete('1.0','end'); } sub statState { my $iter_ref = shift; my $raw_ref = shift; my $stateref = shift; my $iter_l_ref = shift; # my $raw_l_ref = shift; # portability ok if ($^O =~ /mswin/i) { if ($$stateref == 0) { $$iter_ref->configure(-state=>'disabled'); # $$iter_ref->configure(-background=>'SystemButtonFace'); # $$iter_ref->configure(-foreground=>'SystemDisabledText'); # $$raw_ref->configure(-state=>'disabled'); # $$iter_l_ref->configure(-foreground=>'SystemDisabledText'); $$iter_l_ref->configure(-state=>'disabled'); # $$raw_l_ref->configure(-foreground=>'SystemDisabledText'); } elsif ($$stateref == 1) { $$iter_ref->configure(-state=>'normal'); # $$iter_ref->configure(-background=>'white'); # $$iter_ref->configure(-foreground=>'SystemButtonText'); # $$raw_ref->configure(-state=>'normal'); # $$iter_l_ref->configure(-foreground=>'SystemButtonText'); $$iter_l_ref->configure(-state=>'normal'); # $$raw_l_ref->configure(-foreground=>'SystemButtonText'); } else {die "Problem with statistics state\n";} } else { if ($$stateref == 0) { $$iter_ref->configure(-state=>'disabled'); $$iter_ref->configure(-background=>'lightgrey'); $$iter_ref->configure(-foreground=>'grey'); # $$raw_ref->configure(-state=>'disabled'); $$iter_l_ref->configure(-foreground=>'grey'); # $$raw_l_ref->configure(-foreground=>'grey'); } elsif ($$stateref == 1) { $$iter_ref->configure(-state=>'normal'); $$iter_ref->configure(-background=>'white'); $$iter_ref->configure(-foreground=>'black'); # $$raw_ref->configure(-state=>'normal'); $$iter_l_ref->configure(-foreground=>'black'); # $$raw_l_ref->configure(-foreground=>'black'); } else {die "Problem with statistics state\n";} } } sub analysisState { my $wordfile_e_ref = shift; my $wordfile_label_ref = shift; my $wordfile_but_ref = shift; my $analysisType_ref = shift; my $minCount_l_ref = shift; my $minCount_e_ref = shift; my $maxCount_l_ref = shift; my $maxCount_e_ref = shift; # portability ok if ($^O =~ /mswin/i) { if ($$analysisType_ref == 2) { $$wordfile_e_ref->configure(-state=>'disabled'); $$wordfile_label_ref->configure(-state=>'disabled'); $$wordfile_but_ref->configure(-state=>'disabled'); $$minCount_e_ref->configure(-state=>'normal'); $$maxCount_e_ref->configure(-state=>'normal'); $$minCount_l_ref->configure(-state=>'normal'); $$maxCount_l_ref->configure(-state=>'normal'); } elsif ($$analysisType_ref == 0 || $$analysisType_ref == 1) { $$wordfile_e_ref->configure(-state=>'normal'); $$wordfile_label_ref->configure(-state=>'normal'); $$wordfile_but_ref->configure(-state=>'normal'); $$minCount_e_ref->configure(-state=>'disabled'); $$maxCount_e_ref->configure(-state=>'disabled'); $$minCount_l_ref->configure(-state=>'disabled'); $$maxCount_l_ref->configure(-state=>'disabled'); } else {die "Problem with analysis type state\n";} } else { if ($$analysisType_ref == 2) { $$wordfile_e_ref->configure(-state=>'disabled'); $$wordfile_label_ref->configure(-state=>'disabled'); $$wordfile_but_ref->configure(-state=>'disabled'); $$minCount_e_ref->configure(-state=>'normal'); $$maxCount_e_ref->configure(-state=>'normal'); $$minCount_l_ref->configure(-state=>'normal'); $$maxCount_l_ref->configure(-state=>'normal'); # $$wordfile_e_ref->configure(-state=>'disabled'); # $$wordfile_e_ref->configure(-background=>'lightgrey'); # $$wordfile_e_ref->configure(-foreground=>'grey'); # $$wordfile_label_ref->configure(-foreground=>'grey'); # $$wordfile_but_ref->configure(-state=>'disabled'); } elsif ($$analysisType_ref == 0 || $$analysisType_ref == 1) { $$wordfile_e_ref->configure(-state=>'normal'); $$wordfile_label_ref->configure(-state=>'normal'); $$wordfile_but_ref->configure(-state=>'normal'); $$minCount_e_ref->configure(-state=>'disabled'); $$maxCount_e_ref->configure(-state=>'disabled'); $$minCount_l_ref->configure(-state=>'disabled'); $$maxCount_l_ref->configure(-state=>'disabled'); # $$wordfile_e_ref->configure(-state=>'normal'); # $$wordfile_e_ref->configure(-background=>'white'); # $$wordfile_e_ref->configure(-foreground=>'black'); # $$wordfile_label_ref->configure(-foreground=>'black'); # $$wordfile_but_ref->configure(-state=>'normal'); } else {die "Problem with analysis type state\n";} } } sub printlog { my $line = shift; my $logwin_r = shift; if ($logwin_r) { $$logwin_r->configure(-state=>'normal'); $$logwin_r->insert('end',$line); $$logwin_r->yview(moveto=>1); #$$logwin_r->configure(-state=>'disabled'); $$logwin_r->update(); } else { print STDOUT "$line\n"; } } sub updateStatus { my ($mw_r, $status_r, $text) = @_; if ($mw_r) { $$status_r->delete('1.0','end'); $$status_r->insert('end',$text); $$mw_r->update; } else { print STDERR "$text\n"; } } sub lackmain { my ($wordFile_r, $fullFile_r, $sglFile_r, $iter_r, $mw_r, $percDone_r, $rawOut_r, $logWin_r, $statType_r, $verbose_r, $statusBar_r, $analysisType_r, $minCount_r, $maxCount_r, $caseSensitive_r, $interrupt_r ) = @_; # print "$$wordFile_r\n$$fullFile_r\n$$sglFile_r\n$$rawOut_r\n$$iter_r\n$$statType_r\n$$verbose_r\n$$analysisType_r\n$$minCount_r\n$$maxCount_r\n$$caseSensitive_r\n"; $$interrupt_r = 0 if $$mw_r; # input tests: file existence, proper data formats, etc. &updateStatus($mw_r,$statusBar_r,'Beginning analysis'); if (!$$wordFile_r && $$analysisType_r != 2) { &updateStatus($mw_r,$statusBar_r,'Error: No word list file specified'); $$mw_r->bell; return; } if (!$$fullFile_r) { &updateStatus($mw_r,$statusBar_r,'Error: No full dataset file specified'); $$mw_r->bell; return; } if (!$$sglFile_r) { &updateStatus($mw_r,$statusBar_r,'Error: No SGL file specified'); $$mw_r->bell; return; } $$minCount_r = 1 unless $$minCount_r; $$maxCount_r = 999999999 unless $$maxCount_r; &updateStatus($mw_r,$statusBar_r,'Reading search terms'); $$percDone_r = 1; # generate wordlist from file or analysis my @words; if ($$analysisType_r == 0 || $$analysisType_r == 1) { open(WORD,$$wordFile_r) or die "Couldn't open $$wordFile_r: $!\n"; while() { chomp; next if !$_; $_ =~ s/\r/\n/g; # this weirdness is for Macs $_ =~ s/\n\n//g; $_ = lc($_) if ($$caseSensitive_r == 0); if (/\n/) { # more Mac weirdness my @lineSplit = split(/\t/); push @words, @lineSplit; } else { push @words, $_; } } close WORD; } elsif ($$analysisType_r == 2) { &retrieveWords(\@words,$minCount_r,$maxCount_r,$sglFile_r,$caseSensitive_r); } else { die "Couldn't identify analysis type $$analysisType_r\n"; } &updateStatus($mw_r,$statusBar_r,'Reading full dataset'); # get annotations for full and sgl my @fullLines; open(FULL,$$fullFile_r) or die "Couldn't open $$fullFile_r: $!\n"; while() { chomp; $_ =~ s/\r/\n/g; # this weirdness is for Macs $_ =~ s/\n\n//g; next if !$_; $_ = lc($_) if ($$caseSensitive_r == 0); if (/\n/) { # more Mac weirdness my @lineSplit = split(/\n/); push @fullLines, @lineSplit; } else { push @fullLines, $_; } } close FULL; &updateStatus($mw_r,$statusBar_r,'Reading SGL dataset'); &printlog("Term\tSGL Size\tSGL Hits\tFull Dataset Size\tFull Dataset Hits\tp-value\n",$logWin_r) unless ($$verbose_r || $$rawOut_r); my @sglLines; open(SGL,$$sglFile_r) or die "Couldn't open $$sglFile_r: $!\n"; while() { chomp; $_ =~ s/\r/\n/g; # Mac newline weirdness $_ =~ s/\n\n//g; next if !$_; $_ = lc($_) if ($$caseSensitive_r == 0); if (/\n/) { my @lineSplit = split(/\n/); push @sglLines, @lineSplit; } else { push @sglLines, $_; } } close SGL; if ($$mw_r && $$interrupt_r) { $$interrupt_r = 0; &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); $$percDone_r = 0; return; } # run analysis on all words, for analysis type 0 (all terms together) if ($$analysisType_r == 0) { &updateStatus($mw_r,$statusBar_r,'Matching terms in SGL'); my @sglHits = &getSGLhits(\@sglLines,$verbose_r,@words); &updateStatus($mw_r,$statusBar_r,'Matching terms in full dataset'); my @fullHits = &getFullHits(\@fullLines,$verbose_r,$mw_r,$logWin_r,@words); my $sglHitCount = shift @sglHits; my $fullHitCount = shift @fullHits; my $sglSize = $#sglLines + 1; my $fullSize = $#fullLines + 1; if ($$mw_r && $$interrupt_r) { $$interrupt_r = 0; &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); $$percDone_r = 0; return; } # calc stats &printlog("Term\tSGL Size\tSGL Hits\tFull Dataset Size\tFull Dataset Hits\tp-value\n",$logWin_r) if ($$verbose_r || $$rawOut_r); if ($$statType_r == 0) { &updateStatus($mw_r,$statusBar_r,'Computing binomial statistics'); &printlog("All\t$sglSize\t$sglHitCount\t$fullSize\t$fullHitCount\t(P>=$sglHitCount)=",$logWin_r); &binomial($sglHitCount,$sglSize,$fullHitCount,$fullSize,$mw_r,$statusBar_r,$logWin_r,$rawOut_r,$percDone_r,$interrupt_r); } elsif ($$statType_r == 1) { &updateStatus($mw_r,$statusBar_r,'Computing Poisson statistics'); &printlog("All\t$sglSize\t$sglHitCount\t$fullSize\t$fullHitCount\t(P>=$sglHitCount)=",$logWin_r); &poisson(\@fullHits,$sglHitCount,$sglSize,$fullSize,$iter_r,$mw_r,$statusBar_r,$logWin_r,$rawOut_r,$percDone_r,$interrupt_r); } else { die "Statistics type $$statType_r not known\n"; } } # run analysis on individual words, for analysis types 1 & 2 # (separate terms, or derived terms) if ($$analysisType_r == 1 || $$analysisType_r == 2) { foreach (sort @words) { &updateStatus($mw_r,$statusBar_r,'Matching terms in SGL'); my @sglHits = &getSGLhits(\@sglLines,$verbose_r,$_); &updateStatus($mw_r,$statusBar_r,'Matching terms in full dataset'); my @fullHits = &getFullHits(\@fullLines,$verbose_r,$mw_r,$logWin_r,$_); if ($$mw_r && $$interrupt_r) { $$interrupt_r = 0; &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); $$percDone_r = 0; return; } my $sglHitCount = shift @sglHits; my $fullHitCount = shift @fullHits; my $sglSize = $#sglLines + 1; my $fullSize = $#fullLines + 1; # calc stats &printlog("Term\tSGL Size\tSGL Hits\tFull Dataset Size\tFull Dataset Hits\tp-value\n",$logWin_r) if ($$verbose_r || $$rawOut_r); if ($$statType_r == 0) { # calculate binomial &updateStatus($mw_r,$statusBar_r,'Computing binomial statistics'); &printlog("$_\t$sglSize\t$sglHitCount\t$fullSize\t$fullHitCount\t(P>=$sglHitCount)=",$logWin_r); &binomial($sglHitCount,$sglSize,$fullHitCount,$fullSize,$mw_r,$statusBar_r,$logWin_r,$rawOut_r,$percDone_r,$interrupt_r); } elsif ($$statType_r == 1) { # calculate poisson &updateStatus($mw_r,$statusBar_r,'Computing Poisson statistics'); &printlog("$_\t$sglSize\t$sglHitCount\t$fullSize\t$fullHitCount\t(P>=$sglHitCount)=",$logWin_r); my $pval = &poisson(\@fullHits,$sglHitCount,$sglSize,$fullSize,$iter_r,$mw_r,$statusBar_r,$logWin_r,$rawOut_r,$percDone_r,$interrupt_r); } else { die "Statistics type $$statType_r not known\n"; } } } if ($$mw_r && $$interrupt_r) { $$interrupt_r = 0; &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); $$percDone_r = 0; return; } &updateStatus($mw_r,$statusBar_r,'Analysis complete'); $$percDone_r = 100; $$mw_r->bell if $mw_r; } sub retrieveWords { my ($wordArray_r,$minCount_r,$maxCount_r,$sglFile_r,$caseSensitive_r) = @_; my %wordCount; open(SGL,$$sglFile_r) or die "Couldn't open $$sglFile_r: $!\n"; while() { chomp; $_ =~ s/\r/ /g; # to deal with Mac newlines next if !$_; my @line = split(/\b/); foreach (@line) { next if !$_; # empty next if length($_) == 1; # skip single letters next unless /^[a-zA-Z]/; # skip unless first char is alpha # skip uninformative words next if $_ =~ /^an$/i; next if $_ =~ /^and$/i; next if $_ =~ /^at$/i; next if $_ =~ /^for$/i; next if $_ =~ /^in$/i; next if $_ =~ /^of$/i; next if $_ =~ /^on$/i; next if $_ =~ /^the$/i; next if $_ =~ /^to$/i; next if $_ =~ /^with$/i; $wordCount{$_}++; } } close SGL; my %wordCheck; # need to track in case same word sometimes is uc & lc foreach (sort keys %wordCount) { if ($wordCount{$_} >= $$minCount_r && $wordCount{$_} <= $$maxCount_r) { $_ = lc($_) if ($$caseSensitive_r == 0); next if $wordCheck{$_}; $wordCheck{$_}++; push @{$wordArray_r}, $_; } } sort @{$wordArray_r}; } sub getSGLhits { my $sglLines_r = shift; my $verbose_r = shift; my @words = @_; my @sglHits; my $sglHitCount = 0; for my $c (0 .. $#{$sglLines_r}) { my $hit = 0; foreach (@words) { if (${$sglLines_r}[$c] =~ /\b$_/ || ${$sglLines_r}[$c] =~ /$_\b/) { $hit++; $sglHitCount++; last; } } $sglHits[$c] = $hit; } return ($sglHitCount, @sglHits); } sub getFullHits { my $fullLines_r = shift; my $verbose_r = shift; my $mw_r = shift; my $logWin_r = shift; my @words = @_; my @fullHits; my $fullHitCount = 0; for my $c (0 .. $#{$fullLines_r}) { my $hit = 0; $$mw_r->update; foreach (@words) { if (${$fullLines_r}[$c] =~ /\b$_/ || ${$fullLines_r}[$c] =~ /$_\b/) { $hit++; $fullHitCount++; if ($$verbose_r) { &printlog("Found $_ in ${$fullLines_r}[$c]\n",$logWin_r); } last; } } $fullHits[$c] = $hit; } return ($fullHitCount, @fullHits); } sub binomial { my ($sglHitCount,$sglSize,$fullHitCount,$fullSize,$mw_r,$statusBar_r,$logWin_r,$rawOut_r,$percDone_r,$interrupt_r) = @_; my @pvals; my @histo; my $n = $sglSize; my $X = 0; my $p = Math::BigFloat->new($fullHitCount); $p->bdiv($fullSize); my $q = Math::BigFloat->new('1'); $q->bsub($p); my $cumulative = Math::BigFloat->new('0'); my $massdensity = 0; my $curve_side_flag = 0; my %powers; while () { if ($$mw_r && $$interrupt_r) { &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); return; } &updateStatus($mw_r,$statusBar_r,"Calculating p-value for $X hits"); $$percDone_r = 100*($X+1)/($sglHitCount+10); my $n_over_n_minus_x = Math::BigInt->new('1'); my $xfact = Math::BigInt->new('1'); for (my $c = $n; $c > $n - $X; $c--) { $n_over_n_minus_x->bmul($c); } for (my $d = 1; $d <= $X; $d++) { $xfact->bmul($d); } my $nCxval = Math::BigFloat->new($n_over_n_minus_x); $nCxval->bdiv($xfact); my $term2 = Math::BigFloat->new($p); $term2->bpow($X); if ($$mw_r && $$interrupt_r) { &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); return; } my $term3 = Math::BigFloat->new($q); # cache factorial values for speed if ($powers{$q}{$n-$X}) { $term3 = Math::BigFloat->new($powers{$q}{$n-$X}); } else { for my $e (2 .. ($n-$X) ) { $term3->bmul($q); $powers{$q}{$e} = Math::BigFloat->new($term3); $$mw_r->update; } } if ($$mw_r && $$interrupt_r) { &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); return; } my $temp_p = Math::BigFloat->new($nCxval); $temp_p->bmul($term2); $temp_p->bmul($term3); my $temp_histo = $temp_p * 100; # 100 is arbitrary my $temp_p2 = sprintf("%.10f",$temp_p); $temp_p2 = '<0.0000000001' if $temp_p2 == 0; $massdensity = $temp_p2 if $X == $sglHitCount; $cumulative += $temp_p if $X <= $sglHitCount; $temp_histo = &myround($temp_histo); push @pvals, $temp_p2; push @histo, $temp_histo; if ($temp_p > 0.01 && $curve_side_flag == 0) { $curve_side_flag++; } last if $temp_p < 0.00001 && $curve_side_flag && $X >= $sglHitCount; # arbitrary cutoff $X++; } # $cumulative->ffround(-10); # my $invcumul = 1 - $cumulative; # $cumulative = $cumulative->bstr; my $pval = (1-$cumulative) + $massdensity; $pval->ffround(-10); if ($pval == 0) { $pval = '<0.0000000001'; } &printlog("$pval\n",$logWin_r); if ($$rawOut_r) { &printlog("\n\t\tHistogram\n\t\tHits\tExpected frequency in 100 samples from full dataset\tp(x=Hits)\n",$logWin_r); for (my $c = 0; $c<= $#histo; $c++) { &printlog("\t\t$c\t$histo[$c]\t$pvals[$c]\n",$logWin_r); } &printlog("\n",$logWin_r); } $$percDone_r = 100; return; } sub poisson { my ($fullHits_ref,$sglHitCount,$sglSize,$fullSize,$iter_r,$mw_r,$statusBar_r,$logWin_r,$rawOut_r,$percDone_r,$interrupt_r) = @_; my @statdata; for (my $d = 0; $d < $$iter_r; $d++) { if ($$mw_r && $$interrupt_r) { &updateStatus($mw_r,$statusBar_r,'Analysis interrupted'); return; } &updateStatus($mw_r,$statusBar_r,"Calculating iteration $d"); $$percDone_r = 100*($d+1)/($$iter_r*1.01); my @int = &randint($fullSize,$sglSize); $$mw_r->update; my $fullCount = 0; foreach (@int) { if (${$fullHits_ref}[$_]) { $fullCount++; } } push @statdata, $fullCount; } my @stats = &calcdescstat(\@statdata); my $mean = $stats[1]; my $p_less_than_hits = 0; for (my $c = 0; $c < $sglHitCount; $c++) { my $tempval = ($mean ** $c) * exp(-$mean) / &factorial($c); # &updateStatus($mw_r,$statusBar_r,"Calculating iteration $d, p-value for $c of $sglHitCount hits"); $p_less_than_hits += $tempval; } my $pval = 1 - $p_less_than_hits; &printlog("$pval\n",$logWin_r); my %histotally = (); foreach (@statdata) { $histotally{$_}++; } if ($$rawOut_r) { &printlog("\n\t\tHistogram\n\t\tHits\tFrequency\n",$logWin_r); foreach (sort {$a<=>$b} keys %histotally) { &printlog("\t\t$_\t$histotally{$_}\n",$logWin_r); } &printlog("\n",$logWin_r); } $$percDone_r = 100; return; } sub myround { my $num = shift; my $intnum = int($num); my $remain = $num - $intnum; return $intnum if $remain < 0.5; return $intnum+1; } sub randint { my ($fullSize,$sglSize) = @_; my @integers = (); my %intcheck; my $c = 0; while ($c <= $sglSize) { my $randint = int(rand($fullSize)); next if $intcheck{$randint}; $intcheck{$randint} = 1; push @integers, $randint; $c++; } return @integers; } sub factorial { my $n = shift; return undef if $n < 0; return 1 if $n == 0; # my $factorial = Math::BigInt->new('1'); my $factorial = 1; for my $c ( 1 .. $n ) { # $factorial->bmul($c); $factorial *= $c; } return $factorial; } sub calcdescstat { my $dataref = shift; my @data = @{$dataref}; my $stat = Statistics::Descriptive::Full->new(); $stat->add_data(@data); my $count = $stat->count(); my $mean = $stat->mean(); my $stdev = $stat->standard_deviation(); my $min = $stat->min(); my $max = $stat->max(); my $p1 = $stat->percentile(1); my $p5 = $stat->percentile(5); my $p95 = $stat->percentile(95); my $p99 = $stat->percentile(99); return ($count,$mean,$stdev,$min,$max,$p1,$p5,$p95,$p99); } sub selectsinglefile { my $entryblankref = shift; my $entryblank = $$entryblankref; my $mwref = shift; my $mw = $$mwref; my $file; if ($^O =~ /mswin/i) { my $opentypes = [ "{All files} * ", "{PCL files} {.pcl} ", "{Text files} {.txt} ", ]; $file = $mw->getOpenFile(-filetypes=>$opentypes); } else { $file = $mw->getOpenFile(-title=>'Load File'); } if (defined $file and $file ne '') { $entryblank->delete(0, 'end'); $entryblank->insert(0, $file); $entryblank->xview('end'); } }