#!/usr/bin/perl -w # gack3628 removes deprecated functions - only pcl and cdt files are # supported, and binomial smoothing is removed as it was never developed # gack3629 removed unnecessary commented code to reduce size # gack3630 removed unnecessary code (file input types, EPP difference # outputs), implements GD histogram output # gack3631 fix bug that deletes zeros in gck output # fix line reading bug that caught uninitialized values # (due to different line sizes if blank at end) # 2/13/03 use Tk; use Tk::FileSelect; use POSIX; use Tk::ErrorDialog; # Un-comment these for compilation with Perlapp use Tk::Button; use Tk::Checkbutton; use Tk::Frame; use Tk::Entry; use Tk::Radiobutton; use Tk::Scale; use Tk::Scrollbar; use Tk::Text; use Tk::Label; use Tk::LabEntry; use GD; use GD::Graph; use GD::Graph::lines; use Math::Round; my $mw = MainWindow->new; $mw->title("GACK"); $mw->Label(-text=>"Genomotyping Analysis by Charlie Kim")->pack; # MAIN FRAMES # $logfr = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'right', -anchor=>'ne', -fill=>'y', ); $inputfr = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); $secondrowfr = $mw->Frame(-borderwidth=>0, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); $rawdatafr = $secondrowfr->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'left', -anchor=>'nw', -fill=>'both', -expand=> 1, ); $histofr = $secondrowfr->Frame(-relief=>'groove', -label=>'Data Histogram', -borderwidth=>2, )->pack(-side=>'right', -anchor=>'nw', -fill=>'both', ); $smoothfr = $mw->Frame(-relief=>'groove', -label=>'Data Smoothing', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); $modelfr = $mw->Frame(-relief=>'groove', -label=>'Peak Modeling', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); $outfilefr = $mw->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-side=>'top', -anchor=>'nw', -fill=>'both', ); # LOG FRAME # $logopt = $logfr->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both', ); $logging = 1; $logopt->Checkbutton(-text=>"Create logfile", -variable=> \$logging, -command=> \&logstatus, )->pack(-side=>'left', -anchor=>'nw', ); $logfile = 'gacklog.txt'; $logfile_e = $logopt->Entry(-textvariable=> \$logfile, )->pack(-side=>'left', -anchor=>'nw', -pady=> 2, ); $logfile_b = $logopt->Button(-text=>'Browse', -command=> \&selectlogfile, -borderwidth=> 1, -padx=> 0, -pady=> 0, )->pack(-side=>'left', -anchor=>'nw', ); $log_f = $logfr->Frame(-relief=>'groove', -borderwidth=>2, )->pack(-fill=>'both'); $logwin = $log_f->Scrolled("Text")->pack(-fill=>'both'); $logwin->configure(-height=>20, -width=>40, -wrap=>'none', -state=>'disabled', ); $logfr->Button(-text=>"Exit", -command=> sub { exit }, )->pack(-side=>'right', ); $rungack_b = $logfr->Button(-text=>"Run GACK", -command=> \&gackmain, -state=> 'normal', )->pack(-side=>'left', # -padx=>20, ); # OPTION FRAMES # # input data # $incol2fr = $inputfr->Frame(#-relief=>'groove', -borderwidth=>2, )->pack(-side=>'left', -fill=>'both', ); $incol3fr = $inputfr->Frame(#-relief=>'groove', -borderwidth=>2, )->pack(-side=>'left', -fill=>'both', ); $indata = 'pclfile'; $pclfile_e = $incol2fr->LabEntry(-label=>'Input pcl or cdt File', -labelPack=> [qw/-side left -anchor w/], -textvariable=> \$pcl_file, -width=>30, -background=>'white', )->pack( -pady=>3, ); $pclfile_b = $incol3fr->Button(-text=>'Browse', -command=> \&selectpclfile, -state=>'normal', -borderwidth=>1, -padx=> 0, -pady=> 0, )->pack( ); # file extensions #$inext = 'txt'; $outext = 'gck'; # binsize $bincol1fr = $histofr->Frame(-borderwidth=>2, )->pack(-side=>'left', -fill=>'both', ); $bincol2fr = $histofr->Frame(-borderwidth=>2, )->pack(-side=>'left', -fill=>'both', ); $binsize = 0.1; $bincol1fr->Label(-text=>'Bin Size', )->pack(-side=>'left', ); $bincol2fr->Scale(-from=>0.01, -to=>0.5, -resolution=>0.01, -variable=> \$binsize, -width=>10, -borderwidth=>1, -orient=>'horizontal', )->pack(-anchor=>'nw'); $rawdataout = 0; $rawdatafr->Checkbutton(-text=>'Generate raw histogram data', -variable=> \$rawdataout, )->pack(-side=>'top', -anchor=>'nw', ); $graphout = 0; $rawdatafr->Checkbutton(-text=>'Generate graphs', -variable=> \$graphout, )->pack(-side=>'top', -anchor=>'nw', ); # smoothing $smoothcol1fr = $smoothfr->Frame(-borderwidth=>2, )->pack(-side=>'left', -fill=>'both', ); $smoothcol2fr = $smoothfr->Frame(-borderwidth=>2, )->pack(-side=>'left', -fill=>'both', ); $smoothtype = 0; $smoothwin = 1; $smoothcol1fr->Radiobutton(-text=>"No Smoothing", -value=>0, -variable=>\$smoothtype, -command=> \&smoothopts, -borderwidth=>0, -pady=> 0, )->pack(-anchor=>'nw', ); $smoothcol1fr->Radiobutton(-text=>"Moving Window Average", -value=>1, -variable=>\$smoothtype, -command=> \&smoothopts, -anchor=>'nw', -borderwidth=>0, -pady=> 0, )->pack(-anchor=>'nw'); $smoothcol2fr->Label(-text=>'', -borderwidth=> 0, )->pack(-anchor=>'nw', -pady=> 3, ); @windowsizes = (3,5,7,9); # moving average window $mawframe = $smoothcol2fr->Frame()->pack(-side=>'top', -anchor=>'nw', ); %maw = (); foreach (@windowsizes) { $maw{$_} = $mawframe->Radiobutton(-text=>$_, -value=>$_, -variable=> \$mawsmoothwin, -state=>'disabled', -borderwidth=>0, -pady=> 0, -anchor=>'nw', )->pack(-side=>'left', -anchor=>'nw', ); } # fake data type $faketype = 1; $modelfr->Radiobutton(-text=>"Positive-Side Mirroring", -value=>0, -variable=>\$faketype, -anchor=>'nw', -borderwidth=>0, -pady=> 0, )->pack(-anchor=>'nw'); $modelfr->Radiobutton(-text=>"Normal Curve", -value=>1, -variable=>\$faketype, -anchor=>'nw', -borderwidth=>0, -pady=> 0, )->pack(-anchor=>'nw'); $EPPout = 1; $EPP_ch = $outfilefr->Checkbutton(-text=> "Graded output", -variable=> \$EPPout, )->grid(-row=>0,-column=>0,-sticky=>'w'); #$outfilefr->Label(-text=> "Sensitivity", # )->grid(-row=>0,-column=>2); $EPPsensi = 50; #$EPP_sc = $outfilefr->Scale(-from=>0, # -to=>100, # -resolution=>1, # -orient=>'horizontal', # -state=>'disabled', # -width=>10, # -length=>200, # -variable=> \$EPPsensi, # )->grid(-row=>0,-column=>1); $binout = 0; $outfilefr->Checkbutton(-text=> "Binary output", -variable=> \$binout, )->grid(-row=>1,-col=>0,-sticky=>'w'); $outfilefr->Label(-text=> "Binary %EPP Cutoff", )->grid(-row=>1,-col=>2); $binoutcut = 50; $outfilefr->Scale(-from=>0, -to=>100, -resolution=>5, -orient=>'horizontal', -width=>10, -variable=> \$binoutcut, )->grid(-row=>1,-column=>1); $triout = 0; $outfilefr->Checkbutton(-text=> "Trinary output", -variable=> \$triout, )->grid(-row=>2,-col=>0,-sticky=>'w'); $outfilefr->Label(-text=> "Trinary %EPP Cutoff 1", )->grid(-row=>2,-col=>2); $outfilefr->Label(-text=> "Trinary %EPP Cutoff 2", )->grid(-row=>3,-col=>2); $trioutcut1 = 0; $trioutcut2 = 100; $outfilefr->Scale(-from=>0, -to=>100, -resolution=>5, -orient=>'horizontal', -width=>10, -variable=> \$trioutcut1, )->grid(-row=>2,-column=>1); $outfilefr->Scale(-from=>0, -to=>100, -resolution=>5, -orient=>'horizontal', -width=>10, -variable=> \$trioutcut2, )->grid(-row=>3,-column=>1); MainLoop; ############### # SUBROUTINES # ############### sub errormsg { my $msg = shift; my $errorwin = $mw->Toplevel(); # print "\a"; $errorwin->bell; $errorwin->Label(-text=> $msg, )->pack(); $errorwin->Button(-text=>'Close', -command=> sub { $errorwin->destroy }, )->pack(); $errorwin->waitWindow(); } sub printlog { my $line = shift; $logwin->configure(-state=>'normal'); $logwin->insert('end',$line); $logwin->yview(moveto=>1); $logwin->configure(-state=>'disabled'); $logwin->update(); if ($logging == 1) { open(LOGFILE,">>$logfile") or die "Logfile problem\n"; print LOGFILE $line; } close LOGFILE; } sub logparam { &printlog("Bin size = $binsize\n"); if ($smoothtype == 0) { &printlog("No data smoothing selected\n"); } elsif ($smoothtype == 1) { &printlog("Moving window average smoothing selected\n"); &printlog("Smoothing window = $mawsmoothwin"); } elsif ($smoothtype == 2) { &printlog("Binomial smoothing selected\n"); &printlog("Smoothing window = $binsmoothwin"); } if ($faketype == 0) { &printlog("Peak model = Right-side mirroring\n"); } elsif ($faketype == 1) { &printlog("Peak model = Normal curve\n"); } } sub selectlogfile { my $file = $mw->getSaveFile(); if (defined $file and $file ne '') { $logfile_e->delete(0, 'end'); $logfile_e->insert(0, $file); $logfile_e->xview('end'); } } sub logstatus { if ($logging == 0) { $logfile_b->configure(-state=>'disabled'); $logfile_e->configure(-state=>'disabled', -background=>'grey', ); } else { $logfile_b->configure(-state=>'normal'); $logfile_e->configure(-state=>'normal', -background=>'white', ); } } sub selectsinglefile { my $file = $mw->getOpenFile(); if (defined $file and $file ne '') { $single_e->delete(0, 'end'); $single_e->insert(0, $file); $single_e->xview('end'); } } sub selectpclfile { my $file = $mw->getOpenFile(); if (defined $file and $file ne '') { $pclfile_e->delete(0, 'end'); $pclfile_e->insert(0, $file); $pclfile_e->xview('end'); } } sub selectdir { use Tk::FileSelect; use Cwd; my $dirref = $mw->FileSelect(-verify=> [qw/-d/] ); my $dir = $dirref->Show; if (defined $dir and $dir ne '') { $batch_e->delete(0, 'end'); $batch_e->insert(0, $dir); $batch_e->xview('end'); } } sub fileopts { if ($indata eq 'single') { $single_e->configure(-state=>'normal', -background=>'white', ); $single_b->configure(-state=>'normal'); $batch_e->configure(-state=>'disabled', -background=>'grey', ); $batch_b->configure(-state=>'disabled'); $pclfile_e->configure(-state=>'disabled', -background=>'grey', ); $pclfile_b->configure(-state=>'disabled'); $inext_e->configure(-state=>'disabled', -background=>'grey', ); $EPP_ch->configure(-state=>'disabled'); $EPP_sc->configure(-state=>'disabled'); } elsif ($indata eq 'batch') { $single_e->configure(-state=>'disabled', -background=>'grey', ); $single_b->configure(-state=>'disabled'); $batch_e->configure(-state=>'normal', -background=>'white', ); $batch_b->configure(-state=>'normal'); $pclfile_e->configure(-state=>'disabled', -background=>'grey', ); $pclfile_b->configure(-state=>'disabled'); $inext_e->configure(-state=>'normal', -background=>'white', ); $EPP_ch->configure(-state=>'disabled'); $EPP_sc->configure(-state=>'disabled'); } elsif ($indata eq 'pclfile') { $single_e->configure(-state=>'disabled', -background=>'grey', ); $single_b->configure(-state=>'disabled'); $batch_e->configure(-state=>'disabled', -background=>'grey', ); $batch_b->configure(-state=>'disabled'); $pclfile_e->configure(-state=>'normal', -background=>'white', ); $pclfile_b->configure(-state=>'normal'); $inext_e->configure(-state=>'disabled', -background=>'grey', ); $EPP_ch->configure(-state=>'normal'); $EPP_sc->configure(-state=>'normal'); } } sub smoothopts { if ($smoothtype == 0) { $smoothwin = 1; $mawsmoothwin = 0; $binsmoothwin = 0; foreach (keys %maw) { $maw{$_}->configure(-state=>'disabled'); } foreach (keys %bin) { $bin{$_}->configure(-state=>'disabled'); } } elsif ($smoothtype == 1) { $binsmoothwin = 0; $mawsmoothwin = 3; foreach (keys %maw) { $maw{$_}->configure(-state=>'normal'); } foreach (keys %bin) { $bin{$_}->configure(-state=>'disabled'); } } elsif ($smoothtype == 2) { $mawsmoothwin = 0; $binsmoothwin = 3; foreach (keys %maw) { $maw{$_}->configure(-state=>'disabled'); } foreach (keys %bin) { $bin{$_}->configure(-state=>'normal'); } } } sub gackmain { $rungack_b->configure(-state=> 'disabled'); if (!$single_file && !$batch_dir && !$pcl_file) { &errormsg('No input file/directory specified'); $rungack_b->configure(-state=> 'normal'); return; } if ( ($indata eq 'single' && !$single_file) || ($indata eq 'batch' && !$batch_dir) || ($indata eq 'pclfile' && !$pcl_file) ) { &errormsg('Improper input file/directory selection'); $rungack_b->configure(-state=> 'normal'); return; } if ($logging == 1 && !$logfile) { &errormsg('Logging enabled, but no logfile specified'); $rungack_b->configure(-state=> 'normal'); return; } if ($smoothtype == 0) { $smoothwin = 1; } elsif ($smoothtype == 1) { $smoothwin = $mawsmoothwin; } elsif ($smoothtype == 2) { $smoothwin = $binsmoothwin; } else { &errormsg('Smoothing type not specified'); $rungack_b->configure(-state=> 'normal'); return; } unless ($outext) { &errormsg('Output file extension not specified'); $rungack_b->configure(-state=> 'normal'); return; } # SINGLE FILE # if ($indata eq 'single') { ; # removed 8/2/02 } # BATCH PROCESS DIRECTORY # elsif ($indata eq 'batch') { ; # removed 8/2/02 } # PCL FILE # elsif ($indata eq 'pclfile') { my $infile = $pcl_file; use File::Basename; my $outdir = dirname($infile); chdir $outdir; my $time = localtime(); &printlog ("$time\n"); &printlog ("Opening $infile\n\n"); unless (-e $infile) { &printlog("Could not find input file $infile\n"); die; } open(INFILE,$infile); %datarefs = &read_multi_data(*INFILE); close INFILE; # %optcut = (); # %checkcut = (); my @EPP_testvals = (0.5, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100); my %EPP_cutoffs = (); my $EPPoutfile = 'EPP.val'; if (-e $EPPoutfile) { $EPPoutfile = $mw->getSaveFile(-initialfile=>'EPP.val', -title=>'EPP output file', ); } open(EPPOUT,">$EPPoutfile") or die "EPP out problem\n"; print EPPOUT "Dataset"; foreach (@EPP_testvals) { print EPPOUT "\t$_"; } print EPPOUT "\n"; close EPPOUT; foreach $key (sort keys %datarefs) { next if $key eq 'NAME_COLUMN_VALUE'; next if $key eq 'ID_COLUMN_VALUE'; next if $key eq $datarefs{'NAME_COLUMN_VALUE'}; next if $key eq $datarefs{'ID_COLUMN_VALUE'}; next if $key =~ /GWEIGHT$/i; next if $key =~ /GID$/i; next if $key eq 'name'; &printlog("Processing $key\n"); my @logdata = @{$datarefs{$key}}; ($min,$max) = &getminmax(@logdata); ($min,$max) = &defineborders($min,$max,$binsize); my %datahist = &makehist($min,$max,$binsize,@logdata); my @data = %datahist; my @smoothdata = &get_smoothed_max($smoothtype,$smoothwin,$binsize,@data); # bin of the peak my $mu = shift @smoothdata; my %smoothdata = @smoothdata; my %fakedata = (); if ($smoothtype == 0) { %fakedata = &makefake($faketype,$mu,%datahist); } elsif ($smoothtype > 0) { %fakedata = &makefake($faketype,$mu,%smoothdata); } # my %diff = &calcdiff(\%datahist,\%fakedata); # my @diff = %diff; # my @smoothdiff = &get_smoothed_max($smoothtype,$smoothwin,$binsize,@diff); # my $optcut = shift @smoothdiff; # my %smoothdiff = @smoothdiff; # EPP: Estimated Probabiliy of Presence %EPP = &calcEPP(\%datahist,\%fakedata); # Graphs of fitted distributions # my $graphout = 1; if ($graphout) { my $graph = GD::Graph::lines->new(600,300); my @xval; my @rawval; my @fakeval; my @eppval; my $ymax = 1; my $bin = 0; foreach (sort {$a<=>$b} keys %EPP) { push @xval, $_; push @rawval, $datahist{$_}; push @fakeval, $fakedata{$_}; push @eppval, $EPP{$_}; $ymax = $datahist{$_} if $datahist{$_} > $ymax; $ymax = $fakedata{$_} if $fakedata{$_} > $ymax; $ymax = $EPP{$_} if $EPP{$_} > $ymax; $bin++; } my @legend = ('Data', 'Estimated Present Genes', 'EPP'); $graph->set_legend(@legend); $ymax += 100 if $ymax % 50 < 50; $ymax = nearest(100,$ymax); my $xskip = int($bin/10); $graph->set( y_max_value => $ymax, x_label_skip => $xskip, legend_placement => 'RC', ); my @datagraph = ([@xval], [@rawval], [@fakeval], [@eppval]); my $gd = $graph->plot(\@datagraph); my $graphfile = $key . '.jpg'; open(GRAPH,">$graphfile") or die $!; binmode GRAPH; print GRAPH $gd->jpeg(100); close GRAPH; } # cutoff calculations foreach (@EPP_testvals) { $EPP_cutoffs{$_} = &calc_EPPbound($_,%EPP); } open(EPPOUT,">>$EPPoutfile") or die "EPP out problem\n"; print EPPOUT "$key"; foreach (sort {$a<=>$b} keys %EPP_cutoffs) { print EPPOUT "\t$EPP_cutoffs{$_}"; } print EPPOUT "\n"; close EPPOUT; my $EPPcutval = 0.5; my $EPP_bound = &calc_EPPbound($EPPcutval,%EPP); my $diff_bound = ($mu + $EPP_bound) / 2; my %boundsmoothdiff = (); foreach (sort {$a<=>$b} keys %smoothdiff) { $boundsmoothdiff{$_} = $smoothdiff{$_} if $_ < $diff_bound; } my @boundcutdata = &get_smoothed_max($smoothtype,$smoothwin,$binsize,%boundsmoothdiff); my $boundcut = shift @boundcutdata; # end of cutoff calculations if ($rawdataout) { my $inprefix = $key; my $histoutfile = $inprefix . '.out'; $overwritecheck = 1; &tk_writefile(*OUTFILE,'output',$histoutfile); if ($overwritecheck == 1) { open(OUTFILE,">$histoutfile") or &errormsg("Couldn't open $histoutfile"); # print OUTFILE "Overall difference peak = $optcut\n"; # print OUTFILE "EPP boundary = $EPP_bound\n"; # print OUTFILE "Bounded Difference Peak = $boundcut\n"; # print OUTFILE "EPP \= Estimated Probability of Presence\n"; if ($smoothtype > 0) { print OUTFILE "Bin\tFrequency\tSmoothedFreq\tFakePeak\tDiff\tSmoothedDiff\t\%EPP\n"; } # else { print OUTFILE "Bin\tFrequency\tFakePeak\tDifference\t\%EPP\n"; } else { print OUTFILE "Bin\tFrequency\tFakePeak\t\%EPP\n"; } foreach $val ( sort {$a<=>$b} keys %datahist ) { if ($smoothtype > 0) { print OUTFILE "$val\t$datahist{$val}\t"; if (exists $smoothdata{$val}) { print OUTFILE "$smoothdata{$val}\t"; } else { print OUTFILE "\t"; } if (exists $fakedata{$val}) { print OUTFILE "$fakedata{$val}\t"; } else { print OUTFILE "\t"; } # if (exists $diff{$val}) { # print OUTFILE "$diff{$val}\t"; # } # else { print OUTFILE "\t"; } if (exists $smoothdiff{$val}) { print OUTFILE "$smoothdiff{$val}\t"; } else { print OUTFILE "\t"; } if (exists $EPP{$val}) { print OUTFILE "$EPP{$val}\n"; } else { print OUTFILE "\n"; } } else { print OUTFILE "$val\t$datahist{$val}\t$fakedata{$val}\t"; print OUTFILE "$EPP{$val}\n"; } } close OUTFILE; } else { &printlog("File overwrite canceled: $histoutfile\n\n"); &errormsg("$histoutfile not overwritten"); $rungack_b->configure(-state=> 'normal'); next; } &printlog("Data written to $histoutfile\n\n"); $rungack_b->configure(-state=> 'normal'); } } if ($EPPout) { # create EPP-based bin output &printlog("Calculating and processing EPP scaled output\n"); &makeEPPout($EPPoutfile,$infile); } if ($binout) { &printlog("Generating binary output\n"); &makebinout($EPPoutfile,$infile); } if ($triout) { &printlog("Generating trinary output\n"); &maketriout($EPPoutfile,$infile); } unlink($EPPoutfile) unless ($EPPout); &printlog("Processing completed for all datasets in $infile\n"); $mw->bell; $rungack_b->configure(-state=> 'normal'); } } sub maketriout { my $EPPoutfile = shift; my $infile = shift; open(EPPVALS,$EPPoutfile) or die "Couldn't open EPPout\n"; chomp($EPPbins=); my @EPPbin = split(/\t/,$EPPbins); my %EPPvals = (); # assign EPP bins to a range from lowbound to highbound - linear scale my $cutoffcol; for (my $c = 0; $c <= $#EPPbin; $c++) { next if $EPPbin[$c] =~ /[a-zA-Z]/; $EPPbin[$c] = 0 if $EPPbin[$c] == 0.5; if ($EPPbin[$c] == $trioutcut1) { $cutoffcol1 = $c; } if ($EPPbin[$c] == $trioutcut2) { $cutoffcol2 = $c; } } if ($cutoffcol1 > $cutoffcol1) { # switch so that col1 < col2 my $temp = $cutoffcol1; $cutoffcol1 = $cutoffcol2; $cutoffcol2 = $temp; } while() { chomp; my @line = split(/\t/); $_ = shift @line; $_ =~ /(c\d+)_/; my $key = $1; $EPPvals{$key}{'low'} = $line[$cutoffcol1-1]; $EPPvals{$key}{'high'} = $line[$cutoffcol2-1]; } close EPPVALS; open(INFILE,$infile) or die "Infile problem with $infile\n"; my $gckoutfile = basename($infile); $gckoutfile =~ s/\.\w{3}$//; $gckoutfile .= '.tgk'; if (-e $gckoutfile) { $gckoutfile = $mw->getSaveFile(-initialfile=> $gckoutfile, -title=>'New PCL File', ); } open(NEWGCKFILE,">$gckoutfile") or die "Can't open output gck file\n"; chomp($header = ); print NEWGCKFILE "$header\n"; @headers = split(/\t/,$header); my $gweight = 0; $gweight++ if $header =~ /GWEIGHT/i; my $gid = 0; $gid++ if $header =~ /^GID/i; my %headercol = (); for (my $d = 0; $d <= $#headers; $d++) { my $colnum = $d + 1; my $key = 'c' . 0 x (3 - length($colnum) ) . $colnum; $headercol{$key} = $headers[$d]; } while() { if (/^EWEIGHT/) { print NEWGCKFILE $_; next; } if (/^AID/) { print NEWGCKFILE $_; next; } chomp; my @line = split(/\t/); print NEWGCKFILE "$line[0]"; for (my $f = 1; $f <= 1+$gweight+$gid; $f++) { next unless exists $line[$f]; # added in v3631 print NEWGCKFILE "\t$line[$f]"; } for (my $e = $gweight+$gid+2; $e <= $#line; $e++) { if ($line[$e] eq '') { print NEWGCKFILE "\t"; next; } my $colnum = $e + 1; my $colref = 'c' . 0 x (3-length($colnum)) . $colnum; my $binval; if ($line[$e] >= $EPPvals{$colref}{'high'} ) { $binval = 1; } elsif ($line[$e] < $EPPvals{$colref}{'low'} ) { $binval = -1; } elsif ($line[$e] >= $EPPvals{$colref}{'low'} && $line[$e] < $EPPvals{$colref}{'high'}) { $binval = 0; } else { $binval = 'problem'; } print NEWGCKFILE "\t$binval"; } print NEWGCKFILE "\n"; } close NEWGCKFILE; close INFILE; } sub makebinout { my $EPPoutfile = shift; my $infile = shift; open(EPPVALS,$EPPoutfile) or die "Couldn't open EPPout\n"; chomp($EPPbins=); my @EPPbin = split(/\t/,$EPPbins); my %EPPvals = (); # assign EPP bins to a range from lowbound to highbound - linear scale my $cutoffcol; for (my $c = 0; $c <= $#EPPbin; $c++) { next if $EPPbin[$c] =~ /[a-zA-Z]/; $EPPbin[$c] = 0 if $EPPbin[$c] == 0.5; if ($EPPbin[$c] == $binoutcut) { $cutoffcol = $c; last; } } while() { chomp; my @line = split(/\t/); $_ = shift @line; $_ =~ /(c\d+)_/; my $key = $1; $EPPvals{$key} = $line[$cutoffcol-1]; } close EPPVALS; open(INFILE,$infile) or die "Infile problem with $infile\n"; my $gckoutfile = basename($infile); $gckoutfile =~ s/\.\w{3}$//; $gckoutfile .= '.bgk'; if (-e $gckoutfile) { $gckoutfile = $mw->getSaveFile(-initialfile=> $gckoutfile, -title=>'New PCL File', ); } open(NEWGCKFILE,">$gckoutfile") or die "Can't open output gck file\n"; chomp($header = ); print NEWGCKFILE "$header\n"; @headers = split(/\t/,$header); my $gweight = 0; $gweight++ if $header =~ /GWEIGHT/i; my $gid = 0; $gid++ if $header =~ /^GID/i; my %headercol = (); for (my $d = 0; $d <= $#headers; $d++) { my $colnum = $d + 1; my $key = 'c' . 0 x (3 - length($colnum) ) . $colnum; $headercol{$key} = $headers[$d]; } while() { if (/^EWEIGHT/) { print NEWGCKFILE $_; next; } if (/^AID/) { print NEWGCKFILE $_; next; } chomp; my @line = split(/\t/); print NEWGCKFILE "$line[0]"; for (my $f = 1; $f <= 1+$gweight+$gid; $f++) { next unless exists $line[$f]; # added in v3631 print NEWGCKFILE "\t$line[$f]"; } for (my $e = $gweight+$gid+2; $e <= $#line; $e++) { if ($line[$e] eq '') { print NEWGCKFILE "\t"; next; } my $colnum = $e + 1; my $colref = 'c' . 0 x (3-length($colnum)) . $colnum; my $binval; if ($line[$e] >= $EPPvals{$colref}) { $binval = 1; } elsif ($line[$e] < $EPPvals{$colref}) { $binval = 0; } else { $binval = 'problem'; } print NEWGCKFILE "\t$binval"; } print NEWGCKFILE "\n"; } close NEWGCKFILE; close INFILE; } sub makeEPPout { my $EPPoutfile = shift; my $infile = shift; open(EPPVALS,$EPPoutfile) or die "Couldn't open EPPout\n"; chomp($EPPbins=); my @EPPbin = split(/\t/,$EPPbins); my %EPPvals = (); shift @EPPbin; # assign EPP bins to a range from lowbound to highbound - linear scale my $sensi = $EPPsensi / 100; my $lowbound = 0 - $sensi; my $highbound = $lowbound + 1; for (my $c = 0; $c <= $#EPPbin; $c++) { $EPPbin[$c] = 0 if $EPPbin[$c] == 0.5; $EPPbin[$c] = $lowbound + ( ($EPPbin[$c] / 100) * ( $highbound - $lowbound ) ); } while() { chomp; my @line = split(/\t/); $_ = shift @line; $_ =~ /(c\d+)_/; my $key = $1; for (my $c = 0; $c <= $#line; $c++) { $EPPvals{$key}{$EPPbin[$c]} = $line[$c]; } } close EPPVALS; open(INFILE,$infile) or die "Infile problem with $infile\n"; my $gckoutfile = basename($infile); $gckoutfile =~ s/\.\w{3}$//; $gckoutfile .= '.gck'; if (-e $gckoutfile) { $gckoutfile = $mw->getSaveFile(-initialfile=> $gckoutfile, -title=>'New PCL File', ); } open(NEWGCKFILE,">$gckoutfile") or die "Can't open output gck file\n"; chomp($header = ); print NEWGCKFILE "$header\n"; @headers = split(/\t/,$header); my $gweight = 0; $gweight++ if $header =~ /GWEIGHT/i; my $gid = 0; $gid++ if $header =~ /^GID/i; my %headercol = (); for (my $d = 0; $d <= $#headers; $d++) { my $colnum = $d + 1; my $key = 'c' . 0 x (3 - length($colnum) ) . $colnum; $headercol{$key} = $headers[$d]; } while() { if (/^EWEIGHT/) { print NEWGCKFILE $_; next; } if (/^AID/) { print NEWGCKFILE $_; next; } chomp; my @line = split(/\t/); print NEWGCKFILE "$line[0]"; for (my $f = 1; $f <= 1+$gweight+$gid; $f++) { next unless exists $line[$f]; # added in v3631 print NEWGCKFILE "\t$line[$f]"; } for (my $e = $gweight+$gid+2; $e <= $#line; $e++) { my $binval = $highbound; # if (!$line[$e]) { v 3631 bugfix if ($line[$e] eq '') { print NEWGCKFILE "\t"; next; } my $colnum = $e + 1; my $colref = 'c' . 0 x (3-length($colnum)) . $colnum; foreach $val (sort {$b<=>$a} keys %{$EPPvals{$colref}} ) { if ($line[$e] > $EPPvals{$colref}{$val}) { last; } else { $binval = $val; next; } } print NEWGCKFILE "\t$binval"; } print NEWGCKFILE "\n"; } close NEWGCKFILE; close INFILE; } sub check_for_falsecut { my $cut = shift; my $diffref = shift; my $check = 0; foreach $key (sort {$a<=>$b} keys %{$diffref}) { if ($key < $cut && $$diffref{$key} < 0) { $check++; } } return $check; } sub calc_EPPbound { my $bound = shift; my %EPPdata = @_; my $EPPcut = ''; my $interpolated_EPP = ''; foreach $key (sort {$a<=>$b} keys %EPPdata) { if ($EPPdata{$key} < $bound) { $EPPcut = $key; } else { # v3.627 fix if ($EPPcut eq '') { foreach $key (sort {$a<=>$b} keys %EPPdata) { $interpolated_EPP = $key; last; }; last; } # my $x1 = $EPPcut; my $y1 = $EPPdata{$EPPcut}; my $x2 = $key; my $y2 = $EPPdata{$key}; $interpolated_EPP = &calc_interpolate($bound,$x1,$y1,$x2,$y2); last; } } return $interpolated_EPP; # return $EPPcut; } sub calcEPP { my $dataref = shift; my $fakeref = shift; my %EPP = (); foreach $key (sort {$a<=>$b} keys %{$dataref}) { if ( !(exists $$dataref{$key}) || !(exists $$fakeref{$key}) ) { next; } if ($$dataref{$key} == 0) { $EPP{$key} = 0; } else { my $EPP = $$fakeref{$key} / $$dataref{$key}; $EPP{$key} = 100 * $EPP; } } return %EPP; } sub calcdiff { my $dataref = shift; my $fakeref = shift; my %diff = (); foreach $key (sort {$a<=>$b} keys %{$dataref}) { if ( !(exists $$dataref{$key}) || !(exists $$fakeref{$key}) ) { next; } my $difference = $$dataref{$key} - $$fakeref{$key}; $diff{$key} = $difference; } return %diff; } sub makefake { my $type = shift; my $mid = shift; my %data = @_; my %fakedata = (); if ($type == 0) { # right-side mirroring foreach $key (sort {$a<=>$b} keys %data) { if ($key > $mid) { $fakedata{$key} = $data{$key}; $negkey = $mid - ($key-$mid); $fakedata{$negkey} = $data{$key}; } elsif ($key == $mid) { $fakedata{$key} = $data{$key}; } } foreach $key (keys %data) { $fakedata{$key} = 0 unless $fakedata{$key}; } return %fakedata; } elsif ($type == 1) { # normal distribution mapping my $halfmax = $data{$mid} / 2; my $mu = ''; my $stdev = ''; ($mu,$stdev) = &get_interpolated($halfmax,%data); # this max factor normalizes the normal curve's height # to match the height of the real dataset my $largestval = 0; foreach $key (sort {$a<=>$b} keys %data) { $fakedata{$key} = &calc_norm_dens_fx($key,$mu,$stdev,1); if ($fakedata{$key} > $largestval) { $largestval = $fakedata{$key}; } } # print "mid: $mid\n"; my $maxfactor = 0; $maxfactor = $data{$mid} / $largestval; # print "datamid: $data{$mid} largest: $largestval maxfact: $maxfactor\n"; foreach $key (keys %data) { $fakedata{$key} *= $maxfactor; } return %fakedata; } } sub calc_norm_dens_fx { my $x = shift; my $mu = shift; my $sigma = shift; my $max_factor = shift; my $term1 = 0 - ((($x - $mu)**2 )/ ( 2 * ($sigma)**2)); my $term2 = $sigma * (sqrt(2 * 3.141592654) ); my $calc_val = $max_factor * (exp($term1)) / $term2; return $calc_val; } sub get_interpolated { # linear interpolation of the 50% max values my $halfval = shift; my %data = @_; my @binvals = (); my $lastval = -1; my $lastbin = -1; foreach $bin (sort {$a<=>$b} keys %data) { if ($lastval == -1) { $lastval = $data{$bin}; $lastbin = $bin; next; } if ($halfval <= $data{$bin} && $halfval >= $lastval) { my $intbin = &calc_interpolate($halfval,$lastbin,$lastval,$bin,$data{$bin}); push @binvals, $intbin; $lastval = $data{$bin}; $lastbin = $bin; next; } if ($halfval >= $data{$bin} && $halfval <= $lastval) { my $intbin = &calc_interpolate($halfval,$lastbin,$lastval,$bin,$data{$bin}); push @binvals, $intbin; $lastval = $data{$bin}; $lastbin = $bin; next; } $lastval = $data{$bin}; $lastbin = $bin; } my $min = 0; my $max = 0; ($min,$max) = &getminmax(@binvals); my $mid = ($min+$max)/2; my $sd = &calc_stdev($min,$max,$mid,$halfval); return ($mid,$sd); } sub calc_stdev { my $min = shift; my $max = shift; my $mid = shift; my $y_50 = shift; my $norm_x_at_50pct = sqrt( -2 * log(0.5) ); my $sd = ($mid-$min) / $norm_x_at_50pct; return $sd; } sub calc_interpolate { my $y_int = shift; my $x1 = shift; my $y1 = shift; my $x2 = shift; my $y2 = shift; if ($y2 == $y1) { # average x values if the y is equal return ($x1 + $x2)/2; } my $x_int = ( (($y_int-$y1)/($y2-$y1))*($x2-$x1) ) + $x1; return $x_int; } sub get_smoothed_max { my $smoothtype = shift; my $smoothwin = shift; my $binsize = shift; my %data = @_; my @bin = (); my @y_val = (); my $largest_avg = 0; my $largest_bin = 0; my @smootheddata = (); foreach $entry (sort {$a<=>$b} keys %data) { push @bin, $entry; push @y_val, $data{$entry}; } if ($smoothtype == 0) { my @multiple_peaks = (); # stores bin numbers for (my $d = 0; $d <= $#y_val; $d++) { push @smootheddata, $bin[$d]; push @smootheddata, $y_val[$d]; if ($y_val[$d] > $largest_avg) { $largest_avg = $y_val[$d]; $largest_bin = $bin[$d]; @multiple_peaks = (); } elsif ($y_val[$d] == $largest_avg) { push @multiple_peaks,$bin[$d]; } } if (@multiple_peaks) { my $sum = 0; &printlog("Multiple peaks detected\n"); foreach $val (@multiple_peaks) { $sum += $val; } $largest_bin = $sum / ($#multiple_peaks+1); #finds avg bin of mult peak } $largest_bin = &round_to_lowerbin($largest_bin, $binsize); return ($largest_bin, @smootheddata); } elsif ($smoothtype == 1) { my $window = ($smoothwin - 1) / 2; # number of values on each side of main value to eval my $maxbin = 0; for (my $c = $window; $c <= $#y_val - $window; $c++) { my $avg = 0; for (my $d = $c-$window; $d <= $c+$window; $d++) { $avg += $y_val[$d]; } $avg /= $smoothwin; push @smootheddata, $bin[$c]; push @smootheddata, $avg; if ($avg > $largest_avg) { $largest_bin = $c; $largest_avg = $avg; } } return ($bin[$largest_bin], @smootheddata); } elsif ($smoothtype == 2) { die "Binomial smoothing not yet implemented\n"; } else { die "No smoothing type specified\n"; } } sub makehist { my $min = shift; my $max = shift; my $binsize = shift; my @data = @_; my %datahash = (); $min /= $binsize; # these bizarre calculations are used $max /= $binsize; # to account for rounding errors for (my $c = int($min); $c < int($max); $c++) { my $bin = $c * $binsize; $datahash{$bin} = 0; } foreach $val (@data) { $newval = &round_to_lowerbin($val,$binsize); # print "$val\t$newval\n"; $datahash{$newval}++; } return %datahash; } sub round_to_lowerbin { my $val = shift; my $bin = shift; my $newval = $val/$bin; $newval += 0.0001; # this factor accounts for rounding errors in # val/bin division $newval = floor($newval); $newval *= $bin; # print "val: $val\tnewval: $newval\n"; return $newval; } sub defineborders { my $min = shift; my $max = shift; my $bin = shift; my $minval = int($min/$bin); $minval--; my $newmin = $minval * $bin; my $maxval = int($max/$bin); $maxval++; my $newmax = $maxval * $bin; return ($newmin,$newmax); } sub getminmax { my @data = @_; my $min = $data[0]; my $max = $data[0]; foreach $val (@data) { $min = $val if $val < $min; $max = $val if $val > $max; } return ($min,$max); } sub read_data { my $file = shift; my @data = (); while(<$file>) { next if /SCUTID/; next if /NAME/; next if /LOG_RAT2N_MEDIAN/; next if /LOG_RAT2N_MEAN/; chomp; $_ =~ s/\r//; # deal with DOS text my @line = split(/\t/); next if !$line[1]; push @data, $line[1]; } return @data; } sub read_multi_data { my $file = shift; my %data = (); chomp($_=<$file>); my @header = split(/\t/); my $gweight = 0; $gweight++ if $_ =~ /GWEIGHT/i; my $gid = 0; $gid++ if $header[0] =~ /^GID$/i; $data{'ID_COLUMN_VALUE'} = $header[0+$gid]; $data{'NAME_COLUMN_VALUE'} = $header[1+$gid]; # check headers for uniqueness and non-alphanumeric values for (my $d = 2+$gweight ; $d <= $#header; $d++) { $header[$d] =~ s/ /_/g; $header[$d] =~ s/[^0-9a-zA-Z_-]//g; my $column = $d+1; # - 1 - $gweight; my $coldigits = '0' x (3 - length($column) ) . $column; my $headername = 'c' . $coldigits . '_' . $header[$d]; $headername = lc($headername); $header[$d] = $headername; } while(<$file>) { chomp; $_ =~ s/\r//; # deal with DOS text my @line = split(/\t/); next if $line[0] =~ /^EWEIGHT/i; next if $line[0] =~ /^AID/i; for (my $c = 0; $c <= $#line; $c++) { next if !$line[$c]; push @{$data{$header[$c]}}, $line[$c]; } } foreach $val (@header) { # check for empty columns unless ($data{$val}) { &printlog("No values detected in column \"$val\"\n"); undef($data{$val}); } } return %data; } sub tk_writefile { my $filehandle = $_[0]; my $filetype = $_[1] if $_[1]; my $filename = $_[2] if $_[2]; if (-e $filename) { my $overwrite = $mw->Toplevel(); $overwrite->Label(-text=>"$filename already exists! Proceed with overwrite\?", )->pack(); $overwrite->Button(-text=>"OK", -command=> sub { $overwritecheck = 1; $overwrite->destroy() }, )->pack(-side=>'left'); $overwrite->Button(-text=>"Cancel", -command=> sub { $overwritecheck = 0; $overwrite->destroy() }, )->pack(-side=>'right'); $overwrite->waitWindow(); } else { open($filehandle, ">$filename") or die "Couldn't open $filename\n"; } $filehandle = ''; $filetype = ''; $filename = ''; }