#!/usr/bin/perl -w # compare microarray expression data to expression signature # (DINA) # adapt code for web use. # add command line arguments # (NOAM) # 20081106 # Use regular expression to replace illegal characters in supplied column names # (NOAM) # 20081118 # Added nstore for easy sharing across multiple platforms (NOAM) # 20090224 use strict; use Storable qw(nstore store retrieve);; use Getopt::Long; # command-line arguments #prototypes (tell the perl compiler the number and type of parameters) sub getArgs(); sub openFile($); sub readExp($$); sub PrintProgress($$); ###########################GLOBAL############################################### #globals (required for getArgs()) my %args = (); # hash for storing the arguments. # the switches are the keys for the hash # please fill in yes if to build new signature, the file name from which to build the signature,the filename which #is supposed to be compared, and the name of hte coloumn to be compared # in the file to compare ( the coloumn of the Gfold). # If yes causes recomputation of the signatures. # Recommended: no (use --rec switch to set to "yes" from command line) my $recomputeSig = "no"; # Signature filename my $filename_sig = "allAnovas.csv"; # Comparison file name my $filename_to_compare = "./ANOVAinsects.csv"; my $exp_file = "./exp.csv"; my $output_file = "./out.csv"; # get and validate arguments if (!getArgs()) { print "Use $0 --help to get help\n"; exit; }; # use HTML output (change to 0 when running from command line) my $HTML = 0; my @siggenes; #array of the genes that will comprise the signature # Max. P-Value (as is or with FDR) my $maxpval = 0.05; # Sets the size of the signature my $NUM_OF_GENES_IN_SIG = 1000; # Cancels pvalue filtering in signature generation my $DONT_CONSIDER_PVALUES_IN_SIG = 0; # Cancels pvalue filtering in correlation computation my $DONT_CONSIDER_PVALUES_IN_CALC = 0; #if you change this value, set $recomputeSig=yes my $FDR_CORRECTION = 0; # Use FDR? - should be 0 if DONT_CONSIDER_PVALUES_IN_CALC is 1 # The names of GFOLD columns to be compared my @cols_to_compare = (); #my @cols_to_compare = ("F_MJ_30","F_MJ_60","F_MJ_180","F_ACC_30","F_ACC_60","F_ACC_180","F_ABA_30","F_ABA_60","F_ABA_180","F_IAA_30","F_IAA_60","F_IAA_180","F_GA3_30","F_GA3_60","F_GA3_180","F_zeatin_30","F_zeatin_60","F_zeatin_180","F_brassino_30","F_brassino_60","F_brassino_180","F_SA_180"); # Respective P-Value columns my @pvalcols = (); #my @pvalcols=("P_MJ_30","P_MJ_60","P_MJ_180","P_ACC_30","P_ACC_60","P_ACC_180","P_ABA_30","P_ABA_60","P_ABA_180","P_IAA_30","P_IAA_60","P_IAA_180","P_GA3_30","P_GA3_60","P_GA3_180","P_zeatin_30","P_zeatin_60","P_zeatin_180","P_brassino_30","P_brassino_60","P_brassino_180","P_SA_180"); #my @cols_to_compare =("GFoldChange(nahG vs. control)","GFoldChange(ein2 vs. control)","GFoldChange(coi1 vs. control)"); #my @pvalcols=("p-value(Abrassic vs. control)","p-value(Focciden vs. control)","p-value(Mpersica vs. control)","p-value(Prapae vs. control)","p-value(avrPstDC vs. control)"); #my @cols_to_compare =("GFoldChange(Abrassic vs. control)","GFoldChange(Focciden vs. control)","GFoldChange(Mpersica vs. control)","GFoldChange(Prapae vs. control)","GFoldChange(avrPstDC vs. control)"); #my @pvalcols=("p-value(wt * wounded vs wt * control)","p-value(mut * wounded vs wt * wounded)"); #my @pvalcols=("p-value(wt * wounded vs wt * control)"); #my @cols_to_compare =("GFoldChange(wt * wounded vs wt * control)"); # Names of the experiments respectively (will appear in the results and log files) my @experiment_names = (); #my @experiment_names=("MJ30", "MJ60", "MJ180","ACC30","ACC60","ACC180","ABA30","ABA60","ABA180","IAA30","IAA60","IAA180","GA3 3h","GA3 6h","GA3 9h","zeatin30","zeatin60","zeatin180","brassino30","brassino60","brassino180","SA180"); #my @experiment_names =("systemicWtVscontrolWt","systemicTlk1VsSystemicWt"); #my @experiment_names =("nahG","ein2","coi1"); #my @experiment_names =("Abrassic","Focciden","Mpersica","Prapae","avrPstDC"); #my @experiment_names =("WT_WOUNDED"); ###########################GLOBAL############################################### # detect line ending type # store old input record separator my $old_input_rec_sep = $/; # read the first 2K of the experiments list file (it should be the smaller of # the two uploaded files). To read the entire file use "local $/;" (*EXP) = openFile($exp_file); my $file = do { local $/ = \2048; }; close(*EXP); # search for end of line character if($file =~ /\r\n/) { # dos $/ = "\r\n"; } elsif($file =~ /\n\r/) { # some mac $/ = "\n\r"; } elsif($file =~ /\r/) { # weird other option $/ = "\r"; } else { # unix $/ = "\n"; } if (!(readExp($exp_file, $filename_to_compare))) { exit -1; } # create temp sorted file my $sorted = $filename_to_compare . "sorted"; my ($srtEntry, @sortedCSV); my $tmpEntry; # create sorted file (*SRT) = openFile(">$sorted"); # open original CSV file (*CSV) = openFile($filename_to_compare); # read and print first line my $srtLine = ; print SRT $srtLine; # sort the rest of the entries @sortedCSV = sort (); close(*CSV); # print sorted entries foreach $srtEntry (@sortedCSV) { # remove empty lines if (length($srtEntry) > 2) { $tmpEntry = $srtEntry; $tmpEntry =~ s/,+//; print SRT $srtEntry if (length($tmpEntry) > 2); } } close(*SRT); $filename_to_compare = $sorted; ######for random####### #my @cols_to_compare =(); #my @pvalcols=(); #for (my $k = 0;$k < 30; $k++) #{ # $cols_to_compare[$k] = "g_SHUFFLE_$k"; # $pvalcols[$k] = "p_SHUFFLE_$k"; # $experiment_names[$k] = "SHUFFLE_$k"; #} ################################### my $col_to_compare_name; my $pvalcol; my $experiment_name; my $i=0; # create output file (*OUT) = openFile(">$output_file"); # Print the headers #print "--,MJ30, MJ60, MJ180, ACC30, ACC60, ACC180, ABA30, ABA60, ABA180, IAA30, IAA60, IAA180, gib3h, gib6h, gib9h,zeatin30, zeatin60, zeatin180, brassino30, brassino60, brassino180,SA180\n"; print OUT "--,KO_apx_0h,sig_size,KO_Apx_15m,sig_size,KO_Apx_30m,sig_size,KO_Apx_90m,sig_size,KO_Apx_3h,sig_size,KO_Apx_6h,sig_size,KO_Apx_24h,sig_size,flu_30m,sig_size,flu_1h,sig_size,flu_2h,sig_size,KD_Sod,sig_size,fnr1,sig_size,MV_30m,sig_size,MV_1h,sig_size,MV_3h,sig_size,MV_6h,sig_size,MV_12h,sig_size,MV_24h,sig_size,Rotenone_3h,sig_size,Rotenone_12h,sig_size,AS_Aox,sig_size,TDNA_Aox,sig_size,TDNA_Aox_MLD,sig_size,AT,sig_size,Cat_0,sig_size,Cat_3h,sig_size,Cat_8h,sig_size,Ozone,sig_size,H2O2,sig_size,\n"; print STDOUT "Starting\n" if !($HTML); my $prog = 0.0; my $total = (scalar @experiment_names) * 29.0; # no. of experiments * no. of sig $|++; # immediate flush for my $col (@cols_to_compare) { $col_to_compare_name = $col; $pvalcol = $pvalcols[$i]; $experiment_name = $experiment_names[$i]; printf STDOUT "Progress: %3d %%\r", 0 if !($HTML); my @temparr = CheckHormone("F_KO_apx_0h", "P_KO_apx_0h", "KO_apx_0h", $recomputeSig); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KO_apx_15min", "P_KO_apx_15min", "KO_apx_15min", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KO_apx_30min", "P_KO_apx_30min", "KO_apx_30min", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KO_apx_90min", "P_KO_apx_90min", "KO_apx_90min", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KO_apx_3h", "P_KO_apx_3h", "KO_apx_3h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KO_apx_6h", "P_KO_apx_6h", "KO_apx_6h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KO_apx_24h", "P_KO_apx_24h", "KO_apx_24h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_flu_30min", "P_flu_30min", "flu30min", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_flu_1h", "P_flu_1h", "flu1h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_flu_2h", "P_flu_2h", "flu2h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_KD_SOD", "P_KD_SOD", "KD_SOD", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_FNR1", "P_FNR1", "FNR1", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_MV_30min", "P_MV_30min", "MV30min", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_MV_1h", "P_MV_1h", "MV1h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_MV_3h", "P_MV_3h", "MV3h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_MV_6h", "P_MV_6h", "MV6h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_MV_12h", "P_MV_12h", "MV12h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_MV_24h", "P_MV_24h", "MV24h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_rotenone_3h", "P_rotenone_3h", "rotenone3h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_rotenone_12h", "P_rotenone_12h", "rotenone12h", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_AOXas", "P_AOXas", "AOXas", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_AOX_TDNA", "P_AOX_TDNA", "AOX_TDNA", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_AOX_MLD", "P_AOX_MLD", "AOX_MLD", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_AT", "P_AT", "AT", $recomputeSig)); PrintProgress(++$prog, $total); push (@temparr, CheckHormone("F_CAT", "P_CAT", "CAT0", $recomputeSig)); PrintProgress(++$prog, $total); push (@temparr, CheckHormone("F_CAT_3h", "P_CAT_3h", "CAT3", $recomputeSig)); PrintProgress(++$prog, $total); push (@temparr, CheckHormone("F_CAT_8h", "P_CAT_8h", "CAT8", $recomputeSig)); PrintProgress(++$prog, $total); push (@temparr, CheckHormone("F_ozone", "P_ozone", "ozone", $recomputeSig)); PrintProgress(++$prog, $total); push(@temparr, CheckHormone("F_H2O2", "P_H2O2", "H2O2", $recomputeSig)); PrintProgress(++$prog, $total); print OUT $experiment_name . "," . join(",", @temparr) . "\n"; $i++; $recomputeSig = "no"; } print STDOUT "\nFinished\n" if !($HTML); $|--; # restore old input record separator $/ = $old_input_rec_sep; close(*OUT); # selete sorted file unlink($sorted); exit; sub PrintProgress($$) { my($prg, $ttl) = @_; my $percent = (($prg)/$ttl)*100; if(!($HTML)) { printf STDOUT "Progress: %3d %%\r", $percent; } else { printf STDOUT "
%3d %%
\n", $percent; # Now, output a new 'bar', forcing its width # to 3 times the percent, since we have # defined the percent bar to be at # 300 pixels wide. print STDOUT "
\n"; } } ########################################################################## # readExp($exp_file, $filename_to_compare) # gets and validates the user's parameters # requires the global variables: # @cols_to_compare, @pvalcols, and @experiment_names sub readExp($$) { my ($exp_file, $input_file) =@_; my ($caption, @caption_items); my ($line, $f, $p, $e); my (@col_captions, @found); my $search; (*INP) = openFile($input_file); $caption = ; # read first line of the input file. It should contain # the fold change and p-values captions # strip off all trailing white spaces $caption =~ s/\s+$//; @caption_items = split(/,/,$caption); # the first 2 columns must be Probeset ID and locus # there has to be a 3rd column, Gene Title, when recalculating the signature if(($caption_items[0] ne "Probeset ID") or ($caption_items[1] ne "locus")) { close(*INP); print STDERR "No caption\n"; return 0; } close(*INP); (*EXP) = openFile($exp_file); # a csv file listing the experiments that # would be compared to the signatures. For example: # F,P,E # F_MJ_30,P_MJ_30,MJ_30 # F_MJ_60,P_MJ_60,MJ_60 # F_MJ_180,P_MJ_180,MJ_180 # The first column (F) is the label of the fold change data # P lists the labels of the p-value columns # and E is a title given by the user to the compared experiment # read first line and check that it contains capital F, P and E $line = ; chomp($line); # strip off all trailing white spaces $line =~ s/\s+$//; ($f, $p, $e) = split(/,/, $line); if(($f ne "F") or ($p ne "P") or ($e ne "E")) { close(*EXP); print STDERR "No header\n"; return 0; } while($line = ) { chomp($line); # strip off all trailing white spaces $line =~ s/\s+$//; @col_captions = split(/,/,$line); if(scalar(@col_captions) == 3) { # check that the fold change and p value captions are listed in # the input file $search = $col_captions[0]; $search =~ s/([\\\|\(\)\[\{\^\$\*\+\?\.])/\\$1/g; if(@found = grep(/^$search$/,@caption_items)) { push(@cols_to_compare, $col_captions[0]); } else { close(*EXP); print STDERR "Column not found\n"; return 0; } $search = $col_captions[1]; $search =~ s/([\\\|\(\)\[\{\^\$\*\+\?\.])/\\$1/g; if(@found = grep(/^$search$/,@caption_items)) { push(@pvalcols, $col_captions[1]); } else { close(*EXP); print STDERR "2nd column not found\n"; return 0; } push(@experiment_names,$col_captions[2]); } # end if } # end while close(*EXP); } #gives a grade between -1 to 1 according to the extent thah the hormone response # is similar to the stress response sub CheckHormone { # name of the coloumn of the GFold of the relevant hormone # name of the coloumn of the pval of the relevant hormone # file extention used to save the computed signature # to calculate again? my($horcol, $horpvalcol, $fileext, $compute_sig) = @_; if($compute_sig eq "yes") { my @data = ParseFile($filename_sig); @data = SortDataBy(\@data, $horcol); @siggenes = ChooseSigGenes(\@data, "Probeset ID", "Gene Title", "locus", $horpvalcol, $horcol); nstore(\@siggenes,'signatures/siggenes.' . $fileext . '.db'); } else { my $siggenes_ref = retrieve('signatures/siggenes.' . $fileext . '.db') || die "Can't open signature file"; @siggenes = @$siggenes_ref; } my @input = ParseFile($filename_to_compare); my @inputsig = ExtractSig(\@input,\@siggenes, "Probeset ID"); my ($grade, $count) = CalcCorrelation(\@siggenes, \@inputsig, $horcol,$col_to_compare_name, "Probeset ID", $fileext); return ($grade, $count); } sub CalcCorrelation { my ($siggenes_ref, $inputsig_ref, $sigcol, $inputcol, $idcol, $fileext) = @_; my @siggenes = @$siggenes_ref; my @inputsig = @$inputsig_ref; my $sigtest; my $ID = $idcol; my $GENE_TITLE = "Gene Title"; my $LOCUS = "locus"; my $hormongrade = 0; my $i =0; my $dotproduct=0; my $length =0; my $proj_length = 0; my $countsiggenes =0; my $countfinal=0; my $pval_index = 1; my $total_insig_size = $#inputsig; my $file_sig_log_name = "logs/${experiment_name}-${fileext}.csv"; #start of FDR if($FDR_CORRECTION) { my @sorted_pval_input_sig = sort { my %a = %$a; my %b = %$b; $a{$pvalcol} <=> $b{$pvalcol} } @inputsig; # FDR correction for each p val of the input file for my $row (@sorted_pval_input_sig) { $$row{$pvalcol}*=($total_insig_size/$pval_index); $pval_index++; } @inputsig = sort { my %a = %$a; my %b = %$b; $a{$idcol} cmp $b{$idcol} } @sorted_pval_input_sig; } #end of FDR open FILE, ">$file_sig_log_name" or die "Failed to open logfile"; for my $row (@siggenes) { if (defined($inputsig[$i]->{$idcol})) { if($row->{$idcol} eq $inputsig[$i]->{$idcol}) { my $sigval = $row->{$sigcol}; my $inputval = $inputsig[$i]->{$inputcol}; my $inputpval = $inputsig[$i]->{$pvalcol}; $countsiggenes++; #throw away values from input sig. that have pvalue greater than max allowed if(($DONT_CONSIDER_PVALUES_IN_CALC)||$inputpval < $maxpval) { $dotproduct += $sigval * $inputval; # v*u (dot product) $length += $sigval**2; # |v|**2 $proj_length += $inputval**2; # |u|**2 $countfinal++; # Add taken genes to the log file $row->{$LOCUS}||="--"; print FILE "$row->{$ID}, $row->{$LOCUS}, $row->{$GENE_TITLE}\n"; } $i++; } } else { $i++; } # end if defined } close FILE; # print "\n"."signature size ". "$countsiggenes"; #print "\n"."final number of genes compared "."$countfinal"; if (($countfinal!=0) && ($length!=0) && ($proj_length!=0)) { $length=sqrt($length); # |v| $proj_length = sqrt($proj_length); # |u| #Compute cos of the angle between u and v $hormongrade = ($dotproduct/$length)/$proj_length; # (v*u)/(|v||u|) #To get the angle take acos($hormongrade) } else { $hormongrade = "N/A"; } return ($hormongrade, $countfinal); } sub ExtractSig { my($ref1, $ref2, $colname) = @_; my @data = @$ref1; my @sig = @$ref2; my @inputsig; my $i=0; while (my $item = shift(@sig)) { for my $row (@data) { if ($item->{$colname} eq $row->{$colname}) { $inputsig[$i] = $row; last; } } $i++; } return @inputsig; } #parse the file. #we store the file in the following data structure: # ARRAY[HASH{TITLE->VALUE}] sub ParseFile { my ($filename) = @_; open (ANOVAFILE, $filename) || die "\nFile not found $filename\n"; my $caption; my @data; my @caption_items; my $line; my $line_num = 0; $caption =; @caption_items = split(/,/,$caption); chomp(@caption_items); while ($line = ) { # strip off all trailing white spaces $line =~ s/\s+$//; my @line_items = split(/,/,$line); chomp (@line_items); my %rowItems; my $i = 0; foreach my $capt (@caption_items) { # strip off all trailing white spaces $line_items[$i] =~ s/\s+$//; $rowItems{$capt} = $line_items[$i]; $i++; } $data[$line_num++] = \%rowItems; } close(ANOVAFILE); return @data; } sub SortDataBy { my($dataref, $colname) = @_; my @arr = @$dataref; my @sortedarr; @sortedarr = sort { my %a = %$a; my %b = %$b; abs($a{$colname}) <=> abs($b{$colname}) } @arr; @arr = reverse @sortedarr; return @arr; } sub ChooseSigGenes { my($dataref, @cols) = @_; my @data = @$dataref; my $pvalcol = $cols[$#cols-1]; my $idcol = $cols[0]; my @arr; my $i=0; for my $row (@data) { if($i>=$NUM_OF_GENES_IN_SIG) #break if we have enough genes { last; } # Check pvalues only of defined. if(($DONT_CONSIDER_PVALUES_IN_SIG) || ($row->{$pvalcol} < $maxpval)) { my %temphash; for my $col (@cols) { $temphash{$col} = $row->{$col}; } $arr[$i] = \%temphash; $i++; } } if($FDR_CORRECTION) { return sort { my %a = %$a; my %b = %$b; $a{$idcol} cmp $b{$idcol} } @arr; } else { return @arr; } } ########################################################################## # getArgs() # gets and validates the user's parameters sub getArgs() { my $getopts; my $sHelpMsg = "patternanalyzer.pl --input file used for comparison \n" . " --out output file\n" . " --exp instruction file\n" . " --sig signature file\n" . " --rec recompute signature\n\n" . " --input,\n" . " -i \t Specifies the location of the file that contains the\n" . "\t genes data.\n" . " --out,\n" . " -o \t Specifies the output file name and path.\n" . " --exp,\n" . " -e \t Specifies the experiment list file name and path.\n" . " [--sig,\n" . " -s] \t Specifies the signature file name and path.\n" . " [--rec | --norec]\n" . " \t Recompute signature(default: NO).\n"; # get command line arguments: # --input array data file used for comparison # --out output file name and path # [--rec | --norec] recompute signature? YES or NO (default: NO) $getopts = GetOptions(\%args, "help", "rec!", "input=s", "sig=s", "exp=s", "out=s"); if(!$getopts) { print "$sHelpMsg"; return 0; } if($args{help}) { print "$sHelpMsg"; exit(0); } if (defined $args{rec}) { $recomputeSig = $args{rec} ? "yes" : "no"; } else { $recomputeSig = "no"; } if($args{input}) { #input $filename_to_compare = $args{input}; # array data } else { return 0; } if($args{out}) { # output file $output_file = $args{out}; } else { return 0; } if($args{exp}) { # experiment file $exp_file = $args{exp}; } else { return 0; } if($args{sig}) { $filename_sig = $args{sig}; # signature file } else { $filename_sig = "allAnovas.csv"; } return 1; } ########################################################################## # openFile($fileName) # open the file $fileName and return a file handle sub openFile($) { my ($fileName) = @_; my $fh; unless(open($fh, $fileName)) { if($fileName =~ /^>/) { print STDERR "Failed to create file $fileName\n"; } else { print STDERR "Failed to open file $fileName\n"; } exit -1; } return $fh; }