5adcf6bc2904690de7b7b30a83ec8a7a0996abe9
galt
  Tue Aug 21 00:01:25 2018 -0700
changing cse subdomain to soe

diff --git src/utils/qa/gc-stats.pl src/utils/qa/gc-stats.pl
index b4fa475..fdb4ab0 100755
--- src/utils/qa/gc-stats.pl
+++ src/utils/qa/gc-stats.pl
@@ -1,457 +1,457 @@
 #!/usr/local/bin/perl5
 
 # Program: gc-stats.pl
 # Purpose: parses output of gene-check script into summary statistics
-# Test Protocal: http://genecats.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html
-# Summary results: http://genecats.cse.ucsc.edu/qa/test-results/gene-check.results.html
+# Test Protocal: http://genecats.soe.ucsc.edu/qa/test-protocols/gene-check.protocol.html
+# Summary results: http://genecats.soe.ucsc.edu/qa/test-results/gene-check.results.html
 # Author: Jennifer Jackson
 # Int Date: 2005-11-14
 # Rev Date: 2005-11-XX
 
 
 $documentation = "See test protocol documentation
-        http://genecats.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html";
+        http://genecats.soe.ucsc.edu/qa/test-protocols/gene-check.protocol.html";
 
 if ($#ARGV != 2) { die
 "\nUSAGE: $0 <genePreds_table> <gc-details> <gc-summary>
         Parses output of gene-check script into summary statistics
         Output <gc-stats> and <gc-report> are generated by program 
         STDERR is sent to screen
         <genePreds_table> file name format: pred_table.db.server.YYMMDD
 	<gc-details>      file name format: pred_table.db.server.YYMMDD.gc-details
         <gc-summary>      file name format: pred_table.db.server.YYMMDD.gc-summary
         <gc-stats>        file name format: pred_table.db.server.YYMMDD.gc-stats
         <gc-report>       file name format: pred_table.db.server.YYMMDD.gc-report
 	\nWhen complete, do the following to save your results ......	
 	1)Save the <gc-report> file in report directory:
         /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results 
         2)Add the html contents of <gc-stats> into results file:
 	/usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html
         3)Keep working files until QA complete in work directory, then delete:
         /cluster/store9/qa/gene-check
         $documentation\n\n";}
 
 # State the program has started
 print STDERR "PROCESSING: $0 program started, verifing input files\n";
 
 # get input file names     
 $in_preds   = $ARGV[0];
 $in_detail  = $ARGV[1];
 $in_summ    = $ARGV[2];
 
 # attempt to open and die if cannot be found/read
 open(PREDS,"$in_preds")   || die "ERROR:  Cannot open file $in_preds\n";
 open(DETAIL,"$in_detail") || die "ERROR:  Cannot open file $in_detail\n";
 open(SUMM,"$in_summ")     || die "ERROR:  Cannot open file $in_summ\n";
 
 # verify file names of infiles
 # die if database & server are not the same between all three
 # warn if dates are different
 
 # check genePreds_table input file name format
 (@name_pred) = split(/\./,$in_preds);
 if ($#name_pred != 3) { 
   die "ERROR:  $in_preds file name not formatted correctly
         <genePreds_table> file name format: pred_table.db.server.YYMMDD 
 	$documentation\n"; 
 }
 $pred_table  = $name_pred[0];
 $pred_db     = $name_pred[1];
 $pred_server = $name_pred[2]; 
 $pred_date   = $name_pred[3]; 
 
 # check gc-details input file name format
 (@name_detail) = split(/\./,$in_detail);
 if ($#name_detail != 4) {
   die "ERROR:  $in_detail file name not formatted correctly
         <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details 
 	$documentation\n"; 
 }
 $detail_table  = $name_detail[0];
 $detail_file   = $name_detail[4];
 if ($detail_file ne "gc-details") {
   die "ERROR:  $in_detail file name not formatted correctly or in wrong order
         USAGE:  $0 <genePreds_table> <gc-details> <gc-summary>
         <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details
         $documentation\n";
 }
 
 # check gc-summary input file name format
 (@name_summ) = split(/\./,$in_summ);
 if ($#name_summ != 4) {
   die "ERROR:  $in_summ file name not formatted correctly
        	<gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary 
 	$documentation\n"; 
 }
 $summ_table  = $name_summ[0];
 $summ_file   = $name_summ[4];
 if ($summ_file ne "gc-summary") {
   die "ERROR:  $in_summ file name not formatted correctly or in wrong order
 	USAGE:  $0 <genePreds_table> <gc-details> <gc-summary>
         <gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary
         $documentation\n";
 }
 
 # State that files are readable
 print STDERR "PROCESSING: input files identified and readable\n"; 
 
 # check that pred_table name is same on all files
 if (("$pred_table" eq "$detail_table") && ("$detail_table" eq "$summ_table")) {
    print STDERR "PROCESSING: genePreds_table same between all input files
 PROCESSING: finished checking format; statistics being generated\n";
 }  else {
    die "ERROR:  All files must be from same genePreds_table
         <genePreds_table> file name format: pred_table.db.server.YYMMDD
         <gc-details>      file name format: pred_table.db.server.YYMMDD.gc-details
         <gc-summary>      file name format: pred_table.db.server.YYMMDD.gc-summary
 	$documentation\n";
 }
 
 # open outfiles for writing results
 $stat_file = "$pred_table.$pred_db.$pred_server.$pred_date.gc-stats";
 $report_file  = "$pred_table.$pred_db.$pred_server.$pred_date.gc-report";
 open STAT, ">$stat_file";
 open REPORT,  ">$report_file";
 print STAT   "$0 gc-stats file for $in_preds\n\n";
 print STAT   "Paste the html formated results below into file:\n"; 
 print STAT   "/usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html\n\n";
 print REPORT "$0 gc-report file for $in_preds\n\n";
 print REPORT "Save file at /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.reports\n\n";
 
 
 # in preds, count up ids & preds per id
 $preds_id_all  = 0;
 $preds_id_uniq = 0;
 $preds_id_max  = 1;
 $preds_id_name = "none";
 while(<PREDS>) {
  chomp;
  $preds_id_all++; 
  (@content_preds) = split(/\t/);
 
  if (exists $preds_count_id{$content_preds[0]}) {
     $preds_count_id{$content_preds[0]}++;
     if ($preds_count_id{$content_preds[0]} > $preds_id_max) {
 	$preds_id_max  = $preds_count_id{$content_preds[0]};
 	$preds_id_name = $content_preds[0];
     }
  } else {
     $preds_count_id{$content_preds[0]} = 1;
     $preds_id_uniq++;
  }
 
 }
 
 print REPORT "$in_preds contains gene prediction table from database
 	$preds_id_uniq unique sequences have $preds_id_all gene predictions
 	the sequence $preds_id_name has the most gene predictions: $preds_id_max predictions\n\n";
 
 # in summary, count up ids & uniq preds & errors per uniq preds
 $summ_id_uniq = 0;
 $summ_id_max = 1;
 $summ_id_name = "none";
 $summ_longid_all = 0;
 $summ_longid_causes_max = 1;
 $summ_longid_causes_name = "none";
 while(<SUMM>) {
  chomp;
  $summ_longid_all++;
 # print STDERR "longid $summ_longid_all \n";
  unless ($summ_longid_all <= 2) {
  ($summ_id,$chr,$chrs,$chre,$strand,$stat,$frame,$start,$stop,$orfStop,$cdsGap,$utrGap,$cdsSplice,$utrSplice,$causes) = split(/\t/);
  $summ_longid = "$summ_id.$chr.$chrs.$chre";
 
  (@array_causes) = split(/\,/,$causes);
 
  $summ_causes_longid{$summ_longid} = ($#array_causes + 1);
 
     if ($summ_causes_longid{$summ_longid} > $summ_longid_causes_max) {
         $summ_longid_causes_max  = $summ_causes_longid{$summ_longid};
         $summ_longid_causes_name = $summ_longid;
     }
  if (exists $summ_count_id{$summ_id}) {
     $summ_count_id{$summ_id}++;
    if ($summ_count_id{$summ_id} > $summ_id_max) {
        $summ_id_max  = $summ_count_id{$summ_id};
        $summ_id_name = $summ_id;
    }
  } else {
     $summ_count_id{$summ_id} = 1;
     $summ_id_uniq++;
  }
  }
 }
 
 $prob_preds_all = ($summ_longid_all - 2);
 
 print REPORT "$in_summ contains sequences with 1 or more gene-check exceptions
 	$summ_id_uniq unique sequences have $prob_preds_all predictions with exceptions
 	the sequence $summ_id_name has the most predictions with exceptions: $summ_id_max predictions
 	the prediction $summ_longid_causes_name has the most exception types: $summ_longid_causes_max types\n\n";
 
 # compare total predictions with those that have exceptions by id
 # %summ_count_id contains the ids with # predictions with exceptions
 # %preds_count_id contains the ids with # all predictions
 $all_pass = 0;
 $all_fail = 0;
 $some_pass_fail = 0;
 
 foreach $common_id (keys %preds_count_id) {
 
  if (exists $summ_count_id{$common_id}) {
 
   if ($summ_count_id{$common_id} == $preds_count_id{$common_id}) { $all_fail++; }
   if ($summ_count_id{$common_id} <  $preds_count_id{$common_id}) { $some_pass_fail++; }
  
  } else {
    $all_pass++;
  }
 }
 
 
 
 # in details, count up uniq ids and types errors per id
 # counting variables for total errors of each type and
 # number of unique IDs associated have each exception in gc-stat
 # total number of each type of error in file in gc-report
 $detail_id_uniq   = 0;
 $detail_lines_all = 0;
 $detail_gap_all   = 0;
 $detail_gap_id    = 0;
 $detail_badCdsSplice_all = 0;
 $detail_badCdsSplice_id = 0;
 $detail_badUtrSplice_all = 0;
 $detail_badUtrSplice_id = 0;
 $detail_noCDS_all = 0;
 $detail_noCDS_id = 0;
 $detail_noStart_all = 0;
 $detail_noStart_id = 0;
 $detail_noStop_all = 0;
 $detail_noStop_id = 0;
 $detail_inFrameStop_all = 0;
 $detail_inFrameStop_id = 0;
 $detail_frameDiscontig_all = 0;
 $detail_frameDiscontig_id = 0;
 $detail_frameMismatch_all = 0;
 $detail_frameMismatch_id = 0;
 $detail_badFrame_all = 0;
 $detail_badFrame_id = 0;
 $detail_other_all = 0;
 $detail_other_id = 0;
 
 
 # parse file and run through error type fields
 
 while(<DETAIL>) {
  chomp;
   $detail_lines_all++;
   unless ($detail_lines_all < 2) {
    ($detail_id,$type,$info,$exception,$coords) = split(/\t/);
   }
   unless (exists $detail_id_uniq_count{$detail_id}) {
     $detail_id_uniq_count{$detail_id} = "1";
     $detail_id_uniq++;
   } 
 
  if ($type eq "gap") { 
    $detail_gap_all++;
    unless (exists $detail_gap_uniq_count{$detail_id}) {
     $detail_gap_uniq_count{$detail_id} = "1";  
     $detail_gap_id++;
    }
  }
 
  elsif ($type eq "badCdsSplice") {
    $detail_badCdsSplice_all++;
    unless (exists $detail_badCdsSplice_uniq_count{$detail_id}) {
     $detail_badCdsSplice_uniq_count{$detail_id} = "1";
     $detail_badCdsSplice_id++;
    }
  }
 
  elsif ($type eq "badUtrSplice") {
    $detail_badUtrSplice_all++;
    unless (exists $detail_badUtrSplice_uniq_count{$detail_id}) {
     $detail_badUtrSplice_uniq_count{$detail_id} = "1";
     $detail_badUtrSplice_id++;
    }
  }
 
  elsif ($type eq "noCDS") {
    $detail_noCDS_all++;
    unless (exists $detail_noCDS_uniq_count{$detail_id}) {
     $detail_noCDS_uniq_count{$detail_id} = "1";
     $detail_noCDS_id++;
    }
  }
 
  elsif ($type eq "noStart") {
    $detail_noStart_all++;
    unless (exists $detail_noStart_uniq_count{$detail_id}) {
     $detail_noStart_uniq_count{$detail_id} = "1";
     $detail_noStart_id++;
    }
  }
 
  elsif ($type eq "noStop") {
    $detail_noStop_all++;
    unless (exists $detail_noStop_uniq_count{$detail_id}) {
     $detail_noStop_uniq_count{$detail_id} = "1";
     $detail_noStop_id++;
    }
  }
 
  elsif ($type eq "inFrameStop") {
    $detail_inFrameStop_all++;
    unless (exists $detail_inFrameStop_uniq_count{$detail_id}) {
     $detail_inFrameStop_uniq_count{$detail_id} = "1";
     $detail_inFrameStop_id++;
    }
  }
 
  elsif ($type eq "frameDiscontig") {
    $detail_frameDiscontig_all++;
    unless (exists $detail_frameDiscontig_uniq_count{$detail_id}) {
     $detail_frameDiscontig_uniq_count{$detail_id} = "1";
     $detail_frameDiscontig_id++;
    }
  }
 
  elsif ($type eq "frameMismatch") {
    $detail_frameMismatch_all++;
    unless (exists $detail_frameMismatch_uniq_count{$detail_id}) {
     $detail_frameMismatch_uniq_count{$detail_id} = "1";
     $detail_frameMismatch_id++;
    }
  }
 
  elsif ($type eq "badFrame") {
    $detail_badFrame_all++;
    unless (exists $detail_badFrame_uniq_count{$detail_id}) {
     $detail_badFrame_uniq_count{$detail_id} = "1";
     $detail_badFrame_id++;
    }
  }
 
  else {
    $detail_other_all++;
    unless (exists $detail_other_uniq_count{$detail_id}) {
     $detail_other_uniq_count{$detail_id} = "1";
     $detail_other_id++;
    }
  }
 
 
 }
 # summarize seqs w/ exceptions into percentages of total seqs w/ preds
 $detail_errors_all = ($detail_lines_all - 2);
 $detail_gap_pct = (($detail_gap_id/$preds_id_uniq) * 100);
 $detail_badCdsSplice_pct = (($detail_badCdsSplice_id/$preds_id_uniq) * 100);
 $detail_badUtrSplice_pct = (($detail_badUtrSplice_id/$preds_id_uniq) * 100);
 $detail_noCDS_pct = (($detail_noCDS_id/$preds_id_uniq) * 100);
 $detail_noStart_pct = (($detail_noStart_id/$preds_id_uniq) * 100);
 $detail_noStop_pct = (($detail_noStop_id/$preds_id_uniq) * 100);
 $detail_inFrameStop_pct = (($detail_inFrameStop_id/$preds_id_uniq) * 100);
 $detail_frameDiscontig_pct = (($detail_frameDiscontig_id/$preds_id_uniq) * 100);
 $detail_frameMismatch_pct = (($detail_frameMismatch_id/$preds_id_uniq) * 100);
 $detail_badFrame_pct = (($detail_badFrame_id/$preds_id_uniq) * 100);
 $detail_other_pct = (($detail_other_id/$preds_id_uniq) * 100);
 
 # summarize preds w/ exceptions into percentage of total preds
 $prob_preds_all_pct = (($prob_preds_all/$preds_id_all) * 100);
 
 # summarize seqs w/ % w/o exceptions into percentage of total unique seqs w/ preds
 $all_pass_pct = (($all_pass/$preds_id_uniq) * 100);
 $all_fail_pct = (($all_fail/$preds_id_uniq) * 100);
 $some_pass_fail_pct = (($some_pass_fail/$preds_id_uniq) * 100);
 
 print REPORT "$in_detail contains details for exceptions, including coordinates
 	one exception per line; sequences can appear on multiple lines
 	use exception key or sequence name to grep details from file
 	$detail_id_uniq unique sequences with prediction exceptions
 	$detail_lines_all total gene prediction exceptions\n
 Gene-check exception summary statistics based on <gc-details> file content
 exception key: <total> <unique by sequence> <pct of unique sequences in genePreds file> 
 ----------------------------------------------------------------------\n";
 
 printf REPORT "gap:\t\t$detail_gap_all\t$detail_gap_id\t%3.1f %\n", "$detail_gap_pct";
 printf REPORT "badCdsSplice:\t$detail_badCdsSplice_all\t$detail_badCdsSplice_id\t%3.1f %\n", "$detail_badCdsSplice_pct";
 printf REPORT "badUtrSplice:\t$detail_badUtrSplice_all\t$detail_badUtrSplice_id\t%3.1f %\n", "$detail_badUtrSplice_pct";
 printf REPORT "noCDS:\t\t$detail_noCDS_all\t$detail_noCDS_id\t%3.1f %\n", "$detail_noCDS_pct";
 printf REPORT "noStart:\t$detail_noStart_all\t$detail_noStart_id\t%3.1f %\n", "$detail_noStart_pct";
 printf REPORT "noStop:\t\t$detail_noStop_all\t$detail_noStop_id\t%3.1f %\n", "$detail_noStop_pct";
 printf REPORT "inFrameStop:\t$detail_inFrameStop_all\t$detail_inFrameStop_id\t%3.1f %\n", "$detail_inFrameStop_pct";
 printf REPORT "frameDiscontig:\t$detail_frameDiscontig_all\t$detail_frameDiscontig_id\t%3.1f %\n", "$detail_frameDiscontig_pct";
 printf REPORT "frameMismatch:\t$detail_frameMismatch_all\t$detail_frameMismatch_id\t%3.1f %\n", "$detail_frameMismatch_pct";
 printf REPORT "badFrame:\t$detail_badFrame_all\t$detail_badFrame_id\t%3.1f %\n", "$detail_badFrame_pct";
 printf REPORT "other:\t\t$detail_other_all\t$detail_other_id\t%3.1f %\n", "$detail_other_pct";
 
 print REPORT "\n\nGlobal summary statistics for $pred_table gene prediction track
 -------------------------------------------------------------------\n";
 printf REPORT "Pct predictions with one or more gene-check exceptions: %3.1f %\n", "$prob_preds_all_pct";
 printf REPORT "Pct unique sequences with all predictions having no exceptions: %3.1f %\n", "$all_pass_pct";
 printf REPORT "Pct unique sequences with all predictions having 1 or more exceptions: %3.1f %\n", "$all_fail_pct"; 
 printf REPORT "Pct unique sequences with mixed predictions (some have exceptions, some do not): %3.1f %\n", "$some_pass_fail_pct";
 
 print REPORT "\n\n\nEND OF REPORT\n\n";
 
 print STDERR "PROCESSING: outfile <gc-report> finalized\n";
 
 # <gc-stats> output file variables
 # 1  db.genePred	$pred_db.$pred_table
 # 2  date		$pred_date
 # 3  server		$pred_server
 # 4  uniqSeqs		$preds_id_uniq
 # 5  allPreds		$preds_id_all
 # 6  probPreds		$prob_preds_all,$prob_preds_all_pct
 # 7  allPass		$all_pass,$all_pass_pct
 # 8  allFail		$all_fail,$all_fail_pct
 # 8  somePassFail	$some_pass_fail,$some_pass_fail_pct
 # 10 gap		$detail_gap_id,$detail_gap_pct
 # 11 badCdsSplice	$detail_badCdsSplice_id,$detail_badCdsSplice_pct
 # 12 badUtrSplice	$detail_badUtrSplice_id,$detail_badUtrSplice_pct
 # 13 noCDS		$detail_noCDS_id,$detail_noCDS_pct
 # 14 noStart		$detail_noStart_id,$detail_noStart_pct
 # 15 noStop		$detail_noStop_id,$detail_noStop_pct
 # 16 inFrameStop	$detail_inFrameStop_id,$detail_inFrameStop_pct
 # 17 frameDiscontig	$detail_frameDiscontig_id,$detail_frameDiscontig_pct
 # 18 frameMismatch 	$detail_frameMismatch_id,$detail_frameMismatch_pct
 # 19 badFrame		$detail_badFrame_id,$detail_badFrame_pct
 # 20 other		$detail_other_id,$detail_other_pct
 
 
 # write out html format output to <gc-stats>
 
 
 print  STAT "\n\n";
 print  STAT "<tr>\n";
 print  STAT "<td>$pred_db.$pred_table</td>\n";
 print  STAT "<td>$pred_date</td>\n";
 print  STAT "<td>$pred_server</td>\n";
 print  STAT "<td>$preds_id_uniq</td>\n";
 print  STAT "<td>$preds_id_all</td>\n";
 printf STAT "<td>$prob_preds_all(%3.1f%)</td>\n", "$prob_preds_all_pct";
 printf STAT "<td>$all_pass(%3.1f%)</td>\n", "$all_pass_pct";
 printf STAT "<td>$all_fail(%3.1f%)</td>\n", "$all_fail_pct";
 printf STAT "<td>$some_pass_fail(%3.1f%)</td>\n", "$some_pass_fail_pct";
 printf STAT "<td>$detail_gap_id(%3.1f%)</td>\n", "$detail_gap_pct";
 printf STAT "<td>$detail_badCdsSplice_id(%3.1f%)</td>\n", "$detail_badCdsSplice_pct";
 printf STAT "<td>$detail_badUtrSplice_id(%3.1f%)</td>\n", "$detail_badUtrSplice_pct";
 printf STAT "<td>$detail_noCDS_id(%3.1f%)</td>\n", "$detail_noCDS_pct";
 printf STAT "<td>$detail_noStart_id(%3.1f%)</td>\n", "$detail_noStart_pct";
 printf STAT "<td>$detail_noStop_id(%3.1f%)</td>\n", "$detail_noStop_pct";
 printf STAT "<td>$detail_inFrameStop_id(%3.1f%)</td>\n", "$detail_inFrameStop_pct";
 printf STAT "<td>$detail_frameDiscontig_id(%3.1f%)</td>\n", "$detail_frameDiscontig_pct";
 printf STAT "<td>$detail_frameMismatch_id(%3.1f%)</td>\n", "$detail_frameMismatch_pct";
 printf STAT "<td>$detail_badFrame_id(%3.1f%)</td>\n", "$detail_badFrame_pct";
 printf STAT "<td>$detail_other_id(%3.1f%)</td>\n", "$detail_other_pct";
 print  STAT "</tr>\n";
 print  STAT "\n\n\nEND OF REPORT\n\n";
 
 
 
 
 print STDERR "PROCESSING: outfile <gc-stats> finalized\n";
 print STDERR "PROCESSING: $0 program finished witout errors!\n";