src/utils/qa/gc-stats.pl 1.2
1.2 2010/05/21 23:52:35 vanessa
changed the path names for the htdoc reorganization to htdoc-genecats and htdoc-hgdownload and changed the appropriate urls
Index: src/utils/qa/gc-stats.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/qa/gc-stats.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/utils/qa/gc-stats.pl 17 Mar 2005 23:58:30 -0000 1.1
+++ src/utils/qa/gc-stats.pl 21 May 2010 23:52:35 -0000 1.2
@@ -1,457 +1,457 @@
#!/usr/local/bin/perl5
# Program: gc-stats.pl
# Purpose: parses output of gene-check script into summary statistics
-# Test Protocal: http://hgwdev.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html
-# Summary results: http://hgwdev.cse.ucsc.edu/qa/test-results/gene-check.results.html
+# Test Protocal: http://genecats.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html
+# Summary results: http://genecats.cse.ucsc.edu/qa/test-results/gene-check.results.html
# Author: Jennifer Jackson
# Int Date: 2005-11-14
# Rev Date: 2005-11-XX
$documentation = "See test protocol documentation
- http://hgwdev.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html";
+ http://genecats.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html";
if ($#ARGV != 2) { die
"\nUSAGE: $0 <genePreds_table> <gc-details> <gc-summary>
Parses output of gene-check script into summary statistics
Output <gc-stats> and <gc-report> are generated by program
STDERR is sent to screen
<genePreds_table> file name format: pred_table.db.server.YYMMDD
<gc-details> file name format: pred_table.db.server.YYMMDD.gc-details
<gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary
<gc-stats> file name format: pred_table.db.server.YYMMDD.gc-stats
<gc-report> file name format: pred_table.db.server.YYMMDD.gc-report
\nWhen complete, do the following to save your results ......
1)Save the <gc-report> file in report directory:
- /usr/local/apache/htdocs/qa/test-results/gc-stats.results
+ /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results
2)Add the html contents of <gc-stats> into results file:
- /usr/local/apache/htdocs/qa/test-results/gc-stats.results.html
+ /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html
3)Keep working files until QA complete in work directory, then delete:
/cluster/store9/qa/gene-check
$documentation\n\n";}
# State the program has started
print STDERR "PROCESSING: $0 program started, verifing input files\n";
# get input file names
$in_preds = $ARGV[0];
$in_detail = $ARGV[1];
$in_summ = $ARGV[2];
# attempt to open and die if cannot be found/read
open(PREDS,"$in_preds") || die "ERROR: Cannot open file $in_preds\n";
open(DETAIL,"$in_detail") || die "ERROR: Cannot open file $in_detail\n";
open(SUMM,"$in_summ") || die "ERROR: Cannot open file $in_summ\n";
# verify file names of infiles
# die if database & server are not the same between all three
# warn if dates are different
# check genePreds_table input file name format
(@name_pred) = split(/\./,$in_preds);
if ($#name_pred != 3) {
die "ERROR: $in_preds file name not formatted correctly
<genePreds_table> file name format: pred_table.db.server.YYMMDD
$documentation\n";
}
$pred_table = $name_pred[0];
$pred_db = $name_pred[1];
$pred_server = $name_pred[2];
$pred_date = $name_pred[3];
# check gc-details input file name format
(@name_detail) = split(/\./,$in_detail);
if ($#name_detail != 4) {
die "ERROR: $in_detail file name not formatted correctly
<gc-details> file name format: pred_table.db.server.YYMMDD.gc-details
$documentation\n";
}
$detail_table = $name_detail[0];
$detail_file = $name_detail[4];
if ($detail_file ne "gc-details") {
die "ERROR: $in_detail file name not formatted correctly or in wrong order
USAGE: $0 <genePreds_table> <gc-details> <gc-summary>
<gc-details> file name format: pred_table.db.server.YYMMDD.gc-details
$documentation\n";
}
# check gc-summary input file name format
(@name_summ) = split(/\./,$in_summ);
if ($#name_summ != 4) {
die "ERROR: $in_summ file name not formatted correctly
<gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary
$documentation\n";
}
$summ_table = $name_summ[0];
$summ_file = $name_summ[4];
if ($summ_file ne "gc-summary") {
die "ERROR: $in_summ file name not formatted correctly or in wrong order
USAGE: $0 <genePreds_table> <gc-details> <gc-summary>
<gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary
$documentation\n";
}
# State that files are readable
print STDERR "PROCESSING: input files identified and readable\n";
# check that pred_table name is same on all files
if (("$pred_table" eq "$detail_table") && ("$detail_table" eq "$summ_table")) {
print STDERR "PROCESSING: genePreds_table same between all input files
PROCESSING: finished checking format; statistics being generated\n";
} else {
die "ERROR: All files must be from same genePreds_table
<genePreds_table> file name format: pred_table.db.server.YYMMDD
<gc-details> file name format: pred_table.db.server.YYMMDD.gc-details
<gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary
$documentation\n";
}
# open outfiles for writing results
$stat_file = "$pred_table.$pred_db.$pred_server.$pred_date.gc-stats";
$report_file = "$pred_table.$pred_db.$pred_server.$pred_date.gc-report";
open STAT, ">$stat_file";
open REPORT, ">$report_file";
print STAT "$0 gc-stats file for $in_preds\n\n";
print STAT "Paste the html formated results below into file:\n";
-print STAT "/usr/local/apache/htdocs/qa/test-results/gc-stats.results.html\n\n";
+print STAT "/usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html\n\n";
print REPORT "$0 gc-report file for $in_preds\n\n";
-print REPORT "Save file at /usr/local/apache/htdocs/qa/test-results/gc-stats.reports\n\n";
+print REPORT "Save file at /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.reports\n\n";
# in preds, count up ids & preds per id
$preds_id_all = 0;
$preds_id_uniq = 0;
$preds_id_max = 1;
$preds_id_name = "none";
while(<PREDS>) {
chomp;
$preds_id_all++;
(@content_preds) = split(/\t/);
if (exists $preds_count_id{$content_preds[0]}) {
$preds_count_id{$content_preds[0]}++;
if ($preds_count_id{$content_preds[0]} > $preds_id_max) {
$preds_id_max = $preds_count_id{$content_preds[0]};
$preds_id_name = $content_preds[0];
}
} else {
$preds_count_id{$content_preds[0]} = 1;
$preds_id_uniq++;
}
}
print REPORT "$in_preds contains gene prediction table from database
$preds_id_uniq unique sequences have $preds_id_all gene predictions
the sequence $preds_id_name has the most gene predictions: $preds_id_max predictions\n\n";
# in summary, count up ids & uniq preds & errors per uniq preds
$summ_id_uniq = 0;
$summ_id_max = 1;
$summ_id_name = "none";
$summ_longid_all = 0;
$summ_longid_causes_max = 1;
$summ_longid_causes_name = "none";
while(<SUMM>) {
chomp;
$summ_longid_all++;
# print STDERR "longid $summ_longid_all \n";
unless ($summ_longid_all <= 2) {
($summ_id,$chr,$chrs,$chre,$strand,$stat,$frame,$start,$stop,$orfStop,$cdsGap,$utrGap,$cdsSplice,$utrSplice,$causes) = split(/\t/);
$summ_longid = "$summ_id.$chr.$chrs.$chre";
(@array_causes) = split(/\,/,$causes);
$summ_causes_longid{$summ_longid} = ($#array_causes + 1);
if ($summ_causes_longid{$summ_longid} > $summ_longid_causes_max) {
$summ_longid_causes_max = $summ_causes_longid{$summ_longid};
$summ_longid_causes_name = $summ_longid;
}
if (exists $summ_count_id{$summ_id}) {
$summ_count_id{$summ_id}++;
if ($summ_count_id{$summ_id} > $summ_id_max) {
$summ_id_max = $summ_count_id{$summ_id};
$summ_id_name = $summ_id;
}
} else {
$summ_count_id{$summ_id} = 1;
$summ_id_uniq++;
}
}
}
$prob_preds_all = ($summ_longid_all - 2);
print REPORT "$in_summ contains sequences with 1 or more gene-check exceptions
$summ_id_uniq unique sequences have $prob_preds_all predictions with exceptions
the sequence $summ_id_name has the most predictions with exceptions: $summ_id_max predictions
the prediction $summ_longid_causes_name has the most exception types: $summ_longid_causes_max types\n\n";
# compare total predictions with those that have exceptions by id
# %summ_count_id contains the ids with # predictions with exceptions
# %preds_count_id contains the ids with # all predictions
$all_pass = 0;
$all_fail = 0;
$some_pass_fail = 0;
foreach $common_id (keys %preds_count_id) {
if (exists $summ_count_id{$common_id}) {
if ($summ_count_id{$common_id} == $preds_count_id{$common_id}) { $all_fail++; }
if ($summ_count_id{$common_id} < $preds_count_id{$common_id}) { $some_pass_fail++; }
} else {
$all_pass++;
}
}
# in details, count up uniq ids and types errors per id
# counting variables for total errors of each type and
# number of unique IDs associated have each exception in gc-stat
# total number of each type of error in file in gc-report
$detail_id_uniq = 0;
$detail_lines_all = 0;
$detail_gap_all = 0;
$detail_gap_id = 0;
$detail_badCdsSplice_all = 0;
$detail_badCdsSplice_id = 0;
$detail_badUtrSplice_all = 0;
$detail_badUtrSplice_id = 0;
$detail_noCDS_all = 0;
$detail_noCDS_id = 0;
$detail_noStart_all = 0;
$detail_noStart_id = 0;
$detail_noStop_all = 0;
$detail_noStop_id = 0;
$detail_inFrameStop_all = 0;
$detail_inFrameStop_id = 0;
$detail_frameDiscontig_all = 0;
$detail_frameDiscontig_id = 0;
$detail_frameMismatch_all = 0;
$detail_frameMismatch_id = 0;
$detail_badFrame_all = 0;
$detail_badFrame_id = 0;
$detail_other_all = 0;
$detail_other_id = 0;
# parse file and run through error type fields
while(<DETAIL>) {
chomp;
$detail_lines_all++;
unless ($detail_lines_all < 2) {
($detail_id,$type,$info,$exception,$coords) = split(/\t/);
}
unless (exists $detail_id_uniq_count{$detail_id}) {
$detail_id_uniq_count{$detail_id} = "1";
$detail_id_uniq++;
}
if ($type eq "gap") {
$detail_gap_all++;
unless (exists $detail_gap_uniq_count{$detail_id}) {
$detail_gap_uniq_count{$detail_id} = "1";
$detail_gap_id++;
}
}
elsif ($type eq "badCdsSplice") {
$detail_badCdsSplice_all++;
unless (exists $detail_badCdsSplice_uniq_count{$detail_id}) {
$detail_badCdsSplice_uniq_count{$detail_id} = "1";
$detail_badCdsSplice_id++;
}
}
elsif ($type eq "badUtrSplice") {
$detail_badUtrSplice_all++;
unless (exists $detail_badUtrSplice_uniq_count{$detail_id}) {
$detail_badUtrSplice_uniq_count{$detail_id} = "1";
$detail_badUtrSplice_id++;
}
}
elsif ($type eq "noCDS") {
$detail_noCDS_all++;
unless (exists $detail_noCDS_uniq_count{$detail_id}) {
$detail_noCDS_uniq_count{$detail_id} = "1";
$detail_noCDS_id++;
}
}
elsif ($type eq "noStart") {
$detail_noStart_all++;
unless (exists $detail_noStart_uniq_count{$detail_id}) {
$detail_noStart_uniq_count{$detail_id} = "1";
$detail_noStart_id++;
}
}
elsif ($type eq "noStop") {
$detail_noStop_all++;
unless (exists $detail_noStop_uniq_count{$detail_id}) {
$detail_noStop_uniq_count{$detail_id} = "1";
$detail_noStop_id++;
}
}
elsif ($type eq "inFrameStop") {
$detail_inFrameStop_all++;
unless (exists $detail_inFrameStop_uniq_count{$detail_id}) {
$detail_inFrameStop_uniq_count{$detail_id} = "1";
$detail_inFrameStop_id++;
}
}
elsif ($type eq "frameDiscontig") {
$detail_frameDiscontig_all++;
unless (exists $detail_frameDiscontig_uniq_count{$detail_id}) {
$detail_frameDiscontig_uniq_count{$detail_id} = "1";
$detail_frameDiscontig_id++;
}
}
elsif ($type eq "frameMismatch") {
$detail_frameMismatch_all++;
unless (exists $detail_frameMismatch_uniq_count{$detail_id}) {
$detail_frameMismatch_uniq_count{$detail_id} = "1";
$detail_frameMismatch_id++;
}
}
elsif ($type eq "badFrame") {
$detail_badFrame_all++;
unless (exists $detail_badFrame_uniq_count{$detail_id}) {
$detail_badFrame_uniq_count{$detail_id} = "1";
$detail_badFrame_id++;
}
}
else {
$detail_other_all++;
unless (exists $detail_other_uniq_count{$detail_id}) {
$detail_other_uniq_count{$detail_id} = "1";
$detail_other_id++;
}
}
}
# summarize seqs w/ exceptions into percentages of total seqs w/ preds
$detail_errors_all = ($detail_lines_all - 2);
$detail_gap_pct = (($detail_gap_id/$preds_id_uniq) * 100);
$detail_badCdsSplice_pct = (($detail_badCdsSplice_id/$preds_id_uniq) * 100);
$detail_badUtrSplice_pct = (($detail_badUtrSplice_id/$preds_id_uniq) * 100);
$detail_noCDS_pct = (($detail_noCDS_id/$preds_id_uniq) * 100);
$detail_noStart_pct = (($detail_noStart_id/$preds_id_uniq) * 100);
$detail_noStop_pct = (($detail_noStop_id/$preds_id_uniq) * 100);
$detail_inFrameStop_pct = (($detail_inFrameStop_id/$preds_id_uniq) * 100);
$detail_frameDiscontig_pct = (($detail_frameDiscontig_id/$preds_id_uniq) * 100);
$detail_frameMismatch_pct = (($detail_frameMismatch_id/$preds_id_uniq) * 100);
$detail_badFrame_pct = (($detail_badFrame_id/$preds_id_uniq) * 100);
$detail_other_pct = (($detail_other_id/$preds_id_uniq) * 100);
# summarize preds w/ exceptions into percentage of total preds
$prob_preds_all_pct = (($prob_preds_all/$preds_id_all) * 100);
# summarize seqs w/ % w/o exceptions into percentage of total unique seqs w/ preds
$all_pass_pct = (($all_pass/$preds_id_uniq) * 100);
$all_fail_pct = (($all_fail/$preds_id_uniq) * 100);
$some_pass_fail_pct = (($some_pass_fail/$preds_id_uniq) * 100);
print REPORT "$in_detail contains details for exceptions, including coordinates
one exception per line; sequences can appear on multiple lines
use exception key or sequence name to grep details from file
$detail_id_uniq unique sequences with prediction exceptions
$detail_lines_all total gene prediction exceptions\n
Gene-check exception summary statistics based on <gc-details> file content
exception key: <total> <unique by sequence> <pct of unique sequences in genePreds file>
----------------------------------------------------------------------\n";
printf REPORT "gap:\t\t$detail_gap_all\t$detail_gap_id\t%3.1f %\n", "$detail_gap_pct";
printf REPORT "badCdsSplice:\t$detail_badCdsSplice_all\t$detail_badCdsSplice_id\t%3.1f %\n", "$detail_badCdsSplice_pct";
printf REPORT "badUtrSplice:\t$detail_badUtrSplice_all\t$detail_badUtrSplice_id\t%3.1f %\n", "$detail_badUtrSplice_pct";
printf REPORT "noCDS:\t\t$detail_noCDS_all\t$detail_noCDS_id\t%3.1f %\n", "$detail_noCDS_pct";
printf REPORT "noStart:\t$detail_noStart_all\t$detail_noStart_id\t%3.1f %\n", "$detail_noStart_pct";
printf REPORT "noStop:\t\t$detail_noStop_all\t$detail_noStop_id\t%3.1f %\n", "$detail_noStop_pct";
printf REPORT "inFrameStop:\t$detail_inFrameStop_all\t$detail_inFrameStop_id\t%3.1f %\n", "$detail_inFrameStop_pct";
printf REPORT "frameDiscontig:\t$detail_frameDiscontig_all\t$detail_frameDiscontig_id\t%3.1f %\n", "$detail_frameDiscontig_pct";
printf REPORT "frameMismatch:\t$detail_frameMismatch_all\t$detail_frameMismatch_id\t%3.1f %\n", "$detail_frameMismatch_pct";
printf REPORT "badFrame:\t$detail_badFrame_all\t$detail_badFrame_id\t%3.1f %\n", "$detail_badFrame_pct";
printf REPORT "other:\t\t$detail_other_all\t$detail_other_id\t%3.1f %\n", "$detail_other_pct";
print REPORT "\n\nGlobal summary statistics for $pred_table gene prediction track
-------------------------------------------------------------------\n";
printf REPORT "Pct predictions with one or more gene-check exceptions: %3.1f %\n", "$prob_preds_all_pct";
printf REPORT "Pct unique sequences with all predictions having no exceptions: %3.1f %\n", "$all_pass_pct";
printf REPORT "Pct unique sequences with all predictions having 1 or more exceptions: %3.1f %\n", "$all_fail_pct";
printf REPORT "Pct unique sequences with mixed predictions (some have exceptions, some do not): %3.1f %\n", "$some_pass_fail_pct";
print REPORT "\n\n\nEND OF REPORT\n\n";
print STDERR "PROCESSING: outfile <gc-report> finalized\n";
# <gc-stats> output file variables
# 1 db.genePred $pred_db.$pred_table
# 2 date $pred_date
# 3 server $pred_server
# 4 uniqSeqs $preds_id_uniq
# 5 allPreds $preds_id_all
# 6 probPreds $prob_preds_all,$prob_preds_all_pct
# 7 allPass $all_pass,$all_pass_pct
# 8 allFail $all_fail,$all_fail_pct
# 8 somePassFail $some_pass_fail,$some_pass_fail_pct
# 10 gap $detail_gap_id,$detail_gap_pct
# 11 badCdsSplice $detail_badCdsSplice_id,$detail_badCdsSplice_pct
# 12 badUtrSplice $detail_badUtrSplice_id,$detail_badUtrSplice_pct
# 13 noCDS $detail_noCDS_id,$detail_noCDS_pct
# 14 noStart $detail_noStart_id,$detail_noStart_pct
# 15 noStop $detail_noStop_id,$detail_noStop_pct
# 16 inFrameStop $detail_inFrameStop_id,$detail_inFrameStop_pct
# 17 frameDiscontig $detail_frameDiscontig_id,$detail_frameDiscontig_pct
# 18 frameMismatch $detail_frameMismatch_id,$detail_frameMismatch_pct
# 19 badFrame $detail_badFrame_id,$detail_badFrame_pct
# 20 other $detail_other_id,$detail_other_pct
# write out html format output to <gc-stats>
print STAT "\n\n";
print STAT "<tr>\n";
print STAT "<td>$pred_db.$pred_table</td>\n";
print STAT "<td>$pred_date</td>\n";
print STAT "<td>$pred_server</td>\n";
print STAT "<td>$preds_id_uniq</td>\n";
print STAT "<td>$preds_id_all</td>\n";
printf STAT "<td>$prob_preds_all(%3.1f%)</td>\n", "$prob_preds_all_pct";
printf STAT "<td>$all_pass(%3.1f%)</td>\n", "$all_pass_pct";
printf STAT "<td>$all_fail(%3.1f%)</td>\n", "$all_fail_pct";
printf STAT "<td>$some_pass_fail(%3.1f%)</td>\n", "$some_pass_fail_pct";
printf STAT "<td>$detail_gap_id(%3.1f%)</td>\n", "$detail_gap_pct";
printf STAT "<td>$detail_badCdsSplice_id(%3.1f%)</td>\n", "$detail_badCdsSplice_pct";
printf STAT "<td>$detail_badUtrSplice_id(%3.1f%)</td>\n", "$detail_badUtrSplice_pct";
printf STAT "<td>$detail_noCDS_id(%3.1f%)</td>\n", "$detail_noCDS_pct";
printf STAT "<td>$detail_noStart_id(%3.1f%)</td>\n", "$detail_noStart_pct";
printf STAT "<td>$detail_noStop_id(%3.1f%)</td>\n", "$detail_noStop_pct";
printf STAT "<td>$detail_inFrameStop_id(%3.1f%)</td>\n", "$detail_inFrameStop_pct";
printf STAT "<td>$detail_frameDiscontig_id(%3.1f%)</td>\n", "$detail_frameDiscontig_pct";
printf STAT "<td>$detail_frameMismatch_id(%3.1f%)</td>\n", "$detail_frameMismatch_pct";
printf STAT "<td>$detail_badFrame_id(%3.1f%)</td>\n", "$detail_badFrame_pct";
printf STAT "<td>$detail_other_id(%3.1f%)</td>\n", "$detail_other_pct";
print STAT "</tr>\n";
print STAT "\n\n\nEND OF REPORT\n\n";
print STDERR "PROCESSING: outfile <gc-stats> finalized\n";
print STDERR "PROCESSING: $0 program finished witout errors!\n";