5adcf6bc2904690de7b7b30a83ec8a7a0996abe9 galt Tue Aug 21 00:01:25 2018 -0700 changing cse subdomain to soe diff --git src/utils/qa/gc-stats.pl src/utils/qa/gc-stats.pl index b4fa475..fdb4ab0 100755 --- src/utils/qa/gc-stats.pl +++ src/utils/qa/gc-stats.pl @@ -1,457 +1,457 @@ #!/usr/local/bin/perl5 # Program: gc-stats.pl # Purpose: parses output of gene-check script into summary statistics -# Test Protocal: http://genecats.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html -# Summary results: http://genecats.cse.ucsc.edu/qa/test-results/gene-check.results.html +# Test Protocal: http://genecats.soe.ucsc.edu/qa/test-protocols/gene-check.protocol.html +# Summary results: http://genecats.soe.ucsc.edu/qa/test-results/gene-check.results.html # Author: Jennifer Jackson # Int Date: 2005-11-14 # Rev Date: 2005-11-XX $documentation = "See test protocol documentation - http://genecats.cse.ucsc.edu/qa/test-protocols/gene-check.protocol.html"; + http://genecats.soe.ucsc.edu/qa/test-protocols/gene-check.protocol.html"; if ($#ARGV != 2) { die "\nUSAGE: $0 <genePreds_table> <gc-details> <gc-summary> Parses output of gene-check script into summary statistics Output <gc-stats> and <gc-report> are generated by program STDERR is sent to screen <genePreds_table> file name format: pred_table.db.server.YYMMDD <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details <gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary <gc-stats> file name format: pred_table.db.server.YYMMDD.gc-stats <gc-report> file name format: pred_table.db.server.YYMMDD.gc-report \nWhen complete, do the following to save your results ...... 1)Save the <gc-report> file in report directory: /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results 2)Add the html contents of <gc-stats> into results file: /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html 3)Keep working files until QA complete in work directory, then delete: /cluster/store9/qa/gene-check $documentation\n\n";} # State the program has started print STDERR "PROCESSING: $0 program started, verifing input files\n"; # get input file names $in_preds = $ARGV[0]; $in_detail = $ARGV[1]; $in_summ = $ARGV[2]; # attempt to open and die if cannot be found/read open(PREDS,"$in_preds") || die "ERROR: Cannot open file $in_preds\n"; open(DETAIL,"$in_detail") || die "ERROR: Cannot open file $in_detail\n"; open(SUMM,"$in_summ") || die "ERROR: Cannot open file $in_summ\n"; # verify file names of infiles # die if database & server are not the same between all three # warn if dates are different # check genePreds_table input file name format (@name_pred) = split(/\./,$in_preds); if ($#name_pred != 3) { die "ERROR: $in_preds file name not formatted correctly <genePreds_table> file name format: pred_table.db.server.YYMMDD $documentation\n"; } $pred_table = $name_pred[0]; $pred_db = $name_pred[1]; $pred_server = $name_pred[2]; $pred_date = $name_pred[3]; # check gc-details input file name format (@name_detail) = split(/\./,$in_detail); if ($#name_detail != 4) { die "ERROR: $in_detail file name not formatted correctly <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details $documentation\n"; } $detail_table = $name_detail[0]; $detail_file = $name_detail[4]; if ($detail_file ne "gc-details") { die "ERROR: $in_detail file name not formatted correctly or in wrong order USAGE: $0 <genePreds_table> <gc-details> <gc-summary> <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details $documentation\n"; } # check gc-summary input file name format (@name_summ) = split(/\./,$in_summ); if ($#name_summ != 4) { die "ERROR: $in_summ file name not formatted correctly <gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary $documentation\n"; } $summ_table = $name_summ[0]; $summ_file = $name_summ[4]; if ($summ_file ne "gc-summary") { die "ERROR: $in_summ file name not formatted correctly or in wrong order USAGE: $0 <genePreds_table> <gc-details> <gc-summary> <gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary $documentation\n"; } # State that files are readable print STDERR "PROCESSING: input files identified and readable\n"; # check that pred_table name is same on all files if (("$pred_table" eq "$detail_table") && ("$detail_table" eq "$summ_table")) { print STDERR "PROCESSING: genePreds_table same between all input files PROCESSING: finished checking format; statistics being generated\n"; } else { die "ERROR: All files must be from same genePreds_table <genePreds_table> file name format: pred_table.db.server.YYMMDD <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details <gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary $documentation\n"; } # open outfiles for writing results $stat_file = "$pred_table.$pred_db.$pred_server.$pred_date.gc-stats"; $report_file = "$pred_table.$pred_db.$pred_server.$pred_date.gc-report"; open STAT, ">$stat_file"; open REPORT, ">$report_file"; print STAT "$0 gc-stats file for $in_preds\n\n"; print STAT "Paste the html formated results below into file:\n"; print STAT "/usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html\n\n"; print REPORT "$0 gc-report file for $in_preds\n\n"; print REPORT "Save file at /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.reports\n\n"; # in preds, count up ids & preds per id $preds_id_all = 0; $preds_id_uniq = 0; $preds_id_max = 1; $preds_id_name = "none"; while(<PREDS>) { chomp; $preds_id_all++; (@content_preds) = split(/\t/); if (exists $preds_count_id{$content_preds[0]}) { $preds_count_id{$content_preds[0]}++; if ($preds_count_id{$content_preds[0]} > $preds_id_max) { $preds_id_max = $preds_count_id{$content_preds[0]}; $preds_id_name = $content_preds[0]; } } else { $preds_count_id{$content_preds[0]} = 1; $preds_id_uniq++; } } print REPORT "$in_preds contains gene prediction table from database $preds_id_uniq unique sequences have $preds_id_all gene predictions the sequence $preds_id_name has the most gene predictions: $preds_id_max predictions\n\n"; # in summary, count up ids & uniq preds & errors per uniq preds $summ_id_uniq = 0; $summ_id_max = 1; $summ_id_name = "none"; $summ_longid_all = 0; $summ_longid_causes_max = 1; $summ_longid_causes_name = "none"; while(<SUMM>) { chomp; $summ_longid_all++; # print STDERR "longid $summ_longid_all \n"; unless ($summ_longid_all <= 2) { ($summ_id,$chr,$chrs,$chre,$strand,$stat,$frame,$start,$stop,$orfStop,$cdsGap,$utrGap,$cdsSplice,$utrSplice,$causes) = split(/\t/); $summ_longid = "$summ_id.$chr.$chrs.$chre"; (@array_causes) = split(/\,/,$causes); $summ_causes_longid{$summ_longid} = ($#array_causes + 1); if ($summ_causes_longid{$summ_longid} > $summ_longid_causes_max) { $summ_longid_causes_max = $summ_causes_longid{$summ_longid}; $summ_longid_causes_name = $summ_longid; } if (exists $summ_count_id{$summ_id}) { $summ_count_id{$summ_id}++; if ($summ_count_id{$summ_id} > $summ_id_max) { $summ_id_max = $summ_count_id{$summ_id}; $summ_id_name = $summ_id; } } else { $summ_count_id{$summ_id} = 1; $summ_id_uniq++; } } } $prob_preds_all = ($summ_longid_all - 2); print REPORT "$in_summ contains sequences with 1 or more gene-check exceptions $summ_id_uniq unique sequences have $prob_preds_all predictions with exceptions the sequence $summ_id_name has the most predictions with exceptions: $summ_id_max predictions the prediction $summ_longid_causes_name has the most exception types: $summ_longid_causes_max types\n\n"; # compare total predictions with those that have exceptions by id # %summ_count_id contains the ids with # predictions with exceptions # %preds_count_id contains the ids with # all predictions $all_pass = 0; $all_fail = 0; $some_pass_fail = 0; foreach $common_id (keys %preds_count_id) { if (exists $summ_count_id{$common_id}) { if ($summ_count_id{$common_id} == $preds_count_id{$common_id}) { $all_fail++; } if ($summ_count_id{$common_id} < $preds_count_id{$common_id}) { $some_pass_fail++; } } else { $all_pass++; } } # in details, count up uniq ids and types errors per id # counting variables for total errors of each type and # number of unique IDs associated have each exception in gc-stat # total number of each type of error in file in gc-report $detail_id_uniq = 0; $detail_lines_all = 0; $detail_gap_all = 0; $detail_gap_id = 0; $detail_badCdsSplice_all = 0; $detail_badCdsSplice_id = 0; $detail_badUtrSplice_all = 0; $detail_badUtrSplice_id = 0; $detail_noCDS_all = 0; $detail_noCDS_id = 0; $detail_noStart_all = 0; $detail_noStart_id = 0; $detail_noStop_all = 0; $detail_noStop_id = 0; $detail_inFrameStop_all = 0; $detail_inFrameStop_id = 0; $detail_frameDiscontig_all = 0; $detail_frameDiscontig_id = 0; $detail_frameMismatch_all = 0; $detail_frameMismatch_id = 0; $detail_badFrame_all = 0; $detail_badFrame_id = 0; $detail_other_all = 0; $detail_other_id = 0; # parse file and run through error type fields while(<DETAIL>) { chomp; $detail_lines_all++; unless ($detail_lines_all < 2) { ($detail_id,$type,$info,$exception,$coords) = split(/\t/); } unless (exists $detail_id_uniq_count{$detail_id}) { $detail_id_uniq_count{$detail_id} = "1"; $detail_id_uniq++; } if ($type eq "gap") { $detail_gap_all++; unless (exists $detail_gap_uniq_count{$detail_id}) { $detail_gap_uniq_count{$detail_id} = "1"; $detail_gap_id++; } } elsif ($type eq "badCdsSplice") { $detail_badCdsSplice_all++; unless (exists $detail_badCdsSplice_uniq_count{$detail_id}) { $detail_badCdsSplice_uniq_count{$detail_id} = "1"; $detail_badCdsSplice_id++; } } elsif ($type eq "badUtrSplice") { $detail_badUtrSplice_all++; unless (exists $detail_badUtrSplice_uniq_count{$detail_id}) { $detail_badUtrSplice_uniq_count{$detail_id} = "1"; $detail_badUtrSplice_id++; } } elsif ($type eq "noCDS") { $detail_noCDS_all++; unless (exists $detail_noCDS_uniq_count{$detail_id}) { $detail_noCDS_uniq_count{$detail_id} = "1"; $detail_noCDS_id++; } } elsif ($type eq "noStart") { $detail_noStart_all++; unless (exists $detail_noStart_uniq_count{$detail_id}) { $detail_noStart_uniq_count{$detail_id} = "1"; $detail_noStart_id++; } } elsif ($type eq "noStop") { $detail_noStop_all++; unless (exists $detail_noStop_uniq_count{$detail_id}) { $detail_noStop_uniq_count{$detail_id} = "1"; $detail_noStop_id++; } } elsif ($type eq "inFrameStop") { $detail_inFrameStop_all++; unless (exists $detail_inFrameStop_uniq_count{$detail_id}) { $detail_inFrameStop_uniq_count{$detail_id} = "1"; $detail_inFrameStop_id++; } } elsif ($type eq "frameDiscontig") { $detail_frameDiscontig_all++; unless (exists $detail_frameDiscontig_uniq_count{$detail_id}) { $detail_frameDiscontig_uniq_count{$detail_id} = "1"; $detail_frameDiscontig_id++; } } elsif ($type eq "frameMismatch") { $detail_frameMismatch_all++; unless (exists $detail_frameMismatch_uniq_count{$detail_id}) { $detail_frameMismatch_uniq_count{$detail_id} = "1"; $detail_frameMismatch_id++; } } elsif ($type eq "badFrame") { $detail_badFrame_all++; unless (exists $detail_badFrame_uniq_count{$detail_id}) { $detail_badFrame_uniq_count{$detail_id} = "1"; $detail_badFrame_id++; } } else { $detail_other_all++; unless (exists $detail_other_uniq_count{$detail_id}) { $detail_other_uniq_count{$detail_id} = "1"; $detail_other_id++; } } } # summarize seqs w/ exceptions into percentages of total seqs w/ preds $detail_errors_all = ($detail_lines_all - 2); $detail_gap_pct = (($detail_gap_id/$preds_id_uniq) * 100); $detail_badCdsSplice_pct = (($detail_badCdsSplice_id/$preds_id_uniq) * 100); $detail_badUtrSplice_pct = (($detail_badUtrSplice_id/$preds_id_uniq) * 100); $detail_noCDS_pct = (($detail_noCDS_id/$preds_id_uniq) * 100); $detail_noStart_pct = (($detail_noStart_id/$preds_id_uniq) * 100); $detail_noStop_pct = (($detail_noStop_id/$preds_id_uniq) * 100); $detail_inFrameStop_pct = (($detail_inFrameStop_id/$preds_id_uniq) * 100); $detail_frameDiscontig_pct = (($detail_frameDiscontig_id/$preds_id_uniq) * 100); $detail_frameMismatch_pct = (($detail_frameMismatch_id/$preds_id_uniq) * 100); $detail_badFrame_pct = (($detail_badFrame_id/$preds_id_uniq) * 100); $detail_other_pct = (($detail_other_id/$preds_id_uniq) * 100); # summarize preds w/ exceptions into percentage of total preds $prob_preds_all_pct = (($prob_preds_all/$preds_id_all) * 100); # summarize seqs w/ % w/o exceptions into percentage of total unique seqs w/ preds $all_pass_pct = (($all_pass/$preds_id_uniq) * 100); $all_fail_pct = (($all_fail/$preds_id_uniq) * 100); $some_pass_fail_pct = (($some_pass_fail/$preds_id_uniq) * 100); print REPORT "$in_detail contains details for exceptions, including coordinates one exception per line; sequences can appear on multiple lines use exception key or sequence name to grep details from file $detail_id_uniq unique sequences with prediction exceptions $detail_lines_all total gene prediction exceptions\n Gene-check exception summary statistics based on <gc-details> file content exception key: <total> <unique by sequence> <pct of unique sequences in genePreds file> ----------------------------------------------------------------------\n"; printf REPORT "gap:\t\t$detail_gap_all\t$detail_gap_id\t%3.1f %\n", "$detail_gap_pct"; printf REPORT "badCdsSplice:\t$detail_badCdsSplice_all\t$detail_badCdsSplice_id\t%3.1f %\n", "$detail_badCdsSplice_pct"; printf REPORT "badUtrSplice:\t$detail_badUtrSplice_all\t$detail_badUtrSplice_id\t%3.1f %\n", "$detail_badUtrSplice_pct"; printf REPORT "noCDS:\t\t$detail_noCDS_all\t$detail_noCDS_id\t%3.1f %\n", "$detail_noCDS_pct"; printf REPORT "noStart:\t$detail_noStart_all\t$detail_noStart_id\t%3.1f %\n", "$detail_noStart_pct"; printf REPORT "noStop:\t\t$detail_noStop_all\t$detail_noStop_id\t%3.1f %\n", "$detail_noStop_pct"; printf REPORT "inFrameStop:\t$detail_inFrameStop_all\t$detail_inFrameStop_id\t%3.1f %\n", "$detail_inFrameStop_pct"; printf REPORT "frameDiscontig:\t$detail_frameDiscontig_all\t$detail_frameDiscontig_id\t%3.1f %\n", "$detail_frameDiscontig_pct"; printf REPORT "frameMismatch:\t$detail_frameMismatch_all\t$detail_frameMismatch_id\t%3.1f %\n", "$detail_frameMismatch_pct"; printf REPORT "badFrame:\t$detail_badFrame_all\t$detail_badFrame_id\t%3.1f %\n", "$detail_badFrame_pct"; printf REPORT "other:\t\t$detail_other_all\t$detail_other_id\t%3.1f %\n", "$detail_other_pct"; print REPORT "\n\nGlobal summary statistics for $pred_table gene prediction track -------------------------------------------------------------------\n"; printf REPORT "Pct predictions with one or more gene-check exceptions: %3.1f %\n", "$prob_preds_all_pct"; printf REPORT "Pct unique sequences with all predictions having no exceptions: %3.1f %\n", "$all_pass_pct"; printf REPORT "Pct unique sequences with all predictions having 1 or more exceptions: %3.1f %\n", "$all_fail_pct"; printf REPORT "Pct unique sequences with mixed predictions (some have exceptions, some do not): %3.1f %\n", "$some_pass_fail_pct"; print REPORT "\n\n\nEND OF REPORT\n\n"; print STDERR "PROCESSING: outfile <gc-report> finalized\n"; # <gc-stats> output file variables # 1 db.genePred $pred_db.$pred_table # 2 date $pred_date # 3 server $pred_server # 4 uniqSeqs $preds_id_uniq # 5 allPreds $preds_id_all # 6 probPreds $prob_preds_all,$prob_preds_all_pct # 7 allPass $all_pass,$all_pass_pct # 8 allFail $all_fail,$all_fail_pct # 8 somePassFail $some_pass_fail,$some_pass_fail_pct # 10 gap $detail_gap_id,$detail_gap_pct # 11 badCdsSplice $detail_badCdsSplice_id,$detail_badCdsSplice_pct # 12 badUtrSplice $detail_badUtrSplice_id,$detail_badUtrSplice_pct # 13 noCDS $detail_noCDS_id,$detail_noCDS_pct # 14 noStart $detail_noStart_id,$detail_noStart_pct # 15 noStop $detail_noStop_id,$detail_noStop_pct # 16 inFrameStop $detail_inFrameStop_id,$detail_inFrameStop_pct # 17 frameDiscontig $detail_frameDiscontig_id,$detail_frameDiscontig_pct # 18 frameMismatch $detail_frameMismatch_id,$detail_frameMismatch_pct # 19 badFrame $detail_badFrame_id,$detail_badFrame_pct # 20 other $detail_other_id,$detail_other_pct # write out html format output to <gc-stats> print STAT "\n\n"; print STAT "<tr>\n"; print STAT "<td>$pred_db.$pred_table</td>\n"; print STAT "<td>$pred_date</td>\n"; print STAT "<td>$pred_server</td>\n"; print STAT "<td>$preds_id_uniq</td>\n"; print STAT "<td>$preds_id_all</td>\n"; printf STAT "<td>$prob_preds_all(%3.1f%)</td>\n", "$prob_preds_all_pct"; printf STAT "<td>$all_pass(%3.1f%)</td>\n", "$all_pass_pct"; printf STAT "<td>$all_fail(%3.1f%)</td>\n", "$all_fail_pct"; printf STAT "<td>$some_pass_fail(%3.1f%)</td>\n", "$some_pass_fail_pct"; printf STAT "<td>$detail_gap_id(%3.1f%)</td>\n", "$detail_gap_pct"; printf STAT "<td>$detail_badCdsSplice_id(%3.1f%)</td>\n", "$detail_badCdsSplice_pct"; printf STAT "<td>$detail_badUtrSplice_id(%3.1f%)</td>\n", "$detail_badUtrSplice_pct"; printf STAT "<td>$detail_noCDS_id(%3.1f%)</td>\n", "$detail_noCDS_pct"; printf STAT "<td>$detail_noStart_id(%3.1f%)</td>\n", "$detail_noStart_pct"; printf STAT "<td>$detail_noStop_id(%3.1f%)</td>\n", "$detail_noStop_pct"; printf STAT "<td>$detail_inFrameStop_id(%3.1f%)</td>\n", "$detail_inFrameStop_pct"; printf STAT "<td>$detail_frameDiscontig_id(%3.1f%)</td>\n", "$detail_frameDiscontig_pct"; printf STAT "<td>$detail_frameMismatch_id(%3.1f%)</td>\n", "$detail_frameMismatch_pct"; printf STAT "<td>$detail_badFrame_id(%3.1f%)</td>\n", "$detail_badFrame_pct"; printf STAT "<td>$detail_other_id(%3.1f%)</td>\n", "$detail_other_pct"; print STAT "</tr>\n"; print STAT "\n\n\nEND OF REPORT\n\n"; print STDERR "PROCESSING: outfile <gc-stats> finalized\n"; print STDERR "PROCESSING: $0 program finished witout errors!\n";