src/utils/qa/knownGene.csh 1.16
1.16 2010/03/14 20:34:00 pauline
Fixed some typos, an unmatched quote and clarified description statement.
Index: src/utils/qa/knownGene.csh
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/qa/knownGene.csh,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 1000000 -r1.15 -r1.16
--- src/utils/qa/knownGene.csh 14 Oct 2009 17:20:03 -0000 1.15
+++ src/utils/qa/knownGene.csh 14 Mar 2010 20:34:00 -0000 1.16
@@ -1,896 +1,898 @@
#!/bin/tcsh
source `which qaConfig.csh`
# to do: check that trackDb.hgGene is "on"
# to do: see that cgapBiocDesc has unique entries
# --------------------------------------------
-# run KGGeneCheck.java on dev:
+# run HGGeneCheck.java on dev:
#### echo
#### echo " --------------------------------------------"
####
#### echo "server hgwdev.cse.ucsc.edu /
#### machine hgwdev.cse.ucsc.edu /
#### quick false /
#### dbSpec $db /
#### table all /
#### zoomCount 4" > $db.props
###
-#### echo "run KGGeneCheck: \
-#### nohup nice HGGeneCheck $db.props > & $db.KGrobot.out & "
+#### echo "run HGGeneCheck: \
+#### nohup nice HGGeneCheck $db.props > & $db.HGrobot.out & "
#### echo
-echo " --------------------------------------------"
+#echo " --------------------------------------------"
###############################################
#
# 03-09-04
# Robert Kuhn
#
-# Runs through the checks on Known Genes.
-# Requires KG java run in Trackchecking first.
-# -----------
-# run HGGeneCheck.java on dev
-# /cluster/home/heather/TrackChecking/KGGeneCheck.java
+# This script runs through a number of checks on the Known Genes tables.
+# You need to run HGGeneCheck first. Also you need to have 3 files
+# (kgTables, gsTables, and pbTables) that list the tables involved one
+# directory above the directory where you choose to run knownGene.csh.
+#
+# run HGGeneCheck.java on dev:
+# nohup nice HGGeneCheck $db.props > & $db.HGrobot.out &
#
###############################################
set db=""
set oldDb=""
set trackName="kg"
set table="knownGene"
if ($#argv == 0 || $#argv > 2) then
# no command line args
echo
echo " runs test suite on Known Genes track."
echo " suggest KG java run in Trackchecking first."
echo
echo " usage: database [oldDb] (if fresh assembly for existing organism)"
echo
exit
else
set db=$argv[1]
endif
if ($#argv == 2) then
set oldDb=$argv[2]
else
set oldDb=$db
endif
echo "using $db "
echo "using $oldDb for beta db "
set curr_dir=$cwd
# get sets of KG, GS, PB tables from master list:
findKgTableSet.csh $db
# count items in list
echo "found the following KG tables:"
wc -l $db.{kg,gs,pb}Tables
cat $db.{kg,gs,pb}Tables | sort -u > $db.allTables
echo "will have some overlap. uniq tables = `cat $db.allTables | sort -u | wc -l`"
# find new tables: in pushQ but NOT in a kgTables list
echo
echo "find new tables: in pushQ but NOT in a kgTables list"
hgsql -h mysqlbeta -Ne "SELECT tbls FROM pushQ WHERE tbls LIKE '%knownGene%' \
AND dbs = '"$db"'" qapushq | sed "s/\\n/\n/g" | egrep -r . > $db.pushqKgList
dos2unix $db.pushqKgList >& /dev/null
commTrio.csh $db.allTables $db.pushqKgList
if ( `wc -l $db.allTables.Only | awk '{print $1}'` != 0 ) then
echo "these tables may be in separate tracks? ($db.allTables.Only) :"
cat $db.allTables.Only
endif
echo
# --------------------------------------------
echo
echo "compare new genes to old set (uniq):"
echo "not useful for first releasse of KG3-type track"
if ($db != $oldDb) then
echo " comparing to table in $oldDb on beta"
echo
endif
hgsql -N -e "SELECT name from knownGene" $db | sort | uniq > $db.KG.name.dev
hgsql -N -h $sqlbeta -e "SELECT name from knownGene" $oldDb | sort |uniq > $oldDb.KG.name.beta.uniq
comm -23 $db.KG.name.dev $oldDb.KG.name.beta.uniq > $db.KG.name.devOnly
comm -13 $db.KG.name.dev $oldDb.KG.name.beta.uniq > $oldDb.KG.name.betaOnly
comm -12 $db.KG.name.dev $oldDb.KG.name.beta.uniq > $db.KG.name.common
wc -l $db.KG.name.devOnly $oldDb.KG.name.betaOnly $db.KG.name.common | grep -v total
echo
echo "check some new ones by hand:"
head -20 $db.KG.name.devOnly | tail -3
echo
echo "check some old ones by hand on beta:"
head -20 $oldDb.KG.name.betaOnly | tail -3
echo
echo "strip version number and compare again:"
awk -F. '{print $1}' $db.KG.name.devOnly > $db.KG.name.devOnly.noVersion
awk -F. '{print $1}' $oldDb.KG.name.betaOnly > $oldDb.KG.name.betaOnly.noVersion
commTrio.csh $db.KG.name.devOnly.noVersion $oldDb.KG.name.betaOnly.noVersion
# --------------------------------------------
# check alignID for unique:
echo
echo "checking alignID for unique:"
hgsql -N -e "SELECT alignID FROM knownGene" $db | sort > $db.KG.alignID.sort
uniq $db.KG.alignID.sort > $db.KG.alignID.uniq
comm -12 $db.KG.alignID.sort $db.KG.alignID.uniq > $db.KG.alignID.common
echo "expect all lines to be in common file: all alignIDs are unique"
echo "if any are in sort file more than once:"
comm -23 $db.KG.alignID.sort $db.KG.alignID.uniq > $db.KG.alignID.dupes
wc -l $db*alignID* | grep -v "total"
echo "three duplicate alignIDs (may be blank):"
head -3 $db.KG.alignID.dupes
echo
# --------------------------------------------
# check strand:
echo "-------------------------------------------------"
echo
echo "counting strands:"
set totKG=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene' $db`
set tot=`hgsql -N -e "SELECT COUNT(*) FROM knownGene" $db`
set plus=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene WHERE strand = "+"' $db`
set minus=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene WHERE strand = "-"' $db`
set null=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene WHERE strand = ""' $db`
set total = `expr $null + $minus + $plus`
echo "null = $null"
echo "plus = $plus"
echo "minus = $minus"
echo "total = $total"
echo "rows = $totKG"
echo
# -------------------------------------------------
# look for start and end order:
echo "-------------------------------------------------"
echo
echo "look for start and end order:"
rm -f txFile
set var1=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart >= txEnd" $db`
set var2=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart > cdsStart" $db`
set var3=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE cdsEnd > txEnd" $db`
set var4=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart = cdsStart" $db`
set var5=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE cdsEnd = txEnd" $db`
set var6=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart < 0" $db`
set var7=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE cdsStart < 0" $db`
echo
echo "txStart >= txEnd: "$var1
echo "txStart > cdsStart: "$var2
echo "cdsEnd > txEnd: "$var3
echo "txStart = cdsStart: "$var4" (means no 5'-UTR if + strand)"
echo "cdsEnd = txEnd: "$var5" (means no 3'-UTR if + strand)"
echo "negative txStart: "$var6
echo "negative cdsStart: "$var7
echo
echo "-------------------------------------------------"
echo
echo "check if browser show no 5', 3'-UTR where predicted:"
echo "no 5'-UTR:"
hgsql -e "SELECT name, chrom, txStart, txEnd FROM knownGene \
WHERE txStart = cdsStart LIMIT 3" $db
echo
echo "no 3'-UTR:"
hgsql -e "SELECT name, chrom, txStart, txEnd FROM knownGene \
WHERE txEnd = cdsEnd LIMIT 3" $db
# -------------------------------------------------
# check ends:
echo "-------------------------------------------------"
# looking for off the end of each chrom (look for all positive values)
# looking for off-end of each chrom (look for all positive values)
echo
echo "looking for off the end of each chrom \
(look for all positive values):"
checkOffend.csh $db $table
echo
# -------------------------------------------------
# check to see if there are genes on all chroms.
echo "-------------------------------------------------"
echo " check to see if there are genes on all chroms:"
echo
hgsql -N -e "SELECT ci.chrom, COUNT(*) AS genes \
FROM chromInfo ci, knownGene kg WHERE ci.chrom=kg.chrom \
AND ci.chrom NOT LIKE '%random' \
GROUP BY ci.chrom ORDER BY chrom" $db
hgsql -N -e "SELECT ci.chrom, COUNT(*) AS genes \
FROM chromInfo ci, knownGene kg WHERE ci.chrom=kg.chrom \
AND ci.chrom LIKE '%random' \
GROUP BY ci.chrom ORDER BY chrom" $db
echo
echo "-------------------------------------------------"
echo " check coverage: "
echo
checkCoverage.csh $db $table
echo
runBits.csh $db knownGene
echo
runJoiner.csh $db knownGene
# -------------------------------------------------
# check exon sizes:
#
#
# # edit the list /cluster/home/heather/jdbc/data/tablelist
# echo
# echo "knownGene" > /cluster/home/heather/jdbc/data/KG
#
# # and run /cluster/home/heather/jdbc/ExonSize.java:
# cd /cluster/home/heather/jdbc/
# source javaenv
# java ExonSize $db data/KG | egrep -v "items found|-----|Querying" > $curr_dir/$db.exonExtremes
# cd $curr_dir
#
# echo
# echo "number of very large or negative exons:"
# grep "exon" $db.exonExtremes | wc -l
# # echo "five lines are kruft (filtered above now with grep -v"
# echo
# echo "example of very large or negative exons:"
# head -5 $db.exonExtremes
# echo
# -------------------------------------------------
# count exons
echo
echo "count exons"
echo "genes with the most exons:"
hgsql -t -e "SELECT name, exonCount FROM knownGene ORDER BY exonCount DESC limit 5" $db > $db.manyExons
cat $db.manyExons
echo
# -------------------------------------------------
# look for unique names
echo
echo "look for unique names:"
hgsql -N -e "SELECT name FROM knownGene" $db | sort \
> $db.KG.name.sort
uniq $db.KG.name.sort > $db.KG.name.uniq
set kgSort=`wc -l $db.KG.name.sort`
set kgUniq=`wc -l $db.KG.name.uniq`
set diff=`comm -23 $db.KG.name.sort $db.KG.name.uniq \
| wc -l`
echo "sort: "$kgSort
echo "uniq: "$kgUniq
echo "diff: "$diff
echo
# ------------------------------------
# todo: get list of genes and their geneSymbol and count up how
# many have the same name. look at outliers. what follows here doesn't
# work anymore as KG.name is now a unique field.
# todo: analyze knownCanonical clusters
# # delete lines with single hit " 1[tab]"
# # -- and count lines remaining (genes used more than once):
#
# echo
# sort $db.KG.name.sort | uniq -c | sort -nr > $db.KG.name.uniq.most
#
# echo "number of genes that align more than once:"
# sed -e "/ 1 /d" $db.KG.name.uniq.most | wc -l
# echo
#
# echo "gene occurance profile"
# echo
# textHistogram -col=1 $db.KG.name.uniq.most
# rm file
#
# echo
# echo "genes that align the most times:"
# head $db.KG.name.uniq.most
# echo
# -------------------------------------------------
# examine overlapping genes ???
# .../jdbc/Overlap.java
# how ??? There are many
# -------------------------------------------------
# look for same number of rows in kgXref, knownGenePep, knownGeneMrna
echo
echo "-------------------------------------------------"
echo "look for same number of rows in kgXref, knownGenePep, knownGeneMrna:"
echo "kgXref"
set xrefNum=`hgsql -N -e "SELECT COUNT(*) FROM kgXref" $db`
set xrefNumUniq=`hgsql -N -e "SELECT COUNT(DISTINCT(kgID)) FROM kgXref" $db`
echo "$xrefNum $xrefNumUniq (uniq)"
echo "knownGenePep"
set kgPep=`hgsql -N -e "SELECT COUNT(*) FROM knownGenePep" $db`
echo $kgPep
echo "knownGeneMrna"
set kgMrna=`hgsql -N -e "SELECT COUNT(*) FROM knownGeneMrna" $db`
echo $kgMrna
echo "should match uniqs in KG, except for fewer Peps, now that we include non-coding:"
wc -l $db.KG.name.uniq
echo
echo "dupes in kgXref.kgID field:"
if ($xrefNum != $kgPep) then
hgsql -e "SELECT kgID, COUNT(*) as number FROM kgXref GROUP BY kgID \
ORDER BY number DESC LIMIT 5 " $db
endif
echo "should be no dupes anymore"
# -------------------------------------------------
# check that names in knownGene match names in knownGenePep, knownGeneMrna
# check that names in knownGene match kgID in kgXref
# check that names in knownGene match uniq kgID in kgProtAlias
echo
hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq > $db.KG.name.uniq
hgsql -N -e "SELECT name FROM knownGenePep" $db | sort | uniq > $db.KGpep.name.uniq
hgsql -N -e "SELECT name FROM knownGeneMrna" $db | sort | uniq > $db.KGrna.name.uniq
hgsql -N -e "SELECT kgID FROM kgXref" $db | sort | uniq > $db.kgXref.kgID.uniq
hgsql -N -e "SELECT kgID FROM kgProtAlias" $db | sort | uniq > $db.kgProtAlias.kgID.uniq
echo "-------------------------------------------------"
echo "check that names in knownGene match names in knownGenePep, knownGeneMrna:"
diff $db.KG.name.uniq $db.KGpep.name.uniq | wc -l
diff $db.KG.name.uniq $db.KGrna.name.uniq | wc -l
echo "expect zero. upper number is number of genes without pep"
echo
echo "-------------------------------------------------"
echo "check that names in knownGene match kgID in kgXref and kgProtAlias:"
# diff $db.KG.name.uniq $db.kgXref.kgID.uniq | wc -l
# diff $db.KG.name.uniq $db.kgProtAlias.kgID.uniq | wc -l
echo "uniq to KG.name (not in kgXref.kgID):"
comm -23 $db.KG.name.uniq $db.kgXref.kgID.uniq | wc -l
echo "uniq to kgXref.kgID: (not in KG.name):"
comm -13 $db.KG.name.uniq $db.kgXref.kgID.uniq | wc -l
echo "uniq to KG.name (not in kgProtAlias.kgID):"
comm -23 $db.KG.name.uniq $db.kgProtAlias.kgID.uniq | wc -l
echo "uniq to kgProtAlias.kgID: (not in KG.name):"
comm -13 $db.KG.name.uniq $db.kgProtAlias.kgID.uniq | wc -l
echo "expect zero"
echo "as of KGII, duplicate kgXref.kgID are ok. will be corrected in KGIII."
echo
echo "number of non-coding genes in kgTxInfo:"
hgsql -e ' SELECT category, COUNT(*) AS number FROM kgTxInfo GROUP BY category \
ORDER BY number DESC' $db
echo "total non-coding:"
hgsql -e 'SELECT COUNT(*) FROM kgTxInfo WHERE category NOT LIKE "coding"' $db
# -------------------------------------------------
# knownGenePep, knownGeneMrna -- check numbers of entries vs old version
# knownGenePep, knownGeneMrna -- check min and max lengths of seq column
echo
echo "-------------------------------------------------"
echo "knownGenePep, knownGeneMrna -- check numbers of entries vs old version:"
if ($db != $oldDb) then
echo " comparing to table in $oldDb on beta"
echo
endif
echo "dev first"
echo
echo "Pep"
hgsql -N -e "SELECT COUNT(*) FROM knownGenePep" $db
hgsql -h $sqlbeta -N -e "SELECT COUNT(*) FROM knownGenePep" $oldDb
echo
echo "Mrna"
hgsql -N -e "SELECT COUNT(*) FROM knownGeneMrna" $db
hgsql -h $sqlbeta -N -e "SELECT COUNT(*) FROM knownGeneMrna" $oldDb
echo "should be some new ones"
echo
echo "knownGenePep, knownGeneMrna -- check min and max lengths of seq column:"
echo "dev, format: {MIN, MAX}Mrna, {MIN, MAX}Pep:"
echo "----"
hgsql -N -e "SELECT MIN(LENGTH(seq)) FROM knownGeneMrna" $db
hgsql -N -e "SELECT MAX(LENGTH(seq)) FROM knownGeneMrna" $db
hgsql -N -e "SELECT MIN(LENGTH(seq)) FROM knownGenePep" $db
hgsql -N -e "SELECT MAX(LENGTH(seq)) FROM knownGenePep" $db
echo
echo "beta, format: {MIN, MAX}Mrna, {MIN, MAX}Pep:"
echo "----"
hgsql -h $sqlbeta -N -e "SELECT MIN(LENGTH(seq)) FROM knownGeneMrna" $oldDb
hgsql -h $sqlbeta -N -e "SELECT MAX(LENGTH(seq)) FROM knownGeneMrna" $oldDb
hgsql -h $sqlbeta -N -e "SELECT MIN(LENGTH(seq)) FROM knownGenePep" $oldDb
hgsql -h $sqlbeta -N -e "SELECT MAX(LENGTH(seq)) FROM knownGenePep" $oldDb
echo "sometimes the same on dev as beta: no new proteins at extremes "
echo
echo "-------------------------------------------------"
# kgXref -- mrna generally equals kgID
echo "kgXref -- mrna generally equals kgID"
echo "count:"
hgsql -N -e "SELECT COUNT(*) FROM kgXref" $db
echo "count mrna != kgID:"
hgsql -N -e "SELECT COUNT(*) FROM kgXref WHERE mrna != kgID" $db
echo "expect zero"
echo
echo "-------------------------------------------------"
# kgXref -- empty fields
# should give zero (no empty fields):
echo "refseq column can be empty (protAcc is, too, then)"
hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE refseq = "" ' $db
hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE refseq = "" AND protAcc = "" ' $db
hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE refseq = "" AND protAcc != "" ' $db
echo "expect last value to be zero"
echo
# check all tables for null + zero entries:
echo
echo "-------------------------------------------------"
echo "check all tables for null + zero entries:"
cat kgTables gsTables pbTables | sort -u > kgTablesAll
foreach table (`cat kgTablesAll`)
hgsql -N -e "DESC $table" $db | gawk '{print $1}' > ${table}Cols
echo $table
echo "============="
set totRows=`hgsql -N -e 'SELECT COUNT(*) FROM '$table'' $db`
echo "total rows = "$totRows
rm -f $table.nullzero.totals
echo "null + zero entries:"
foreach col (`cat ${table}Cols`)
set cnt=`hgsql -N -e 'SELECT COUNT(*) FROM '$table' WHERE '$col='""' $db`
echo "$col\t$cnt"
end
echo
end
echo
# check all tables for null entries:
echo
foreach table (`cat kgTablesAll`)
hgsql -N -e "DESC $table" $db | gawk '{print $1}' > ${table}Cols
echo $table
echo "============="
set totRows=`hgsql -N -e 'SELECT COUNT(*) FROM '$table'' $db`
echo "total rows = "$totRows
rm -f $table.null.totals
echo "null entries:"
foreach col (`cat ${table}Cols`)
set cnt=`hgsql -N -e "SELECT COUNT(*) FROM $table WHERE $col IS NULL" $db`
echo "$col\t$cnt"
end
echo
end
echo
echo
echo "-------------------------------------------------"
echo "check all tables for null + zero entries:"
echo "some, but not all of protAcc column will be empty:"
echo "kgXref.protAcc empty:"
hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE protAcc = "" ' $db
echo "kgXref.protAcc not empty:"
hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE protAcc != "" ' $db
echo
# -------------------------------------------------
# kgXref -- can have duplicate values in spID
echo "-------------------------------------------------"
echo "kgXref -- can have duplicate values in spID"
set tot=`hgsql -N -e "SELECT COUNT(*) FROM kgXref" $db`
hgsql -N -e "SELECT spID FROM kgXref" $db | sort | uniq > $db.kgXref.spID.uniq
set uniq=`hgsql -N -e "SELECT spID FROM kgXref" $db | sort | uniq | wc -l`
set diff = `expr $tot - $uniq`
echo "tot = "$tot
echo "uniq = "$uniq
echo "dupes = "$diff
echo
# -------------------------------------------------
# kgXref -- can position-search on spID:
# make a java robot
echo "-------------------------------------------------"
echo "kgXref -- can position-search on spID:"
echo "check manually"
hgsql -N -e "SELECT spID FROM kgXref ORDER BY spID DESC limit 2" $db
echo
# -------------------------------------------------
# kgAlias -- check kgAlias.alias for uniq. search position box with some
echo "-------------------------------------------------"
echo "kgAlias -- check kgAlias.alias for uniq. search position box with some."
echo
echo "kgAlias count:"
hgsql -N -e "SELECT COUNT(*) FROM kgAlias" $db
echo "uniq:"
hgsql -N -e "SELECT alias FROM kgAlias" $db | sort | uniq > $db.kgAlias.alias.uniq
wc -l $db.kgAlias.alias.uniq
echo "expect more uniq than KG names, but many dupes"
echo
echo "get some records"
hgsql -e "SELECT alias, kgID FROM kgAlias ORDER BY alias limit 5" $db
echo
# -------------------------------------------------
# kgAlias -- uniq IDs from KG can have one or more matches
echo "-------------------------------------------------"
echo "kgAlias -- uniq IDs from KG can have one or more matches"
hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq > $db.KG.name.uniq
wc -l $db.KG.name.uniq
hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq > $db.kgAlias.kgID.uniq
comm -23 $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
comm -13 $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
comm -12 $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
echo "expect 0, 0, all"
echo
# -------------------------------------------------
# kgAlias -- kgAlias uniq count should be less than KG.uniq
echo
echo "-------------------------------------------------"
echo "kgAlias -- kgAlias uniq count should be less than or equal to KG.uniq"
hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq | wc -l
hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq | wc -l
hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq > $db.kgAlias.kgID.uniq
diff $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
echo "expect zero"
echo
# -------------------------------------------------
# kgAlias -- expect multiple rows with same kgID value
echo
echo "-------------------------------------------------"
echo "kgAlias -- expect multiple rows with same kgID value"
set tot=`hgsql -N -e "SELECT COUNT(*) FROM kgAlias" $db`
set uniq=`hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq | wc -l`
set diff = `expr $tot - $uniq`
echo "tot = "$tot
echo "uniq = "$uniq
echo "dupes = "$diff
echo
# -------------------------------------------------
# cgapAlias -- check that at least some cgapAlias.alias
# entries are in knownGene.name
echo
echo "-------------------------------------------------"
echo "cgapAlias -- check that at least some cgapAlias.alias \
entries are in knownGene.name"
hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq > $db.KG.name.uniq
hgsql -N -e "SELECT alias FROM cgapAlias" $db | sort | uniq > $db.cgapAlias.alias.uniq
wc -l $db.cgapAlias.alias.uniq | grep -v total
comm -23 $db.KG.name.uniq $db.cgapAlias.alias.uniq > $db.KG.notin.cgap
comm -12 $db.KG.name.uniq $db.cgapAlias.alias.uniq > $db.KG.in.cgap
echo
wc -l $db*KG*cgap*
echo "expect total be number of KG"
echo
# -------------------------------------------------
# keggPathway, keggMapDesc -- validate
# automate this w/ httpunit.
echo
echo "-------------------------------------------------"
echo "keggPathway, keggMapDesc -- validate"
echo
hgsql -e "SELECT * FROM keggPathway limit 5" $db > $db.kegg
cat $db.kegg
echo
# get the mapID for these 5:
echo "check some by hand, using kgID -- should have kegg section \
at bottom of KG page"
foreach mapID (`cat $db.kegg | gawk '{print $3}'`)
hgsql -N -e 'SELECT * FROM keggMapDesc WHERE mapID = "'$mapID'"' $db
end
echo
# hgsql -e "SELECT * FROM keggMapDesc limit 5" $db
echo
# ooooooooooooooo add logic for the following:
# -------------------------------------------------
# check KG sort
# confirm the new table has the same rows as the old
# (sorted but not uniqed)
echo
echo "-------------------------------------------------"
echo "checking for KG position sorting:"
hgsql -N -e "SELECT chrom FROM chromInfo" $db > $db.chroms
foreach chrom (`cat $db.chroms`)
hgsql -N -e 'SELECT txStart FROM knownGene WHERE chrom = "$chrom"' $db > $db.$chrom
sort $db.$chrom > $db.$chrom.sort
set sortDiff=`diff $db.$chrom $db.$chrom.sort | wc -l`
if ($sortDiff > 0) then
echo ""$chrom
endif
end
echo "prints chrom name of any unsorted chrom. expect none."
echo
# -------------------------------------------------
# check if KG is sorted by chrom:
# oooooooo fix sort
echo
# ~/bin/sortChromStart.csh $db knownGene
echo
# -------------------------------------------------
# start Comparative tests
# -------------------------------------------------
echo "-------------------------------------------------"
echo " Comparative tests"
echo "-------------------------------------------------"
# ------------
# for all tables in list: (csh version):
echo
echo "-------------------------------------------------"
echo "compare description of all tables with previous"
if ($db != $oldDb) then
echo " comparing to table in $oldDb on beta"
echo
endif
foreach table (`cat kgTablesAll`)
hgsql -h $sqlbeta -N -e "DESCRIBE $table" $oldDb > $db.beta.$table.desc
hgsql -N -e "DESCRIBE $table" $db > $db.dev.$table.desc
echo $table
diff $db.dev.$table.desc $db.beta.$table.desc
echo
end
echo
# find a way to count "<" and ">" lines in a file
# "sed effort below didn't work
# hgsql -N -e 'SELECT name FROM knownGeneLink WHERE name = "NM_198864"' $db
# found: therefore "<" in a diff means it is in the first file named
# hgsql -h $sqlbeta -N -e 'SELECT name FROM knownGeneLink \
# WHERE name = "NM_198864"' $db
# not found: therefore "<" in a diff means it is in the first file named
###############################
# didn't work:
# diff $db.dev.kgLink.sort $db.beta.kgLink.sort | sed -e '/>/'
###############################
# didn't work:
echo
echo "-------------------------------------------------"
echo "check rowcounts against beta"
echo "old version, beta, listed first"
if ($db != $oldDb) then
echo " comparing to table in $oldDb on beta"
echo
endif
foreach table (`cat kgTablesAll`)
set old=`hgsql -h $sqlbeta -N -e "SELECT COUNT(*) FROM $table" $oldDb`
set new=`hgsql -N -e "SELECT COUNT(*) FROM $table" $db`
set newRows=`expr $new - $old`
set percent = 0
if ($newRows == 0) then
echo
echo "$table : same number of rows\n"
else
if ($newRows < 0) then
set negRows=`echo $newRows | gawk '{print (-1) * $1}'`
echo "$table has $negRows fewer rows"
else
echo "$table has $newRows more rows"
endif
echo " old: $old"
echo " new: $new"
if ($old != 0) then
set percent=`echo $newRows $old | gawk '{printf "%.2f\n", $1/$2*100}'`
# attempt to put a "+" in fornt of positive numbers. except they are strings.
if ($newRows > 0) then
set percent="+"$percent
endif
echo " diff: $newRows ("$percent"%)"
endif
echo
endif
end
echo
###############################
# -------------------------------------------------
# kgAlias -- check diffs from former (two-field table):
echo
echo "kgAlias -- check diffs from former (two- field table):"
if ($db != $oldDb) then
echo " comparing to table in $oldDb on beta"
echo
endif
hgsql -N -e "SELECT * FROM kgAlias" $db | sort > $db.dev.kgAlias.sort
hgsql -h $sqlbeta -N -e "SELECT * FROM kgAlias" $oldDb | sort > $oldDb.beta.kgAlias.sort
wc -l *kgAlias.sort | grep -v total
echo
# -------------------------------------------------
# some things on protocol not yet implemented:
#
# comparative to previous asembly. knownGene:
# compare gene size. examine changes clarger than 10K
# compare exonCount. examine changes larger than 5
#
# comparative to previous assembly. kgXref
# use KGXrefCheck.java (only compares old assembly to new now. won't do new assembly update)
# use KGXrefDiff.java
#
# --------------------------------------------
# check bioCyc tables:
echo
echo "-------------------------------------------------"
echo check bioCyc tables, if available:
echo
set biocyc=`cat {$trackName}TablesAll | grep "bioCyc" | wc -l`
if ($biocyc > 0) then
echo "bioCycPathway"
hgsql -N -e "SELECT COUNT(DISTINCT(kgID)) FROM bioCycPathway" $db
hgsql -e "SELECT * FROM bioCycMapDesc LIMIT 5" $db
echo
echo "bioCycMapDesc"
hgsql -N -e "SELECT COUNT(DISTINCT(mapID)) FROM bioCycMapDesc" $db
hgsql -e "SELECT * FROM bioCycPathway LIMIT 5" $db
echo
else
echo "no bioCyc* tables in this assembly"
endif
echo
echo "flesh this out with checks on number of genes. are any uniq, etc."
echo "pipe a column into awk and get the proper other values"
echo
echo "add support for check previous assembly as parameter if not a rebuild"
# -------------------------------------------------
echo "-------------------------------------------------"
echo "check for different types of knownAlt"
echo
hgsql -e "SELECT DISTINCT name, COUNT(*) AS number FROM knownAlt \
GROUP BY name ORDER BY number DESC" $db
echo
# maybe get a link for each type?
# -------------------------------------------------
#
# further development:
# -------------
# see ooooo above.
# comparatives: compare gene size. examine changes larger than 10k bases
# comparatives: compare exonCount. examine changes larger than 5
# use .../jdbc/KGXrefCheck
# --finds old db entries that do not match in new at kgID, spID or refseq.
# check kgAlias for added, deleted and changes aliases
#
# -------------------------------------------------
#
#
echo " -------------------------------------------------"
echo "check out proteinsYYMMDD tables : hugo, etc. see push protocols"
echo " set gdbPdb after tables pushed "
echo " -------------------------------------------------"
echo
#
echo
echo " -------------------------------------------------"
echo " When the gene set is stable, notify Tim Strom (TimStrom@gsf.de)"
echo " so he can change his ExonPrimer - if a human or mouse assembly"
echo " -------------------------------------------------"
echo
echo
echo " -------------------------------------------------"
-echo " Once the new UCSC Genes is released on RR,
- update .../htdocs/knownGeneLists.html"
+echo " Once the new UCSC Genes is released on RR, "
+echo " update .../htdocs/knownGeneLists.html"
echo " -------------------------------------------------"
echo "\n here are files for /usr/local/apache/htdocs/knownGeneList/$db"
echo
ls -l /usr/local/apache/htdocs/knownGeneList/$db
echo
echo
echo " -------------------------------------------------"
echo " now go do Family Browser"
echo " -------------------------------------------------"
echo
echo " then run bigPush.csh ??"
# clean up unneeded files:
rm -f *desc
rm -f *Cols
rm -f $db.chr*
# "clear out empty files:
~/bin/dumpEmpty.csh . >& /dev/null
echo
echo end
echo