src/utils/qa/knownGene.csh 1.16

1.16 2010/03/14 20:34:00 pauline
Fixed some typos, an unmatched quote and clarified description statement.
Index: src/utils/qa/knownGene.csh
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/qa/knownGene.csh,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 1000000 -r1.15 -r1.16
--- src/utils/qa/knownGene.csh	14 Oct 2009 17:20:03 -0000	1.15
+++ src/utils/qa/knownGene.csh	14 Mar 2010 20:34:00 -0000	1.16
@@ -1,896 +1,898 @@
 #!/bin/tcsh
 source `which qaConfig.csh`
 
 # to do:  check that trackDb.hgGene is "on"
 # to do:  see that cgapBiocDesc has unique entries
 
 # --------------------------------------------
-#  run KGGeneCheck.java on dev:
+#  run HGGeneCheck.java on dev:
 
 #### echo
 #### echo " --------------------------------------------"
 ####
 #### echo "server hgwdev.cse.ucsc.edu /
 #### machine hgwdev.cse.ucsc.edu /
 #### quick false /
 #### dbSpec $db /
 #### table all /
 #### zoomCount 4" > $db.props
 ###
-#### echo "run KGGeneCheck: \
-####   nohup nice HGGeneCheck $db.props > & $db.KGrobot.out & "
+#### echo "run HGGeneCheck: \
+####   nohup nice HGGeneCheck $db.props > & $db.HGrobot.out & "
 #### echo
-echo " --------------------------------------------"
+#echo " --------------------------------------------"
 
 ###############################################
 # 
 #  03-09-04
 #  Robert Kuhn
 #
-#  Runs through the checks on Known Genes.  
-#  Requires KG java run in Trackchecking first.
-#  -----------
-#  run HGGeneCheck.java on dev 
-#  /cluster/home/heather/TrackChecking/KGGeneCheck.java
+#  This script runs through a number of checks on the Known Genes tables.  
+#  You need to run HGGeneCheck first. Also you need to have 3 files 
+#  (kgTables, gsTables, and pbTables) that list the tables involved one
+#  directory above the directory where you choose to run knownGene.csh.
+#  
+#  run HGGeneCheck.java on dev: 
+#  nohup nice HGGeneCheck $db.props > & $db.HGrobot.out &
 # 
 ###############################################
 
 set db=""
 set oldDb=""
 set trackName="kg"
 set table="knownGene"
 
 
 if ($#argv == 0 || $#argv > 2) then
   # no command line args
   echo
   echo "  runs test suite on Known Genes track."
   echo "  suggest KG java run in Trackchecking first."
   echo
   echo "    usage:  database [oldDb] (if fresh assembly for existing organism)"
   echo
   exit
 else
   set db=$argv[1]
 endif
 
 if ($#argv == 2) then
   set oldDb=$argv[2]
 else
   set oldDb=$db
 endif
 
 
 echo "using $db "
 echo "using $oldDb for beta db "
 set curr_dir=$cwd
 
 #  get sets of KG, GS, PB tables from master list:
 findKgTableSet.csh $db 
 
 # count items in list
 echo "found the following KG tables:"
 wc -l $db.{kg,gs,pb}Tables
 cat $db.{kg,gs,pb}Tables | sort -u > $db.allTables
 echo "will have some overlap. uniq tables = `cat $db.allTables | sort -u | wc -l`"
 
 # find new tables: in pushQ but NOT in a kgTables list
 echo
 echo "find new tables: in pushQ but NOT in a kgTables list"
 hgsql -h mysqlbeta -Ne "SELECT tbls FROM pushQ WHERE tbls LIKE '%knownGene%' \
   AND dbs = '"$db"'" qapushq | sed "s/\\n/\n/g" | egrep -r . > $db.pushqKgList
 dos2unix $db.pushqKgList >& /dev/null
 commTrio.csh $db.allTables $db.pushqKgList 
 
 if ( `wc -l $db.allTables.Only | awk '{print $1}'` != 0 ) then
   echo "these tables may be in separate tracks? ($db.allTables.Only) :"
   cat $db.allTables.Only
 endif 
 echo
 
 # --------------------------------------------
 
 echo
 echo "compare new genes to old set (uniq):"
 echo "not useful for first releasse of KG3-type track"
 if ($db != $oldDb) then
   echo "  comparing to table in $oldDb on beta"
   echo
 endif
 hgsql -N -e "SELECT name from knownGene" $db | sort | uniq > $db.KG.name.dev
 hgsql -N -h $sqlbeta -e "SELECT name from knownGene" $oldDb | sort |uniq > $oldDb.KG.name.beta.uniq
 comm -23 $db.KG.name.dev $oldDb.KG.name.beta.uniq > $db.KG.name.devOnly
 comm -13 $db.KG.name.dev $oldDb.KG.name.beta.uniq > $oldDb.KG.name.betaOnly
 comm -12 $db.KG.name.dev $oldDb.KG.name.beta.uniq > $db.KG.name.common
 wc -l  $db.KG.name.devOnly $oldDb.KG.name.betaOnly $db.KG.name.common | grep -v total 
 
 echo
 echo "check some new ones by hand:"
 head -20 $db.KG.name.devOnly | tail -3
 echo
 echo "check some old ones by hand on beta:"
 head -20 $oldDb.KG.name.betaOnly | tail -3
 echo
 
 echo "strip version number and compare again:"
 awk -F. '{print $1}' $db.KG.name.devOnly     > $db.KG.name.devOnly.noVersion 
 awk -F. '{print $1}' $oldDb.KG.name.betaOnly > $oldDb.KG.name.betaOnly.noVersion
 commTrio.csh  $db.KG.name.devOnly.noVersion $oldDb.KG.name.betaOnly.noVersion
 
 # --------------------------------------------
 # check alignID for unique:
 
 echo
 echo "checking alignID for unique:"
 hgsql -N -e "SELECT alignID FROM knownGene" $db | sort > $db.KG.alignID.sort
 uniq  $db.KG.alignID.sort > $db.KG.alignID.uniq
 comm -12  $db.KG.alignID.sort $db.KG.alignID.uniq > $db.KG.alignID.common
 echo "expect all lines to be in common file:  all alignIDs are unique"
 
 echo "if any are in sort file more than once:"
 comm -23  $db.KG.alignID.sort $db.KG.alignID.uniq > $db.KG.alignID.dupes 
 wc -l $db*alignID* | grep -v "total"
 echo "three duplicate alignIDs (may be blank):"
 head -3 $db.KG.alignID.dupes 
 echo
 
 
 # --------------------------------------------
 # check strand:
 
 echo "-------------------------------------------------"
 echo
 echo "counting strands:"
 set totKG=`hgsql -N -e  'SELECT COUNT(*) FROM knownGene' $db`
 set tot=`hgsql -N -e "SELECT COUNT(*) FROM knownGene" $db` 
 set plus=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene WHERE strand = "+"' $db` 
 set minus=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene WHERE strand = "-"' $db`
 set null=`hgsql -N -e 'SELECT COUNT(*) FROM knownGene WHERE strand = ""' $db`
 
 set total = `expr $null + $minus + $plus`
 
 echo "null  = $null"
 echo "plus  = $plus"
 echo "minus = $minus"
 echo "total = $total"
 echo "rows  = $totKG"
 echo
 
 
 # -------------------------------------------------
 # look for start and end order:
 
 echo "-------------------------------------------------"
 echo
 echo "look for start and end order:"
 rm -f txFile
 set var1=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart >= txEnd" $db` 
 set var2=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart > cdsStart" $db`
 set var3=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE cdsEnd > txEnd" $db`
 set var4=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart = cdsStart" $db`
 set var5=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE cdsEnd = txEnd" $db` 
 set var6=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE txStart < 0" $db` 
 set var7=`hgsql -N -e "SELECT COUNT(*) FROM knownGene WHERE cdsStart < 0" $db` 
 
 echo
 echo "txStart >= txEnd:   "$var1
 echo "txStart > cdsStart: "$var2
 echo "cdsEnd > txEnd:     "$var3
 echo "txStart = cdsStart: "$var4" (means no 5'-UTR if + strand)"
 echo "cdsEnd = txEnd:     "$var5" (means no 3'-UTR if + strand)"
 echo "negative txStart:   "$var6
 echo "negative cdsStart:  "$var7
 echo
 
 echo "-------------------------------------------------"
 echo
 echo "check if browser show no 5', 3'-UTR where predicted:"
 echo "no 5'-UTR:"
 hgsql -e "SELECT name, chrom, txStart, txEnd FROM knownGene \
          WHERE txStart = cdsStart LIMIT 3" $db
 echo
 echo "no 3'-UTR:"
 hgsql -e "SELECT name, chrom, txStart, txEnd FROM knownGene \
          WHERE txEnd = cdsEnd LIMIT 3" $db
 
 
 # -------------------------------------------------
 # check ends:
 
 echo "-------------------------------------------------"
 # looking for off the end of each chrom (look for all positive values)
 # looking for off-end of each chrom (look for all positive values)
 
 echo
 echo "looking for off the end of each chrom \
        (look for all positive values):"
 
 checkOffend.csh $db $table
 echo
 
 
 # -------------------------------------------------
 # check to see if there are genes on all chroms.
 
 
 echo "-------------------------------------------------"
 echo " check to see if there are genes on all chroms:"
 echo
 hgsql -N -e "SELECT ci.chrom, COUNT(*) AS genes \
   FROM chromInfo ci, knownGene kg WHERE ci.chrom=kg.chrom \
   AND ci.chrom NOT LIKE '%random' \
   GROUP BY ci.chrom ORDER BY chrom" $db
   
 hgsql -N -e "SELECT ci.chrom, COUNT(*) AS genes \
   FROM chromInfo ci, knownGene kg WHERE ci.chrom=kg.chrom \
   AND ci.chrom LIKE '%random' \
   GROUP BY ci.chrom ORDER BY chrom" $db 
 echo
 
 
 
 echo "-------------------------------------------------"
 echo " check coverage: "
 echo
 checkCoverage.csh $db $table
 
 echo
 runBits.csh $db knownGene
 echo
 runJoiner.csh $db knownGene
 
 # -------------------------------------------------
 # check exon sizes:
 # 
 # 
 # # edit the list /cluster/home/heather/jdbc/data/tablelist
 # echo
 # echo "knownGene" > /cluster/home/heather/jdbc/data/KG
 # 
 # # and run /cluster/home/heather/jdbc/ExonSize.java:
 # cd /cluster/home/heather/jdbc/
 # source javaenv
 # java ExonSize $db data/KG | egrep -v "items found|-----|Querying" > $curr_dir/$db.exonExtremes
 # cd $curr_dir 
 # 
 # echo
 # echo "number of very large or negative exons:"
 # grep "exon" $db.exonExtremes | wc -l
 # # echo "five lines are kruft (filtered above now with grep -v"
 # echo
 # echo "example of very large or negative exons:"
 # head -5 $db.exonExtremes
 # echo
 
 # -------------------------------------------------
 # count exons
 
 echo
 echo "count exons"
 echo "genes with the most exons:"
 hgsql -t -e "SELECT name, exonCount FROM knownGene ORDER BY exonCount DESC limit 5" $db > $db.manyExons
 cat $db.manyExons
 echo
 
 # -------------------------------------------------
 # look for unique names 
 
 echo
 echo "look for unique names:" 
 hgsql -N -e "SELECT name FROM knownGene" $db | sort \
   > $db.KG.name.sort
 uniq $db.KG.name.sort > $db.KG.name.uniq 
 
 set kgSort=`wc -l $db.KG.name.sort`
 set kgUniq=`wc -l $db.KG.name.uniq`
 set diff=`comm -23 $db.KG.name.sort $db.KG.name.uniq \
     | wc -l`
 echo "sort: "$kgSort
 echo "uniq: "$kgUniq
 echo "diff: "$diff
 echo
 
 
 # ------------------------------------
 
 # todo: get list of genes and their geneSymbol and count up how
 # many have the same name.  look at outliers.  what follows here doesn't
 # work anymore as KG.name is now a unique field.
 # todo: analyze knownCanonical clusters
 
 
 # # delete lines with single hit "   1[tab]" 
 # # -- and count lines remaining (genes used more than once):
 # 
 # echo
 # sort  $db.KG.name.sort | uniq -c | sort -nr > $db.KG.name.uniq.most
 # 
 # echo "number of genes that align more than once:"
 # sed -e "/  1 /d" $db.KG.name.uniq.most | wc -l
 # echo
 # 
 # echo "gene occurance profile"
 # echo
 # textHistogram -col=1 $db.KG.name.uniq.most
 # rm file
 # 
 # echo
 # echo "genes that align the most times:"
 # head $db.KG.name.uniq.most
 # echo
 
 
 
 # -------------------------------------------------
 # examine overlapping genes ???
 #   .../jdbc/Overlap.java
 #   how ???  There are many
 # -------------------------------------------------
 # look for same number of rows in kgXref, knownGenePep, knownGeneMrna
 
 echo
 echo "-------------------------------------------------"
 echo "look for same number of rows in kgXref, knownGenePep, knownGeneMrna:"
 echo "kgXref"
 set xrefNum=`hgsql -N -e "SELECT COUNT(*) FROM kgXref" $db`
 set xrefNumUniq=`hgsql -N -e "SELECT COUNT(DISTINCT(kgID)) FROM kgXref" $db`
 echo "$xrefNum   $xrefNumUniq (uniq)"
 echo "knownGenePep"
 set kgPep=`hgsql -N -e "SELECT COUNT(*) FROM knownGenePep" $db`
 echo $kgPep
 echo "knownGeneMrna"
 set kgMrna=`hgsql -N -e "SELECT COUNT(*) FROM knownGeneMrna" $db`
 echo $kgMrna
 echo "should match uniqs in KG, except for fewer Peps, now that we include non-coding:"
 wc -l $db.KG.name.uniq
 echo
 
 echo "dupes in kgXref.kgID field:"
 if ($xrefNum != $kgPep) then
   hgsql -e "SELECT kgID, COUNT(*) as number FROM kgXref GROUP BY kgID \
     ORDER BY number DESC LIMIT 5 " $db
 endif
 echo "should be no dupes anymore"
 
 # -------------------------------------------------
 # check that names in knownGene match names in knownGenePep, knownGeneMrna
 # check that names in knownGene match kgID in kgXref
 # check that names in knownGene match uniq kgID in kgProtAlias
 
 echo
 
 hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq > $db.KG.name.uniq
 hgsql -N -e "SELECT name FROM knownGenePep" $db | sort | uniq > $db.KGpep.name.uniq
 hgsql -N -e "SELECT name FROM knownGeneMrna" $db | sort | uniq > $db.KGrna.name.uniq
 hgsql -N -e "SELECT kgID FROM kgXref" $db | sort | uniq > $db.kgXref.kgID.uniq
 hgsql -N -e "SELECT kgID FROM kgProtAlias" $db | sort  | uniq > $db.kgProtAlias.kgID.uniq
 
 
 
 echo "-------------------------------------------------"
 echo "check that names in knownGene match names in knownGenePep, knownGeneMrna:"
 diff $db.KG.name.uniq $db.KGpep.name.uniq | wc -l
 diff $db.KG.name.uniq $db.KGrna.name.uniq | wc -l
 echo "expect zero. upper number is number of genes without pep"
 echo
 
 echo "-------------------------------------------------"
 echo "check that names in knownGene match kgID in kgXref and kgProtAlias:"
 # diff $db.KG.name.uniq $db.kgXref.kgID.uniq | wc -l
 # diff $db.KG.name.uniq $db.kgProtAlias.kgID.uniq | wc -l
 
 echo "uniq to KG.name (not in kgXref.kgID):"
 comm -23 $db.KG.name.uniq $db.kgXref.kgID.uniq | wc -l
 echo "uniq to kgXref.kgID: (not in KG.name):"
 comm -13 $db.KG.name.uniq $db.kgXref.kgID.uniq | wc -l
 
 echo "uniq to KG.name (not in kgProtAlias.kgID):"
 comm -23 $db.KG.name.uniq $db.kgProtAlias.kgID.uniq | wc -l
 echo "uniq to kgProtAlias.kgID: (not in KG.name):"
 comm -13 $db.KG.name.uniq $db.kgProtAlias.kgID.uniq | wc -l
 
 echo "expect zero"
 echo "as of KGII, duplicate kgXref.kgID are ok.  will be corrected in KGIII."
 echo
 
 echo "number of non-coding genes in kgTxInfo:"
 hgsql -e ' SELECT category, COUNT(*) AS number FROM kgTxInfo GROUP BY category \
   ORDER BY number DESC' $db
 echo "total non-coding:"
 hgsql -e 'SELECT COUNT(*) FROM kgTxInfo WHERE category NOT LIKE "coding"' $db
 
 
 
 # -------------------------------------------------
 # knownGenePep, knownGeneMrna -- check numbers of entries vs old version
 # knownGenePep, knownGeneMrna -- check min and max lengths of seq column
 
 echo
 echo "-------------------------------------------------"
 echo "knownGenePep, knownGeneMrna -- check numbers of entries vs old version:"
 if ($db != $oldDb) then
   echo "  comparing to table in $oldDb on beta"
   echo
 endif
 echo "dev first"
 echo
 echo "Pep"
 hgsql -N -e "SELECT COUNT(*) FROM knownGenePep" $db
 hgsql -h $sqlbeta -N -e "SELECT COUNT(*) FROM knownGenePep" $oldDb
 echo
 echo "Mrna"
 hgsql -N -e "SELECT COUNT(*) FROM knownGeneMrna" $db
 hgsql -h $sqlbeta -N -e "SELECT COUNT(*) FROM knownGeneMrna" $oldDb
 echo "should be some new ones"
 echo
 
 
 echo "knownGenePep, knownGeneMrna -- check min and max lengths of seq column:"
 echo "dev, format: {MIN, MAX}Mrna, {MIN, MAX}Pep:"
 echo "----"
  hgsql -N -e "SELECT MIN(LENGTH(seq)) FROM knownGeneMrna" $db
  hgsql -N -e "SELECT MAX(LENGTH(seq)) FROM knownGeneMrna" $db
  hgsql -N -e "SELECT MIN(LENGTH(seq)) FROM knownGenePep" $db
  hgsql -N -e "SELECT MAX(LENGTH(seq)) FROM knownGenePep" $db
 
 echo
 echo "beta, format: {MIN, MAX}Mrna, {MIN, MAX}Pep:"
 echo "----"
  hgsql -h $sqlbeta -N -e "SELECT MIN(LENGTH(seq)) FROM knownGeneMrna" $oldDb
  hgsql -h $sqlbeta -N -e "SELECT MAX(LENGTH(seq)) FROM knownGeneMrna" $oldDb
  hgsql -h $sqlbeta -N -e "SELECT MIN(LENGTH(seq)) FROM knownGenePep" $oldDb
  hgsql -h $sqlbeta -N -e "SELECT MAX(LENGTH(seq)) FROM knownGenePep" $oldDb
 echo "sometimes the same on dev as beta:  no new proteins at extremes "
 echo
 
 echo  "-------------------------------------------------"
 # kgXref -- mrna generally equals kgID
 
 echo "kgXref -- mrna generally equals kgID"
 echo "count:"
 hgsql -N -e "SELECT COUNT(*) FROM kgXref" $db
 echo "count mrna != kgID:"
 hgsql -N -e "SELECT COUNT(*) FROM kgXref WHERE mrna != kgID" $db
 echo "expect zero"
 echo
 
 
 echo  "-------------------------------------------------"
 # kgXref -- empty fields
 # should give zero (no empty fields):
 
 echo "refseq column can be empty (protAcc is, too, then)"
 hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE refseq = "" ' $db
 hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE refseq = "" AND protAcc = "" ' $db
 hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE refseq = "" AND protAcc != "" ' $db
 echo "expect last value to be zero"
 echo
 
 
 # check all tables for null + zero entries:
 
 echo
 echo  "-------------------------------------------------"
 echo "check all tables for null + zero entries:"
 cat kgTables gsTables pbTables | sort -u > kgTablesAll
 foreach table (`cat kgTablesAll`)
   hgsql -N -e "DESC $table" $db | gawk '{print $1}' > ${table}Cols
   echo $table
   echo "============="
   set totRows=`hgsql -N -e  'SELECT COUNT(*) FROM '$table'' $db`
   echo "total rows = "$totRows
   rm -f $table.nullzero.totals
   echo "null + zero entries:"
   foreach col (`cat ${table}Cols`)
     set cnt=`hgsql -N -e  'SELECT COUNT(*) FROM '$table' WHERE '$col='""' $db`
     echo "$col\t$cnt"
   end
   echo
 end
 echo
 
 
 # check all tables for null entries:
 
 echo
 foreach table (`cat kgTablesAll`)
   hgsql -N -e "DESC $table" $db | gawk '{print $1}' > ${table}Cols
   echo $table
   echo "============="
   set totRows=`hgsql -N -e  'SELECT COUNT(*) FROM '$table'' $db`
   echo "total rows = "$totRows
   rm -f $table.null.totals
   echo "null entries:"
   foreach col (`cat ${table}Cols`)
     set cnt=`hgsql -N -e  "SELECT COUNT(*) FROM $table WHERE $col IS NULL" $db`
     echo "$col\t$cnt"
   end
   echo
 end
 echo
 
 
 echo
 echo  "-------------------------------------------------"
 echo "check all tables for null + zero entries:"
 echo "some, but not all  of protAcc column will be empty:"
 echo "kgXref.protAcc empty:"
 hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE protAcc = "" ' $db
 echo "kgXref.protAcc not empty:"
 hgsql -N -e 'SELECT COUNT(*) FROM kgXref WHERE protAcc != "" ' $db
 echo
 
 
 # -------------------------------------------------
 # kgXref -- can have duplicate values in spID
 
 
 echo  "-------------------------------------------------"
 echo "kgXref -- can have duplicate values in spID"
 set tot=`hgsql -N -e "SELECT COUNT(*) FROM kgXref" $db` 
 hgsql -N -e "SELECT spID FROM kgXref" $db | sort | uniq > $db.kgXref.spID.uniq
 set uniq=`hgsql -N -e "SELECT spID FROM kgXref" $db | sort | uniq | wc -l`
 set diff = `expr $tot - $uniq`
 echo "tot   = "$tot
 echo "uniq  = "$uniq
 echo "dupes = "$diff
 echo
 
 
 # -------------------------------------------------
 # kgXref -- can position-search on spID:   
 #   make a java robot
 
 echo  "-------------------------------------------------"
 echo "kgXref -- can position-search on spID:"
 echo "check manually"
 hgsql -N -e "SELECT spID FROM kgXref ORDER BY spID DESC limit 2" $db
 echo
 
 
 # -------------------------------------------------
 # kgAlias -- check kgAlias.alias for uniq.  search position box with some
 
 echo  "-------------------------------------------------"
 echo "kgAlias -- check kgAlias.alias for uniq.  search position box with some."
 echo
 echo "kgAlias count:"
 hgsql -N -e "SELECT COUNT(*) FROM kgAlias" $db 
 echo "uniq:"
 hgsql -N -e "SELECT alias FROM kgAlias" $db | sort | uniq > $db.kgAlias.alias.uniq
 wc -l $db.kgAlias.alias.uniq
 echo "expect more uniq than KG names, but many dupes"
 echo
 echo "get some records"
 hgsql -e "SELECT alias, kgID FROM kgAlias ORDER BY alias limit 5" $db 
 echo
 
 
 # -------------------------------------------------
 # kgAlias -- uniq IDs from KG can have one or more matches
 
 echo  "-------------------------------------------------"
 echo "kgAlias -- uniq IDs from KG can have one or more matches"
 hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq > $db.KG.name.uniq
 wc -l  $db.KG.name.uniq
 hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq > $db.kgAlias.kgID.uniq
 
 comm -23 $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
 comm -13 $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
 comm -12 $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
 echo "expect 0, 0, all"
 echo
 
 
 # -------------------------------------------------
 # kgAlias -- kgAlias uniq count should be less than KG.uniq
 
 echo
 echo  "-------------------------------------------------"
 echo "kgAlias -- kgAlias uniq count should be less than or equal to KG.uniq"
 hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq | wc -l
 hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq | wc -l
 
 hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq  > $db.kgAlias.kgID.uniq
 diff $db.KG.name.uniq $db.kgAlias.kgID.uniq | wc -l
 echo "expect zero"
 echo
 
 
 # -------------------------------------------------
 # kgAlias -- expect multiple rows with same kgID value
 
 echo
 echo  "-------------------------------------------------"
 echo "kgAlias -- expect multiple rows with same kgID value"
 set tot=`hgsql -N -e "SELECT COUNT(*) FROM kgAlias" $db`
 set uniq=`hgsql -N -e "SELECT kgID FROM kgAlias" $db | sort | uniq | wc -l`
 set diff = `expr $tot - $uniq`
 echo "tot   = "$tot
 echo "uniq  = "$uniq
 echo "dupes = "$diff
 echo
 
 # -------------------------------------------------
 # cgapAlias -- check that at least some cgapAlias.alias 
 #    entries are in knownGene.name
 
 echo
 echo  "-------------------------------------------------"
 echo "cgapAlias -- check that at least some cgapAlias.alias \
   entries are in knownGene.name"
 hgsql -N -e "SELECT name FROM knownGene" $db | sort | uniq > $db.KG.name.uniq
 hgsql -N -e "SELECT alias FROM cgapAlias" $db | sort | uniq > $db.cgapAlias.alias.uniq
 wc -l $db.cgapAlias.alias.uniq | grep -v total
 
 comm -23 $db.KG.name.uniq $db.cgapAlias.alias.uniq  > $db.KG.notin.cgap
 comm -12 $db.KG.name.uniq $db.cgapAlias.alias.uniq  > $db.KG.in.cgap
 echo 
 wc -l $db*KG*cgap* 
 echo  "expect total be number of KG"
 echo
 
 
 # -------------------------------------------------
 # keggPathway, keggMapDesc -- validate
 #  automate this w/ httpunit.  
 
 echo
 echo  "-------------------------------------------------"
 echo "keggPathway, keggMapDesc -- validate"
 echo
 hgsql -e "SELECT * FROM keggPathway limit 5" $db > $db.kegg
 cat $db.kegg
 
 echo
 # get the mapID for these 5:
 echo "check some by hand, using kgID -- should have kegg section \
        at bottom of KG page"
 
 foreach mapID (`cat $db.kegg | gawk '{print $3}'`)
   hgsql -N -e 'SELECT * FROM keggMapDesc WHERE mapID = "'$mapID'"' $db
 end
    
 
 echo
 # hgsql -e "SELECT * FROM keggMapDesc limit 5" $db
 echo
 
 
 # ooooooooooooooo  add logic for the following:
 # -------------------------------------------------
 # check KG sort
 # confirm the new table has the same rows as the old
 #  (sorted but not uniqed)
 
 echo
 echo  "-------------------------------------------------"
 echo "checking for KG position sorting:"
 hgsql -N -e "SELECT chrom FROM chromInfo" $db > $db.chroms
 foreach chrom (`cat $db.chroms`)
   hgsql -N -e 'SELECT txStart FROM knownGene WHERE chrom = "$chrom"' $db > $db.$chrom
   sort $db.$chrom > $db.$chrom.sort
   set sortDiff=`diff $db.$chrom $db.$chrom.sort | wc -l`
   if ($sortDiff > 0) then
     echo ""$chrom
   endif
 end
 echo "prints chrom name of any unsorted chrom.  expect none."
 echo
 
 # -------------------------------------------------
 # check if KG is sorted by chrom:
 
 # oooooooo fix sort
 
 echo
 # ~/bin/sortChromStart.csh $db knownGene
 echo
 
 
 # -------------------------------------------------
 # start Comparative tests
 # -------------------------------------------------
 
 
 echo "-------------------------------------------------"
 echo "          Comparative tests"
 echo "-------------------------------------------------"
 
 
 # ------------
 # for all tables in list: (csh version):
 
 echo 
 echo  "-------------------------------------------------"
 echo "compare description of all tables with previous"
 if ($db != $oldDb) then
   echo "  comparing to table in $oldDb on beta"
   echo
 endif
 foreach table (`cat kgTablesAll`)
   hgsql -h $sqlbeta -N  -e "DESCRIBE $table" $oldDb >  $db.beta.$table.desc
   hgsql -N  -e "DESCRIBE $table" $db > $db.dev.$table.desc
   echo $table
   diff $db.dev.$table.desc $db.beta.$table.desc
   echo
 end
 echo
 
 # find a way to count "<" and ">" lines in a file  
 # "sed effort below didn't work
 
 # hgsql -N -e 'SELECT name FROM knownGeneLink WHERE name = "NM_198864"' $db 
 # found: therefore "<" in a diff means it is in the first file named
 # hgsql -h $sqlbeta -N -e 'SELECT name FROM knownGeneLink \
 #       WHERE name = "NM_198864"' $db 
 # not found: therefore "<" in a diff means it is in the first file named
 
 ###############################
 # didn't work:
 # diff $db.dev.kgLink.sort $db.beta.kgLink.sort | sed -e '/>/' 
 ###############################
 # didn't work:
 
 
 echo
 echo  "-------------------------------------------------"
 echo "check rowcounts against beta"
 echo "old version, beta,  listed first"
 if ($db != $oldDb) then
   echo "  comparing to table in $oldDb on beta"
   echo
 endif
 foreach table (`cat kgTablesAll`)
   set old=`hgsql -h $sqlbeta -N  -e "SELECT COUNT(*) FROM $table" $oldDb`
   set new=`hgsql -N  -e "SELECT COUNT(*) FROM $table" $db`
   set newRows=`expr $new - $old`
   set percent = 0
   if ($newRows == 0) then
     echo
     echo "$table : same number of rows\n"
   else
     if ($newRows < 0) then
       set negRows=`echo $newRows | gawk '{print (-1) * $1}'`
       echo "$table has $negRows fewer rows"
     else
       echo "$table has $newRows more rows"
     endif
     echo "   old:  $old"
     echo "   new:  $new"
     if ($old != 0) then
       set percent=`echo $newRows $old | gawk '{printf "%.2f\n", $1/$2*100}'`
       # attempt to put a "+" in fornt of positive numbers.  except they are strings.
       if ($newRows > 0) then
         set percent="+"$percent
       endif
       echo "   diff: $newRows ("$percent"%)"
     endif
     echo
   endif
 end
 echo
 
 
 ###############################
 
 # -------------------------------------------------
 # kgAlias -- check diffs from former (two-field table):
 
 echo
 echo "kgAlias -- check diffs from former (two- field table):"
 if ($db != $oldDb) then
   echo "  comparing to table in $oldDb on beta"
   echo
 endif
 hgsql -N -e "SELECT * FROM kgAlias" $db | sort > $db.dev.kgAlias.sort
 hgsql -h $sqlbeta -N -e "SELECT * FROM kgAlias" $oldDb | sort > $oldDb.beta.kgAlias.sort
 wc -l *kgAlias.sort | grep -v total
 echo
 
 # -------------------------------------------------
 # some things on protocol not yet implemented:
 # 
 # comparative to previous asembly.  knownGene:
 #   compare gene size.  examine changes clarger than 10K
 #   compare exonCount.  examine changes larger than 5
 # 
 # comparative to previous assembly.  kgXref
 #   use KGXrefCheck.java (only compares old assembly to new now.  won't do new assembly update)
 #   use KGXrefDiff.java  
 # 
 
 # --------------------------------------------
 #  check bioCyc tables:
 
 echo
 echo  "-------------------------------------------------"
 echo  check bioCyc tables, if available:
 echo
 set biocyc=`cat {$trackName}TablesAll | grep "bioCyc" | wc -l`
 if ($biocyc > 0) then
   echo "bioCycPathway"
   hgsql -N -e "SELECT COUNT(DISTINCT(kgID)) FROM bioCycPathway" $db
   hgsql -e "SELECT * FROM bioCycMapDesc LIMIT 5" $db
 
   echo
   echo "bioCycMapDesc"
   hgsql -N -e "SELECT COUNT(DISTINCT(mapID)) FROM bioCycMapDesc" $db
   hgsql -e "SELECT * FROM bioCycPathway LIMIT 5" $db
   echo
 else
   echo "no bioCyc* tables in this assembly"
 endif
 echo
 echo "flesh this out with checks on number of genes.  are any uniq, etc."
 echo "pipe a column into awk and get the proper other values"
 echo
 echo "add support for check previous assembly as parameter if not a rebuild"
 
 # -------------------------------------------------
 echo "-------------------------------------------------"
 echo "check for different types of knownAlt"
 echo
 hgsql -e "SELECT DISTINCT name, COUNT(*) AS number FROM knownAlt \
   GROUP BY name ORDER BY number DESC" $db
 echo
  # maybe get a link for each type?
 
 
 
 # -------------------------------------------------
 # 
 # further development:
 #      -------------
 # see ooooo above.
 # comparatives:  compare gene size. examine changes larger than 10k bases
 # comparatives:  compare exonCount.  examine changes larger than 5 
 # use .../jdbc/KGXrefCheck 
 #     --finds old db entries that do not match in new at kgID, spID or refseq.
 # check kgAlias for added, deleted and changes aliases
 # 
 # -------------------------------------------------
 # 
 # 
 
 echo  " -------------------------------------------------"
 echo  "check out proteinsYYMMDD tables :  hugo, etc.  see push protocols"
 echo  "        set gdbPdb after tables pushed           "
 echo  " -------------------------------------------------"
 echo
 # 
 echo
 echo " -------------------------------------------------"
 echo "    When the gene set is stable, notify  Tim Strom (TimStrom@gsf.de)"
 echo "    so he can change his ExonPrimer - if a human or mouse assembly"
 echo " -------------------------------------------------"
 echo
 
 echo
 echo " -------------------------------------------------"
-echo "    Once the new UCSC Genes is released on RR, 
-          update .../htdocs/knownGeneLists.html"
+echo "    Once the new UCSC Genes is released on RR, "
+echo "         update .../htdocs/knownGeneLists.html"
 echo " -------------------------------------------------"
 echo "\n here are files for /usr/local/apache/htdocs/knownGeneList/$db"
 echo
 ls -l /usr/local/apache/htdocs/knownGeneList/$db
 echo
 
 echo
 echo " -------------------------------------------------"
 echo "            now go do Family Browser" 
 echo " -------------------------------------------------"
 echo
 echo "  then run bigPush.csh ??"
 
 # clean up unneeded files:
 rm -f *desc
 rm -f *Cols
 rm -f $db.chr*
 
 # "clear out empty files:
 ~/bin/dumpEmpty.csh . >& /dev/null
 
 echo
 echo end
 echo