src/utils/qa/qaEnsGenes.csh 1.20
1.20 2009/03/21 00:29:25 rhead
Changed host hgwbeta to host hgofbeta.
Index: src/utils/qa/qaEnsGenes.csh
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/qa/qaEnsGenes.csh,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 1000000 -r1.19 -r1.20
--- src/utils/qa/qaEnsGenes.csh 2 Feb 2009 19:35:38 -0000 1.19
+++ src/utils/qa/qaEnsGenes.csh 21 Mar 2009 00:29:25 -0000 1.20
@@ -1,255 +1,255 @@
#!/bin/tcsh
###############################################
#
# 03-18-08
# Ann Zweig
#
# Runs through the usual checks for Ensembl
# Gene updates.
#
###############################################
set runOn=''
set ver=0
set dbs=''
set db=''
set betaList=''
if ($#argv != 2 ) then
echo
echo " runs test suite for ensGene track update"
echo " run this script before you push the new tables to beta"
echo " (makes lots of files: run in junk directory)"
echo " (it's best to direct output and errors to a file: '>&')"
echo
echo " usage: ensGeneVersionNumber (db | all)"
echo " ensGeneVersionNumber = Ensembl's version, e.g. 49"
echo " choose one database (db) or all dbs with ensGenes tracks (all)"
echo
exit 1
else
set ver=$argv[1]
set runOn=$argv[2]
endif
# run only from hgwdev
if ( "$HOST" != "hgwdev" ) then
echo "\nERROR: you must run this script on hgwdev!\n"
exit 1
endif
# check input
if ( $ver <= 45 || $ver >= 100 ) then
echo "\nERROR: you must enter a valid version number!\n"
exit 1
endif
# get rid of files, if they are around
rm -f xxDbList$$ xxNotActive$$ xxNotOnBeta$$
# figure out which assemblies already have an ensGene track on beta
set betaList=`getAssemblies.csh ensGene | egrep -v 'get' | egrep -v 'ensGene'`
# figure out which databases we're running it on
if ( 'all' == $runOn ) then
set dbs=`hgsql -Ne "SELECT db FROM trackVersion WHERE version = '$ver' \
and name='ensGene' ORDER BY db" hgFixed | sort -u`
echo "\nThe following databases were updated on hgwdev to ensGenes v$ver :"
echo "$dbs\n"
if ( "" == "$dbs" ) then
echo "\nERROR: there is no update available for version number $ver\n"
exit 1
else # updates for this ensGene version exist
foreach db ($dbs)
- set onBeta=`hgsql -h hgwbeta -Ne "SELECT name FROM dbDb \
+ set onBeta=`hgsql -h hgofbeta -Ne "SELECT name FROM dbDb \
WHERE name = '$db' AND active = 1" hgcentralbeta`
if ( "" == "$onBeta" ) then
echo $db >> xxNotActive$$
else
set hasTrack=''
set hasTrack=`echo $betaList | egrep -wo $db`
if ( "$db" != "$hasTrack" ) then
echo $db >> xxNotOnBeta$$
echo $db >> xxDbList$$
else
echo $db >> xxDbList$$
endif
endif
end
# print out all results
if ( -e xxNotActive$$ ) then
echo "\nOf that list, the following databases are not active on beta"
echo "so the tests in this script will not be run on them:"
cat xxNotActive$$
endif
if ( -e xxNotOnBeta$$ ) then
echo "\nOf that list, the following databases do not have an ensGenes track"
echo "on hgwbeta (however, the tests in this script will still be run on them)."
echo "You might consider releasing an ensGenes track for these databases:"
cat xxNotOnBeta$$
endif
set dbs=`cat xxDbList$$`
endif
else # running on one database only (don't check, just run)
echo "\nRunning script for Ensebml Genes v$ver on this assembly:\n"
set dbs=$runOn
echo $dbs
endif
# a huge loop through all databases
foreach db ($dbs)
echo "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
echo "~~~~~~~~~ $db ~~~~~~~~~~~~"
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
# find out if this is a new Ensembl Genes track (or an update)
set ensTrack=`echo $betaList | egrep -wo $db`
echo "\n\n----------------------"
echo "compare new (dev) and old (beta) ens* tables"
echo "this shows the counts of rows unique to dev, unique to beta, and"
echo "present on both. you should be suspicious if there are big differences"
compareWholeColumn.csh $db ensGene name
compareWholeColumn.csh $db ensPep name
compareWholeColumn.csh $db ensGtp transcript
echo
echo "\n\n----------------------"
echo "check a few of the new additions to the ensGene table"
echo "(be sure to click all the way out to Ensembl website)"
echo "\ncheck these in $db browser on hgwdev:"
head -2 $db.ensGene.name.devOnly
tail -2 $db.ensGene.name.devOnly
# only do this test if the ensGene track already exists on beta
if ( $db == $ensTrack ) then
echo "\n\n----------------------"
echo "check a few of the deleted items from the ensGene table"
echo "(make sure they have also been dropped from Ensembl website)"
echo "\ncheck these in $db browser on hgwbeta:"
head -2 $db.ensGene.name.betaOnly
tail -2 $db.ensGene.name.betaOnly
endif
echo "\n\n----------------------"
echo "these are full sets of corresponding rows from the three tables:"
echo "ensGene <-> ensPep <-> ensGtp on hgwdev"
echo "\ncheck these two genes (and their peptides) on hgwdev for '$db':"
hgsql -e "SELECT * FROM ensGene, ensPep, ensGtp WHERE \
ensGene.name = ensPep.name AND ensGene.name = ensGtp.transcript LIMIT 2\G" $db
echo "\n\n----------------------"
echo "run genePredCheck on the ensGene table. if there a failure here,"
echo "then something is seriously wrong with the ensGene table."
echo "MarkD can help you figure out exactly what's wrong."
echo "\ngenePredCheck results for $db.ensGene on hgwdev:"
genePredCheck -db=$db ensGene
echo "\n\n----------------------"
echo "find out which chroms the genes are on (for both dev and beta)."
echo "look for unusually small or large numbers here (or big differences)."
# don't run this on scaffold assemblies
set numChroms=`hgsql -Ne "SELECT COUNT(*) FROM chromInfo" $db`
if ( $numChroms < 100 ) then
if ( $db == $ensTrack ) then
countPerChrom.csh $db ensGene $db hgwbeta
else
countPerChrom.csh $db ensGene $db
endif
else
echo "$db is a scaffold assembly: skipping countPerChrom"
endif
echo
echo "\n\n----------------------"
echo "featureBits for new (dev) and old (beta) tables"
echo "\nfeatureBits $db ensGene (on hgwdev):"
featureBits $db ensGene
echo "featureBits $db -countGaps ensGene (on hgwdev):"
featureBits $db -countGaps ensGene
echo "featureBits $db -countGaps ensGene gap (on hgwdev):"
featureBits $db -countGaps ensGene gap
if ( $db == $ensTrack ) then
echo "\nfeatureBits $db ensGene (on hgwbeta):"
ssh hgwbeta featureBits $db ensGene
echo "featureBits $db -countGaps ensGene (on hgwbeta):"
ssh hgwbeta featureBits $db -countGaps ensGene
echo "featureBits $db -countGaps ensGene gap (on hgwbeta):"
ssh hgwbeta featureBits $db -countGaps ensGene gap
endif
echo
echo "\n\n----------------------"
echo "check that the ensGene track is sorted by chrom:"
echo "positionalTblCheck -verbose=2 $db ensGene\n"
positionalTblCheck -verbose=2 $db ensGene
echo
echo "\n\n----------------------"
echo "run Joiner Check. look for errors in the following two lines only:"
echo "ensPep.name and ensGtp.transcript"
echo "\nrunning joinerCheck for $db on ensemblTranscriptId:"
joinerCheck -keys -database=$db -identifier=ensemblTranscriptId ~/kent/src/hg/makeDb/schema/all.joiner
echo "\n\n----------------------"
echo "ensGene names typically begin with 'ENS'. if there is a number other"
echo "than 0, then there are ensGenes that do not begin with 'ENS'."
echo "check them out on the Ensembl website."
echo "\nnumber of ensGenes that do not begin with 'ENS' in '$db':"
set num=`hgsql -Ne "SELECT COUNT(*) FROM ensGene WHERE name \
NOT LIKE 'ENS%'" $db`
echo $num
if ( 0 != $num ) then
echo "instead of 'ENS', the ensGenes in this table look like this:"
hgsql -Ne "SELECT name FROM ensGene WHERE name NOT LIKE 'ENS%' LIMIT 3" $db
endif
echo "\n\n----------------------"
echo "A few tracks have another table or two associated with them. For"
echo "example, when Ensembl uses different scaffold names than we do, there"
echo "should be a translation table called: ensembleGeneScaffold. This"
echo "table supports a separate track called: Ensembl Assembly."
echo "Assemblies with a UCSC Gene track should also have a table called:"
echo "knownToEnsembl. Here's what this assembly has:"
echo
hgsql -Ne "SHOW TABLES LIKE 'ensemblGeneScaffold'" $db
hgsql -Ne "SHOW TABLES LIKE 'knownToEnsembl'" $db
end # huge loop through each database
# remember the hgFixed.trackVersion table
echo "\n\n----------------------"
echo "Don't forget to also push (to beta and then to the RR) the appropriate"
echo "rows from the trackVersion table in the hgFixed database."
echo "These rows allow the correct version number to be displayed in hgTrackUi."
echo "\nEdit the following pseudo-SQL statement to include only those"
echo "assemblies you will be pushing:"
echo 'hgsql -Ne "SELECT * FROM trackVersion WHERE version = ensGeneUpdateVersion AND db like 'dbsYouWillPush'" hgFixed'
# make sure the date column has been updated
echo "\n\n----------------------"
echo "the dateReference column in the hgFixed.trackVersion table"
echo "should say 'current' for your database (or all):"
hgsql -Ne "SELECT db, dateReference FROM trackVersion WHERE version = $ver AND name = 'ensGene'" hgFixed
# check that the corresponding upstream MAF files have been updated
echo "\n\n----------------------"
echo "In conjunction with an Ensembl Gene update, some upstream MAF files need"
echo "to be rebuilt. Specifically those for: ornAna1, fr2, gasAcu1, oryLat2"
echo "Check for them here (look for new dates) there should be 3 for each db:"
echo "hgwdev:/data/apache/htdocs/goldenPath/<db>/multiz*way/maf/ensGene.upstream?000.maf.gz"
echo
# clean up
rm -f xxDbList$$ xxNotActive$$ xxNotOnBeta$$
echo "\nthe end.\n"
exit 0