d4d7d316841c2aeced33a4aaa088dd40741e942d rhead Fri Aug 24 17:21:27 2012 -0700 Changed the check for genomeClade so that it matches the whole word from the dbDb.genome field instead of using mysql LIKE. This was causing problems for pairs of organisms with similar names, such as tree shrew and shrew and guinea pig and pig. diff --git src/utils/qa/checkMetaData.csh src/utils/qa/checkMetaData.csh index 0188c87..d4e91f3 100755 --- src/utils/qa/checkMetaData.csh +++ src/utils/qa/checkMetaData.csh @@ -1,223 +1,215 @@ #!/bin/tcsh source `which qaConfig.csh` ############################################### # # 08-25-04 # Robert Kuhn # # checks the metadata for a new assembly. # ############################################### set db="" set dbTrunc="" set metatables="dbDb blatServers defaultDb gdbPdb genomeClade liftOverChain" if ( $#argv < 1 || $#argv > 3 ) then # no command line args echo echo " checks the metadata for a new assembly." echo echo " usage: database [machine1 machine2]" echo ' (defaults to dev and beta, or use "RR".)' echo exit else set db=$argv[1] set dbTrunc=`echo $db | sed -e "s/[0-9]*//"` endif # set defaults set mach1="hgwdev" set mach2="hgwbeta" set centdb1="hgcentraltest" set centdb2="hgcentralbeta" set host1="" set host2="-h $sqlbeta" # check if asssembly database exits on dev. set orgCheck=`hgsql -N -e 'SELECT COUNT(*) FROM dbDb WHERE name = "'$db'"' \ hgcentraltest` if ( $orgCheck == 0 ) then echo echo " $db is not a valid genome database." echo exit endif # set machines, if given on command line. if ( $#argv == 2 ) then echo "\n please use two machine names.\n" echo "${0}:" $0 exit 1 endif if ( $#argv == 3 ) then set mach1=$argv[2] set mach2=$argv[3] endif # set machines to dev where needed if ( $mach1 == "hgwdev" ) then set centdb1="hgcentraltest" set host1="" endif if ( $mach2 == "hgwdev" ) then set centdb2="hgcentraltest" set host2="" endif # set machines to beta where needed if ( $mach1 == "hgwbeta" ) then set centdb1="hgcentralbeta" set host1="-h $sqlbeta" endif if ( $mach2 == "hgwbeta" ) then set centdb2="hgcentralbeta" set host2="-h $sqlbeta" endif if ( $mach1 == "RR" || $mach1 == "rr" ) then set centdb1="hgcentral" set host1="-h $sqlrr" endif if ( $mach2 == "RR" || $mach2 == "rr" ) then set centdb2="hgcentral" set host2="-h $sqlrr" endif # set machines to RR where needed if hgw# format used set covered="hgwdev hgwbeta rr RR" echo $covered | grep -wq "$mach1" if ( $status ) then checkMachineName.csh $mach1 if ( $status ) then exit 1 else set centdb1="hgcentral" set host1="-h $sqlrr" endif endif echo $covered | grep -wq "$mach2" if ( $status ) then checkMachineName.csh $mach2 if ( $status ) then exit 1 else set centdb2="hgcentral" set host2="-h $sqlrr" endif endif # echo # echo "host1 = $host1" # echo "centdb1= $centdb1" # echo # echo "host2 = $host2" # echo "centdb2= $centdb2" echo # make file extention for output set out1=`echo $centdb1 | sed -e "s/-h //"` set out2=`echo $centdb2 | sed -e "s/-h //"` # echo "out1 = $out1" # echo "out2 = $out2" # ---------------------------------------------------- # compare metadata echo "database = $db" echo # check dbDb set metatable="dbDb" hgsql $host1 -Ne 'SELECT * FROM dbDb WHERE name = "'$db'"' $centdb1 \ > $metatable.$db.$out1 hgsql $host2 -Ne 'SELECT * FROM dbDb WHERE name = "'$db'"' $centdb2 \ > $metatable.$db.$out2 # check blatServers set metatable="blatServers" hgsql $host1 -Ne 'SELECT * FROM blatServers WHERE db = "'$db'"' $centdb1 | sort \ > $metatable.$db.$out1 hgsql $host2 -Ne 'SELECT * FROM blatServers WHERE db = "'$db'"' $centdb2 | sort \ > $metatable.$db.$out2 # check defaultDb set metatable="defaultDb" hgsql $host1 -Ne 'SELECT * FROM defaultDb WHERE name LIKE "'$dbTrunc'%"' \ $centdb1 > $metatable.$db.$out1 hgsql $host2 -Ne 'SELECT * FROM defaultDb WHERE name LIKE "'$dbTrunc'%"' \ $centdb2 > $metatable.$db.$out2 # check gdbPdb set metatable="gdbPdb" hgsql $host1 -Ne 'SELECT * FROM gdbPdb WHERE genomeDb = "'$db'"' $centdb1 \ > $metatable.$db.$out1 hgsql $host2 -Ne 'SELECT * FROM gdbPdb WHERE genomeDb = "'$db'"' $centdb2 \ > $metatable.$db.$out2 # check liftOverChain set metatable="liftOverChain" hgsql $host1 -Ne 'SELECT * FROM liftOverChain WHERE fromDb = "'$db'" \ or toDb = "'$db'"' $centdb1 | sort \ > $metatable.$db.$out1 hgsql $host2 -Ne 'SELECT * FROM liftOverChain WHERE fromDb = "'$db'" \ or toDb = "'$db'"' $centdb2 | sort \ > $metatable.$db.$out2 # check genomeClade # get genome name for the assembly to query genomeClade table. set genome=`hgsql -N -e 'SELECT genome FROM dbDb WHERE name = "'$db'"' \ hgcentraltest` -# pull out last word of the find, if in the format "G. species" -# and use LIKE to query genomeClade. -set secondWord=`echo $genome | gawk -F" " '{print $2}'` -if ( $secondWord != "" ) then - set genome=$secondWord -endif - set metatable="genomeClade" # get lookup for clade check -# filter out "/" when it appears in genome name - to avoid e.g, Dog/Human -hgsql $host1 -Ne 'SELECT * FROM genomeClade WHERE genome LIKE "%'$genome'"' \ - $centdb1 | grep -v "/" | sort > $metatable.$db.$out1 -hgsql $host2 -Ne 'SELECT * FROM genomeClade WHERE genome LIKE "%'$genome'"' \ - $centdb2 | grep -v "/" | sort > $metatable.$db.$out2 +hgsql $host1 -Ne "SELECT * FROM genomeClade WHERE genome='$genome'" \ + $centdb1 | sort > $metatable.$db.$out1 +hgsql $host2 -Ne "SELECT * FROM genomeClade WHERE genome='$genome'" \ + $centdb2 | sort > $metatable.$db.$out2 set metatable="" # compare and print results # should replace with commTiro.csh, but for now simply sorting genomeClade above foreach table ( `echo $metatables` ) comm -23 $table.$db.$out1 $table.$db.$out2 > $table.$db.${out1}Only comm -13 $table.$db.$out1 $table.$db.$out2 > $table.$db.${out2}Only comm -12 $table.$db.$out1 $table.$db.$out2 > $table.$db.common wc -l $table.$db.${out1}Only $table.$db.${out2}Only $table.$db.common \ | gawk '{ printf("%3d %-45s\n", $1, $2) }' \ | grep -v "total" echo end