src/hg/utils/automation/synBlastp.csh 1.4
1.4 2009/09/26 21:22:51 kent
Working around mysql 5 bug.
Index: src/hg/utils/automation/synBlastp.csh
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/synBlastp.csh,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/utils/automation/synBlastp.csh 13 Feb 2007 01:37:29 -0000 1.3
+++ src/hg/utils/automation/synBlastp.csh 26 Sep 2009 21:22:51 -0000 1.4
@@ -1,146 +1,147 @@
#!/bin/tcsh
# $Id$
##########################################################################
#
# synBlastp.csh - Help filter out unwanted paralogs from xxBlastTab
# using chains for synteny (Galt 2007-01-10)
#
# Background: The xxBlastTab tables are made with a simple blastall
# (blastp with -b 1) which chooses the best match. Unfortunately this
# means that if there is no proper match it will still pick something
# even though it's probably not orthologous. This is especially a problem
# in organisms like rat knownGene which has only 30% gene coverage.
# The strategy here is to filter our xxBlastTab using synteny mappings from
# the chains. This is done by simply taking $db.kg and using /gbdb/$db chains
# and pslMap to lift the genes to the target xx assembly. Then hgMapToGene
# will find which of those mapped ids have good overlap with xx.knownGene.
# The final mapping is then created by doing an inner join between
# the traditional xxBlastTab and the mapping table produced above.
# Then simply drop the old table and rename the new table.
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
# check which host we are on
if ( "$HOST" != "hgwdev" ) then
echo "error: you must run this script on hgwdev!"
exit 1
endif
# check for required params db and otherDb
if ( "$1" == "" ) then
echo "error: you must specify assembly db and other db on commandline, e.g."
echo "synBlastp.csh hg18 rn4"
exit 1
endif
if ( "$2" == "" ) then
echo "error: you must specify assembly db and other db on commandline, e.g."
echo "synBlastp.csh hg18 rn4"
exit 1
endif
set db = "$1"
set otherDb = "$2"
# check for required lift file in /gbdb/$db
set otherDbUp1st = `echo "$otherDb" | gawk '{print toupper(substr($1,1,1)) substr($1,2,length($1)-1)}'`
set lift = "/gbdb/$db/liftOver/${db}To${otherDbUp1st}.over.chain.gz"
if ( ! -e $lift ) then
echo "error: $lift not found. You must specify assembly db and other db on commandline, e.g."
echo "synBlastp.csh hg18 rn4"
exit 1
endif
#check for required tables $db.xxBlastTab and db.knownGene and otherDb.knownGene
set otherOrg = `echo "$otherDb" | sed 's/[0123456789]*//g'`
set xxBlastTab = "${otherOrg}BlastTab"
# db.xxBlastTab exists?
set sql = "show tables like '$xxBlastTab'"
set result = `hgsql $db -BN -e "$sql"`
if ($result != "$xxBlastTab") then
echo "error: table $db.$xxBlastTab not found."
exit 1
endif
# db.knownGene exists?
set sql = "show tables like 'knownGene'"
set result = `hgsql $db -BN -e "$sql"`
if ($result != "knownGene") then
echo "error: table $db.knownGene not found."
exit 1
endif
# otherDb.knownGene exists?
set sql = "show tables like 'knownGene'"
set result = `hgsql $otherDb -BN -e "$sql"`
if ($result != "knownGene") then
echo "error: table $otherDb.knownGene not found."
exit 1
endif
#debug
echo "db=$db"
echo "otherDb=$otherDb"
#echo "lift=$lift"
#echo "otherDbUp1st=$otherDbUp1st"
#echo "otherOrg=$otherOrg"
#echo "xxBlastTab=$xxBlastTab"
# --- general setup ---
cd ${HOME}
if (-d synBlastpTemp) then
rm -fr synBlastpTemp
endif
mkdir synBlastpTemp
cd synBlastpTemp
# --- pre-clean old tables from any previous failure ---
hgsql $otherDb -BN -e "drop table temp${db}KgPslMapped if exists" >& /dev/null
hgsql $otherDb -BN -e "drop table temp${otherDb}kgTo${db}kg if exists" >& /dev/null
hgsql $db -BN -e "drop table ${xxBlastTab}Temp if exists" >& /dev/null
# --- filter $db.xxBlastTab based on pslMapped $db.kg hgMapToGene'd over to $otherDb.kg ---
echo "genePredToFakePsl:"
genePredToFakePsl $db knownGene $db.kg.psl $db.kg.cds
echo "pslMap via $lift :"
zcat $lift | pslMap -chainMapFile -swapMap $db.kg.psl stdin stdout \
| sort -k 14,14 -k 16,16n > $db.$otherDb.kg.psl
echo "hgLoadPsl:"
hgLoadPsl $otherDb $db.$otherDb.kg.psl -table=temp${db}KgPslMapped
echo "hgMapToGene:"
hgMapToGene -all $otherDb -type=psl -verbose=0 temp${db}KgPslMapped knownGene temp${otherDb}kgTo${db}kg
echo "$db.${xxBlastTab}:"
# original for comparison:
set sql = "select count(distinct query) from ${xxBlastTab}"; echo "old number of unique query values:"; hgsql $db -BN -e "$sql"
set sql = "select count(distinct target) from ${xxBlastTab}"; echo "old number of unique target values"; hgsql $db -BN -e "$sql"
# drop rows that do not have a match in the psl-mapped otherDb table.
-hgsql $db -BN -e "delete ${xxBlastTab} from ${xxBlastTab} a left join ${otherDb}.temp${otherDb}kgTo${db}kg b on (a.query = b.value and a.target = b.name) where b.value is NULL"
+set yyBlastTab = ${otherDb}.temp${otherDb}kgTo${db}kg
+hgsql $db -BN -e "delete ${xxBlastTab} from ${xxBlastTab} left join ${yyBlastTab} on (${xxBlastTab}.query = ${yyBlastTab}.value and ${xxBlastTab}.target = ${yyBlastTab}.name) where ${yyBlastTab}.value is NULL"
set sql = "select count(distinct query) from ${xxBlastTab}"; echo "new number of unique query values:"; hgsql $db -BN -e "$sql"
set sql = "select count(distinct target) from ${xxBlastTab}"; echo "new number of unique target values"; hgsql $db -BN -e "$sql"
#cleanup:
hgsql $otherDb -BN -e "drop table temp${db}KgPslMapped"
hgsql $otherDb -BN -e "drop table temp${otherDb}kgTo${db}kg"
rm $db.kg.psl $db.kg.cds $db.$otherDb.kg.psl
cd ..
rmdir synBlastpTemp