9160ba77561829b8e213058680623b63d35262dc
braney
  Fri Sep 3 10:40:47 2021 -0700
get synBlast to work with bigBed

diff --git src/hg/utils/automation/synBlastp.csh src/hg/utils/automation/synBlastp.csh
index 7a04062..d7627e4 100755
--- src/hg/utils/automation/synBlastp.csh
+++ src/hg/utils/automation/synBlastp.csh
@@ -1,160 +1,160 @@
 #!/bin/tcsh
 # $Id: synBlastp.csh,v 1.5 2009/09/27 03:32:29 galt Exp $
 ##########################################################################
 #
 #  synBlastp.csh - Help filter out unwanted paralogs from xxBlastTab 
 #    using chains for synteny  (Galt 2007-01-10)
 #
 # Background: The xxBlastTab tables are made with a simple blastall 
 # (blastp with -b 1) which chooses the best match.  Unfortunately this
 # means that if there is no proper match it will still pick something
 # even though it's probably not orthologous. This is especially a problem
 # in organisms like rat knownGene which has only 30% gene coverage. 
 # The strategy here is to filter our xxBlastTab using synteny mappings from
 # the chains. This is done by simply taking $db.kg and using /gbdb/$db chains
 # and pslMap to lift the genes to the target xx assembly.  Then hgMapToGene
 # will find which of those mapped ids have good overlap with xx.knownGene.
 # The final mapping is then created by doing an inner join between 
 # the traditional xxBlastTab and the mapping table produced above
 # and deleting the appropriate records.
 #
 # We are starting with xxBlastTab tables already built in the usual way with
 # blastall/blastp, probably with doHgNearBlastp.pl script.
 
 # check which host we are on
 if ( "$HOST" != "hgwdev" ) then
  echo "error: you must run this script on hgwdev!"
  exit 1
 endif
 
 # check for required params db and otherDb
 if ( "$1" == "" ) then
  echo "error: you must specify assembly db and other db on commandline, e.g."
  echo "synBlastp.csh hg18 rn4"
  exit 1
 endif
 
 if ( "$2" == "" ) then
  echo "error: you must specify assembly db and other db on commandline, e.g."
  echo "synBlastp.csh hg18 rn4"
  exit 1
 endif
 
 set db = "$1"
 set otherDb = "$2"
 set geneTable = "knownGene"
 set otherGeneTable = "knownGene"
 
 if ( "$3" != "" ) then
     if ( "$4" == "" ) then
 	 echo "error: if you specify geneTable, you need to specify otherGeneTable too, e.g."
 	 echo "synBlastp.csh hg18 rn4 knownGene rgdGene2"
 	 exit 1
     endif
  set geneTable = $3
  set otherGeneTable = $4
 endif
 
 # check for required lift file in /gbdb/$db
 set otherDbUp1st = `echo "$otherDb" | gawk '{print toupper(substr($1,1,1)) substr($1,2,length($1)-1)}'`
 set lift = "/gbdb/$db/liftOver/${db}To${otherDbUp1st}.over.chain.gz"
 if ( ! -e $lift ) then
  echo "error: $lift not found. You must specify assembly db and other db on commandline, e.g."
  echo "synBlastp.csh hg18 rn4"
  exit 1
 endif
 
 
 #check for required tables $db.xxBlastTab and db.$geneTable and otherDb.$otherGeneTable
 
 set otherOrg = `echo "$otherDb" | sed 's/[0123456789]*//g'`
 
 set xxBlastTab = "${otherOrg}BlastTab"
 
 # db.xxBlastTab exists?
 set sql = "show tables like '$xxBlastTab'"
 set result = `hgsql $db -BN -e "$sql"`
 if ($result != "$xxBlastTab") then
  echo "error: table $db.$xxBlastTab not found."
  exit 1
 endif
 
 # db.$geneTable exists?
 set sql = "show tables like '$geneTable'"
 set result = `hgsql $db -BN -e "$sql"`
 if ($result != "$geneTable") then
  echo "error: table $db.$geneTable not found."
  exit 1
 endif
 
 # otherDb.$otherGeneTable exists?
 set sql = "show tables like '$otherGeneTable'"
 set result = `hgsql $otherDb -BN -e "$sql"`
 if ($result != "$otherGeneTable") then
  echo "error: table $otherDb.$otherGeneTable not found."
  exit 1
 endif
 
 
 #debug
 echo "db=$db"
 echo "geneTable=$geneTable"
 echo "otherDb=$otherDb"
 echo "otherGeneTable=$otherGeneTable"
 #echo "lift=$lift"
 #echo "otherDbUp1st=$otherDbUp1st"
 #echo "otherOrg=$otherOrg"
 #echo "xxBlastTab=$xxBlastTab"
 
 # --- general setup ---
 
 cd ${HOME}
 if (-d synBlastpTemp) then
     rm -fr synBlastpTemp
 endif
 mkdir synBlastpTemp
 cd synBlastpTemp
 
 # --- pre-clean old tables from any previous failure ---
 hgsql $otherDb -BN -e "drop table temp${db}KgPslMapped if exists" >& /dev/null
 hgsql $otherDb -BN -e "drop table temp${otherDb}kgTo${db}kg if exists" >& /dev/null
 hgsql $db -BN -e "drop table ${xxBlastTab}Temp if exists" >& /dev/null
 
 
 # --- filter $db.xxBlastTab based on pslMapped $db.kg hgMapToGene'd over to $otherDb.kg ---
 
 echo "genePredToFakePsl:"
 genePredToFakePsl $db $geneTable $db.kg.psl $db.kg.cds
 
 echo "pslMap via $lift :"
 zcat $lift | pslMap -chainMapFile -swapMap $db.kg.psl stdin stdout \
 |  sort -k 14,14 -k 16,16n > $db.$otherDb.kg.psl
 
 echo "hgLoadPsl:"
 hgLoadPsl -clientLoad $otherDb $db.$otherDb.kg.psl -table=temp${db}KgPslMapped
 
 echo "hgMapToGene:"
-hgMapToGene -all $otherDb -type=psl -verbose=0 temp${db}KgPslMapped $otherGeneTable temp${otherDb}kgTo${db}kg
+hgMapToGene  -geneTableType=genePred  -all $otherDb -type=psl -verbose=0 temp${db}KgPslMapped $otherGeneTable temp${otherDb}kgTo${db}kg
 
 echo "$db.${xxBlastTab}:"
 
 # original for comparison:
 set sql = "select count(distinct query) from ${xxBlastTab}"; echo "old number of unique query values:"; hgsql $db -BN -e "$sql"
 set sql = "select count(distinct target) from ${xxBlastTab}"; echo "old number of unique target values"; hgsql $db -BN -e "$sql"
 
 # drop rows that do not have a match in the psl-mapped otherDb table.
 hgsql $db -BN -e "delete a from ${xxBlastTab} a left join ${otherDb}.temp${otherDb}kgTo${db}kg b on (a.query = b.value and a.target = b.name) where b.value is NULL"
 
 set sql = "select count(distinct query) from ${xxBlastTab}"; echo "new number of unique query values:"; hgsql $db -BN -e "$sql"
 set sql = "select count(distinct target) from ${xxBlastTab}"; echo "new number of unique target values"; hgsql $db -BN -e "$sql"
 
 #cleanup:
 
 hgsql $otherDb -BN -e "drop table temp${db}KgPslMapped"
 hgsql $otherDb -BN -e "drop table temp${otherDb}kgTo${db}kg"
 
 rm $db.kg.psl $db.kg.cds $db.$otherDb.kg.psl
 cd ..
 rmdir synBlastpTemp