src/hg/makeDb/doc/ucscGenes/hg19.ucscGenes14.csh 4cba91d2140fe4cc6bb2748d9009201fbcae577d

4cba91d2140fe4cc6bb2748d9009201fbcae577d
braney
  Tue Dec 14 12:46:17 2021 -0800
update hg19 kgXref, kgAlias, the search index files, and the hgTracks
bigBed with newer gene symbols

diff --git src/hg/makeDb/doc/ucscGenes/hg19.ucscGenes14.csh src/hg/makeDb/doc/ucscGenes/hg19.ucscGenes14.csh
index 810cbb7..9ed0f41 100755
--- src/hg/makeDb/doc/ucscGenes/hg19.ucscGenes14.csh
+++ src/hg/makeDb/doc/ucscGenes/hg19.ucscGenes14.csh
@@ -1,1614 +1,1649 @@
 #!/bin/tcsh -efx
 # :vim nowrap
 # for emacs: -*- mode: sh; -*-
 # This describes how to make the UCSC genes on hg19, though
 # hopefully by editing the variables that follow immediately
 # this will work on other databases too.
 
 # NOTE:  synBlastp has changed its parameters since this release!
 
 #
 # Prerequisites
 # Before executing this script, rebuild the swissprot ,proteins, and go databases.
 
 
 # SHOULD REBUILD GO DATABASE BEFORE DOING THIS!!!
 
 
 # Directories
 set genomes = /hive/data/genomes
 set dir = $genomes/hg19/bed/ucsc.14.3
 set scratchDir = /hive/scratch
 set testingDir = $scratchDir/ucscGenes
 
 # Databases
 set db = hg19
 set Db = hg19
 set oldDb = hg19
 set xdb = mm10
 set Xdb = Mm10
 set ydb = canFam3
 set zdb = rheMac2
 set spDb = sp121210
 set pbDb = proteins121210
 set ratDb = rn4
 set RatDb = Rn4
 set fishDb = danRer7
 set flyDb = dm3
 set wormDb = ce6
 set yeastDb = sacCer3
 
 # The net alignment for the closely-related species indicated in $xdb
 set xdbNetDir = $genomes/$db/bed/lastz.${xdb}/axtChain
 
 # Blast tables
 set rnBlastTab = rnBlastTab
 if ($db =~ hg* ) then
     set blastTab = hgBlastTab
     set xBlastTab = mmBlastTab
 endif
 if ($db =~ mm* ) then
     set blastTab = mmBlastTab
     set xBlastTab = hgBlastTab
 endif
 
 # If rebuilding on an existing assembly make tempDb some bogus name like tmpFoo2, otherwise 
 # make tempDb same as db.
 set tempPrefix = "tmp"
 set tmpSuffix = "Foo14"
 set tempDb = ${tempPrefix}${tmpSuffix}
 set bioCycTempDb = tmpBioCyc${tmpSuffix}
 
 # Table for SNPs
 set snpTable = snp137
 
 # Public version number
 set lastVer = 6
 set curVer = 7
 
 # Database to rebuild visiGene text from.  Should include recent mouse and human
 # but not the one you're rebuilding if you're rebuilding. (Use tempDb instead).
 set vgTextDbs = (mm8 mm9 hg18 hg19 $tempDb)
 
 # Proteins in various species
 set tempFa = $dir/ucscGenes.faa
 set xdbFa = $genomes/$xdb/bed/ucsc.13.1/ucscGenes.faa
 set ratFa = $genomes/$ratDb/bed/blastpRgdGene2/rgdGene2Pep.faa
 set fishFa = $genomes/$fishDb/bed/blastp/ensembl.faa
 set flyFa = $genomes/$flyDb/bed/hgNearBlastp/100806/$flyDb.flyBasePep.faa
 set wormFa = $genomes/$wormDb/bed/blastp/wormPep190.faa
 set yeastFa = $genomes/$yeastDb/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa
 
 # Other files needed
 set bioCycPathways = /hive/data/outside/bioCyc/120801/1.7/data/pathways.col
 set bioCycGenes = /hive/data/outside/bioCyc/120801/1.7/data/genes.col
 set rfam = /hive/data/outside/Rfam/111130
 
 # Tracks
 set multiz = multiz46way
 
 # NCBI Taxon 10090 for mouse, 9606 for human
 set taxon = 9606
 
 # Previous gene set
 set oldGeneDir = $genomes/hg19/bed/ucsc.13
 set oldGeneBed = $oldGeneDir/ucscGenes.bed
 
 # Machines
 set dbHost = hgwdev
 set ramFarm = swarm
 set cpuFarm = swarm
 
 # Code base
 set kent = ~/kent
 
 # Create initial dir
 mkdir -p $dir
 cd $dir
 
 # Get Genbank info
 txGenbankData $db
 # creates the files:
 #	mgcStatus.tab  mrna.psl  refPep.fa  refSeq.psl
 #	mrna.fa        mrna.ra   refSeq.fa  refSeq.ra
 
 # process RA Files
 txReadRa mrna.ra refSeq.ra .
 # creates the files:
 #	cds.tab             refSeqStatus.tab	mrnaSize.tab	refToPep.tab
 #	refPepStatus.tab    exceptions.tab	accVer.tab
 
 # Get some other info from the database.  Best to get it about
 # the same time so it is synced with other data. Takes 4 seconds.
 hgsql -N $db -e 'select distinct name,sizePolyA from mrnaOrientInfo' | \
 	subColumn -miss=sizePolyA.miss 1 stdin accVer.tab sizePolyA.tab
 #	creates the files:
 #	sizePolyA.miss  sizePolyA.tab
 
 # Get CCDS for human (or else just an empty file)
 if ( `hgsql -N $db -e "show tables;" | grep -E -c "ccdsGene|chromInfo"` == 2) then
     hgsql -N $db -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ccdsGene" | genePredToBed stdin ccds.bed
     hgsql -N $db \
 	-e "select mrnaAcc,ccds from ccdsInfo where srcDb='N'" > refToCcds.tab
 else
     echo -n "" > ccds.bed
     echo -n "" > refToCcds.tab
 endif
 
 # Get tRNA for human (or else just an empty file)
 if ($db =~ hg* && \
   `hgsql -N $db -e "show tables;" | grep -E -c "tRNAs|chromInfo"` == 2) then
     hgsql -N $db -e "select chrom,chromStart,chromEnd,name,score,strand,chromStart,chromEnd,"0","1",chromEnd-chromStart,"0" from tRNAs" > trna.bed 
     bedToPsl $genomes/$db/chrom.sizes trna.bed trna.psl
 else
     echo -n "" > trna.bed
     echo -n "" > trna.psl
     echo -n "" > refToCcds.tab
 endif
 
 
 # Get the blocks in this genome that are syntenic to the $xdb genome
 netFilter -syn $xdbNetDir/${db}.${xdb}.net.gz > ${db}.${xdb}.syn.net
 netToBed -maxGap=0 ${db}.${xdb}.syn.net ${db}.${xdb}.syn.bed
 
 # Get the Rfams that overlap with blocks that are syntenic to $Xdb, and filter out
 # any duplicate Rfam blocks.  In some cases, where there are two related Rfam models,
 # the same genomic region can get two distinct hits.  For our purposes, we only
 # want one of those hits, because we're looking for regions that might be genes and
 # don't care as much about the subclass of gene.
 cat ${rfam}/${db}/Rfam.bed |sort -k1,1 -k2,2n > rfam.sorted.bed
 bedRemoveOverlap rfam.sorted.bed rfam.distinctHits.bed
 bedIntersect -aHitAny rfam.distinctHits.bed ${db}.${xdb}.syn.bed rfam.syntenic.bed
 bedToPsl $genomes/$db/chrom.sizes rfam.syntenic.bed rfam.syntenic.psl
 
 # Create directories full of alignments split by chromosome.
 mkdir -p est refSeq mrna 
 pslSplitOnTarget refSeq.psl refSeq
 pslSplitOnTarget mrna.psl mrna
 bedSplitOnChrom ccds.bed ccds
 
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (! -e refSeq/$c.psl) then
 	  echo creating empty refSeq/$c.psl
           echo -n "" >refSeq/$c.psl
     endif
     if (! -e mrna/$c.psl) then
 	  echo creating empty mrna/$c.psl
           echo -n "" >mrna/$c.psl
     endif
     hgGetAnn -noMatchOk $db intronEst $c est/$c.psl
     if (! -e est/$c.psl) then
 	  echo creating empty est/$c.psl
           echo -n "" >est/$c.psl
     endif
     if (! -e ccds/$c.bed) then
 	  echo creating empty ccds/$c.bed
           echo -n "" >ccds/$c.bed
     endif
 end
 
 
 # Get list of accessions that are associated with antibodies from database.
 # This will be a good list but not 100% complete.  Cluster these to get
 # four or five antibody heavy regions.  Later we'll weed out input that
 # falls primarily in these regions, and, include the regions themselves
 # as special genes.  Takes 40 seconds
 txAbFragFind $db antibodyAccs
 # Found 28999 descriptions that match
 pslCat mrna/*.psl -nohead | fgrep -w -f antibodyAccs > antibody.psl
 clusterPsl -prefix=ab.ab.antibody. antibody.psl antibody.cluster
 if ($db =~ mm*) then
     echo chr12 > abChrom.lst
     echo chr16 >> abChrom.lst
     echo chr6 >> abChrom.lst
     fgrep -w -f abChrom.lst antibody.cluster | cut -f 1-12 | bedOrBlocks stdin antibody.bed
 else
     awk '$13 > 100' antibody.cluster | cut -f 1-12 > antibody.bed
 endif
 
 
 # Convert psls to bed, saving mapping info and weeding antibodies.  Takes 2.5 min
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (-s refSeq/$c.psl) then
 	txPslToBed refSeq/$c.psl -noFixStrand -cds=cds.tab $genomes/$db/$db.2bit refSeq/$c.bed -unusual=refSeq/$c.unusual
     else
 	echo "creating empty refSeq/$c.bed"
 	touch refSeq/$c.bed
     endif
     if (-s mrna/$c.psl) then
 	txPslToBed mrna/$c.psl -cds=cds.tab $genomes/$db/$db.2bit \
 		stdout -unusual=mrna/$c.unusual \
 	    | bedWeedOverlapping antibody.bed maxOverlap=0.5 stdin mrna/$c.bed
     else
 	echo "creating empty mrna/$c.bed"
 	touch mrna/$c.bed
     endif
     if (-s est/$c.psl) then
 	txPslToBed est/$c.psl $genomes/$db/$db.2bit stdout \
 	    | bedWeedOverlapping antibody.bed maxOverlap=0.3 stdin est/$c.bed
     else
 	echo "creating empty est/$c.bed"
 	touch est/$c.bed
     endif
 end
 
 # Create mrna splicing graphs.  Takes 10 seconds.
 mkdir -p bedToGraph
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txBedToGraph -prefix=$c. ccds/$c.bed ccds refSeq/$c.bed refSeq mrna/$c.bed mrna  \
 	bedToGraph/$c.txg
 end
 
 # Create est splicing graphs.  Takes 6 minutes.
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txBedToGraph -prefix=e$c. est/$c.bed est est/$c.txg
 end
 
 
 # Create an evidence weight file
 cat > trim.weights <<end
 refSeq  100
 ccds    50
 mrna    2
 txOrtho 1
 exoniphy 1
 est 1
 end
 
 
 
 # Make evidence file for EST graph edges supported by at least 2 
 # ests.  Takes about 30 seconds.
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txgGoodEdges est/$c.txg  trim.weights 2 est est/$c.edges
 end
 
 
 # Setup other species dir
 mkdir -p $dir/$xdb
 cd $dir/$xdb
 
 # Get other species mrna including ESTs.  Takes about three minutes
 mkdir -p refSeq mrna est
 foreach c (`awk '{print $1;}' $genomes/$xdb/chrom.sizes`)
     echo $c
     hgGetAnn -noMatchOk $xdb refSeqAli $c stdout | txPslToBed stdin $genomes/$xdb/$xdb.2bit refSeq/$c.bed 
     hgGetAnn -noMatchOk $xdb mrna $c stdout | txPslToBed stdin $genomes/$xdb/$xdb.2bit mrna/$c.bed
     hgGetAnn -noMatchOk $xdb intronEst $c stdout | txPslToBed stdin $genomes/$xdb/$xdb.2bit est/$c.bed
 end
 
 
 # Create other species splicing graphs.  Takes about five minutes.
 cd $dir/$xdb
 rm -f other.txg
 foreach c (`awk '{print $1;}' $genomes/$xdb/chrom.sizes`)
     echo $c
     txBedToGraph refSeq/$c.bed refSeq mrna/$c.bed mrna est/$c.bed est stdout >> other.txg
 end
 
 
 # Clean up all but final other.txg
 rm -r est mrna refSeq
 
 
 
 # Unpack chains and nets, apply synteny filter and split by chromosome
 # Takes 5 minutes.  Make up phony empty nets for ones that are empty after
 # synteny filter.
 cd $dir/$xdb
 chainSplit chains $genomes/$db/bed/lastz.$xdb/axtChain/$db.$xdb.all.chain.gz
 netFilter -syn $genomes/$db/bed/lastz.$xdb/axtChain/$db.$xdb.net.gz \
 	| netSplit stdin nets
 cd nets
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (! -e $c.net) then
         echo -n >  $c.net
     endif
 end
 
 cd ../chains
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (! -e $c.chain) then
         echo  -n >  $c.chain
     endif
 end
 
 # Make txOrtho directory and a para spec file
 cd $dir
 mkdir -p txOrtho/edges
 cd txOrtho
 echo '#LOOP' > template
 echo './runTxOrtho $(path1) '"$xdb $db" >> template
 echo '#ENDLOOP' >> template
     
 cat << '_EOF_' > runTxOrtho
 #!/bin/csh -ef
 set inAgx = ../bedToGraph/$1.txg
 set inChain = ../$2/chains/$1.chain
 set inNet = ../$2/nets/$1.net
 set otherTxg = ../$2/other.txg
 set tmpDir = /scratch/tmp/$3
 set workDir = $tmpDir/${1}_${2}
 mkdir -p $tmpDir
 mkdir -p $workDir
 txOrtho $inAgx $inChain $inNet $otherTxg $workDir/$1.edges
 mv $workDir/$1.edges edges/$1.edges
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
     chmod +x runTxOrtho
     touch toDoList
 cd $dir/bedToGraph
 foreach f (*.txg)
     set c=$f:r
     if ( -s $f ) then
 	echo $c >> ../txOrtho/toDoList
     else
 	echo "warning creating empty $c.edges result"
 	touch ../txOrtho/edges/$c.edges
     endif
 end
 cd ..
 
 # Do txOrtho parasol run on high RAM cluster
 ssh $ramFarm "cd $dir/txOrtho; gensub2 toDoList single template jobList"
 ssh $ramFarm "cd $dir/txOrtho; para make jobList"
 ssh $ramFarm "cd $dir/txOrtho; para time > run.time"
 cat txOrtho/run.time
 
 #Completed: 145 of 145 jobs
 #CPU time in finished jobs:       4142s      69.03m     1.15h    0.05d  0.000 y
 #IO & Wait Time:                  2381s      39.69m     0.66h    0.03d  0.000 y
 #Average job time:                  45s       0.75m     0.01h    0.00d
 #Longest finished job:             196s       3.27m     0.05h    0.00d
 #Submission to last job:           205s       3.42m     0.06h    0.00d
 
 # Filter out some duplicate edges. These are legitimate from txOrtho's point
 # of view, since they represent two different mouse edges both supporting
 # a human edge. However, from the human point of view we want only one
 # support from mouse orthology.  Just takes a second.
 #  TODO: assuming true is mouse is target.
 cd $dir/txOrtho
 mkdir -p uniqEdges
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     cut -f 1-9 edges/$c.edges | sort | uniq > uniqEdges/$c.edges
 end
 cd ..
 
 
 #
 # testing suggestion: uncomment below
 # mkdir -p $testingDir
 # compareModifiedFileSizes.csh $oldGeneDir .
 # cut -f-3,5,6,8 txOrtho/uniqEdges/chr22.edges >$testingDir/chr22.subset.edges.new
 # cut -f-3,5,6,8 $oldGeneDir/txOrtho/uniqEdges/chr22.edges \
 #    >$testingDir/chr22.subset.edges.old
 # checkRandomLinesExist.py -s $testingDir/chr22.subset.edges.old \
 #    -d $testingDir/chr22.subset.edges.new
 
 # Clean up chains and nets since they are big
 cd $dir
 rm -r $xdb/chains $xdb/nets
 
 # Get exonophy. Takes about 4 seconds.
 hgsql -N $db -e "select chrom, txStart, txEnd, name, "1", strand from exoniphy order by chrom, txStart;" \
     > exoniphy.bed
 bedToTxEdges exoniphy.bed exoniphy.edges
 
 # Add evidence from ests, orthologous other species transcripts, and exoniphy
 # Takes 36 seconds.
 mkdir -p graphWithEvidence
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     echo adding evidence for $c
     if ( -s bedToGraph/$c.txg ) then
     txgAddEvidence -chrom=$c bedToGraph/$c.txg exoniphy.edges exoniphy stdout \
 	   | txgAddEvidence stdin txOrtho/uniqEdges/$c.edges txOrtho stdout \
 	   | txgAddEvidence stdin est/$c.edges est graphWithEvidence/$c.txg
     else
 	touch graphWithEvidence/$c.txg
     endif
 end
 
 #
 # special testing suggestion: uncomment below
 # compareModifiedFileSizes.csh $oldGeneDir .
 # cut -f1-3,5 graphWithEvidence/chr22.txg > $testingDir/chr22.graph.bounds.new
 # cut -f1-3,5 $oldGeneDir/graphWithEvidence/chr22.txg > $testingDir/chr22.graph.bounds.old
 # checkRandomLinesExist.py -s $testingDir/chr22.graph.bounds.old \
 #  -d $testingDir/chr22.graph.bounds.new
 #
 
 # Do  txWalk  - takes 32 seconds (mostly loading the mrnaSize.tab again and
 # again...)
 mkdir -p txWalk
 if ($db =~ mm*) then
     set sef = 0.6
 else
     set sef = 0.75
 endif
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txWalk graphWithEvidence/$c.txg trim.weights 3 txWalk/$c.bed \
 	-evidence=txWalk/$c.ev -sizes=mrnaSize.tab -defrag=0.25 \
 	    -singleExonFactor=$sef
 end
 
 # Make a file that lists the various categories of alt-splicing we see.
 # Do this by making and analysing splicing graphs of just the transcripts
 # that have passed our process so far.  The txgAnalyze program occassionally
 # will make a duplicate, which is the reason for the sort/uniq run.
 # Takes 7 seconds.
 cat txWalk/*.bed > txWalk.bed
 txBedToGraph txWalk.bed txWalk txWalk.txg
 txgAnalyze txWalk.txg $genomes/$db/$db.2bit stdout | sort | uniq > altSplice.bed
 
 # Get txWalk transcript sequences.  This takes about ten minutes
 time twoBitToFa $genomes/$db/$db.2bit -bed=txWalk.bed txWalk.fa
 # 726.490u 508.167s 21:12.30 97.0%        0+0k 0+0io 0pf+0w
 
 rm -rf txFaSplit
 mkdir -p txFaSplit
 faSplit sequence txWalk.fa 200 txFaSplit/
 
 #
 # A good time for testing.  Uncomment the lines (not all at once) to 
 # compare the line count for files just built in the current version
 # and the previous version
 #
 # compareModifiedFileSizes.csh $oldGeneDir .
 # # Check that most of the old alt events are still there
 # checkRandomLinesExist.py -d $oldGeneDir/altSplice.bed -s ./altSplice.bed
 # # check that most of the old txWalk bed entries overlap some new entry
 # bedIntersect -aHitAny txWalk.bed $oldGeneDir/txWalk.bed \
 #    $testingDir/txWalk.intersect.bed
 # wc $testingDir/txWalk.intersect.bed
 #
 
 # Fetch human protein set and table that describes if curated or not.
 # Takes about a minute
 hgsql -N $spDb -e \
   "select p.acc, p.val from protein p, accToTaxon x where x.taxon=$taxon and p.acc=x.acc" \
   | awk '{print ">" $1;print $2}' >uniProt.fa
 faSize uniProt.fa
 # 54766916 bases (43127082 N's 11639834 real 11639834 upper 0 lower) in 148175 sequences in 1 files
 
 hgsql -N $spDb -e "select i.acc,i.isCurated from info i,accToTaxon x where x.taxon=$taxon and i.acc=x.acc" > uniCurated.tab
 
 mkdir -p blat/rna/raw
 echo '#LOOP' > blat/rna/template
 echo './runTxBlats $(path1) '"$db"' $(root1) {check out line+ raw/ref_$(root1).psl}' >> blat/rna/template
 echo '#ENDLOOP' >> blat/rna/template
  
 cat << '_EOF_' > blat/rna/runTxBlats
 #!/bin/csh -ef
 set ooc = /scratch/data/$2/11.ooc
 set target = ../../txFaSplit/$1
 set out1 = raw/mrna_$3.psl
 set out2 = raw/ref_$3.psl
 set tmpDir = /scratch/tmp/$2/ucscGenes/rna
 set workDir = $tmpDir/$3
 mkdir -p $tmpDir
 rm -rf $workDir
 mkdir $workDir
 blat -ooc=$ooc -minIdentity=95 $target ../../mrna.fa $workDir/mrna_$3.psl
 blat -ooc=$ooc -minIdentity=97 $target ../../refSeq.fa $workDir/ref_$3.psl
 mv $workDir/mrna_$3.psl raw/mrna_$3.psl
 mv $workDir/ref_$3.psl raw/ref_$3.psl
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
 chmod +x blat/rna/runTxBlats
 cd txFaSplit
 ls -1 *.fa > ../blat/rna/toDoList
 cd ..
 
 ssh $cpuFarm "cd $dir/blat/rna; gensub2 toDoList single template jobList"
 ssh $cpuFarm "cd $dir/blat/rna; para make jobList"
 
 ssh $cpuFarm "cd $dir/blat/rna; para time > run.time"
 cat blat/rna/run.time
 #Completed: 197 of 197 jobs
 #CPU time in finished jobs:      59847s     997.45m    16.62h    0.69d  0.002 y
 #IO & Wait Time:                  5535s      92.25m     1.54h    0.06d  0.000 y
 #Average job time:                 332s       5.53m     0.09h    0.00d
 #Longest finished job:            3142s      52.37m     0.87h    0.04d
 #Submission to last job:          3152s      52.53m     0.88h    0.04d
 
 # Set up blat jobs for proteins vs. translated txWalk transcripts
 cd $dir
 mkdir -p blat/protein/raw
 echo '#LOOP' > blat/protein/template
 echo './runTxBlats $(path1) '"$db"' $(root1) {check out line+ raw/ref_$(root1).psl}' >> blat/protein/template
 echo '#ENDLOOP' >> blat/protein/template
 
 cat << '_EOF_' > blat/protein/runTxBlats
 #!/bin/csh -ef
 set ooc = /scratch/data/$2/11.ooc
 set target = ../../txFaSplit/$1
 set out1 = uni_$3.psl
 set out2 = ref_$3.psl
 set tmpDir = /scratch/tmp/$2/ucscGenes/prot
 set workDir = $tmpDir/$3
 mkdir -p $tmpDir
 rm -rf $workDir
 mkdir $workDir
 blat -t=dnax -q=prot -minIdentity=90 $target ../../uniProt.fa $workDir/$out1
 blat -t=dnax -q=prot -minIdentity=90 $target ../../refPep.fa $workDir/$out2
 mv $workDir/$out1 raw/$out1
 mv $workDir/$out2 raw/$out2
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
 chmod +x blat/protein/runTxBlats
 cd txFaSplit
 ls -1 *.fa > ../blat/protein/toDoList
 cd ..
 
 # Run protein/transcript blat job on cluster
 ssh $cpuFarm "cd $dir/blat/protein; gensub2 toDoList single template jobList"
 
 ssh $cpuFarm "cd $dir/blat/protein; para make jobList"
 ssh $cpuFarm "cd $dir/blat/protein; para time > run.time"
 cat blat/protein/run.time
 # Completed: 197 of 197 jobs
 # CPU time in finished jobs:      38292s     638.21m    10.64h    0.44d  0.001 y
 # IO & Wait Time:                  3177s      52.94m     0.88h    0.04d  0.000 y
 # Average job time:                 211s       3.51m     0.06h    0.00d
 # Longest finished job:             701s      11.68m     0.19h    0.01d
 # Submission to last job:           795s      13.25m     0.22h    0.01d
 
 # Sort and select best alignments. Remove raw files for space. Takes  a couple
 # of hours. Use pslReps not pslCdnaFilter because need -noIntrons flag,
 # and also working on protein as well as rna alignments. The thresholds
 # for the proteins in particular are quite loose, which is ok because
 # they will be weighted against each other.  We lose some of the refSeq
 # mappings at tighter thresholds.
 cd $dir/blat
 pslCat -nohead rna/raw/ref*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/refSeq.psl /dev/null
 pslCat -nohead rna/raw/mrna*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005  stdin rna/mrna.psl /dev/null
 pslCat -nohead protein/raw/ref*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -nearTop=0.02  -ignoreSize -minAli=0.85 stdin protein/refSeq.psl /dev/null
 pslCat -nohead protein/raw/uni*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -nearTop=0.02  -minAli=0.85 stdin protein/uniProt.psl /dev/null
 rm -r protein/raw
 
 cd $dir
 
 # Get parts of multiple alignments corresponding to transcripts.
 # Takes a couple of hours.  Could do this is a small cluster job
 # to speed it up.
 echo $db $xdb $ydb $zdb > ourOrgs.txt
 foreach c (`cut -f1 $genomes/$db/chrom.sizes`)
     echo "doing chrom $c"
     if (-s txWalk/$c.bed ) then
 	mafFrags $db $multiz txWalk/$c.bed stdout -bed12 -meFirst \
 	   | mafSpeciesSubset stdin ourOrgs.txt txWalk/$c.maf -keepFirst
     endif
 end
 
 cd $dir
 
 # Create and populate directory with various CDS evidence
 mkdir -p cdsEvidence
 cat txWalk/*.maf | txCdsPredict txWalk.fa -nmd=txWalk.bed -maf=stdin cdsEvidence/txCdsPredict.tce
 txCdsEvFromRna refSeq.fa cds.tab blat/rna/refSeq.psl txWalk.fa \
 	cdsEvidence/refSeqTx.tce -refStatus=refSeqStatus.tab \
 	-unmapped=cdsEvidence/refSeqTx.unmapped -exceptions=exceptions.tab
 txCdsEvFromRna mrna.fa cds.tab blat/rna/mrna.psl txWalk.fa \
 	cdsEvidence/mrnaTx.tce -mgcStatus=mgcStatus.tab \
 	-unmapped=cdsEvidence/mrna.unmapped
 txCdsEvFromProtein refPep.fa blat/protein/refSeq.psl txWalk.fa \
 	cdsEvidence/refSeqProt.tce -refStatus=refPepStatus.tab \
 	-unmapped=cdsEvidence/refSeqProt.unmapped \
 	-exceptions=exceptions.tab -refToPep=refToPep.tab \
 	-dodgeStop=3 -minCoverage=0.3
 txCdsEvFromProtein uniProt.fa blat/protein/uniProt.psl txWalk.fa \
 	cdsEvidence/uniProt.tce -uniStatus=uniCurated.tab \
 	-unmapped=cdsEvidence/uniProt.unmapped -source=trembl
 
 
 
 txCdsEvFromBed ccds.bed ccds txWalk.bed ../../$db.2bit cdsEvidence/ccds.tce
 cat cdsEvidence/*.tce | sort  > unweighted.tce
 
 # Merge back in antibodies, and add the small, noncoding genes that shouldn't go 
 # through txWalk because their gene boundaries should not change much.  Before
 # adding them, weed out anything that overlaps a txWalk transcript to avoid 
 # getting duplicate transcripts.
 
 bedWeedOverlapping txWalk.bed rfam.syntenic.bed rfam.weeded.bed
 
 bedWeedOverlapping txWalk.bed trna.bed trna.weeded.bed
 cat txWalk.bed antibody.bed trna.weeded.bed rfam.weeded.bed > abWalk.bed
 sequenceForBed -db=$db -bedIn=antibody.bed -fastaOut=stdout -upCase -keepName > antibody.fa
 sequenceForBed -db=$db -bedIn=trna.weeded.bed -fastaOut=stdout -upCase -keepName \
   > trna.weeded.fa
 sequenceForBed -db=$db -bedIn=rfam.weeded.bed -fastaOut=stdout -upCase -keepName \
   > rfam.weeded.fa
 cat txWalk.fa antibody.fa trna.weeded.fa rfam.weeded.fa > abWalk.fa
 
 
 # Pick ORFs, make genes
 cat refToPep.tab refToCcds.tab | \
         txCdsPick abWalk.bed unweighted.tce stdin pick.tce pick.picks \
 	-exceptionsIn=exceptions.tab \
 	-exceptionsOut=abWalk.exceptions 
 txCdsToGene abWalk.bed abWalk.fa pick.tce pick.gtf pick.fa \
 	-bedOut=pick.bed -exceptions=abWalk.exceptions
 #size of block (7379) #0 not multiple of 3 in chr16.2590.1.NM_052892.3 (source ccds CCDS59275.1)
 #size of block (659) #0 not multiple of 3 in chr2.304.1.NM_001145051.2 (source ccds CCDS56117.1)
 #size of block (1319) #0 not multiple of 3 in chr7.1104.1.NM_133457.2 (source ccds CCDS59504.1)
 
 # Create gene info table. Takes 8 seconds
 cat mrna/*.unusual refSeq/*.unusual | awk '$5=="flip" {print $6;}' > all.flip
 cat mrna/*.psl refSeq/*.psl trna.psl rfam.syntenic.psl \
       | txInfoAssemble pick.bed pick.tce cdsEvidence/txCdsPredict.tce \
 	altSplice.bed abWalk.exceptions sizePolyA.tab stdin all.flip prelim.info
 
 # Cluster purely based on CDS (in same frame). Takes 1 second
 txCdsCluster pick.bed pick.cluster
 
 
 # Flag suspicious CDS regions, and add this to info file. Weed out bad CDS.
 # Map CDS to gene set.  Takes 10 seconds.  Might want to reconsider using
 # txCdsWeed here.
 txCdsSuspect pick.bed txWalk.txg pick.cluster prelim.info pick.suspect pick.info 
 txCdsWeed pick.tce pick.info weededCds.tce weededCds.info
 # Weeded 5048 of 68742 cds
 
 txCdsToGene abWalk.bed abWalk.fa weededCds.tce weededCds.gtf weededCds.faa \
 	-bedOut=weededCds.bed -exceptions=abWalk.exceptions \
 	-tweaked=weededCds.tweaked
 # size of block (7379) #0 not multiple of 3 in chr16.2590.1.NM_052892.3 (source ccds CCDS59275.1)
 # size of block (659) #0 not multiple of 3 in chr2.304.1.NM_001145051.2 (source ccds CCDS56117.1)
 # size of block (1319) #0 not multiple of 3 in chr7.1104.1.NM_133457.2 (source ccds CCDS59504.1)
 
 
 # After txCdsToGene, the transcripts in weededCds.bed might be
 # slightly different from those in abWalk.bed.  Make a new sequence
 # file, weeded.fa, to replace abWalk.fa.
 
 sequenceForBed -db=$db -bedIn=weededCds.bed -fastaOut=weeded.fa \
     -upCase -keepName
 
 
 
 # Separate out transcripts into coding and 4 uncoding categories.
 # Generate new gene set that weeds out the junkiest. Takes 9 seconds.
 txGeneSeparateNoncoding weededCds.bed weededCds.info \
 	coding.bed nearCoding.bed nearCodingJunk.bed antisense.bed uncoding.bed separated.info
 # coding 50303, codingJunk 13388, nearCoding 7350, junk 6052, antisense 1217, noncoding 10702
 
 awk '$2 != "nearCodingJunk"' separated.info > weeded.info
 awk '$2 == "nearCodingJunk" {print $1}' separated.info > weeds.lst
 cat coding.bed nearCoding.bed antisense.bed uncoding.bed | sort -k1,1 -k2,3n >weeded.bed
 
 # Make up a little alignment file for the ones that got tweaked.
 sed -r 's/.*NM_//' weededCds.tweaked | awk '{printf("NM_%s\n", $1);}' > tweakedNm.lst
 fgrep -f tweakedNm.lst refToPep.tab | cut -f 2 > tweakedNp.lst
 fgrep -f weededCds.tweaked weededCds.bed > tweakedNm.bed
 sequenceForBed -db=$db -bedIn=tweakedNm.bed -fastaOut=tweakedNm.fa -upCase -keepName
 faSomeRecords refPep.fa tweakedNp.lst tweakedNp.fa
 blat -q=prot -t=dnax -noHead tweakedNm.fa tweakedNp.fa stdout | sort -k 10 > tweaked.psl
 
 # Make an alignment file for refSeqs that swaps in the tweaked ones
 weedLines weededCds.tweaked blat/protein/refSeq.psl refTweaked.psl
 cat tweaked.psl >> refTweaked.psl
 
 # Make precursor to kgProtMap table.  This is a psl file that maps just the CDS of the gene
 # to the genome.
 txGeneCdsMap weeded.bed weeded.info pick.picks refTweaked.psl \
 	refToPep.tab $genomes/$db/chrom.sizes cdsToRna.psl \
 	rnaToGenome.psl
 # Missed 55 of 35808 refSeq protein mappings.  A small number of RefSeqs just map to genome in the UTR.
 
 pslMap cdsToRna.psl rnaToGenome.psl cdsToGenome.psl
 
 
 # this section is for mapping between assemblies
 # map mm9 knownGene to mm10
 #cd $dir
 #set lift = "/gbdb/$oldDb/liftOver/${oldDb}To${Db}.over.chain.gz"
 #genePredToFakePsl $oldDb knownGene $oldDb.kg.psl $oldDb.kg.cds
 
 # only keep those id's that uniquely map to mm10
 #zcat $lift | pslMap -chainMapFile -swapMap $oldDb.kg.psl stdin stdout | pslCDnaFilter -uniqueMapped stdin stdout |  sort -k 14,14 -k 16,16n | pslToBed -cds=$oldDb.kg.cds stdin $oldDb.$db.kg.bed 
 
 #     drop nonUnique:     112     285
 
 #TODO:  figure out better wayto deal with txLastId
 #set oldGeneBed=$oldDb.$db.kg.bed
 cp ~kent/src/hg/txGene/txGeneAccession/txLastId foo
 
 txGeneAccession $oldGeneBed ~kent/src/hg/txGene/txGeneAccession/txLastId \
 	weeded.bed txToAcc.tab oldToNew.tab
 
 tawk '{print $4}' oldToNew.tab | sort | uniq -c
 #  5556 compatible
 # 74657 exact
 #   709 lost
 #  2747 new
 
 # check to make sure we don't have any dups.  These two numbers should
 # be the same.   
 awk '{print $2}' txToAcc.tab | sed 's/\..*//' | sort -u | wc -l
 # 82959
 awk '{print $2}' txToAcc.tab | sed 's/\..*//' | sort  | wc -l
 # 82960
 
 echo "select * from knownGene" | hgsql $db | sort > $db.knownGene.gp
 grep lost oldToNew.tab | awk '{print $2}' | sort > lost.txt
 join lost.txt $db.knownGene.gp > $db.lost.gp
 
 awk '{if ($7 == $6) print}' $db.lost.gp | wc -l
 # non-coding 241
 awk '{if ($7 != $6) print}' $db.lost.gp | wc -l
 # coding 468
 
 # Assign permanent accessions to each transcript, and make up a number
 # of our files with this accession in place of the temporary IDs we've been
 # using.  Takes 4 seconds
 
 cd $dir
 txGeneAccession $oldGeneBed ~kent/src/hg/txGene/txGeneAccession/txLastId \
 	weeded.bed txToAcc.tab oldToNew.tab
 subColumn 4 weeded.bed txToAcc.tab ucscGenes.bed
 subColumn 1 weeded.info txToAcc.tab ucscGenes.info
 weedLines weeds.lst pick.picks stdout | subColumn 1 stdin txToAcc.tab ucscBadUniprot.picks
 subColumn 4 coding.bed txToAcc.tab ucscCoding.bed
 subColumn 4 nearCoding.bed txToAcc.tab ucscNearCoding.bed
 subColumn 4 antisense.bed txToAcc.tab ucscAntisense.bed
 subColumn 4 uncoding.bed txToAcc.tab ucscUncoding.bed
 subColumn 10 cdsToGenome.psl txToAcc.tab ucscProtMap.psl
 cat txWalk/*.ev | weedLines weeds.lst stdin stdout | subColumn 1 stdin txToAcc.tab ucscGenes.ev
 
 
 # Make files with (a) representative protein and mrna accessions, and (b) the protein
 # and mrna sequences representing direct transcription/translation of the bed blocks.  
 # The reresentative proteins and mrnas will be taken from RefSeq for the RefSeq ones, 
 # and derived from our transcripts for the rest.
 # Load these sequences into database. Takes 17 seconds.
 txGeneProtAndRna weeded.bed weeded.info weeded.fa weededCds.faa \
     refSeq.fa refToPep.tab refPep.fa txToAcc.tab ucscGenes.fa ucscGenes.faa \
     ucscGenesTx.fa ucscGenesTx.faa
 
 # Generate ucscGene/uniprot blat run.
 mkdir -p $dir/blat/uniprotVsUcsc
 cd $dir/blat/uniprotVsUcsc
 mkdir -p raw
 rm -rf ucscFaaSplit
 mkdir  ucscFaaSplit
 faSplit sequence ../../ucscGenes.faa 100 ucscFaaSplit/uc
 ls -1 ucscFaaSplit/uc*.fa > toDoList
 
 echo '#LOOP' > template
 echo './runBlats $(path1) '"$db"' $(root1) {check out line+ raw/uni_$(root1).psl}' >> template
 echo '#ENDLOOP' >> template
 
 cat << '_EOF_' > runBlats
 #!/bin/csh -ef
 set out1 = uni_$3.psl
 set tmpDir = /scratch/tmp/$2/ucscGenes/uniprotVsUcsc
 set workDir = $tmpDir/$3
 mkdir -p $tmpDir
 mkdir $workDir
 blat -prot -minIdentity=90 $1 ../../uniProt.fa $workDir/$out1
 mv $workDir/$out1 raw/$out1
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
 chmod +x runBlats
 
 gensub2 toDoList single template jobList
 
 # Run protein/transcript blat job on cluster
 ssh $cpuFarm "cd $dir/blat/uniprotVsUcsc; para make jobList"
 ssh $cpuFarm "cd $dir/blat/uniprotVsUcsc; para time > run.time"
 
 cat run.time
 # Completed: 97 of 97 jobs
 # CPU time in finished jobs:       4379s      72.98m     1.22h    0.05d  0.000 y
 # IO & Wait Time:                   473s       7.89m     0.13h    0.01d  0.000 y
 # Average job time:                  50s       0.83m     0.01h    0.00d
 # Longest finished job:             503s       8.38m     0.14h    0.01d
 # Submission to last job:           514s       8.57m     0.14h    0.01d
 
 pslCat raw/*.psl > ../../ucscVsUniprot.psl
 rm -r raw
 
 
 # Fixup UniProt links in picks file.  This is a little circuitious.  In the future may
 # avoid using the picks file for the protein link, and rely on protein/protein blat
 # to do the assignment.  The current use of the uniProt protein/ucscGenes mrna alignments
 # and the whole trip through evidence and picks files is largely historical from when
 # there was a different CDS selection process.
 cd $dir
 txCdsRedoUniprotPicks ucscBadUniprot.picks ucscVsUniprot.psl uniCurated.tab ucscGenes.picks
 
 
 
 # Cluster the coding and the uncoding sets, and make up canonical and
 # isoforms tables. Takes 3 seconds.
 txCdsCluster ucscCoding.bed coding.cluster
 txBedToGraph ucscUncoding.bed uncoding uncoding.txg -prefix=non
 txBedToGraph ucscAntisense.bed antisense antisense.txg -prefix=anti
 cat uncoding.txg antisense.txg > senseAnti.txg
 txGeneCanonical coding.cluster ucscGenes.info senseAnti.txg ucscGenes.bed ucscNearCoding.bed \
 	canonical.tab isoforms.tab txCluster.tab
 
 # Make up final splicing graph just containing our genes, and final alt splice
 # table.
 txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg
 txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq > ucscSplice.bed
 
 
 #####################################################################################
 # Now the gene set is built.  Time to start loading it into the database,
 # and generating all the many tables that go on top of known Genes.
 # We do this initially in a temporary database.
 
 # Protect databases from overwriting
 
 if ( $tempDb =~ "${tempPrefix}*" ) then
     if ( 2 == `hgsql -N -e "show databases;" mysql | grep -E -c -e "$tempDb|"'^mysql$'` ) then
 	echo "tempDb: '$tempDb' already exists, it should not"
 	exit 255
     else
 	echo "creating tempDb: '$tempDb'"
 	# Create temporary database with a few small tables from main database
 	hgsqladmin create $tempDb
 	hgsqldump $db chromInfo | hgsql $tempDb
 	hgsqldump $db trackDb_$user | hgsql $tempDb
     endif
 else
     echo "tempDb does not match $tempPrefix"
     hgsql -N $tempDb -e "show tables;" | grep -E -c "chromInfo|trackDb_$user"
     if (2 != `hgsql -N $tempDb -e "show tables;" | grep -E -c "chromInfo|trackDb_$user"` ) then
 	echo "do not find tables chromInfo and trackDb_$user in database '$tempDb'"
 	exit 255
     endif
     echo "tempDb setting: '$tempDb' should not be an existing db"
     exit 255
 endif
 
 
 # Make up knownGenes table, adding uniProt ID. Load into database.
 #	Takes 3 seconds.
 txGeneFromBed ucscGenes.bed ucscGenes.picks ucscGenes.faa uniProt.fa refPep.fa ucscGenes.gp
 hgLoadSqlTab $tempDb knownGene $kent/src/hg/lib/knownGene.sql ucscGenes.gp
 hgLoadBed $tempDb knownAlt ucscSplice.bed
 
 # Load in isoforms, canonical, and gene sequence tables
 hgLoadSqlTab $tempDb knownIsoforms $kent/src/hg/lib/knownIsoforms.sql isoforms.tab
 hgLoadSqlTab $tempDb knownCanonical $kent/src/hg/lib/knownCanonical.sql canonical.tab
 hgPepPred $tempDb generic knownGenePep ucscGenes.faa
 hgPepPred $tempDb generic knownGeneMrna ucscGenes.fa
 hgPepPred $tempDb generic knownGeneTxPep ucscGenesTx.faa
 hgPepPred $tempDb generic knownGeneTxMrna ucscGenesTx.fa
 
 # Create a bunch of knownToXxx tables according to alignment overlap.  
 # Takes about 3 minutes:
 cd $dir
 bedIntersect -minCoverage=1 ucscGenes.bed antibody.bed abGenes.bed
 cat abGenes.bed |awk '{ print $4}' > abGenes.txt
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db ensGene knownGene knownToEnsembl
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db refGene knownGene knownToRefSeq
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" $db > refToLl.txt
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db refGene knownGene knownToLocusLink -lookup=refToLl.txt
 
 # Make up kgXref table.  Takes about 3 minutes.
 time txGeneXref $db $tempDb $spDb ucscGenes.gp ucscGenes.info ucscGenes.picks ucscGenes.ev ucscGenes.xref
 # 5.078u 4.871s 1:45.41 9.4%      0+0k 0+0io 0pf+0w
 
 hgLoadSqlTab $tempDb kgXref $kent/src/hg/lib/kgXref.sql ucscGenes.xref
 
 # Update knownToRefSeq to make it consistent with ucscGenes.xref.  Prior to
 # this update, knownToRefSeq contains links to the RefSeq transcript that it
 # most overlaps.  This is a preliminary mapping.  txGeneXref generates
 # more reliable RefSeq associations, mostly from the CDS picks 
 hgsql $tempDb -e "update knownToRefSeq kr, kgXref kx set kr.value = kx.refseq where kr.name = kx.kgID and length(kx.refseq) > 0"
 
 
 # add NR_... RefSeq IDs into kgXref table.
 hgsql $tempDb \
     -e 'update kgXref set refseq=mRNA where mRNA like "NR_%" and refseq=""'
 
 # Make up and load kgColor table. Takes about a minute.
 txGeneColor $spDb ucscGenes.info ucscGenes.picks ucscGenes.color
 hgLoadSqlTab $tempDb kgColor $kent/src/hg/lib/kgColor.sql ucscGenes.color
 
 # Load up kgTxInfo table. Takes 0.3 second
 hgLoadSqlTab $tempDb kgTxInfo $kent/src/hg/lib/txInfo.sql ucscGenes.info
 
 # Make up alias tables and load them. Takes a minute or so.
 txGeneAlias $db $spDb ucscGenes.xref ucscGenes.ev oldToNew.tab foo.alias foo.protAlias
 sort foo.alias | uniq > ucscGenes.alias
 sort foo.protAlias | uniq > ucscGenes.protAlias
 rm foo.alias foo.protAlias
 hgLoadSqlTab $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias
 hgLoadSqlTab $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias
 
 # Load up kgProtMap2 table that says where exons are in terms of CDS
 hgLoadPsl $tempDb ucscProtMap.psl -table=kgProtMap2
 
 # Create a bunch of knownToXxx tables.  Takes about 3 minutes:
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db allenBrainAli -type=psl knownGene knownToAllenBrain
 
 
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 # Create knownToTreefam table.
 mkdir -p $dir/treeFam
 cd $dir/treeFam
 wget "http://www.treefam.org/static/download/treefam_family_data.tar.gz"
 tar xfz treefam_family_data.tar.gz
 cd treefam_family_data
  egrep -a ">ENSP[0-9]+" *  | cut -d/ -f2 | tr -d : | sed -e 's/.aa.fasta//' |sed -e 's/.cds.fasta//' | grep -v accession | grep -v ^\> | tr '>' '\t' | tawk '{print $2,$1}' > ../ensToTreefam.tab 
 cd ..
 # hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db ensGene knownGene knownToEnsembl -noLoad
 #awk '{print $2,$1}' ../knownToEnsembl.tab | sort | uniq > ensTransUcsc.tab
 hgsql $db -e "select value,name from knownToEnsembl" | sort | uniq > ensTransUcsc.tab
 echo "select transcript,protein from ensGtp" | hgsql hg19 | sort | uniq | awk '{if (NF==2) print}'  > ensTransProt.tab
 join ensTransUcsc.tab ensTransProt.tab | awk '{if (NF==3)print $3, $2}' | sort | uniq  > ensProtToUc.tab
 join ensProtToUc.tab ensToTreefam.tab | sort -u | awk 'BEGIN {OFS="\t"} {print $2,$3}' | sort -u > knownToTreefam.tab
 hgLoadSqlTab $tempDb knownToTreefam $kent/src/hg/lib/knownTo.sql knownToTreefam.tab
 #endif
 
 if ($db =~ hg*) then
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db HInvGeneMrna knownGene knownToHInv
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133Plus2 knownGene knownToU133Plus2
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133 knownGene knownToU133
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU95 knownGene knownToU95
     knownToHprd $tempDb $genomes/$db/p2p/hprd/FLAT_FILES/HPRD_ID_MAPPINGS.txt
     hgsql $tempDb -e "delete k from knownToHprd k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 endif
 
 if ($db =~ hg*) then
     time hgExpDistance $tempDb hgFixed.gnfHumanU95MedianRatio \
 	    hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
     time hgExpDistance $tempDb hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2
 endif
 
 if ($db =~ mm*) then
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyGnf1m knownGene knownToGnf1m
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU74  knownGene knownToU74
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyMOE430 knownGene knownToMOE430
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyMOE430 -prefix=A: knownGene knownToMOE430A
     hgExpDistance $tempDb $db.affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
     hgExpDistance $tempDb $db.affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
     hgExpDistance $tempDb $db.affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
     hgExpDistance $tempDb hgFixed.gnfMouseAtlas2MedianRatio \
 	    hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
 endif
 
 
 # Update visiGene stuff
 knownToVisiGene $tempDb -probesDb=$db
 hgsql $tempDb -e "delete k from knownToVisiGene k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 
 vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text $vgTextDbs
 # probe has 26611 rows
 # gene has 20413 rows
 # imageProbe has 125765 rows
 
 # This shouldn't be here me thinks
 cd /usr/local/apache/cgi-bin/visiGeneData
 ixIxx visiGene.text visiGene.ix visiGene.ixx
 cd $dir
 #end me thinking
 
 # Create Human P2P protein-interaction Gene Sorter columns
 if ($db =~ hg*) then
 hgLoadNetDist $genomes/$db/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \
     -sqlRemap="select distinct value, name from knownToHprd"
 hgLoadNetDist $genomes/$db/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P \
     -sqlRemap="select distinct locusLinkID, kgID from $db.refLink,kgXref where $db.refLink.mrnaAcc = kgXref.mRNA"
 hgLoadNetDist $genomes/$db/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P \
     -sqlRemap="select distinct locusLinkID, kgID from $db.refLink,kgXref where $db.refLink.mrnaAcc = kgXref.mRNA"
 endif
 
 
 # Run nice Perl script to make all protein blast runs for
 # Gene Sorter and Known Genes details page.  Takes about
 # 45 minutes to run.
 rm -rf   $dir/hgNearBlastp
 mkdir  $dir/hgNearBlastp
 cd $dir/hgNearBlastp
 cat << _EOF_ > config.ra
 # Latest human vs. other Gene Sorter orgs:
 # mouse, rat, zebrafish, worm, yeast, fly
 
 targetGenesetPrefix known
 targetDb $tempDb
 queryDbs $xdb $ratDb $fishDb $flyDb $wormDb $yeastDb
 
 ${tempDb}Fa $tempFa
 ${xdb}Fa $xdbFa
 ${ratDb}Fa $ratFa
 ${fishDb}Fa $fishFa
 ${flyDb}Fa $flyFa
 ${wormDb}Fa $wormFa
 ${yeastDb}Fa $yeastFa
 
 buildDir $dir/hgNearBlastp
 scratchDir $scratchDir/brHgNearBlastp
 _EOF_
 
 
 rm -rf  $scratchDir/brHgNearBlastp
 doHgNearBlastp.pl -noLoad -clusterHub=swarm -distrHost=hgwdev -dbHost=hgwdev -workhorse=hgwdev config.ra |& tee do.log 
 # *** All done!
 # *** -noLoad was specified -- you can run this script manually to load mm10 tables:
 #        run.mm10.mm10/loadPairwise.csh
 #
 # *** -noLoad was specified -- you can run these scripts manually to load mm10 tables:
 #        run.mm10.hg19/loadPairwise.csh
 #        run.mm10.rn5/loadPairwise.csh
 #        run.mm10.danRer7/loadPairwise.csh
 #        run.mm10.dm3/loadPairwise.csh
 #        run.mm10.ce10/loadPairwise.csh
 #        run.mm10.sacCer3/loadPairwise.csh
 #
 # *** -noLoad was specified -- you can run these scripts manually to load mmBlastTab in query databases:
 #        run.hg19.mm10/loadPairwise.csh
 #        run.rn5.mm10/loadPairwise.csh
 #        run.danRer7.mm10/loadPairwise.csh
 #        run.dm3.mm10/loadPairwise.csh
 #        run.ce10.mm10/loadPairwise.csh
 #        run.sacCer3.mm10/loadPairwise.csh
 
 
 # Load self
 cd $dir/hgNearBlastp/run.$tempDb.$tempDb
 ./loadPairwise.csh
 
 # Load mouse and rat
 cd $dir/hgNearBlastp/run.$tempDb.$xdb
 hgLoadBlastTab $tempDb $xBlastTab -maxPer=1 out/*.tab
 cd $dir/hgNearBlastp/run.$tempDb.$ratDb
 hgLoadBlastTab $tempDb $rnBlastTab -maxPer=1 out/*.tab
 
 # Remove non-syntenic hits for mouse and rat
 # Takes a few minutes
 mkdir -p /gbdb/$tempDb/liftOver
 rm -f /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz
 ln -s $genomes/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \
     /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz
 ln -s $genomes/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \
     /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz
 
 # delete non-syntenic genes from rat and mouse blastp tables
 cd $dir/hgNearBlastp
 synBlastp.csh $tempDb $xdb
 # old number of unique query values: 61500
 # old number of unique target values 22297
 # new number of unique query values: 47295
 # new number of unique target values 18227
 
 # TODO:  why such a big difference!   Are the synteny checks done?
 hgsql -e "select  count(*) from mmBlastTab\G" $oldDb | tail -n +2
 # count(*): 70589
 hgsql -e "select  count(*) from mmBlastTab\G" $tempDb | tail -n +2
 # count(*): 45806
 
 synBlastp.csh $tempDb $ratDb knownGene rgdGene2
 # old number of unique query values: 54724
 # old number of unique target values 10134
 # new number of unique query values: 23165
 # new number of unique target values 6421
 
 hgsql -e "select  count(*) from rnBlastTab\G" $oldDb | tail -n +2
 # count(*): 27747
 hgsql -e "select  count(*) from rnBlastTab\G" $tempDb | tail -n +2
 # count(*): 22397
 
 # Make reciprocal best subset for the blastp pairs that are too
 # Far for synteny to help
 
 # Us vs. fish
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$fishDb
 set bToA = run.$fishDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab
 
 # TODO: what's this doing here?
 # hgLoadBlastTab $fishDb tfBlastTab $bToA/recipBest.tab
 
 hgsql -e "select  count(*) from drBlastTab\G" $oldDb | tail -n +2
 # count(*): 13085
 hgsql -e "select  count(*) from drBlastTab\G" $tempDb | tail -n +2
 # count(*): 13022
 
 # Us vs. fly
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$flyDb
 set bToA = run.$flyDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb dmBlastTab $aToB/recipBest.tab
 
 # why is this here?  TODO
 # hgLoadBlastTab $flyDb tfBlastTab $bToA/recipBest.tab
 
 hgsql -e "select  count(*) from dmBlastTab\G" $oldDb | tail -n +2
 #  count(*): 5970
 hgsql -e "select  count(*) from dmBlastTab\G" $tempDb | tail -n +2
 # count(*): 5976
 
 # Us vs. worm
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$wormDb
 set bToA = run.$wormDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab
 
 # why is this here?  TODO
 #hgLoadBlastTab $wormDb tfBlastTab $bToA/recipBest.tab
 
 hgsql -e "select  count(*) from ceBlastTab\G" $oldDb | tail -n +2
 # count(*): 4949
 hgsql -e "select  count(*) from ceBlastTab\G" $tempDb | tail -n +2
 # count(*): 4947
 
 # Us vs. yeast
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$yeastDb
 set bToA = run.$yeastDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab
 
 # why is this here?  TODO
 #hgLoadBlastTab $yeastDb tfBlastTab $bToA/recipBest.tab
 
 hgsql -e "select  count(*) from scBlastTab\G" $oldDb | tail -n +2
 # count(*): 2364
 hgsql -e "select  count(*) from scBlastTab\G" $tempDb | tail -n +2
 # count(*): 2366
 
 # Clean up
 cd $dir/hgNearBlastp
 cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz
 gzip run.*/all.tab
 
 # make knownToWikipedia
 mkdir -p $dir/wikipedia
 cd $dir/wikipedia
 join kg6ToKg7.tab oldKnownToWikipedia.tab | awk 'BEGIN {OFS="\t"} {print $2,$3}' | sort | uniq > knownToWikipedia.tab
 hgLoadSqlTab $tempDb  knownToWikipedia $kent/src/hg/lib/knownToWikipedia.sql  knownToWikipedia.tab
 
 # MAKE FOLDUTR TABLES 
 # First set up directory structure and extract UTR sequence on hgwdev
 cd $dir
 mkdir -p rnaStruct
 cd rnaStruct
 mkdir -p utr3/split utr5/split utr3/fold utr5/fold
 # these commands take some significant time
 utrFa -nibPath=$genomes/$db/nib $tempDb knownGene utr3 utr3/utr.fa
 utrFa -nibPath=$genomes/$db/nib $tempDb knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
 faSplit sequence utr3/utr.fa 10000 utr3/split/s
 faSplit sequence utr5/utr.fa 10000 utr5/split/s
 ls -1 utr3/split > utr3/in.lst
 ls -1 utr5/split > utr5/in.lst
 cd utr3
 cat << '_EOF_' > template
 #LOOP
 rnaFoldBig split/$(path1) fold
 #ENDLOOP
 '_EOF_'
 gensub2 in.lst single template jobList
 cp template ../utr5
 cd ../utr5
 
 gensub2 in.lst single template jobList
 
 # Do cluster runs for UTRs
 ssh $cpuFarm "cd $dir/rnaStruct/utr3; para make jobList"
 ssh $cpuFarm "cd $dir/rnaStruct/utr5; para make jobList"
 
 
 # Load database
     cd $dir/rnaStruct/utr5
     hgLoadRnaFold $tempDb foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold -warnEmpty $tempDb foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # Make pfam run.  Actual cluster run is about 6 hours.
 mkdir -p /hive/data/outside/pfam/Pfam31.0
 cd /hive/data/outside/pfam/Pfam31.0
 wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
 gunzip Pfam-A.hmm.gz
 #set pfamScratch = $scratchDir/pfamBR
 #ssh $cpuFarm mkdir -p $pfamScratch
 #ssh $cpuFarm cp /hive/data/outside/pfam/Pfam26.0/Pfam-A.hmm $pfamScratch
 
 mkdir -p $dir/pfam
 cd $dir/pfam
 rm -rf  splitProt
 mkdir  splitProt
 faSplit sequence $dir/ucscGenes.faa 10000 splitProt/
 mkdir -p result
 ls -1 splitProt > prot.list
 # /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/Pfam26.0/Pfam-A.hmm 
 cat << '_EOF_' > doPfam
 #!/bin/csh -ef  
 /hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch   --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam31.0/Pfam-A.hmm     splitProt/$1 
 mv /scratch/tmp/pfam.$2.pf $3
 '_EOF_'
     # << happy emacs
 chmod +x doPfam
 cat << '_EOF_' > template
 #LOOP
 ./doPfam $(path1) $(root1) {check out line+ result/$(root1).pf}
 #ENDLOOP
 '_EOF_'
 gensub2 prot.list single template jobList
 
 ssh $cpuFarm "cd $dir/pfam; para make jobList"
 ssh $cpuFarm "cd $dir/pfam; para time > run.time"
 cat run.time
 
 # Completed: 9670 of 9670 jobs
 # CPU time in finished jobs:    2470667s   41177.79m   686.30h   28.60d  0.078 y
 # IO & Wait Time:                659066s   10984.43m   183.07h    7.63d  0.021 y
 # Average job time:                 324s       5.39m     0.09h    0.00d
 # Longest finished job:             663s      11.05m     0.18h    0.01d
 # Submission to last job:         49263s     821.05m    13.68h    0.57d
 
 # Make up pfamDesc.tab by converting pfam to a ra file first
 cat << '_EOF_' > makePfamRa.awk
 /^NAME/ {print}
 /^ACC/ {print}
 /^DESC/ {print}
 /^TC/ {print $1,$3; printf("\n");}
 '_EOF_'
 awk -f makePfamRa.awk  /hive/data/outside/pfam/Pfam31.0/Pfam-A.hmm  > pfamDesc.ra
 raToTab -cols=ACC,NAME,DESC,TC pfamDesc.ra stdout |  awk -F '\t' '{printf("%s\t%s\t%s\t%g\n", $1, $2, $3, $4);}' | sort > pfamDesc.tab
 
 # Convert output to tab-separated file. 
 cd $dir/pfam
 catDir result | sed '/^#/d' > allResults.tab
 awk 'BEGIN {OFS="\t"} { print $5,$1,$18-1,$19,$4,$14}' allResults.tab | sort > allUcscPfam.tab
 join  -t $'\t' -j 1  allUcscPfam.tab pfamDesc.tab | tawk '{if ($6 > $9) print $2, $3, $4, $5, $6, $1}' > ucscPfam.tab
 join  -t $'\t' -j 1  allUcscPfam.tab pfamDesc.tab | tawk '{print $2, $3, $4, $5 "/" $6 "/" $9, $6, $1}' | sort > sortAllUcscPfam.tab
 cd $dir
 
 # Convert output to knownToPfam table
 tawk '{print $1, gensub(/\.[0-9]+/, "", "g", $6)}' pfam/ucscPfam.tab | sort -u > knownToPfam.tab
 hgLoadSqlTab -notOnServer $tempDb knownToPfam $kent/src/hg/lib/knownTo.sql knownToPfam.tab
 tawk '{print gensub(/\.[0-9]+/, "", "g", $1), $2, $3}' pfam/pfamDesc.tab| hgLoadSqlTab -notOnServer $tempDb pfamDesc $kent/src/hg/lib/pfamDesc.sql stdin
 
 cd $dir/pfam
 genePredToFakePsl hg19 knownGene knownGene.psl cdsOut.tab
 sort cdsOut.tab | sed 's/\.\./   /' > sortCdsOut.tab
 sort ucscPfam.tab> sortPfam.tab
 awk '{print $10, $11}' knownGene.psl > gene.sizes
 join sortCdsOut.tab sortAllUcscPfam.tab |  awk '{print $1, $2 - 1 + 3 * $4, $2 - 1 + 3 * $5, $6}' | bedToPsl gene.sizes stdin allDomainToGene.psl
 join sortCdsOut.tab sortPfam.tab |  awk '{print $1, $2 - 1 + 3 * $4, $2 - 1 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl
 pslMap allDomainToGene.psl knownGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin allDomainToGenome.bed 
 pslMap domainToGene.psl knownGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin domainToGenome.bed 
 hgLoadBed $tempDb ucscGenePfam domainToGenome.bed
 
 # Do scop run. Takes about 6 hours
 # First get pfam global HMMs into /hive/data/outside/scop/scop.hmm 
 # used existing ones...TODO
 mkdir -p $dir/scop
 cd $dir/scop
 rm -rf result
 mkdir  result
 ls -1 ../pfam/splitProt > prot.list
 cat << '_EOF_' > doScop
 #!/bin/csh -ef
 /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/scop/scop.hmm \
 	../pfam/splitProt/$1 > /scratch/tmp/$2.pf
 mv /scratch/tmp/$2.pf $3
 '_EOF_'
     # << happy emacs
 chmod +x doScop
 cat << '_EOF_' > template
 #LOOP
 doScop $(path1) $(root1) {check out line+ result/$(root1).pf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 gensub2 prot.list single template jobList
 
 ssh $cpuFarm "cd $dir/scop; para make jobList"
 ssh $cpuFarm "cd $dir/scop; para time > run.time"
 cat run.time
 
 # Completed: 9670 of 9670 jobs
 # CPU time in finished jobs:    3970738s   66178.96m  1102.98h   45.96d  0.126 y
 # IO & Wait Time:               5306721s   88445.36m  1474.09h   61.42d  0.168 y
 # Average job time:                 959s      15.99m     0.27h    0.01d
 # Longest finished job:            4512s      75.20m     1.25h    0.05d
 # Submission to last job:         23325s     388.75m     6.48h    0.27
 
 # Convert scop output to tab-separated files
 cd $dir
 catDir scop/result | \
 	hmmPfamToTab -eValCol -scoreCol stdin scopPlusScore.tab
 scopCollapse scopPlusScore.tab /hive/data/outside/scop/model.tab ucscScop.tab \
 	scopDesc.tab knownToSuper.tab
 hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab
 hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'"
 
 hgLoadSqlTab $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab
 hgLoadSqlTab $tempDb ucscScop $kent/src/hg/lib/ucscScop.sql ucscScop.tab
 
 # Regenerate ccdsKgMap table
 $kent/src/hg/makeDb/genbank/bin/x86_64/mkCcdsGeneMap  -db=$tempDb -loadDb $db.ccdsGene knownGene ccdsKgMap
 
 cd $dir
 # Map old to new mapping
 #set oldGeneBed=$oldDb.$db.kg.bed
 txGeneExplainUpdate2 $oldGeneBed ucscGenes.bed kgOldToNew.tab
 hgLoadSqlTab $tempDb kg${lastVer}ToKg${curVer} $kent/src/hg/lib/kg1ToKg2.sql kgOldToNew.tab
 
 # add kg${lastVer}ToKg${curVer} to all.joiner !!!!
 
 # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables.
 
 hgsql $tempDb -N -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' > kgSpAlias_0.tmp
          
 hgsql $tempDb -N -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >> kgSpAlias_0.tmp
 cat kgSpAlias_0.tmp|sort -u  > kgSpAlias.tab
 rm kgSpAlias_0.tmp
 
 hgLoadSqlTab $tempDb kgSpAlias $kent/src/hg/lib/kgSpAlias.sql kgSpAlias.tab
 
 
 # Do BioCyc Pathways build
     mkdir -p $dir/bioCyc
     cd $dir/bioCyc
     grep -v '^#' $bioCycPathways > pathways.tab
     grep -v '^#' $bioCycGenes > genes.tab
     kgBioCyc1 genes.tab pathways.tab $tempDb bioCycPathway.tab bioCycMapDesc.tab
     hgLoadSqlTab $tempDb bioCycPathway $kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
     hgLoadSqlTab $tempDb bioCycMapDesc $kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 # Do KEGG Pathways build (borrowing Fan Hus's strategy from hg19.txt)
     mkdir -p $dir/kegg
     cd $dir/kegg
 
     # Make the keggMapDesc table, which maps KEGG pathway IDs to descriptive names
     cp /cluster/data/hg19/bed/ucsc.13/kegg/map_title.tab .
     # wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab
     cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp
     cut -f 2 j.tmp >j.hsa
     cut -f 1,3 j.tmp >j.1
     paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab
     rm j.hsa j.1 j.tmp
     hgLoadSqlTab $tempDb keggMapDesc $kent/src/hg/lib/keggMapDesc.sql keggMapDesc.tab
 
     # Following in two-step process, build/load a table that maps UCSC Gene IDs
     # to LocusLink IDs and to KEGG pathways.  First, make a table that maps 
     # LocusLink IDs to KEGG pathways from the downloaded data.  Store it temporarily
     # in the keggPathway table, overloading the schema.
     cp /cluster/data/hg19/bed/ucsc.13/kegg/hsa_pathway.list .
 
     cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
     hgLoadSqlTab $tempDb keggPathway $kent/src/hg/lib/keggPathway.sql j.tmp
 
     # Next, use the temporary contents of the keggPathway table to join with
     # knownToLocusLink, creating the real content of the keggPathway table.
     # Load this data, erasing the old temporary content
     hgsql $tempDb -B -N -e \
     'select distinct name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
     >keggPathway.tab
     hgLoadSqlTab $tempDb \
 	keggPathway $kent/src/hg/lib/keggPathway.sql  keggPathway.tab
 
    # Finally, update the knownToKeggEntrez table from the keggPathway table.
    hgsql $tempDb -B -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' \
         |sort -u| sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab
    hgLoadSqlTab $tempDb knownToKeggEntrez $kent/src/hg/lib/knownToKeggEntrez.sql \
         knownToKeggEntrez.tab
     hgsql $tempDb -e "delete k from knownToKeggEntrez k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 
 # Make spMrna table (useful still?)
    cd $dir
    hgsql $db -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab;
    hgLoadSqlTab $tempDb spMrna $kent/src/hg/lib/spMrna.sql spMrna.tab
 
 
 # Do CGAP tables 
 
     mkdir -p $dir/cgap
     cd $dir/cgap
     
     wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
     hgCGAP Hs_GeneData.dat
         
     cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
     hgLoadSqlTab $tempDb cgapAlias $kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab
 
     hgLoadSqlTab $tempDb cgapBiocPathway $kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab
 
     cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
     hgLoadSqlTab $tempDb cgapBiocDesc $kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab
 
 
 
 cd $dir
 # Make PCR target for UCSC Genes, Part 1.
 # 1. Get a set of IDs that consist of the UCSC Gene accession concatenated with the
 #    gene symbol, e.g. uc010nxr.1__DDX11L1
 hgsql $tempDb -N -e 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt
 
 # 2. Get a file of per-transcript fasta sequences that contain the sequences of each 
 #    UCSC Genes transcript, with this new ID in the place of the UCSC Genes accession.
 #    Convert that file to TwoBit format and soft-link it into /gbdb/hg19/targetDb/
 subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
 sequenceForBed -keepName -db=$db -bedIn=ucscGenesIdSubbed.bed -fastaOut=stdout \
     | faToTwoBit stdin kgTargetSeq${curVer}.2bit 
 mkdir -p /gbdb/$db/targetDb/
 rm -f /gbdb/$db/targetDb/kgTargetSeq${curVer}.2bit 
 ln -s $dir/kgTargetSeq${curVer}.2bit /gbdb/$db/targetDb/
 # Load the table kgTargetAli, which shows where in the genome these targets are.
 cut -f 1-10 ucscGenes.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null
 hgLoadPsl $tempDb kgTargetAli.psl
 
 #
 # At this point we should save a list of the tables in tempDb!!!
 echo "show tables" | hgsql $tempDb > tablesInKnownGene.lst
 
 # NOW SWAP IN TABLES FROM TEMP DATABASE TO MAIN DATABASE.
 # You'll need superuser powers for this step.....
 
 cd $dir
 # Drop tempDb history table and chromInfo, we don't want to swap them in!
 hgsql -e "drop table history" $tempDb
 hgsql -e "drop table chromInfo" $tempDb
 
 
 cat << _EOF_ > moveTablesIntoPlace
 # Save old known genes and kgXref tables
 sudo ~kent/bin/copyMysqlTable $db knownGene $tempDb knownGeneOld$lastVer
 sudo ~kent/bin/copyMysqlTable $db kgXref $tempDb kgXrefOld$lastVer
 
 # Create backup database
 hgsqladmin create ${db}BackupBraney2
 
 
 # Swap in new tables, moving old tables to backup database.
 sudo ~kent/bin/swapInMysqlTempDb $tempDb $db ${db}BackupBraney2
 _EOF_
 
 
 # Update database links.
 sudo rm /var/lib/mysql/uniProt
 sudo ln -s /var/lib/mysql/$spDb /var/lib/mysql/uniProt
 sudo rm /var/lib/mysql/proteome
 sudo ln -s /var/lib/mysql/$pbDb /var/lib/mysql/proteome
 hgsqladmin flush-tables
 
 
 # Make full text index.  Takes a minute or so.  After this the genome browser
 # tracks display will work including the position search.  The genes details
 # page, gene sorter, and proteome browser still need more tables.
 mkdir -p $dir/index
 cd $dir/index
 hgKgGetText $db knownGene.text 
 ixIxx knownGene.text knownGene.ix knownGene.ixx
 rm -f /gbdb/$db/knownGene.ix /gbdb/$db/knownGene.ixx
 ln -s $dir/index/knownGene.ix  /gbdb/$db/knownGene.ix
 ln -s $dir/index/knownGene.ixx /gbdb/$db/knownGene.ixx
 
 
 # 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on       
 # /gbdb/$db/targetDb/kgTargetSeq${curVer}.2bit
 
 # 4. On hgwdev, insert new records into blatServers and targetDb, using the 
 # host (field 2) and port (field 3) specified by cluster-admin.  Identify the
 # blatServer by the keyword "$db"Kg with the version number appended
 hgsql hgcentraltest -e \
       'INSERT into blatServers values ("hg19Kgv14", "blat4d", 17847, 0, 1);'
 hgsql hgcentraltest -e \                                                    
       'INSERT into targetDb values("hg19Kgv14", "UCSC Genes", \
          "hg19", "kgTargetAli", "", "", \
          "/gbdb/hg19/targetDb/kgTargetSeq7.2bit", 1, now(), "");'
 
 
 cd $dir
 
 
 #
 # Finally, need to wait until after testing, but update databases in other organisms
 # with blastTabs
 
 # Load blastTabs
 cd $dir/hgNearBlastp
 hgLoadBlastTab $xdb $blastTab run.$xdb.$tempDb/out/*.tab
 hgLoadBlastTab $ratDb $blastTab run.$ratDb.$tempDb/out/*.tab 
 hgLoadBlastTab $flyDb $blastTab run.$flyDb.$tempDb/recipBest.tab
 hgLoadBlastTab $wormDb $blastTab run.$wormDb.$tempDb/recipBest.tab
 hgLoadBlastTab $yeastDb $blastTab run.$yeastDb.$tempDb/recipBest.tab
 
 # Do synteny on mouse/human/rat
 synBlastp.csh $xdb $db
 # old number of unique query values: 43103
 # old number of unique target values 22704
 # new number of unique query values: 34612
 # new number of unique target values 19733
 
 synBlastp.csh $ratDb $db rgdGene2 knownGene
 # old number of unique query values: 11206
 # old number of unique target values 10796
 # new number of unique query values: 6741
 # new number of unique target values 6925
 
 
 # need to generate multiz downloads
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz46way/alignments/knownCanonical.exonAA.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz46way/alignments/knownCanonical.exonNuc.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz46way/alignments/knownGene.exonAA.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz46way/alignments/knownGene.exonNuc.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz46way/alignments/md5sum.txt
 
 # copy over malacards table from hg38
 # Done on Tue Dec  8 00:20:47 CET 2015 max
 # mysqldump hg38 malacards | mysql hg19
 
 echo
 echo "see the bottom of the script for details about knownToWikipedia"
 echo
 # Clean up
 rm -r run.*/out
 
 # Last step in setting up isPCR: after the new UCSC Genes with the new Known Gene isPcr
 # is released, take down the old isPcr gfServer  
 
 #######################
 ### The following is the process Briam Lee used to pull out only
 #   the genes from knownToLocusLink for which there are Wikipedia articles.
 ### get the full knownToLocusLinkTable
 # hgsql -Ne 'select value from knownToLocusLink' hg19 | sort -u >> knToLocusLink
 ###   query Wikipedia for each to if there is an article
 # for i in $(cat knToLocusLink); do lynx -dump "http://genewiki.sulab.org/map/wiki/"$i | grep -m 1 "no results" >trash ; echo $? $i | grep "1 "| awk '{print $2}'>> workingLinks; done
 ###   pull out all isoforms that have permitted LocusLinkIds
 # for i in $(cat workingLinks); do hgsql -Ne 'select * from knownToLocusLink where value like "'$i'"' hg19 >> knownToWikipediaNew; done
 ###   then load the table as knownToWikipedia using the knowToLocusLink INDICES.
 
 ## braney's knownToWikipedia logic
 # maybe rescrape wikipedia following instructions in doc/wikipediaScrape.txt
 mkdir $dir/wikipedia
 cd $dir/wikipedia
 hgsql hg19 -e "select geneSymbol,name from knownGene g, kgXref x where g.name=x.kgId " | sort > hg19.symbolToId.txt
 join -t $'\t'   /hive/groups/browser/wikipediaScrape/symbolToPage.txt hg19.symbolToId.txt | tawk '{print $3,$2}' | sort | uniq > hg19.idToPage.txt
 hgLoadSqlTab hg19 knownToWikipedia $HOME/kent/src/hg/lib/knownTo.sql hg19.idToPage.txt
 
 # make bigKnownGene.bb
 set genomes = /hive/data/genomes
 set dir = $genomes/hg19/bed/ucsc.14.3
 cd $dir
 makeBigKnown hg19
 rm -f /gbdb/hg19/knownGene.bb
 ln -s `pwd`/hg19.knownGene.bb /gbdb/hg19/knownGene.bb
+
+# upgrade old gene symbols
+mkdir $dir/newSyms
+cd $dir/newSyms
+hgMapToGene hg19 wgEncodeGencodeCompV38lift37 knownGene map -noLoad
+hgsql hg19 -Ne "select name,geneSymbol from knownGene, kgXref where knownGene.name=kgXref.kgId" | sort > ucToSym.txt
+hgsql hg19 -Ne "select name,geneName from wgEncodeGencodeCompV38lift37, wgEncodeGencodeAttrsV38lift37 where name=transcriptId" | sort > ensToSym.txt
+join -t$'\t' ucToEns.txt ucToSym.txt | sort -k 2 > ucEnsSymbol.txt
+join -t$'\t' -1 2 -2 1 ucEnsSymbol.txt ens* | tawk '{print $2,$1, $3, $4}' | sort > mapping.txt
+tawk '$3 != $4 {print $1, $4}' mapping.txt | sort -u > newMaps.txt
+tawk '$3 != $4 {print $1}' mapping.txt | sort -u > changedIds.txt
+cat ../ucscGenes.alias newMaps.txt | sort > newKgAlias.txt
+hgLoadSqlTab hg19 kgAlias $HOME/kent/src/hg/lib/kgAlias.sql newKgAlias.txt
+
+# now do kgXref
+tawk '$3 != $4 {print $1, $3, $4}' mapping.txt | sort -u | tr '/' '.' > idOldNew.txt
+sort ../ucscGenes.xref | join -t$'\t' changedIds.txt /dev/stdin -v 2 > unchangedXref.tab 
+sort ../ucscGenes.xref | join -t$'\t' changedIds.txt /dev/stdin  > toChangeXref.tab 
+
+# only change gene symbols in field 5, and 8 on lines where the id has been identified above
+IFS=$'\t'; while read id old new; do grep "^$id" toChangeXref.tab | tawk "{gsub(\"$old\", \"$new\", \$5);gsub(\"$old\", \"$new\", \$8); print}" ; done < idOldNew.txt > beenChangedXref.txt
+
+cat unchangedXref.tab   beenChangedXref.txt | sort > newKgXref.tab
+hgLoadSqlTab hg19 kgXref $HOME/kent/src/hg/lib/kgXref.sql newKgXref.tab
+
+sort ../knownGene.text > oldKnownGene.text
+join -t$'\t' -a 1 oldKnownGene.text newMaps.txt > newKnownGene.txt
+ixIxx newKnownGene.txt knownGene.ix knownGene.ixx -maxWordLength=63
+rm -f /gbdb/hg19/knownGene.ix /gbdb/hg19/knownGene.ixx
+ln -s `pwd`/knownGene.ix  /gbdb/hg19/knownGene.ix
+ln -s `pwd`/knownGene.ixx /gbdb/hg19/knownGene.ixx
+
+./makeBigKnown hg19
+rm -f /gbdb/hg19/knownGene.bb
+ln -s `pwd`/hg19.knownGene.bb /gbdb/hg19/knownGene.bb