src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes15.csh 4321aabd0d2819f88564b50b0cd26edf82e69502

4321aabd0d2819f88564b50b0cd26edf82e69502
braney
  Wed Aug 14 10:19:58 2019 -0700
add a note to rename the kgTargetSeq file to match the name in
blatServers next time around.

diff --git src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes15.csh src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes15.csh
index c70de0b..1f12c8e 100755
--- src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes15.csh
+++ src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes15.csh
@@ -1,1603 +1,1607 @@
 #!/bin/tcsh -efx
 # :vim nowrap
 # for emacs: -*- mode: sh; -*-
 # This describes how to make the UCSC genes on hg38, though
 # hopefully by editing the variables that follow immediately
 # this will work on other databases too.
 
 #
 # Prerequisites
 # Before executing this script, rebuild the swissprot ,proteins, and go databases.
 
 # Directories
 set genomes = /hive/data/genomes
 set dir = $genomes/hg38/bed/ucsc.15.1
 set scratchDir = /hive/users/braney/scratch
 set testingDir = $scratchDir/ucscGenes
 
 # Databases
 set db = hg38
 set Db = Hg38
 set oldDb = hg19
 set xdb = mm10
 set Xdb = Mm10
 set ydb = canFam3
 set zdb = rheMac3
 set spDb = sp140122
 set pbDb = proteins140122
 set ratDb = rn4
 set RatDb = Rn4
 set fishDb = danRer7
 set flyDb = dm3
 set wormDb = ce6
 set yeastDb = sacCer3
 
 # The net alignment for the closely-related species indicated in $xdb
 set xdbNetDir = $genomes/$db/bed/lastz.${xdb}/axtChain
 
 # Blast tables
 set rnBlastTab = rnBlastTab
 if ($db =~ hg* ) then
     set blastTab = hgBlastTab
     set xBlastTab = mmBlastTab
 endif
 if ($db =~ mm* ) then
     set blastTab = mmBlastTab
     set xBlastTab = hgBlastTab
 endif
 
 # If rebuilding on an existing assembly make tempDb some bogus name like tmpFoo2, otherwise 
 # make tempDb same as db.
 #set tempPrefix = "tmp"
 #set tmpSuffix = "Foo14"
 #set tempDb = ${tempPrefix}${tmpSuffix}
 set tempDb = hg38
 #set bioCycTempDb = tmpBioCyc${tmpSuffix}
 
 # Table for SNPs
 #set snpTable = snp137
 
 # Public version number
 set lastVer = 7
 set curVer = 8
 
 # Proteins in various species
 set tempFa = $dir/ucscGenes.faa
 set xdbFa = $genomes/$xdb/bed/ucsc.13.1/ucscGenes.faa
 set ratFa = $genomes/$ratDb/bed/blastpRgdGene2/rgdGene2Pep.faa
 set fishFa = $genomes/$fishDb/bed/blastp/ensembl.faa
 set flyFa = $genomes/$flyDb/bed/hgNearBlastp/100806/$flyDb.flyBasePep.faa
 set wormFa = $genomes/$wormDb/bed/blastp/wormPep190.faa
 set yeastFa = $genomes/$yeastDb/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa
 
 # Other files needed
 # First register with BioCyc to download their HumanCyc database
 # The site will email you the URL for download.  Beware, they supply
 #       a URL to a directory chock a block full of data, almost 7 Gb,
 #       you only need one file
     
 mkdir /hive/data/outside/bioCyc/140219
 cd /hive/data/outside/bioCyc/140219
 mkdir download
 cd download  
 wget --timestamping --no-directories --recursive --user=biocyc-flatfiles --password=data-20541 "http://brg.ai.sri.com/ecocyc/dist/flatfiles-52983746/human.tar.gz"
 tar xvzf human.tar.gz
 set bioCycDir = /hive/data/outside/bioCyc/140219/download/17.5/data
 
 # liftOver hg19 version of Rfam to hg38
 pslMap -swapMap -chainMapFile /hive/data/genomes/hg19/bed/ucsc.14.3/rfam.syntenic.psl \
     /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz rfam.lifted.psl
 pslToBed rfam.lifted.psl rfam.lifted.bed
 
 # Tracks
 set multiz = multiz4way
 
 # NCBI Taxon 10090 for mouse, 9606 for human
 set taxon = 9606
 
 # Previous gene set
 set oldGeneDir = $genomes/hg19/bed/ucsc.14.3
 set oldGeneBed = $oldGeneDir/ucscGenes.bed
 
 # Machines
 set dbHost = hgwdev
 set ramFarm = ku
 set cpuFarm = ku
 
 # Code base
 set kent = ~/kent
 
 # Create initial dir
 mkdir -p $dir
 cd $dir
 
 # Get Genbank info
 txGenbankData $db
 # creates the files:
 #	mgcStatus.tab  mrna.psl  refPep.fa  refSeq.psl
 #	mrna.fa        mrna.ra   refSeq.fa  refSeq.ra
 
 # process RA Files
 txReadRa mrna.ra refSeq.ra .
 # creates the files:
 #	cds.tab             refSeqStatus.tab	mrnaSize.tab	refToPep.tab
 #	refPepStatus.tab    exceptions.tab	accVer.tab
 
 # Get some other info from the database.  Best to get it about
 # the same time so it is synced with other data. Takes 4 seconds.
 hgsql -N $db -e 'select distinct name,sizePolyA from mrnaOrientInfo' | \
 	subColumn -miss=sizePolyA.miss 1 stdin accVer.tab sizePolyA.tab
 #	creates the files:
 #	sizePolyA.miss  sizePolyA.tab
 
 # didn't do this yet
 # # {
 # Get CCDS for human (or else just an empty file)
 if ( `hgsql -N $db -e "show tables;" | grep -E -c "ccdsGene|chromInfo"` == 2) then
     hgsql -N $db -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ccdsGene" | genePredToBed stdin ccds.bed
     hgsql -N $db \
 	-e "select mrnaAcc,ccds from ccdsInfo where srcDb='N'" > refToCcds.tab
 else
     echo -n "" > ccds.bed
     echo -n "" > refToCcds.tab
 endif
 
 # Get tRNA for human (or else just an empty file)
 if ($db =~ hg* && \
   `hgsql -N $db -e "show tables;" | grep -E -c "tRNAs|chromInfo"` == 2) then
     hgsql -N $db -e "select chrom,chromStart,chromEnd,name,score,strand,chromStart,chromEnd,"0","1",chromEnd-chromStart,"0" from tRNAs" > trna.bed 
     bedToPsl $genomes/$db/chrom.sizes trna.bed trna.psl
 else
     echo -n "" > trna.bed
     echo -n "" > trna.psl
 endif
 
 # Get the blocks in this genome that are syntenic to the $xdb genome
 netFilter -syn $xdbNetDir/${db}.${xdb}.net.gz > ${db}.${xdb}.syn.net
 netToBed -maxGap=0 ${db}.${xdb}.syn.net ${db}.${xdb}.syn.bed
 
 # Get the Rfams that overlap with blocks that are syntenic to $Xdb, and filter out
 # any duplicate Rfam blocks.  In some cases, where there are two related Rfam models,
 # the same genomic region can get two distinct hits.  For our purposes, we only
 # want one of those hits, because we're looking for regions that might be genes and
 # don't care as much about the subclass of gene.
 cat rfam.lifted.bed |sort -k1,1 -k2,2n > rfam.sorted.bed
 bedRemoveOverlap rfam.sorted.bed rfam.distinctHits.bed
 bedIntersect -aHitAny rfam.distinctHits.bed ${db}.${xdb}.syn.bed rfam.syntenic.bed
 bedToPsl $genomes/$db/chrom.sizes rfam.syntenic.bed rfam.syntenic.psl
 
 
 # Create directories full of alignments split by chromosome.
 cd $dir
 mkdir -p est refSeq mrna 
 pslSplitOnTarget refSeq.psl refSeq
 pslSplitOnTarget mrna.psl -maxTargetCount=2000 mrna
 # no ccds
 # bedSplitOnChrom ccds.bed ccds
 
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (! -e refSeq/$c.psl) then
 	  echo creating empty refSeq/$c.psl
           echo -n "" >refSeq/$c.psl
     endif
     if (! -e mrna/$c.psl) then
 	  echo creating empty mrna/$c.psl
           echo -n "" >mrna/$c.psl
     endif
     hgGetAnn -noMatchOk $db intronEst $c est/$c.psl
     if (! -e est/$c.psl) then
 	  echo creating empty est/$c.psl
           echo -n "" >est/$c.psl
     endif
     if (! -e ccds/$c.bed) then
 	  echo creating empty ccds/$c.bed
           echo -n "" >ccds/$c.bed
     endif
 end
 
 
 # Get list of accessions that are associated with antibodies from database.
 # This will be a good list but not 100% complete.  Cluster these to get
 # four or five antibody heavy regions.  Later we'll weed out input that
 # falls primarily in these regions, and, include the regions themselves
 # as special genes.  Takes 40 seconds
 txAbFragFind $db antibodyAccs
 # Found 30601 descriptions that match
 # Matched 31149 of 19563720 accessions to antibody criteria
 pslCat mrna/*.psl -nohead | fgrep -w -f antibodyAccs > antibody.psl
 clusterPsl -prefix=ab.ab.antibody. antibody.psl antibody.cluster
 if ($db =~ mm*) then
     echo chr12 > abChrom.lst
     echo chr16 >> abChrom.lst
     echo chr6 >> abChrom.lst
     fgrep -w -f abChrom.lst antibody.cluster | cut -f 1-12 | bedOrBlocks stdin antibody.bed
 else
     awk '$13 > 100' antibody.cluster | cut -f 1-12 > antibody.bed
 endif
 
 
 # Convert psls to bed, saving mapping info and weeding antibodies.  Takes 2.5 min
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (-s refSeq/$c.psl) then
 	txPslToBed refSeq/$c.psl -noFixStrand -cds=cds.tab $genomes/$db/$db.2bit refSeq/$c.bed -unusual=refSeq/$c.unusual
     else
 	echo "creating empty refSeq/$c.bed"
 	touch refSeq/$c.bed
     endif
     if (-s mrna/$c.psl) then
 	txPslToBed mrna/$c.psl -cds=cds.tab $genomes/$db/$db.2bit \
 		stdout -unusual=mrna/$c.unusual \
 	    | bedWeedOverlapping antibody.bed maxOverlap=0.5 stdin mrna/$c.bed
     else
 	echo "creating empty mrna/$c.bed"
 	touch mrna/$c.bed
     endif
     if (-s est/$c.psl) then
 	txPslToBed est/$c.psl $genomes/$db/$db.2bit stdout \
 	    | bedWeedOverlapping antibody.bed maxOverlap=0.3 stdin est/$c.bed
     else
 	echo "creating empty est/$c.bed"
 	touch est/$c.bed
     endif
 end
 
 # Create mrna splicing graphs.  Takes 10 seconds.
 mkdir -p bedToGraph
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txBedToGraph -prefix=$c. ccds/$c.bed ccds refSeq/$c.bed refSeq mrna/$c.bed mrna  \
 	bedToGraph/$c.txg
 end
 
 # Create est splicing graphs.  Takes 6 minutes.
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txBedToGraph -prefix=e$c. est/$c.bed est est/$c.txg
 end
 
 
 # Create an evidence weight file
 cat > trim.weights <<end
 refSeq  100
 ccds    50
 mrna    2
 txOrtho 1
 exoniphy 1
 est 1
 end
 
 
 
 # Make evidence file for EST graph edges supported by at least 2 
 # ests.  Takes about 30 seconds.
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txgGoodEdges est/$c.txg  trim.weights 2 est est/$c.edges
 end
 
 
 # Setup other species dir
 mkdir -p $dir/$xdb
 cd $dir/$xdb
 
 # Get other species mrna including ESTs.  Takes about three minutes
 mkdir -p refSeq mrna est
 foreach c (`awk '{print $1;}' $genomes/$xdb/chrom.sizes`)
     echo $c
     hgGetAnn -noMatchOk $xdb refSeqAli $c stdout | txPslToBed stdin $genomes/$xdb/$xdb.2bit refSeq/$c.bed 
     hgGetAnn -noMatchOk $xdb mrna $c stdout | txPslToBed stdin $genomes/$xdb/$xdb.2bit mrna/$c.bed
     hgGetAnn -noMatchOk $xdb intronEst $c stdout | txPslToBed stdin $genomes/$xdb/$xdb.2bit est/$c.bed
 end
 
 # Create other species splicing graphs.  Takes about five minutes.
 cd $dir/$xdb
 rm -f other.txg
 foreach c (`awk '{print $1;}' $genomes/$xdb/chrom.sizes`)
     echo $c
     txBedToGraph refSeq/$c.bed refSeq mrna/$c.bed mrna est/$c.bed est stdout >> other.txg
 end
 
 
 # Clean up all but final other.txg
 rm -r est mrna refSeq
 
 
 
 # Unpack chains and nets, apply synteny filter and split by chromosome
 # Takes 5 minutes.  Make up phony empty nets for ones that are empty after
 # synteny filter.
 cd $dir/$xdb
 chainSplit chains $genomes/$db/bed/lastz.$xdb/axtChain/$db.$xdb.all.chain.gz
 netFilter -syn $genomes/$db/bed/lastz.$xdb/axtChain/$db.$xdb.net.gz \
 	| netSplit stdin nets
 cd nets
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (! -e $c.net) then
         echo -n >  $c.net
     endif
 end
 
 cd ../chains
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     if (! -e $c.chain) then
         echo  -n >  $c.chain
     endif
 end
 
 # Make txOrtho directory and a para spec file
 cd $dir
 mkdir -p txOrtho/edges
 cd txOrtho
 echo '#LOOP' > template
 echo './runTxOrtho $(path1) '"$xdb $db" >> template
 echo '#ENDLOOP' >> template
     
 cat << '_EOF_' > runTxOrtho
 #!/bin/csh -ef
 set inAgx = ../bedToGraph/$1.txg
 set inChain = ../$2/chains/$1.chain
 set inNet = ../$2/nets/$1.net
 set otherTxg = ../$2/other.txg
 set tmpDir = /scratch/tmp/$3
 set workDir = $tmpDir/${1}_${2}
 mkdir -p $tmpDir
 mkdir -p $workDir
 txOrtho $inAgx $inChain $inNet $otherTxg $workDir/$1.edges
 mv $workDir/$1.edges edges/$1.edges
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
     chmod +x runTxOrtho
     touch toDoList
 cd $dir/bedToGraph
 foreach f (*.txg)
     set c=$f:r
     if ( -s $f ) then
 	echo $c >> ../txOrtho/toDoList
     else
 	echo "warning creating empty $c.edges result"
 	touch ../txOrtho/edges/$c.edges
     endif
 end
 cd ..
 
 # Do txOrtho parasol run on high RAM cluster
 ssh $ramFarm "cd $dir/txOrtho; gensub2 toDoList single template jobList"
 ssh $ramFarm "cd $dir/txOrtho; para make jobList"
 ssh $ramFarm "cd $dir/txOrtho; para time > run.time"
 cat txOrtho/run.time
 
 # Completed: 1811 of 1811 jobs
 # CPU time in finished jobs:      31324s     522.06m     8.70h    0.36d  0.001 y
 # IO & Wait Time:                  9887s     164.79m     2.75h    0.11d  0.000 y
 # Average job time:                  23s       0.38m     0.01h    0.00d
 # Longest finished job:             316s       5.27m     0.09h    0.00d
 # Submission to last job:         70626s    1177.10m    19.62h    0.82d
 #
 
 # Filter out some duplicate edges. These are legitimate from txOrtho's point
 # of view, since they represent two different mouse edges both supporting
 # a human edge. However, from the human point of view we want only one
 # support from mouse orthology.  Just takes a second.
 #  TODO: assuming true if mouse is target.
 cd $dir/txOrtho
 mkdir -p uniqEdges
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     cut -f 1-9 edges/$c.edges | sort | uniq > uniqEdges/$c.edges
 end
 cd ..
 
 
 #
 # testing suggestion: uncomment below
 # mkdir -p $testingDir
 # compareModifiedFileSizes.csh $oldGeneDir .
 # cut -f-3,5,6,8 txOrtho/uniqEdges/chr22.edges >$testingDir/chr22.subset.edges.new
 # cut -f-3,5,6,8 $oldGeneDir/txOrtho/uniqEdges/chr22.edges \
 #    >$testingDir/chr22.subset.edges.old
 # checkRandomLinesExist.py -s $testingDir/chr22.subset.edges.old \
 #    -d $testingDir/chr22.subset.edges.new
 
 # Clean up chains and nets since they are big
 cd $dir
 rm -r $xdb/chains $xdb/nets
 
 # Get exoniphy. Takes about 4 seconds.
 hgsql -N $db -e "select chrom, txStart, txEnd, name, "1", strand from exoniphy order by chrom, txStart;" > exoniphy.bed
 bedToTxEdges exoniphy.bed exoniphy.edges
 
 # Add evidence from ests, orthologous other species transcripts, and exoniphy
 # Takes 36 seconds.
 mkdir -p graphWithEvidence
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     echo adding evidence for $c
     if ( -s bedToGraph/$c.txg ) then
     txgAddEvidence -chrom=$c bedToGraph/$c.txg exoniphy.edges exoniphy stdout \
 	   | txgAddEvidence stdin txOrtho/uniqEdges/$c.edges txOrtho stdout \
 	   | txgAddEvidence stdin est/$c.edges est graphWithEvidence/$c.txg
     else
 	touch graphWithEvidence/$c.txg
     endif
 end
 
 #
 # special testing suggestion: uncomment below
 # compareModifiedFileSizes.csh $oldGeneDir .
 # cut -f1-3,5 graphWithEvidence/chr22.txg > $testingDir/chr22.graph.bounds.new
 # cut -f1-3,5 $oldGeneDir/graphWithEvidence/chr22.txg > $testingDir/chr22.graph.bounds.old
 # checkRandomLinesExist.py -s $testingDir/chr22.graph.bounds.old \
 #  -d $testingDir/chr22.graph.bounds.new
 #
 
 # Do  txWalk  - takes 32 seconds (mostly loading the mrnaSize.tab again and
 # again...)
 mkdir -p txWalk
 if ($db =~ mm*) then
     set sef = 0.6
 else
     set sef = 0.75
 endif
 foreach c (`awk '{print $1;}' $genomes/$db/chrom.sizes`)
     txWalk graphWithEvidence/$c.txg trim.weights 3 txWalk/$c.bed \
 	-evidence=txWalk/$c.ev -sizes=mrnaSize.tab -defrag=0.25 \
 	    -singleExonFactor=$sef
 end
 
 
 # Make a file that lists the various categories of alt-splicing we see.
 # Do this by making and analysing splicing graphs of just the transcripts
 # that have passed our process so far.  The txgAnalyze program occassionally
 # will make a duplicate, which is the reason for the sort/uniq run.
 # Takes 7 seconds.
 cat txWalk/*.bed > txWalk.bed
 txBedToGraph txWalk.bed txWalk txWalk.txg
 txgAnalyze txWalk.txg $genomes/$db/$db.2bit stdout | sort | uniq > altSplice.bed
 
 # Get txWalk transcript sequences.  This takes about ten minutes
 time twoBitToFa $genomes/$db/$db.2bit -bed=txWalk.bed txWalk.fa
 # 151.150u 373.867s 21:25.82 40.8%        0+0k 0+0io 0pf+0w
 
 rm -rf txFaSplit
 mkdir -p txFaSplit
 faSplit sequence txWalk.fa 200 txFaSplit/
 
 #
 # A good time for testing.  Uncomment the lines (not all at once) to 
 # compare the line count for files just built in the current version
 # and the previous version
 #
 # compareModifiedFileSizes.csh $oldGeneDir .
 # # Check that most of the old alt events are still there
 # checkRandomLinesExist.py -d $oldGeneDir/altSplice.bed -s ./altSplice.bed
 # # check that most of the old txWalk bed entries overlap some new entry
 # bedIntersect -aHitAny txWalk.bed $oldGeneDir/txWalk.bed \
 #    $testingDir/txWalk.intersect.bed
 # wc $testingDir/txWalk.intersect.bed
 #
 
 # Fetch human protein set and table that describes if curated or not.
 # Takes about a minute
 hgsql -N $spDb -e \
   "select p.acc, p.val from protein p, accToTaxon x where x.taxon=$taxon and p.acc=x.acc" \
   | awk '{print ">" $1;print $2}' >uniProt.fa
 faSize uniProt.fa
 # 55039707 bases (43318105 N's 11721602 real 11721602 upper 0 lower) in 155102 sequences in 1 files
 
 hgsql -N $spDb -e "select i.acc,i.isCurated from info i,accToTaxon x where x.taxon=$taxon and i.acc=x.acc" > uniCurated.tab
 
 mkdir -p blat/rna/raw
 echo '#LOOP' > blat/rna/template
 echo './runTxBlats $(path1) '"$db"' $(root1) {check out line+ raw/ref_$(root1).psl}' >> blat/rna/template
 echo '#ENDLOOP' >> blat/rna/template
  
 cat << '_EOF_' > blat/rna/runTxBlats
 #!/bin/csh -ef
 set ooc = /scratch/data/$2/hg38.11.ooc
 set target = ../../txFaSplit/$1
 set out1 = raw/mrna_$3.psl
 set out2 = raw/ref_$3.psl
 set tmpDir = /scratch/tmp/$2/ucscGenes/rna
 set workDir = $tmpDir/$3
 mkdir -p $tmpDir
 rm -rf $workDir
 mkdir $workDir
 blat -ooc=$ooc -minIdentity=95 $target ../../mrna.fa $workDir/mrna_$3.psl
 blat -ooc=$ooc -minIdentity=97 $target ../../refSeq.fa $workDir/ref_$3.psl
 mv $workDir/mrna_$3.psl raw/mrna_$3.psl
 mv $workDir/ref_$3.psl raw/ref_$3.psl
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
 chmod +x blat/rna/runTxBlats
 cd txFaSplit
 ls -1 *.fa > ../blat/rna/toDoList
 cd ..
 
 ssh $cpuFarm "cd $dir/blat/rna; gensub2 toDoList single template jobList"
 ssh $cpuFarm "cd $dir/blat/rna; para make jobList"
 
 ssh $cpuFarm "cd $dir/blat/rna; para time > run.time"
 cat blat/rna/run.time
 # Completed: 196 of 196 jobs  
 # CPU time in finished jobs:      50761s     846.01m    14.10h    0.59d  0.002 y  
 # IO & Wait Time:                 18931s     315.52m     5.26h    0.22d  0.001 y  
 # Average job time:                 356s       5.93m     0.10h    0.00d  
 # Longest finished job:            2529s      42.15m     0.70h    0.03d  
 # Submission to last job:          2712s      45.20m     0.75h    0.03d  
 
 # Set up blat jobs for proteins vs. translated txWalk transcripts
 cd $dir
 mkdir -p blat/protein/raw
 echo '#LOOP' > blat/protein/template
 echo './runTxBlats $(path1) '"$db"' $(root1) {check out line+ raw/ref_$(root1).psl}' >> blat/protein/template
 echo '#ENDLOOP' >> blat/protein/template
 
 cat << '_EOF_' > blat/protein/runTxBlats
 #!/bin/csh -ef
 set ooc = /scratch/data/$2/$2.11.ooc
 set target = ../../txFaSplit/$1
 set out1 = uni_$3.psl
 set out2 = ref_$3.psl
 set tmpDir = /scratch/tmp/$2/ucscGenes/prot
 set workDir = $tmpDir/$3
 mkdir -p $tmpDir
 rm -rf $workDir
 mkdir $workDir
 blat -t=dnax -q=prot -minIdentity=90 $target ../../uniProt.fa $workDir/$out1
 blat -t=dnax -q=prot -minIdentity=90 $target ../../refPep.fa $workDir/$out2
 mv $workDir/$out1 raw/$out1
 mv $workDir/$out2 raw/$out2
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
 chmod +x blat/protein/runTxBlats
 cd txFaSplit
 ls -1 *.fa > ../blat/protein/toDoList
 cd ..
 
 # Run protein/transcript blat job on cluster
 ssh $cpuFarm "cd $dir/blat/protein; gensub2 toDoList single template jobList"
 
 ssh $cpuFarm "cd $dir/blat/protein; para make jobList"
 ssh $cpuFarm "cd $dir/blat/protein; para time > run.time"
 cat blat/protein/run.time
 # Completed: 196 of 196 jobs
 # CPU time in finished jobs:      33638s     560.64m     9.34h    0.39d  0.001 y
 # IO & Wait Time:                 29077s     484.61m     8.08h    0.34d  0.001 y
 # Average job time:                 320s       5.33m     0.09h    0.00d
 # Longest finished job:             694s      11.57m     0.19h    0.01d
 # Submission to last job:          1534s      25.57m     0.43h    0.02d
 
 # Sort and select best alignments. Remove raw files for space. Takes  a couple
 # of hours. Use pslReps not pslCdnaFilter because need -noIntrons flag,
 # and also working on protein as well as rna alignments. The thresholds
 # for the proteins in particular are quite loose, which is ok because
 # they will be weighted against each other.  We lose some of the refSeq
 # mappings at tighter thresholds.
 cd $dir/blat
 pslCat -nohead rna/raw/ref*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/refSeq.psl /dev/null
 pslCat -nohead rna/raw/mrna*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005  stdin rna/mrna.psl /dev/null
 pslCat -nohead protein/raw/ref*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -nearTop=0.02  -ignoreSize -minAli=0.85 stdin protein/refSeq.psl /dev/null
 pslCat -nohead protein/raw/uni*.psl | sort -k 10 | \
 	pslReps -noIntrons -nohead -nearTop=0.02  -minAli=0.85 stdin protein/uniProt.psl /dev/null
 rm -r protein/raw
 
 cd $dir
 
 # Get parts of multiple alignments corresponding to transcripts.
 # Takes a couple of hours.  Could do this is a small cluster job
 # to speed it up.
 echo $db $xdb $ydb $zdb > ourOrgs.txt
 foreach c (`cut -f1 $genomes/$db/chrom.sizes`)
     echo "doing chrom $c"
     if (-s txWalk/$c.bed ) then
 	mafFrags $db $multiz txWalk/$c.bed stdout -bed12 -meFirst | mafSpeciesSubset stdin ourOrgs.txt txWalk/$c.maf -keepFirst
     endif
 end
 
 cd $dir
 
 # Create and populate directory with various CDS evidence
 mkdir -p cdsEvidence
 cat txWalk/*.maf | txCdsPredict txWalk.fa -nmd=txWalk.bed -maf=stdin cdsEvidence/txCdsPredict.tce
 txCdsEvFromRna refSeq.fa cds.tab blat/rna/refSeq.psl txWalk.fa \
 	cdsEvidence/refSeqTx.tce -refStatus=refSeqStatus.tab \
 	-unmapped=cdsEvidence/refSeqTx.unmapped -exceptions=exceptions.tab
 txCdsEvFromRna mrna.fa cds.tab blat/rna/mrna.psl txWalk.fa \
 	cdsEvidence/mrnaTx.tce -mgcStatus=mgcStatus.tab \
 	-unmapped=cdsEvidence/mrna.unmapped
 txCdsEvFromProtein refPep.fa blat/protein/refSeq.psl txWalk.fa \
 	cdsEvidence/refSeqProt.tce -refStatus=refPepStatus.tab \
 	-unmapped=cdsEvidence/refSeqProt.unmapped \
 	-exceptions=exceptions.tab -refToPep=refToPep.tab \
 	-dodgeStop=3 -minCoverage=0.3
 txCdsEvFromProtein uniProt.fa blat/protein/uniProt.psl txWalk.fa \
 	cdsEvidence/uniProt.tce -uniStatus=uniCurated.tab \
 	-unmapped=cdsEvidence/uniProt.unmapped -source=trembl
 
 
 
 txCdsEvFromBed ccds.bed ccds txWalk.bed ../../$db.2bit cdsEvidence/ccds.tce
 cat cdsEvidence/*.tce | sort  > unweighted.tce
 
 # Merge back in antibodies, and add the small, noncoding genes that shouldn't go 
 # through txWalk because their gene boundaries should not change much.  Before
 # adding them, weed out anything that overlaps a txWalk transcript to avoid 
 # getting duplicate transcripts.
 
 bedWeedOverlapping txWalk.bed rfam.syntenic.bed rfam.weeded.bed
 
 bedWeedOverlapping txWalk.bed trna.bed trna.weeded.bed
 cat txWalk.bed antibody.bed trna.weeded.bed rfam.weeded.bed > abWalk.bed
 sequenceForBed -db=$db -bedIn=antibody.bed -fastaOut=stdout -upCase -keepName > antibody.fa
 sequenceForBed -db=$db -bedIn=trna.weeded.bed -fastaOut=stdout -upCase -keepName \
   > trna.weeded.fa
 sequenceForBed -db=$db -bedIn=rfam.weeded.bed -fastaOut=stdout -upCase -keepName \
   > rfam.weeded.fa
 cat txWalk.fa antibody.fa trna.weeded.fa rfam.weeded.fa > abWalk.fa
 
 
 # Pick ORFs, make genes
 #cat refToPep.tab refToCcds.tab | \
 cat refToPep.tab | \
         txCdsPick abWalk.bed unweighted.tce stdin pick.tce pick.picks \
 	-exceptionsIn=exceptions.tab \
 	-exceptionsOut=abWalk.exceptions 
 txCdsToGene abWalk.bed abWalk.fa pick.tce pick.gtf pick.fa \
 	-bedOut=pick.bed -exceptions=abWalk.exceptions
 
 # Create gene info table. Takes 8 seconds
 cat mrna/*.unusual refSeq/*.unusual | awk '$5=="flip" {print $6;}' > all.flip
 cat mrna/*.psl refSeq/*.psl trna.psl rfam.syntenic.psl \
       | txInfoAssemble pick.bed pick.tce cdsEvidence/txCdsPredict.tce \
 	altSplice.bed abWalk.exceptions sizePolyA.tab stdin all.flip prelim.info
 
 # Cluster purely based on CDS (in same frame). Takes 1 second
 txCdsCluster pick.bed pick.cluster
 
 
 # Flag suspicious CDS regions, and add this to info file. Weed out bad CDS.
 # Map CDS to gene set.  Takes 10 seconds.  Might want to reconsider using
 # txCdsWeed here.
 txCdsSuspect pick.bed txWalk.txg pick.cluster prelim.info pick.suspect pick.info 
 txCdsWeed pick.tce pick.info weededCds.tce weededCds.info
 # Weeded 5234 of 73003 cds
 
 txCdsToGene abWalk.bed abWalk.fa weededCds.tce weededCds.gtf weededCds.faa \
 	-bedOut=weededCds.bed -exceptions=abWalk.exceptions \
 	-tweaked=weededCds.tweaked
 
 # After txCdsToGene, the transcripts in weededCds.bed might be
 # slightly different from those in abWalk.bed.  Make a new sequence
 # file, weeded.fa, to replace abWalk.fa.
 
 sequenceForBed -db=$db -bedIn=weededCds.bed -fastaOut=weeded.fa \
     -upCase -keepName
 
 # Separate out transcripts into coding and 4 uncoding categories.
 # Generate new gene set that weeds out the junkiest. Takes 9 seconds.
 txGeneSeparateNoncoding weededCds.bed weededCds.info \
 	coding.bed nearCoding.bed nearCodingJunk.bed antisense.bed uncoding.bed separated.info
 # coding 53453, codingJunk 14316, nearCoding 8196, junk 6452, antisense 1626, noncoding 26587
 
 awk '$2 != "nearCodingJunk"' separated.info > weeded.info
 awk '$2 == "nearCodingJunk" {print $1}' separated.info > weeds.lst
 cat coding.bed nearCoding.bed antisense.bed uncoding.bed | sort -k1,1 -k2,3n >weeded.bed
 
 # Make up a little alignment file for the ones that got tweaked.
 sed -r 's/.*NM_//' weededCds.tweaked | awk '{printf("NM_%s\n", $1);}' > tweakedNm.lst
 fgrep -f tweakedNm.lst refToPep.tab | cut -f 2 > tweakedNp.lst
 fgrep -f weededCds.tweaked weededCds.bed > tweakedNm.bed
 sequenceForBed -db=$db -bedIn=tweakedNm.bed -fastaOut=tweakedNm.fa -upCase -keepName
 faSomeRecords refPep.fa tweakedNp.lst tweakedNp.fa
 blat -q=prot -t=dnax -noHead tweakedNm.fa tweakedNp.fa stdout | sort -k 10 > tweaked.psl
 
 # Make an alignment file for refSeqs that swaps in the tweaked ones
 weedLines weededCds.tweaked blat/protein/refSeq.psl refTweaked.psl
 cat tweaked.psl >> refTweaked.psl
 
 # Make precursor to kgProtMap table.  This is a psl file that maps just the CDS of the gene
 # to the genome.
 # NOTE: had to edit refToPep.tab to change NM_001127457.2 to NM_001127457.1
 # NOTE: had to edit refToPep.tab to change NM_004716.3 to NM_004716.2 
 # NOTE: had to edit refToPep.tab to change NM_003791.3 to NM_003791.2 
 # NOTE: had to edit refToPep.tab to change NM_002594.4 to NM_002594.3 
 # NOTE: had to edit refToPep.tab to change NM_001201529.2 to NM_001201529.1 
 txGeneCdsMap weeded.bed weeded.info pick.picks refTweaked.psl \
 	refToPep.tab $genomes/$db/chrom.sizes cdsToRna.psl \
 	rnaToGenome.psl
 # Missed 337 of 40856 refSeq protein mappings.  A small number of RefSeqs just map
 to genome in the UTR.
 
 pslMap cdsToRna.psl rnaToGenome.psl cdsToGenome.psl
 
 # this section is for mapping between assemblies
 # map hg19 knownGene to hg38
 cd $dir
 set lift = "/gbdb/$oldDb/liftOver/${oldDb}To${Db}.over.chain.gz"
 genePredToFakePsl $oldDb knownGene $oldDb.kg.psl $oldDb.kg.cds
 
 # only keep those id's that uniquely map to hg38
 zcat $lift | pslMap -chainMapFile -swapMap $oldDb.kg.psl stdin stdout | pslCDnaFilter -uniqueMapped stdin stdout |  sort -k 14,14 -k 16,16n | pslToBed -cds=$oldDb.kg.cds stdin $oldDb.$db.kg.bed 
 
 #     drop nonUnique:     376     1074
 
 # changed uc010nxr.1 to uc001aab.3 because there were two uc010nxr in hg19
 
 #TODO:  figure out better wayto deal with txLastId
 set oldGeneBed=$oldDb.$db.kg.bed
 cp ~kent/src/hg/txGene/txGeneAccession/txLastId saveLastId
 # cp saveLastId ~kent/src/hg/txGene/txGeneAccession/txLastId 
 
 txGeneAccession -test $oldGeneBed ~kent/src/hg/txGene/txGeneAccession/txLastId \
 	weeded.bed txToAcc.tab oldToNew.tab
 
 tawk '{print $4}' oldToNew.tab | sort | uniq -c
 #   8879 compatible
 #  69691 exact
 #   4012 lost
 #  25608 new
 
 # check to make sure we don't have any dups.  These two numbers should
 # be the same.   
 awk '{print $2}' txToAcc.tab | sed 's/\..*//' | sort -u | wc -l
 # 104178
 awk '{print $2}' txToAcc.tab | sed 's/\..*//' | sort  | wc -l
 # 104178
 
 # this should be the current db instead of olDdb if not the first release
 echo "select * from knownGene" | hgsql $oldDb | sort > $oldDb.knownGene.gp
 grep lost oldToNew.tab | awk '{print $2}' | sort > lost.txt
 join lost.txt $oldDb.knownGene.gp > $oldDb.lost.gp
 
 awk '{if ($7 == $6) print}' $oldDb.lost.gp | wc -l
 # non-coding 1234
 awk '{if ($7 != $6) print}' $oldDb.lost.gp | wc -l
 # coding 2778
 
 # Assign permanent accessions to each transcript, and make up a number
 # of our files with this accession in place of the temporary IDs we've been
 # using.  Takes 4 seconds
 
 cd $dir
 txGeneAccession $oldGeneBed ~kent/src/hg/txGene/txGeneAccession/txLastId \
 	weeded.bed txToAcc.tab oldToNew.tab
 subColumn 4 weeded.bed txToAcc.tab ucscGenes.bed
 subColumn 1 weeded.info txToAcc.tab ucscGenes.info
 weedLines weeds.lst pick.picks stdout | subColumn 1 stdin txToAcc.tab ucscBadUniprot.picks
 subColumn 4 coding.bed txToAcc.tab ucscCoding.bed
 subColumn 4 nearCoding.bed txToAcc.tab ucscNearCoding.bed
 subColumn 4 antisense.bed txToAcc.tab ucscAntisense.bed
 subColumn 4 uncoding.bed txToAcc.tab ucscUncoding.bed
 subColumn 10 cdsToGenome.psl txToAcc.tab ucscProtMap.psl
 cat txWalk/*.ev | weedLines weeds.lst stdin stdout | subColumn 1 stdin txToAcc.tab ucscGenes.ev
 
 
 # Make files with (a) representative protein and mrna accessions, and (b) the protein
 # and mrna sequences representing direct transcription/translation of the bed blocks.  
 # The reresentative proteins and mrnas will be taken from RefSeq for the RefSeq ones, 
 # and derived from our transcripts for the rest.
 # Load these sequences into database. Takes 17 seconds.
 txGeneProtAndRna weeded.bed weeded.info weeded.fa weededCds.faa \
     refSeq.fa refToPep.tab refPep.fa txToAcc.tab ucscGenes.fa ucscGenes.faa \
     ucscGenesTx.fa ucscGenesTx.faa
 
 # Generate ucscGene/uniprot blat run.
 mkdir -p $dir/blat/uniprotVsUcsc
 cd $dir/blat/uniprotVsUcsc
 mkdir -p raw
 rm -rf ucscFaaSplit
 mkdir  ucscFaaSplit
 faSplit sequence ../../ucscGenes.faa 100 ucscFaaSplit/uc
 ls -1 ucscFaaSplit/uc*.fa > toDoList
 
 echo '#LOOP' > template
 echo './runBlats $(path1) '"$db"' $(root1) {check out line+ raw/uni_$(root1).psl}' >> template
 echo '#ENDLOOP' >> template
 
 cat << '_EOF_' > runBlats
 #!/bin/csh -ef
 set out1 = uni_$3.psl
 set tmpDir = /scratch/tmp/$2/ucscGenes/uniprotVsUcsc
 set workDir = $tmpDir/$3
 mkdir -p $tmpDir
 mkdir $workDir
 blat -prot -minIdentity=90 $1 ../../uniProt.fa $workDir/$out1
 mv $workDir/$out1 raw/$out1
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	<< happy emacs
 chmod +x runBlats
 
 gensub2 toDoList single template jobList
 
 # Run protein/transcript blat job on cluster
 ssh $cpuFarm "cd $dir/blat/uniprotVsUcsc; para make jobList"
 ssh $cpuFarm "cd $dir/blat/uniprotVsUcsc; para time > run.time"
 
 cat run.time
 #Completed: 97 of 97 jobs 
 #CPU time in finished jobs:       4806s      80.09m     1.33h    0.06d  0.000 y 
 #IO & Wait Time:                   280s       4.67m     0.08h    0.00d  0.000 y
 #Average job time:                  52s       0.87m     0.01h    0.00d
 #Longest finished job:             455s       7.58m     0.13h    0.01d
 #Submission to last job:           501s       8.35m     0.14h    0.01d
 
 pslCat raw/*.psl > ../../ucscVsUniprot.psl
 rm -r raw
 
 
 # Fixup UniProt links in picks file.  This is a little circuitious.  In the future may
 # avoid using the picks file for the protein link, and rely on protein/protein blat
 # to do the assignment.  The current use of the uniProt protein/ucscGenes mrna alignments
 # and the whole trip through evidence and picks files is largely historical from when
 # there was a different CDS selection process.
 cd $dir
 txCdsRedoUniprotPicks ucscBadUniprot.picks ucscVsUniprot.psl uniCurated.tab ucscGenes.picks
 
 # Cluster the coding and the uncoding sets, and make up canonical and
 # isoforms tables. Takes 3 seconds.
 txCdsCluster ucscCoding.bed coding.cluster
 txBedToGraph ucscUncoding.bed uncoding uncoding.txg -prefix=non
 txBedToGraph ucscAntisense.bed antisense antisense.txg -prefix=anti
 cat uncoding.txg antisense.txg > senseAnti.txg
 txGeneCanonical coding.cluster ucscGenes.info senseAnti.txg ucscGenes.bed ucscNearCoding.bed \
 	canonical.tab isoforms.tab txCluster.tab
 
 # Make up final splicing graph just containing our genes, and final alt splice
 # table.
 txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg
 txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq > ucscSplice.bed
 
 
 #####################################################################################
 # Now the gene set is built.  Time to start loading it into the database,
 # and generating all the many tables that go on top of known Genes.
 # We do this initially in a temporary database.
 
 # Protect databases from overwriting
 
 if ( $tempDb =~ "${tempPrefix}*" ) then
     if ( 2 == `hgsql -N -e "show databases;" mysql | grep -E -c -e "$tempDb|"'^mysql$'` ) then
 	echo "tempDb: '$tempDb' already exists, it should not"
 	exit 255
     else
 	echo "creating tempDb: '$tempDb'"
 	# Create temporary database with a few small tables from main database
 	hgsqladmin create $tempDb
 	hgsqldump $db chromInfo | hgsql $tempDb
 	hgsqldump $db trackDb_$user | hgsql $tempDb
     endif
 else
     echo "tempDb does not match $tempPrefix"
     hgsql -N $tempDb -e "show tables;" | grep -E -c "chromInfo|trackDb_$user"
     if (2 != `hgsql -N $tempDb -e "show tables;" | grep -E -c "chromInfo|trackDb_$user"` ) then
 	echo "do not find tables chromInfo and trackDb_$user in database '$tempDb'"
 	exit 255
     endif
     echo "tempDb setting: '$tempDb' should not be an existing db"
     exit 255
 endif
 
 
 # Make up knownGenes table, adding uniProt ID. Load into database.
 #	Takes 3 seconds.
 txGeneFromBed ucscGenes.bed ucscGenes.picks ucscGenes.faa uniProt.fa refPep.fa ucscGenes.gp
 hgLoadSqlTab -notOnServer $tempDb knownGene $kent/src/hg/lib/knownGene.sql ucscGenes.gp
 hgLoadBed $tempDb knownAlt ucscSplice.bed
 
 # Load in isoforms, canonical, and gene sequence tables
 hgLoadSqlTab -notOnServer $tempDb knownIsoforms $kent/src/hg/lib/knownIsoforms.sql isoforms.tab
 hgLoadSqlTab -notOnServer $tempDb knownCanonical $kent/src/hg/lib/knownCanonical.sql canonical.tab
 hgPepPred $tempDb generic knownGenePep ucscGenes.faa
 hgPepPred $tempDb generic knownGeneMrna ucscGenes.fa
 hgPepPred $tempDb generic knownGeneTxPep ucscGenesTx.faa
 hgPepPred $tempDb generic knownGeneTxMrna ucscGenesTx.fa
 
 # Create a bunch of knownToXxx tables according to alignment overlap.  
 # Takes about 3 minutes:
 cd $dir
 bedIntersect -minCoverage=1 ucscGenes.bed antibody.bed abGenes.bed
 cat abGenes.bed |awk '{ print $4}' > abGenes.txt
 # no ensembl
 # hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db ensGene knownGene knownToEnsembl
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db  wgEncodeGencodeCompV20 knownGene knownToGencodeV20
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db refGene knownGene knownToRefSeq
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" $db > refToLl.txt
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db refGene knownGene knownToLocusLink -lookup=refToLl.txt
 
 # Make up kgXref table.  Takes about 3 minutes.
 time txGeneXref $db $tempDb $spDb ucscGenes.gp ucscGenes.info ucscGenes.picks ucscGenes.ev ucscGenes.xref
 # 5.078u 4.871s 1:45.41 9.4%      0+0k 0+0io 0pf+0w
 
 hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql ucscGenes.xref
 
 # Update knownToRefSeq to make it consistent with ucscGenes.xref.  Prior to
 # this update, knownToRefSeq contains links to the RefSeq transcript that it
 # most overlaps.  This is a preliminary mapping.  txGeneXref generates
 # more reliable RefSeq associations, mostly from the CDS picks 
 hgsql $tempDb -e "update knownToRefSeq kr, kgXref kx set kr.value = kx.refseq where kr.name = kx.kgID and length(kx.refseq) > 0"
 
 
 # add NR_... RefSeq IDs into kgXref table.
 hgsql $tempDb \
     -e 'update kgXref set refseq=mRNA where mRNA like "NR_%" and refseq=""'
 
 # Make up and load kgColor table. Takes about a minute.
 txGeneColor $spDb ucscGenes.info ucscGenes.picks ucscGenes.color
 hgLoadSqlTab -notOnServer $tempDb kgColor $kent/src/hg/lib/kgColor.sql ucscGenes.color
 
 # Load up kgTxInfo table. Takes 0.3 second
 hgLoadSqlTab -notOnServer $tempDb kgTxInfo $kent/src/hg/lib/txInfo.sql ucscGenes.info
 
 # Make up alias tables and load them. Takes a minute or so.
 txGeneAlias $db $spDb ucscGenes.xref ucscGenes.ev oldToNew.tab foo.alias foo.protAlias
 sort foo.alias | uniq > ucscGenes.alias
 sort foo.protAlias | uniq > ucscGenes.protAlias
 rm foo.alias foo.protAlias
 hgLoadSqlTab -notOnServer $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias
 hgLoadSqlTab -notOnServer $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias
 
 # Load up kgProtMap2 table that says where exons are in terms of CDS
 hgLoadPsl $tempDb ucscProtMap.psl -table=kgProtMap2
 
 # Create a bunch of knownToXxx tables.  Takes about 3 minutes:
 # TODO: no allenBrainAli
 # hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db allenBrainAli -type=psl knownGene knownToAllenBrain
 
 
 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 # TODO: Create knownToTreefam table.
 mkdir -p $dir/treeFam
 cd $dir/treeFam
 wget "http://www.treefam.org/static/download/treefam_family_data.tar.gz"
 tar xfz treefam_family_data.tar.gz
 cd treefam_family_data
  egrep -a ">ENSP[0-9]+" *  | cut -d/ -f2 | tr -d : | sed -e 's/.aa.fasta//' |sed -e 's/.cds.fasta//' | grep -v accession | grep -v ^\> | tr '>' '\t' | tawk '{print $2,$1}' > ../ensToTreefam.tab 
 cd ..
 # hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db ensGene knownGene knownToEnsembl -noLoad
 #awk '{print $2,$1}' ../knownToEnsembl.tab | sort | uniq > ensTransUcsc.tab
 hgsql $db -e "select value,name from knownToEnsembl" | sort | uniq > ensTransUcsc.tab
 echo "select transcript,protein from ensGtp" | hgsql hg38 | sort | uniq | awk '{if (NF==2) print}'  > ensTransProt.tab
 join ensTransUcsc.tab ensTransProt.tab | awk '{if (NF==3)print $3, $2}' | sort | uniq  > ensProtToUc.tab
 join ensProtToUc.tab ensToTreefam.tab | sort -u | awk 'BEGIN {OFS="\t"} {print $2,$3}' | sort -u > knownToTreefam.tab
 hgLoadSqlTab $tempDb knownToTreefam $kent/src/hg/lib/knownTo.sql knownToTreefam.tab
 #end not done
 
 if ($db =~ hg*) then
     # TODO
     #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db HInvGeneMrna knownGene knownToHInv
     #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133Plus2 knownGene knownToU133Plus2
     #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133 knownGene knownToU133
     #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU95 knownGene knownToU95
     mkdir hprd
     cd hprd
     wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310"
     tar xvf HPRD_FLAT_FILES_041310.tar.gz
     knownToHprd $tempDb FLAT_FILES_072010/HPRD_ID_MAPPINGS.txt
     hgsql $tempDb -e "delete k from knownToHprd k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 endif
 
 if ($db =~ hg*) then
     time hgExpDistance $tempDb hgFixed.gnfHumanU95MedianRatio \
 	    hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
     time hgExpDistance $tempDb hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2
 endif
 
 if ($db =~ mm*) then
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyGnf1m knownGene knownToGnf1m
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU74  knownGene knownToU74
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyMOE430 knownGene knownToMOE430
     hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyMOE430 -prefix=A: knownGene knownToMOE430A
     hgExpDistance $tempDb $db.affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
     hgExpDistance $tempDb $db.affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
     hgExpDistance $tempDb $db.affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
     hgExpDistance $tempDb hgFixed.gnfMouseAtlas2MedianRatio \
 	    hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
 endif
 
 
 # Update visiGene stuff
 knownToVisiGene $tempDb -probesDb=$db
 hgsql $tempDb -e "delete k from knownToVisiGene k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 
 # Create Human P2P protein-interaction Gene Sorter columns
 if ($db =~ hg*) then
 hgLoadNetDist $genomes/hg19/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \
     -sqlRemap="select distinct value, name from knownToHprd"
 hgLoadNetDist $genomes/hg19/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P \
     -sqlRemap="select distinct locusLinkID, kgID from $db.refLink,kgXref where $db.refLink.mrnaAcc = kgXref.mRNA"
 hgLoadNetDist $genomes/hg19/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P -sqlRemap="select distinct locusLinkID, kgID from $db.refLink,kgXref where $db.refLink.mrnaAcc = kgXref.mRNA"
 endif
 
 
 # Run nice Perl script to make all protein blast runs for
 # Gene Sorter and Known Genes details page.  Takes about
 # 45 minutes to run.
 rm -rf   $dir/hgNearBlastp
 mkdir  $dir/hgNearBlastp
 cd $dir/hgNearBlastp
 cat << _EOF_ > config.ra
 # Latest human vs. other Gene Sorter orgs:
 # mouse, rat, zebrafish, worm, yeast, fly
 
 targetGenesetPrefix known
 targetDb $tempDb
 queryDbs $xdb $ratDb $fishDb $flyDb $wormDb $yeastDb
 
 ${tempDb}Fa $tempFa
 ${xdb}Fa $xdbFa
 ${ratDb}Fa $ratFa
 ${fishDb}Fa $fishFa
 ${flyDb}Fa $flyFa
 ${wormDb}Fa $wormFa
 ${yeastDb}Fa $yeastFa
 
 buildDir $dir/hgNearBlastp
 scratchDir $scratchDir/brHgNearBlastp
 _EOF_
 
 
 rm -rf  $scratchDir/brHgNearBlastp
 doHgNearBlastp.pl -noLoad -clusterHub=ku -distrHost=hgwdev -dbHost=hgwdev -workhorse=hgwdev config.ra |& tee do.log 
 
 # Load self
 cd $dir/hgNearBlastp/run.$tempDb.$tempDb
 ./loadPairwise.csh
 
 # Load mouse and rat
 cd $dir/hgNearBlastp/run.$tempDb.$xdb
 hgLoadBlastTab $tempDb $xBlastTab -maxPer=1 out/*.tab
 cd $dir/hgNearBlastp/run.$tempDb.$ratDb
 hgLoadBlastTab $tempDb $rnBlastTab -maxPer=1 out/*.tab
 
 # Remove non-syntenic hits for mouse and rat
 # Takes a few minutes
 # stopped here... need hg38 to rn4
 mkdir -p /gbdb/$tempDb/liftOver
 rm -f /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz
 ln -s $genomes/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \
     /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz
 ln -s $genomes/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \
     /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz
 
 # delete non-syntenic genes from rat and mouse blastp tables
 cd $dir/hgNearBlastp
 synBlastp.csh $tempDb $xdb
 # old number of unique query values:  65755
 # old number of unique target values  22343
 # new number of unique query values:  60984
 # new number of unique target values  21844
 #
 
 hgsql -e "select  count(*) from mmBlastTab\G" $oldDb | tail -n +2
 # count(*): 72823
 hgsql -e "select  count(*) from mmBlastTab\G" $tempDb | tail -n +2
 # count(*): 60984
 
 synBlastp.csh $tempDb $ratDb knownGene rgdGene2
 # old number of unique query values: 58734
 # old number of unique target values 10138
 # new number of unique query values: 31066
 # new number of unique target values 7689
 
 hgsql -e "select  count(*) from rnBlastTab\G" $oldDb | tail -n +2
 # count(*): 28372
 hgsql -e "select  count(*) from rnBlastTab\G" $tempDb | tail -n +2
 # count(*): 31066
 
 # Make reciprocal best subset for the blastp pairs that are too
 # Far for synteny to help
 
 # Us vs. fish
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$fishDb
 set bToA = run.$fishDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab
 
 # TODO: what's this doing here?
 # hgLoadBlastTab $fishDb tfBlastTab $bToA/recipBest.tab
 
 hgsql -e "select  count(*) from drBlastTab\G" $oldDb | tail -n +2
 # count(*): 13111
 hgsql -e "select  count(*) from drBlastTab\G" $tempDb | tail -n +2
 # count(*): 13031
 
 # Us vs. fly
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$flyDb
 set bToA = run.$flyDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb dmBlastTab $aToB/recipBest.tab
 
 hgsql -e "select  count(*) from dmBlastTab\G" $oldDb | tail -n +2
 #  count(*): 5975
 hgsql -e "select  count(*) from dmBlastTab\G" $tempDb | tail -n +2
 # count(*): 5974
 
 # Us vs. worm
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$wormDb
 set bToA = run.$wormDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab
 
 hgsql -e "select  count(*) from ceBlastTab\G" $oldDb | tail -n +2
 # count(*): 4948
 hgsql -e "select  count(*) from ceBlastTab\G" $tempDb | tail -n +2
 # count(*): 4950
 
 # Us vs. yeast
 cd $dir/hgNearBlastp
 set aToB = run.$tempDb.$yeastDb
 set bToA = run.$yeastDb.$tempDb
 cat $aToB/out/*.tab > $aToB/all.tab
 cat $bToA/out/*.tab > $bToA/all.tab
 blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab
 hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab
 
 hgsql -e "select  count(*) from scBlastTab\G" $oldDb | tail -n +2
 # count(*): 2365
 hgsql -e "select  count(*) from scBlastTab\G" $tempDb | tail -n +2
 # count(*): 2366
 
 # Clean up
 cd $dir/hgNearBlastp
 cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz
 gzip run.*/all.tab
 
 # make knownToMalacards
 mkdir -p $dir/malacards
 cd $dir/malacards
 # grab files that b0b came up with somehow
 cp /hive/users/kuhn/tracks/malacards/malacardsUcscGenesExport.csv .
 awk 'BEGIN {FS=","} {print $1}' malacardsUcscGenesExport.csv | sort -u > malacardExists.txt
 hgsql -e "select geneSymbol,kgId from kgXref" --skip-column-names hg38 | awk '{if (NF == 2) print}' | sort > geneSymbolToKgId.txt
 join malacardExists.txt geneSymbolToKgId.txt | awk 'BEGIN {OFS="\t"} {print $2,$1}' | sort > knownToMalacards.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToMalacards $kent/src/hg/lib/knownTo.sql  knownToMalacards.tab
 
 # make knownToLynx
 mkdir -p $dir/lynx
 cd $dir/lynx
 
 wget "http://lynx.ci.uchicago.edu/downloads/LYNX_GENES.tab"
 awk '{print $2}' LYNX_GENES.tab | sort > lynxExists.txt
 hgsql -e "select geneSymbol,kgId from kgXref" --skip-column-names hg38 | awk '{if (NF == 2) print}' | sort > geneSymbolToKgId.txt
 join lynxExists.txt geneSymbolToKgId.txt | awk 'BEGIN {OFS="\t"} {print $2,$1}' | sort > knownToLynx.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToLynx $kent/src/hg/lib/knownTo.sql  knownToLynx.tab
 
 # make knownToNextProt
 mkdir -p $dir/nextProt
 cd $dir/nextProt
 
 wget "ftp://ftp.nextprot.org/pub/current_release/ac_lists/nextprot_ac_list_all.txt"
 awk '{print $0, $0}' nextprot_ac_list_all.txt | sed 's/NX_//' | sort > displayIdToNextProt.txt
 hgsql -e "select spID,kgId from kgXref" --skip-column-names hg38 | awk '{if (NF == 2) print}' | sort > displayIdToKgId.txt
 join displayIdToKgId.txt displayIdToNextProt.txt | awk 'BEGIN {OFS="\t"} {print $2,$3}' > knownToNextProt.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToNextProt $kent/src/hg/lib/knownTo.sql  knownToNextProt.tab
 
 
 # make knownToWikipedia
 mkdir -p $dir/wikipedia
 cd $dir/wikipedia
 hgsql hg19 -e "select * from knownToWikipedia" | tail -n +2 | sort > oldKnownToWikipedia.tab
 hgsql hg38 -e "select oldId,newId from kg7ToKg8" | tail -n +2 | sort > kg7ToKg8.tab
 join kg7ToKg8.tab oldKnownToWikipedia.tab | awk 'BEGIN {OFS="\t"} {print $2,$3}' | sort | uniq | awk '{if ((NF == 2) && (haveSeen[$1] == 0)) print;haveSeen[$1]=1}' > knownToWikipedia.tab
 hgLoadSqlTab $tempDb  knownToWikipedia $kent/src/hg/lib/knownToWikipedia.sql  knownToWikipedia.tab
 
 # MAKE FOLDUTR TABLES 
 # First set up directory structure and extract UTR sequence on hgwdev
 cd $dir
 mkdir -p rnaStruct
 cd rnaStruct
 mkdir -p utr3/split utr5/split utr3/fold utr5/fold
 # these commands take some significant time
 utrFa $tempDb knownGene utr3 utr3/utr.fa
 utrFa $tempDb knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
 faSplit sequence utr3/utr.fa 10000 utr3/split/s
 faSplit sequence utr5/utr.fa 10000 utr5/split/s
 ls -1 utr3/split > utr3/in.lst
 ls -1 utr5/split > utr5/in.lst
 cd utr3
 cat << '_EOF_' > template
 #LOOP
 rnaFoldBig split/$(path1) fold
 #ENDLOOP
 '_EOF_'
 gensub2 in.lst single template jobList
 cp template ../utr5
 cd ../utr5
 
 gensub2 in.lst single template jobList
 
 # Do cluster runs for UTRs
 ssh $cpuFarm "cd $dir/rnaStruct/utr3; para make jobList"
 ssh $cpuFarm "cd $dir/rnaStruct/utr5; para make jobList"
 
 
 # Load database
     cd $dir/rnaStruct/utr5
     hgLoadRnaFold $tempDb foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold -warnEmpty $tempDb foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # Make pfam run.  Actual cluster run is about 6 hours.
 mkdir -p /hive/data/outside/pfam/Pfam27.0
 cd /hive/data/outside/pfam/Pfam27.0
 wget ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
 gunzip Pfam-A.hmm.gz
 #set pfamScratch = $scratchDir/pfamBR
 #ssh $cpuFarm mkdir -p $pfamScratch
 #ssh $cpuFarm cp /hive/data/outside/pfam/Pfam26.0/Pfam-A.hmm $pfamScratch
 
 mkdir -p $dir/pfam
 cd $dir/pfam
 rm -rf  splitProt
 mkdir  splitProt
 faSplit sequence $dir/ucscGenes.faa 10000 splitProt/
 mkdir -p result
 ls -1 splitProt > prot.list
 # /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/current/Pfam_fs \
 cat << '_EOF_' > doPfam
 #!/bin/csh -ef  
 /hive/data/outside/pfam/Pfam27.0/PfamScan/hmmer-3.1b1/src/hmmsearch   --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam27.0/Pfam-A.hmm     splitProt/$1 
 mv /scratch/tmp/pfam.$2.pf $3
 '_EOF_'
     # << happy emacs
 chmod +x doPfam
 cat << '_EOF_' > template
 #LOOP
 doPfam $(path1) $(root1) {check out line+ result/$(root1).pf}
 #ENDLOOP
 '_EOF_'
 gensub2 prot.list single template jobList
 
 ssh $cpuFarm "cd $dir/pfam; para make jobList"
 ssh $cpuFarm "cd $dir/pfam; para time > run.time"
 cat run.time
 
 # Completed: 9667 of 9667 jobs
 # CPU time in finished jobs:    2335576s   38926.27m   648.77h   27.03d  0.074 y
 # IO & Wait Time:                676819s   11280.31m   188.01h    7.83d  0.021 y
 # Average job time:                 312s       5.19m     0.09h    0.00d
 # Longest finished job:             484s       8.07m     0.13h    0.01d
 # Submission to last job:         47429s     790.48m    13.17h    0.55d
 
 
 # Make up pfamDesc.tab by converting pfam to a ra file first
 cat << '_EOF_' > makePfamRa.awk
 /^NAME/ {print}
 /^ACC/ {print}
 /^DESC/ {print; printf("\n");}
 '_EOF_'
 awk -f makePfamRa.awk  /hive/data/outside/pfam/Pfam27.0/Pfam-A.hmm  > pfamDesc.ra
 raToTab -cols=ACC,NAME,DESC pfamDesc.ra stdout |  awk -F '\t' '{printf("%s\t%s\t%s\n", gensub(/\.[0-9]+/, "", "g", $1), $2, $3);}' > pfamDesc.tab
 
 # Convert output to tab-separated file. 
 cd $dir/pfam
 catDir result | sed '/^#/d' | awk 'BEGIN {OFS="\t"} {if ($7 < 0.0001) print $1,$18-1,$19,$4,$7}' | sort > ucscPfam.tab
 cd $dir
 
 # Convert output to knownToPfam table
 awk '{printf("%s\t%s\n", $2, gensub(/\.[0-9]+/, "", "g", $1));}' \
 	pfam/pfamDesc.tab > sub.tab
 cut -f 1,4 pfam/ucscPfam.tab | subColumn 2 stdin sub.tab stdout | sort -u > knownToPfam.tab
 rm -f sub.tab
 hgLoadSqlTab -notOnServer $tempDb knownToPfam $kent/src/hg/lib/knownTo.sql knownToPfam.tab
 hgLoadSqlTab -notOnServer $tempDb pfamDesc $kent/src/hg/lib/pfamDesc.sql pfam/pfamDesc.tab
 hgsql $tempDb -e "delete k from knownToPfam k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 
 cd $dir/pfam
 genePredToFakePsl hg38 knownGene knownGene.psl cdsOut.tab
 sort cdsOut.tab | sed 's/\.\./   /' > sortCdsOut.tab
 sort ucscPfam.tab> sortPfam.tab
 awk '{print $10, $11}' knownGene.psl > gene.sizes
 join sortCdsOut.tab sortPfam.tab |  awk '{print $1, $2 + 3 * $4, $2 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl
 pslMap domainToGene.psl knownGene.psl stdout | sort | uniq | pslToBed stdin domainToGenome.bed 
 hgLoadBed $tempDb ucscGenePfam domainToGenome.bed
 
 # Do scop run. Takes about 6 hours
 mkdir /hive/data/outside/scop/1.75
 cd /hive/data/outside/scop/1.75
 wget "ftp://license:SlithyToves@supfam.org/models/hmmlib_1.75.gz"
 gunzip hmmlib_1.75.gz
 wget "ftp://license:SlithyToves@supfam.org/models/model.tab.gz"
 gunzip model.tab.gz
 
 mkdir -p $dir/scop
 cd $dir/scop
 rm -rf result
 mkdir  result
 ls -1 ../pfam/splitProt > prot.list
 cat << '_EOF_' > doScop
 #!/bin/csh -ef
 /hive/data/outside/pfam/Pfam27.0/PfamScan/hmmer-3.1b1/src/hmmsearch   --domtblout /scratch/tmp/scop.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/scop/1.75/hmmlib_1.75  ../pfam/splitProt/$1
 mv /scratch/tmp/scop.$2.pf $3
 '_EOF_'
     # << happy emacs
 chmod +x doScop
 cat << '_EOF_' > template
 #LOOP
 doScop $(path1) $(root1) {check out line+ result/$(root1).pf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 gensub2 prot.list single template jobList
 
 ssh $cpuFarm "cd $dir/scop; para make jobList"
 ssh $cpuFarm "cd $dir/scop; para time > run.time"
 cat run.time
 
 # Completed: 9667 of 9667 jobs
 # CPU time in finished jobs:    2398927s   39982.12m   666.37h   27.77d  0.076 y
 # IO & Wait Time:               1142244s   19037.40m   317.29h   13.22d  0.036 y
 # Average job time:                 366s       6.11m     0.10h    0.00d
 # Longest finished job:             585s       9.75m     0.16h    0.01d
 # Submission to last job:         38784s     646.40m    10.77h    0.45d
 #
 
 
 # Convert scop output to tab-separated files
 cd $dir
 catDir scop/result | sed '/^#/d' | awk 'BEGIN {OFS="\t"} {if ($7 <= 0.0001) print $1,$18-1,$19,$4, $7,$8}' | sort > scopPlusScore.tab
 sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab
 scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \
 	scopDesc.tab knownToSuper.tab
 hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab
 hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab
 hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'"
 
 hgLoadSqlTab $tempDb ucscScop $kent/src/hg/lib/ucscScop.sql ucscScop.tab
 
 # Regenerate ccdsKgMap table
 # TODO: didn't do
 $kent/src/hg/makeDb/genbank/bin/x86_64/mkCcdsGeneMap  -db=$tempDb -loadDb $db.ccdsGene knownGene ccdsKgMap
 
 cd $dir
 # Map old to new mapping
 set oldGeneBed=$oldDb.$db.kg.bed
 txGeneExplainUpdate2 $oldGeneBed ucscGenes.bed kgOldToNew.tab
 hgLoadSqlTab -notOnServer $tempDb kg${lastVer}ToKg${curVer} $kent/src/hg/lib/kg1ToKg2.sql kgOldToNew.tab
 
 # add kg${lastVer}ToKg${curVer} to all.joiner !!!!
 
 # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables.
 
 hgsql $tempDb -N -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' > kgSpAlias_0.tmp
          
 hgsql $tempDb -N -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >> kgSpAlias_0.tmp
 cat kgSpAlias_0.tmp|sort -u  > kgSpAlias.tab
 rm kgSpAlias_0.tmp
 
 hgLoadSqlTab -notOnServer $tempDb kgSpAlias $kent/src/hg/lib/kgSpAlias.sql kgSpAlias.tab
 
 # Do BioCyc Pathways build
 mkdir $dir/bioCyc
 cd $dir/bioCyc
 
 grep -E -v "^#" $bioCycDir/genes.col  > genes.tab  
 grep -E -v "^#" $bioCycDir/pathways.col  | awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' > pathways.tab
 
 kgBioCyc1 -noEnsembl genes.tab pathways.tab hg38 bioCycPathway.tab bioCycMapDesc.tab  
 hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
 hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 # Do KEGG Pathways build (borrowing Fan Hus's strategy from hg38.txt)
     mkdir -p $dir/kegg
     cd $dir/kegg
 
     # Make the keggMapDesc table, which maps KEGG pathway IDs to descriptive names
     cp /cluster/data/hg19/bed/ucsc.13/kegg/map_title.tab .
     # wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab
     cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp
     cut -f 2 j.tmp >j.hsa
     cut -f 1,3 j.tmp >j.1
     paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab
     rm j.hsa j.1 j.tmp
     hgLoadSqlTab -notOnServer $tempDb keggMapDesc $kent/src/hg/lib/keggMapDesc.sql keggMapDesc.tab
 
     # Following in two-step process, build/load a table that maps UCSC Gene IDs
     # to LocusLink IDs and to KEGG pathways.  First, make a table that maps 
     # LocusLink IDs to KEGG pathways from the downloaded data.  Store it temporarily
     # in the keggPathway table, overloading the schema.
     cp /cluster/data/hg19/bed/ucsc.14.3/kegg/hsa_pathway.list .
 
     cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
     hgLoadSqlTab -notOnServer $tempDb keggPathway $kent/src/hg/lib/keggPathway.sql j.tmp
 
     # Next, use the temporary contents of the keggPathway table to join with
     # knownToLocusLink, creating the real content of the keggPathway table.
     # Load this data, erasing the old temporary content
     hgsql $tempDb -B -N -e 'select distinct name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' > keggPathway.tab
     hgLoadSqlTab -notOnServer $tempDb \
 	keggPathway $kent/src/hg/lib/keggPathway.sql  keggPathway.tab
 
    # Finally, update the knownToKeggEntrez table from the keggPathway table.
    hgsql $tempDb -B -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u| sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab
    hgLoadSqlTab -notOnServer $tempDb knownToKeggEntrez $kent/src/hg/lib/knownToKeggEntrez.sql knownToKeggEntrez.tab
     hgsql $tempDb -e "delete k from knownToKeggEntrez k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'"
 
 # Make spMrna table 
    cd $dir
    hgsql $db -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab;
    hgLoadSqlTab $tempDb spMrna $kent/src/hg/lib/spMrna.sql spMrna.tab
 
 
 # Do CGAP tables 
 
     mkdir -p $dir/cgap
     cd $dir/cgap
     
     wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
     hgCGAP Hs_GeneData.dat
         
     cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
     hgLoadSqlTab -notOnServer $tempDb cgapAlias $kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab
 
     hgLoadSqlTab -notOnServer $tempDb cgapBiocPathway $kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab
 
     cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
     hgLoadSqlTab -notOnServer $tempDb cgapBiocDesc $kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab
 
 
 
 cd $dir
 # Make PCR target for UCSC Genes, Part 1.
 # 1. Get a set of IDs that consist of the UCSC Gene accession concatenated with the
 #    gene symbol, e.g. uc010nxr.1__DDX11L1
 hgsql $tempDb -N -e 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt 
 # 2. Get a file of per-transcript fasta sequences that contain the sequences of each UCSC Genes transcript, with this new ID in the place of the UCSC Genes accession.   Convert that file to TwoBit format and soft-link it into /gbdb/hg38/targetDb/ 
+### NEXT TIME  use same name in blatServers table as file name!!!
 subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=$db -bedIn=ucscGenesIdSubbed.bed -fastaOut=stdout  | faToTwoBit stdin kgTargetSeq${curVer}.2bit 
 mkdir -p /gbdb/$db/targetDb/ 
 rm -f /gbdb/$db/targetDb/kgTargetSeq${curVer}.2bit 
 ln -s $dir/kgTargetSeq${curVer}.2bit /gbdb/$db/targetDb/
 # Load the table kgTargetAli, which shows where in the genome these targets are.
 cut -f 1-10 ucscGenes.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null
 hgLoadPsl $tempDb kgTargetAli.psl
 
 #
 # At this point we should save a list of the tables in tempDb!!!
 echo "show tables" | hgsql $tempDb > tablesInKnownGene.lst
 
 # NOW SWAP IN TABLES FROM TEMP DATABASE TO MAIN DATABASE.
 # You'll need superuser powers for this step.....
 
 cd $dir
 # Drop tempDb history table and chromInfo, we don't want to swap them in!
 hgsql -e "drop table history" $tempDb
 hgsql -e "drop table chromInfo" $tempDb
 
 
 cat << _EOF_ > moveTablesIntoPlace
 # Save old known genes and kgXref tables
 sudo ~kent/bin/copyMysqlTable $db knownGene $tempDb knownGeneOld$lastVer
 sudo ~kent/bin/copyMysqlTable $db kgXref $tempDb kgXrefOld$lastVer
 
 # Create backup database
 hgsqladmin create ${db}BackupBraney2
 
 
 # Swap in new tables, moving old tables to backup database.
 sudo ~kent/bin/swapInMysqlTempDb $tempDb $db ${db}BackupBraney2
 _EOF_
 
 
 # Update database links.
 sudo rm /var/lib/mysql/uniProt
 sudo ln -s /var/lib/mysql/$spDb /var/lib/mysql/uniProt
 sudo rm /var/lib/mysql/proteome
 sudo ln -s /var/lib/mysql/$pbDb /var/lib/mysql/proteome
 hgsqladmin flush-tables
 
 
 # Make full text index.  Takes a minute or so.  After this the genome browser
 # tracks display will work including the position search.  The genes details
 # page, gene sorter, and proteome browser still need more tables.
 mkdir -p $dir/index
 cd $dir/index
 hgKgGetText $db knownGene.text 
 ixIxx knownGene.text knownGene.ix knownGene.ixx
 rm -f /gbdb/$db/knownGene.ix /gbdb/$db/knownGene.ixx
 ln -s $dir/index/knownGene.ix  /gbdb/$db/knownGene.ix
 ln -s $dir/index/knownGene.ixx /gbdb/$db/knownGene.ixx
 
+### NEXT TIME  use same name in blatServers table as file name!!!
 
 # 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on       
 # /gbdb/$db/targetDb/kgTargetSeq${curVer}.2bit
 
+### NEXT TIME  use same name in blatServers table as file name!!!
+
 # 4. On hgwdev, insert new records into blatServers and targetDb, using the 
 # host (field 2) and port (field 3) specified by cluster-admin.  Identify the
 # blatServer by the keyword "$db"Kg with the version number appended
 hgsql hgcentraltest -e \
       'INSERT into blatServers values ("hg38Kgv15", "blat4b", 17787, 0, 1);'
 hgsql hgcentraltest -e \                                                    
       'INSERT into targetDb values("hg38Kgv15", "UCSC Genes", \
          "hg38", "kgTargetAli", "", "", \
          "/gbdb/hg38/targetDb/kgTargetSeq8.2bit", 1, now(), "");'
 
 #
 ##
 ##   WRAP-UP  
 #
 #  add database to the db's in kent/src/hg/visiGene/vgGetText
 
 cd $dir
 #
 # Finally, need to wait until after testing, but update databases in other organisms
 # with blastTabs
 
 # Load blastTabs
 cd $dir/hgNearBlastp
 hgLoadBlastTab $xdb $blastTab run.$xdb.$tempDb/out/*.tab
 hgLoadBlastTab $ratDb $blastTab run.$ratDb.$tempDb/out/*.tab 
 hgLoadBlastTab $flyDb $blastTab run.$flyDb.$tempDb/recipBest.tab
 hgLoadBlastTab $wormDb $blastTab run.$wormDb.$tempDb/recipBest.tab
 hgLoadBlastTab $yeastDb $blastTab run.$yeastDb.$tempDb/recipBest.tab
 
 # Do synteny on mouse/human/rat
 synBlastp.csh $xdb $db
 # old number of unique query values: 43110
 # old number of unique target values 22769
 # new number of unique query values: 35140
 # new number of unique target values 20138
 
 synBlastp.csh $ratDb $db rgdGene2 knownGene
 #old number of unique query values: 11205
 #old number of unique target values 10791
 #new number of unique query values: 7854
 #new number of unique target values 7935
 
 # need to generate multiz downloads
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownCanonical.exonAA.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownCanonical.exonNuc.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownGene.exonAA.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownGene.exonNuc.fa.gz
 #/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/md5sum.txt
 
 echo
 echo "see the bottom of the script for details about knownToWikipedia"
 echo
 # Clean up
 rm -r run.*/out
 
 # Last step in setting up isPCR: after the new UCSC Genes with the new Known Gene isPcr
 # is released, take down the old isPcr gfServer  
 
 #######################
 ### The following is the process Briam Lee used to pull out only
 #   the genes from knownToLocusLink for which there are Wikipedia articles.
 ### get the full knownToLocusLinkTable
 # hgsql -Ne 'select value from knownToLocusLink' hg38 | sort -u >> knToLocusLink
 ###   query Wikipedia for each to if there is an article
 # for i in $(cat knToLocusLink); do lynx -dump "http://genewiki.sulab.org/map/wiki/"$i | grep -m 1 "no results" >trash ; echo $? $i | grep "1 "| awk '{print $2}'>> workingLinks; done
 ###   pull out all isoforms that have permitted LocusLinkIds
 # for i in $(cat workingLinks); do hgsql -Ne 'select * from knownToLocusLink where value like "'$i'"' hg38 >> knownToWikipediaNew; done
 ###   then load the table as knownToWikipedia using the knowToLocusLink INDICES.