aa80d65a1312c44a1c41488390f324f76beb8984
Merge parents c04d210 3a4325f
max
  Tue Apr 5 04:35:24 2022 -0700
Adding Jarvis makedoc. Merge branch 'master' of hgwdev.gi.ucsc.edu:/data/git/kent

Conflicts:
src/hg/makeDb/doc/hg19.txt

diff --cc src/hg/makeDb/doc/hg19.txt
index c55544d,2066151..9fd19c6
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@@ -1,34773 -1,34816 +1,34823 @@@
  # for emacs: -*- mode: sh; -*-
  
  # This file describes how we made the browser database on
  # NCBI build 37 (February 2009 freeze) aka:
  #	GRCh37 - Genome Reference Consortium Human Reference 37
  #	Assembly Accession: GCA_000001405.1
  
  #	"$Id: hg19.txt,v 1.118 2010/06/10 16:34:40 chinhli Exp $";
  
  #############################################################################
  
  # NOTE FOR NEXT HUMAN ASSEMBLY (2009-07-29 - Brooke): hg19 contains the wrong
  # sequence for chrM. The accession NC_001807 was replaced in GenBank with
  # NC_012920, with the note: "This sequence was removed since the accepted
  # reference sequence for the Homo sapiens mitochondrion is the rCRS/Mitomap
  # sequence, which is now available as the record NC_012920".
  # Also, from http://www.mitomap.org/mitoseq.html:
  # "IMPORTANT:  Do not use NC_001807 as "the rCRS" as it is an African
  # (Yoruban) sequence with over 40 variant nucleotides from the rCRS. As of
  # July 8, 2009 it has been removed from GenBank as a reference sequence but
  # may be found, if needed, as  AF347015, one of 53 African sequence deposited
  # in Genbank by Ingman et al in 2001."
  # Use NC_012920 for the chrM sequence for the next build!
  
  # Download sequence (DONE - 2009-02-04 - Hiram)
      mkdir -p /hive/data/genomes/hg19/download
      cd /hive/data/genomes/hg19/download
      mkdir -p assembled_chromosomes
      wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
          --directory-prefix=assembled_chromosomes \
          -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
  
      mkdir -p alternate_loci
  for N in 1 2 3 4 5 6 7 8 9
  do
  wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
      --directory-prefix=alternate_loci \
          -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
  done
  
      mkdir -p unlocalized_scaffolds
      wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
          --directory-prefix=unlocalized_scaffolds \
  	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
  
      mkdir -p unplaced_scaffolds
      wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
          --directory-prefix=unplaced_scaffolds \
  	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
  
      mkdir -p placed_scaffolds
      wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
          --directory-prefix=placed_scaffolds \
  	    -nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
  
      mkdir ucscChr
      cd ucscChr
      for F in ../assembled_chromosomes/FASTA/chr*.fa
  do
      C=`basename $F`
      C=${C/.fa}
      echo -n "${C} "
      H=`head -1 "${F}"`
      chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
      A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
      echo $chrN $A
      grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
          | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
      echo ">${chrN}" > ${chrN}.fa
      grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
  done
  
      rm -f scaffolds.agp
      find ../alternate_loci -type f | grep ".agp$" | while read F
  do
      grep "^GL" $F | sed -e \
  "s/^GL000250.1/chr6_apd_hap1/" -e \
  "s/^GL000251.1/chr6_cox_hap2/" -e \
  "s/^GL000252.1/chr6_dbb_hap3/" -e \
  "s/^GL000253.1/chr6_mann_hap4/" -e \
  "s/^GL000254.1/chr6_mcf_hap5/" -e \
  "s/^GL000255.1/chr6_qbl_hap6/" -e \
  "s/^GL000256.1/chr6_ssto_hap7/" -e \
  "s/^GL000257.1/chr4_ctg9_hap1/" -e \
  "s/^GL000258.1/chr17_ctg5_hap1/"
  done > scaffolds.agp
  
      find ../unlocalized_scaffolds -type f | grep ".agp$" \
  | while read F
  do
      C=`basename ${F}`
      C=${C/.unlocalized.scaf.agp}
      grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
  done >> scaffolds.agp
  
      find ../unplaced_scaffolds -type f | grep ".agp$" \
  | while read F
  do
      grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
  done >> scaffolds.agp
  
      rm -f scaffolds.fa
      find ../alternate_loci -type f | grep ".fa$" | while read F
  do
      sed -e \
  "s/>.*GL000250.*/>chr6_apd_hap1/" -e \
  "s/>.*GL000251.*/>chr6_cox_hap2/" -e \
  "s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
  "s/>.*GL000253.*/>chr6_mann_hap4/" -e \
  "s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
  "s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
  "s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
  "s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
  "s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
  done > scaffolds.fa
  
      find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
  do
      sed -e \
  "s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
  done >> scaffolds.fa
  
      find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
  do
      sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
  done >> scaffolds.fa
  
  
  ############################################################################
  ## Create database (DONE - 2009-03-04 - Hiram)
      cd /hive/data/genomes/hg19
      cat << '_EOF_' > hg19.config.ra
  # Config parameters for makeGenomeDb.pl:
  db hg19
  scientificName Homo sapiens
  commonName Human
  assemblyDate Feb. 2009
  assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
  orderKey 14
  mitoAcc NC_001807
  fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
  agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
  # qualFiles /dev/null
  dbDbSpeciesDir human
  taxId	9606
  '_EOF_'
      # << happy emacs
  
      time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
      #	real    14m8.958s
       featureBits -countGaps hg19 gap
      #	239845127 bases of 3137161264 (7.645%) in intersection
      featureBits -noRandom -noHap -countGaps hg19 gap
      #	234344806 bases of 3095693983 (7.570%) in intersection
      #	verify featureBits is properly ignorning haps and randoms:
      egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
      #	3095693983 chrM 16571
      #	same total as in featureBits
  
      #	much later on, discovered that we needed a chrM definition in the
      #	agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
  # chrM    1       16571   1       F       NC001807        1       16571   +
      #	the spaces there are tabs
  
  ############################################################################
  # running repeat masker (DONE - 2009-03-05 - Hiram)
      screen # use screen to manage this day-long job
      mkdir /hive/data/genomes/hg19/bed/repeatMasker
      cd /hive/data/genomes/hg19/bed/repeatMasker
      time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
  	> do.log 2>&1
      #	real    525m23.521s
      cat faSize.rmsk.txt
      #	3137161264 bases (239850802 N's 2897310462 real 1431585691
      #	upper 1465724771 lower) in 93 sequences in 1 files
      #	%46.72 masked total, %50.59 masked real
      featureBits -countGaps hg19 rmsk
      #	1465724774 bases of 3137161264 (46.721%) in intersection
      #	this is odd, 3 bases more in featureBits than were masked ?
      #	check it out, make a bed file from the featureBits:
      featureBits -countGaps -bed=rmsk.bed hg19 rmsk
      #	went down a sequence of intersections with this idea, but could
      #	not get it resolved.  It appears there are 75 bases in the rmsk
      #	table that were not masked in the 2bit file ?
      #	Later on, realized that featureBits does not count lower case N's
      #	in the "lower" category, but only in the N's category.
  
      #	trying a non-split table:
      hgsql -e "show tables;" hg19 | grep _rmsk | while read T
  do
      hgsql -e "drop table ${T};" hg19
  done
      hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
  bad rep range [4385, 4384] line 1348605 of hg19.fa.out
  bad rep range [5563, 5562] line 1563988 of hg19.fa.out
  bad rep range [4539, 4538] line 3111186 of hg19.fa.out
      #	featureBits still reports 1465724774 bases in rmsk table
      #	cleaning the hg19.fa.out file:
      cp hg19.fa.out hg19.clean.out
      # edit hg19.clean.out and remove the three lines:
  # 1467  20.7  1.2 17.6  chr14     35056767 35056794 (72292746) +  L1ME1          LINE/L1               4385 4384 (1761) 1120962
  # 1943  23.8  5.0 12.6  chr15     65775909 65775924 (36755468) +  L1MC4          LINE/L1               5563 5562 (2480) 1299299
  # 2463  25.1  5.0 11.6  chr3      121291056 121291083 (76731347) +  L1M3           LINE/L1               4539 4538 (1608) 2589267
  
      #	reload the table
      hgsql -e "drop table rmsk;" hg19
      hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
  
      #	try masking with this clean file:
      twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
  	hg19.clean.2bit
      twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
      cat faSize.clean.txt
      #	this gives the lower by 75 bases result:
      #	3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
      #	1465724699 lower) in 93 sequences in 1 files
      #	%46.72 masked total, %50.59 masked real
      featureBits -countGaps hg19 rmsk
      #	1465724774 bases of 3137161264 (46.721%) in intersection
      #	is the countGaps interferring ?
      featureBits hg19 rmsk
      #	1465724774 bases of 2897316137 (50.589%) in intersection
      #	nope, lets' see what the .out file has:
      grep chr hg19.clean.out | sed -e "s/^  *//" | awk '{print $5,$6-1,$7}' \
  	| sort -k1,1 -k2,2n > hg19.clean.out.bed
      featureBits -countGaps hg19 hg19.clean.out.bed
      #	1465724774 bases of 3137161264 (46.721%) in intersection
      #	is it perhaps not masking N's ?
      twoBitToFa hg19.clean.2bit stdout | grep n | less
      #	that does find some lower case n's, find all N's:
      findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
  	2> findMotif.out
      grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
      #	which cover:
      featureBits -countGaps hg19 nLocations.bed
      #	251299071 bases of 3137161264 (8.010%) in intersection
      #	overlapping rmsk business with these N locations:
      featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
      #	6494740 bases of 3137161264 (0.207%) in intersection
      #	and overlapping with gap:
      featureBits -countGaps hg19 gap nLocations.bed
      #	239845127 bases of 3137161264 (7.645%) in intersection
  
  ############################################################################
  # running TRF simple repeats (DONE - 2009-03-05 - Hiram)
      screen # use screen to manage this day-long job
      mkdir /hive/data/genomes/hg19/bed/simpleRepeat
      cd /hive/data/genomes/hg19/bed/simpleRepeat
      time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
  	-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
      #	real    33m25.815s
  
      twoBitMask bed/repeatMasker/hg19.clean.2bit \
  	-add bed/simpleRepeat/trfMask.bed hg19.2bit
      twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
  # 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
  # 1466923203 lower) in 93 sequences in 1 files
  # %46.76 masked total, %50.63 masked real
  
  ############################################################################
  #	prepare cluster data (DONE - 2009-03-06 - Hiram)
      cd /hive/data/genomes/hg19
      rm /gbdb/hg19/hg19.2bit
      ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
  
      time blat hg19.2bit \
  	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
      #	Wrote 30675 overused 11-mers to 11.ooc
      #	real    3m11.302s
  
      mkdir /hive/data/staging/data/hg19
      cp -p hg19.2bit /hive/data/staging/data/hg19
      cp -p 11.ooc /hive/data/staging/data/hg19
      cp -p chrom.sizes /hive/data/staging/data/hg19
  
      mkdir separateChrs
      cd separateChrs
      grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
  do
      twoBitToFa -seq="${C}" ../hg19.2bit stdout
  done | faToTwoBit stdin hg19.chrOnly.2bit
      twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
  
      grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
  do
      twoBitToFa -seq="${C}" ../hg19.2bit stdout
  done | faToTwoBit stdin hg19.hapOnly.2bit
      twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
  
      grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
  do
      twoBitToFa -seq="${C}" ../hg19.2bit stdout
  done | faToTwoBit stdin hg19.scaffolds.2bit
      twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
  
      cp -p *.2bit *.sizes /hive/data/staging/data/hg19
  
      # ask admin to sync this directory: /hive/data/staging/data/hg19/
      #	to the kluster nodes /scratch/data/hg19/
  
  ############################################################################
  # running cpgIsland business (DONE - 2009-03-06 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/cpgIsland
      cd /hive/data/genomes/hg19/bed/cpgIsland
      cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
      cd hg3rdParty/cpgIslands
      # comment out the following two lines if it compiles cleanly
      # some day  (there were some other fixups too, adding include lines)
      sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
      mv tmp.c cpg_lh.c
      make
      cd ../../
      ln -s hg3rdParty/cpgIslands/cpglh.exe
      mkdir -p hardMaskedFa
      cut -f1 ../../chrom.sizes | while read C
  do
      echo ${C}
      twoBitToFa ../../hg19.2bit:$C stdout \
  	| maskOutFa stdin hard hardMaskedFa/${C}.fa
  done
  
      cut -f1 ../../chrom.sizes > chr.list
      cat << '_EOF_' > template
  #LOOP
  ./runOne $(root1) {check out line results/$(root1).cpg}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cat << '_EOF_' > runOne
  #!/bin/csh -fe
  ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
  mv /scratch/tmp/$1.$$ $2
  '_EOF_'
      # << happy emacs
  
      gensub2 chr.list single template jobList
      para create jobList
      para try
      para check ... etc
      para time
  # Completed: 93 of 93 jobs
  # CPU time in finished jobs:        172s       2.86m     0.05h    0.00d  0.000 y
  # IO & Wait Time:                  1748s      29.14m     0.49h    0.02d  0.000 y
  # Average job time:                  21s       0.34m     0.01h    0.00d
  # Longest finished job:              34s       0.57m     0.01h    0.00d
  # Submission to last job:            83s       1.38m     0.02h    0.00d
  
      # Transform cpglh output to bed +
      catDir results | awk '{
  $2 = $2 - 1;
  width = $3 - $2;
  printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
         $1, $2, $3, $5,$6, width,
         $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
  }' > cpgIsland.bed
  
      cd /hive/data/genomes/hg19/bed/cpgIsland
      hgLoadBed hg19 cpgIslandExt -tab \
        -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
  
  # Reading cpgIsland.bed
  # Loaded 28226 elements of size 10
  # Sorted
  # Saving bed.tab
  # Loading hg19
  
  ############################################################################
  # create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/gap
      cd /hive/data/genomes/hg19/bed/gap
      gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
      cp -p hg19.unBridged.lift ../../jkStuff
      cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
  
  ############################################################################
  # AUTO UPDATE GENBANK RUN  (DONE - 2009-03-07,13 - Hiram)
      # align with latest genbank process.
      cd ~/kent/src/hg/makeDb/genbank
      cvsup
      # edit etc/genbank.conf to add hg19 just after hg18
  
  # hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
  #       Assembly Accession: GCA_000001405.1
  hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
  hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
  hg19.ooc = /scratch/data/hg19/11.ooc
  hg19.lift = /hive/data/genomes/hg19/jkStuff/hg19.unBridged.lift
  hg19.hapRegions = /hive/data/genomes/hg19/jkStuff/hg19.haplotypes.psl
  hg19.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
  hg19.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
  hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
  hg19.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
  hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
  hg19.genbank.est.xeno.pslCDnaFilter   = ${finished.genbank.est.xeno.pslCDnaFilter}
  hg19.genbank.est.xeno.load = yes
  hg19.refseq.mrna.xeno.load  = yes
  hg19.refseq.mrna.xeno.loadDesc = yes
  hg19.mgc = yes
  hg19.orfeome = yes
  hg19.downloadDir = hg19
  hg19.ccds.ncbiBuild = 37.1
  hg19.upstreamGeneTbl = refGene
  hg19.upstreamMaf = multiz46way /hive/data/genomes/hg19/bed/multiz46way/species.list
  hg19.genbank.mrna.blatTargetDb = yes
  hg19.perChromTables = no
  
      cvs ci -m "Added hg19." etc/genbank.conf
      # update /cluster/data/genbank/:
      make etc-update
  
      ssh genbank
      screen		#	use a screen to manage this job
      cd /cluster/data/genbank
      time nice -n +19 bin/gbAlignStep -initial hg19 &
      #	logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
      #	real    2761m13.680s
      #	that ran on the swarm with little interference and no problems
  
      # load database when finished
      ssh hgwdev
      screen	# use screen to manage this long running command
      cd /cluster/data/genbank
      time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
      # logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
      #	real    369m11.941s
  
      # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
      cd ~/kent/src/hg/makeDb/genbank
      cvsup
      # add hg19 to:
          etc/align.dbs
          etc/hgwdev.dbs
      cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
      make etc-update
  
  #########################################################################
  #  BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
  #	After getting a blat server assigned by the Blat Server Gods,
      ssh hgwdev
  
      hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
  	VALUES ("hg19", "blat13", "17778", "1", "0"); \
  	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
  	VALUES ("hg19", "blat13", "17779", "0", "1");' \
  	    hgcentraltest
      #	test it with some sequence
  
  ############################################################################
  # Making download files (DONE - 2009-03-13 - Hiram)
      cd /hive/data/genomes/hg19
      makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
  	> downloads.log 2>&1
  ############################################################################
  # Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
  doBlastzChainNet.pl `pwd`/DEF \
          -stop=partition -bigClusterHub=swarm \
          -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
          -workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
  
  doBlastzChainNet.pl `pwd`/DEF \
          -continue=blastz -stop=blastz -bigClusterHub=swarm \
          -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
          -workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
  
  doBlastzChainNet.pl `pwd`/DEF \
          -continue=cat -stop=net -bigClusterHub=swarm \
          -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
          -workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
  real    163m28.438s
  
      # to load, run it in debug, then check the load script
  doBlastzChainNet.pl `pwd`/DEF \
  	-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
  	-debug -smallClusterHub=swarm -chainMinScore=1000 \
  	-chainLinearGap=medium \
  	-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
  
      # and create a synNet for multiz, run in debug, and examine script
      #	to make sure it works correctly
  doBlastzChainNet.pl `pwd`/DEF \
  	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
  	-debug -bigClusterHub=swarm \
  	-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
  	-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
      #	real    31m11.216s
  
  ############################################################################
  # reset position to chr6 haplotype situation
      hgsql -e \
  'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
  	hgcentraltest
  
  # reset to a smaller range (2009-04-24 - Brooke)
  # this is the SOD1 gene, implicated in Lou Gehrig's disease.
  
      hgsql -e \
  'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
          hgcentraltest
  
  ############################################################################
  # Self Lastz run (DONE - 2009-03-19 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
      cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
      cat << '_EOF_'
  # human vs human
  BLASTZ=lastz
  # maximum M allowed with lastz is only 255
  BLASTZ_M=254
  # lastz does not like the O= and E= lines in the matrix file
  #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
  BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from hg18 vs venter1 lastz on advice from Webb
  BLASTZ_K=10000
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Human Hg19
  SEQ2_DIR=/scratch/data/hg19/hg19.2bit
  SEQ2_LEN=/scratch/data/hg19/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      screen # use screen to manage this long-running job
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
  	-workhorse=hgwdev \
  	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
      #	cluster difficulties, finished manually, then:
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
  	-continue=cat -workhorse=hgwdev \
  	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
  
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
  	-continue=load -debug -workhorse=hgwdev \
  	-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
      #	that indicates it would do:
      hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
      #	adding -normScore
      hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
  
      # a user asked about axtNet files, d
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
  	-ignoreSelf -continue=net -workhorse=hgwdev \
  	-stop=net -smallClusterHub=encodek -bigClusterHub=swarm > net.log 2>&1 &
      #	about 8m 17s
      cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19/axtChain
      netClass -verbose=0 -noAr noClass.net hg19 hg19 hg19.hg19.net
      gzip hg19.hg19.net
      cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/vsSelf
      ln -s \
  /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19/axtChain/hg19.hg19.net.gz .
      # fixup README.txt and md5sum.txt files
      md5sum hg19.hg19.net.gz >> md5sum.txt
      # Brian wants to see the track:
      netFilter -minGap=10 hg19.hg19.net.gz \
  	| hgLoadNet -verbose=0 hg19 netSelf stdin
  
  ############################################################################
  # Chimp Lastz run (DONE - 2009-03-19 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
      cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
      cat << '_EOF_'
  # human vs chimp
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  # lastz does not like the O= and E= lines in the matrix file
  #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
  BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Chimp PanTro2
  SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
  SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      screen # use screen to manage this long-running job
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
      #	real    173m22.880s
      #	cluster problems, continuing after lastz done:
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
  	-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
  	> net.log 2>&1 &
      #	real    81m20.209s
      #	continuing with the load and adding syntenicNet
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
  	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
  	-chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
  	> load.log 2>&1 &
      #	real    47m17.871s
      cat fb.hg19.chainPanTro2Link.txt
      #	2747983350 bases of 2897316137 (94.846%) in intersection
  
      #	running the swap - DONE - 2009-05-24
      ssh swarm
      mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
      cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #	real    723m41.377s
      cat fb.panTro2.chainHg19Link.txt
      #	2761343871 bases of 2909485072 (94.908%) in intersection
  
  ############################################################################
  # Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
      mkdir /hive/data/genomes/hg19/pushQ
      cd /hive/data/genomes/hg19/pushQ
      makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
      # many complaints about the chain and net tables from the haplotype
      #	experiments, and this table:
      #	orfeomeGenes
      #	which is probably in genbank, and these usual ones:
      #	hg19 does not have seq
      #	hg19 does not have extFile
  
  ############################################################################
  # Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/parRegion
      cd /hive/data/genomes/hg19/bed/parRegion
      awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
      awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
      comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
      cat chrXY.par.clone.list \
  	| while read C; do grep "${C}" ../../X/chrX.agp; done \
  	| sort -k1,1 -k2,2n >> chrX.par.region.agp
      cat chrXY.par.clone.list \
  	| while read C; do grep "${C}" ../../Y/chrY.agp; done \
  	| sort -k1,1 -k2,2n >> chrY.par.region.agp
      awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
  	> chrY.par.region.bed
      awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
  	> chrX.par.region.bed
      #	use those bed files in custom tracks on hg19 to verify that they
      #	are two continuous regions with only gaps between these items
      #	these location extents are: (zero relative)
      #	chrX 60000 2722842
      #	chrX 154906585 155260560
      #	chrY 10000 2649520
      #	chrY 59034049 59363566
  
  ############################################################################
  # Gorilla Lastz run (DONE - 2009-03-21,05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
      cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
      cat << '_EOF_'
  # human vs gorilla
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  # lastz does not like the O= and E= lines in the matrix file
  #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
  BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=100000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Gorilla gorGor1
  SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
  SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      screen # use screen to manage this long-running job
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      cat fb.hg19.chainGorGor1Link.txt
      #	1723432141 bases of 2897316137 (59.484%) in intersection
      doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1
  
  ############################################################################
  # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2009-04-02 - Hiram)
      ssh pk
      mkdir /hive/data/genomes/hg19/bed/linSpecRep
      cd /hive/data/genomes/hg19/bed/linSpecRep
      #	create individual .out files from the master record in ../repeatMasker
      mkdir splitOut
      cat << '_EOF_' > split.csh
  #!/bin/csh -fe
  set C = $1
  head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
  grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
  '_EOF_'
      # << happy emacs
  
      cat << '_EOF_' > template
  #LOOP
  split.csh $(root1) {check out line+ splitOut/$(root1).out}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cut -f1 ../../chrom.sizes > chrom.list
      gensub2 chrom.list single template jobList
      para create jobList
      para try ... check ... push ... etc...
  # Completed: 93 of 93 jobs
  # CPU time in finished jobs:        127s       2.12m     0.04h    0.00d  0.000 y
  # IO & Wait Time:                 17154s     285.90m     4.76h    0.20d  0.001 y
  # Average job time:                 186s       3.10m     0.05h    0.00d
  # Longest finished job:             224s       3.73m     0.06h    0.00d
  # Submission to last job:           280s       4.67m     0.08h    0.00d
  
      #	now, we can date and process each of those .out files
      #	this really should be a single creation of notInOthers
      #	These four different ones all end up to be the same anyhow
      #	the notInMouse becomes notInOthers below and the others are removed.
      mkdir dateRepeats
      cd dateRepeats
      cat << '_EOF_' > mkLSR
  #!/bin/csh -fe
  rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
  ln -s ../splitOut/$1.out .
  /scratch/data/RepeatMasker/DateRepeats \
      $1.out -query human -comp mouse -comp rat -comp dog -comp cow
  rm $1.out
  mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
  /cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
  	> ../notInMouse/$1.out.spec
  /cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
  	> ../notInRat/$1.out.spec
  /cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
  	> ../notInDog/$1.out.spec
  /cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
  	> ../notInCow/$1.out.spec
  '_EOF_'
      #	<< happy emacs
      chmod +x mkLSR
  
      cat << '_EOF_' > template
  #LOOP
  ./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
  #ENDLOOP
  '_EOF_'
      #	<< happy emacs
  
      gensub2 ../chrom.list single template jobList
      para try ... check ... push ... etc...
      para time
  # Completed: 93 of 93 jobs
  # CPU time in finished jobs:       2441s      40.69m     0.68h    0.03d  0.000 y
  # IO & Wait Time:                   332s       5.53m     0.09h    0.00d  0.000 y
  # Average job time:                  30s       0.50m     0.01h    0.00d
  # Longest finished job:             125s       2.08m     0.03h    0.00d
  # Submission to last job:           454s       7.57m     0.13h    0.01d
  
      done
  
      #	these four types of out.spec results all turn out to be identical
      #	To check identical
      cd /hive/data/genomes/hg19/bed/linSpecRep
      find . -name "*.out.spec" | \
  	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
  	| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
  	| sort | uniq -c | less
      #	You will see they are all a count of 4
      #	Set them up on scratch data and get to all the kluster nodes:
      mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
      cd notInMouse
      rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
      cd ..
      mv notInMouse notInOthers
      #	do not need to keep all of these
      rm -fr notInRat notInDog notInCow
  
      # We also need the nibs for blastz runs with lineage specific repeats
      mkdir /hive/data/genomes/hg19/bed/nibs
      cd /hive/data/genomes/hg19/bed/nibs
      cut -f1 ../../chrom.sizes | while read C
  do
      twoBitToFa -seq=${C} ../../hg19.2bit stdout \
  	| faToNib -softMask stdin ${C}.nib
      echo "${C} done"
  done
      mkdir /hive/data/staging/data/hg19/nib
      rsync -a --progress ./ /hive/data/staging/data/hg19/nib
  
      # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
  
  #############################################################################
  # create gc5Base download file (DONE - 2009-04-24 - Hiram)
      cd /hive/data/genomes/hg19/bed/gc5Base
      hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
          /cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
  
  #############################################################################
  # Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram) (Alt. haplotypes added 4/12/10 angie)
      mkdir /hive/data/genomes/hg19/bed/ctgPos
      cd /hive/data/genomes/hg19/bed/ctgPos
      cat << '_EOF_' > mkCtgPos.sh
  AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
  export AGP
  for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
  do
      C=${F/.agp/}
      grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
  {
  printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
  }
  '
  done
  '_EOF_'
      # << happy emacs
      chmod +x mkCtgPos.sh
      ./mkCtgPos.sh > ctgPos.tab
  
      cat << '_EOF_' > mkRanCtgPos.sh
  AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
  export AGP
  for F in `(cd ${AGP}; ls chr*.agp)`
  do
      C=${F/.unlocalized.scaf.agp/}
      c=${C/chr/}
      export C c
      grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
  BEGIN {
      ctgName=""
      ctgStart=0
      ctgEnd=0
      chrom="'${c}'"
      ctgNameLower=""
  }
  {
  if (match(ctgName,$1)) {
      ctgEnd = $3
  } else {
      if (length(ctgName) > 0) {
          size=ctgEnd - ctgStart
  printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
  ctgStart, ctgEnd
      }
      ctgStart = $2 - 1
      ctgEnd = $3
      ctgName = $1
      ctgNameLower = tolower($1)
      sub(".1$","",ctgNameLower)
  }
  }
  END {
  size=ctgEnd - ctgStart
  printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
  ctgStart, ctgEnd
  }
  '
  done
  '_EOF_'
      # << happy emacs
      chmod +x mkRanCtgPos.sh
      ./mkRanCtgPos.sh >> ctgPos.tab
  
      #	fetch .sql definition from hg18
      chmod 777 .
      hgsqldump --all -c --tab=. hg18 ctgPos
      # Don't confuse us w/hg18 data:
      rm ctgPos.txt
      chmod 775 .
      hgsql hg19 < ctgPos.sql
      hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
  
      # 4/12/10 (angie): add the alt loci:
      perl -we 'while (<>) { \
                  next if (/^#/); chomp; @w = split; \
                  $w[0] = lc($w[0]); $w[0] =~ s/^hs//; $w[0] =~ s/_mhc_/_/;  $w[0] =~ s/_ctg1$//; \
                  $w[0] =~ s/_apd$/_apd_hap1/; $w[0] =~ s/_cox$/_cox_hap2/; \
                  $w[0] =~ s/_dbb$/_dbb_hap3/; $w[0] =~ s/_mann$/_mann_hap4/; \
                  $w[0] =~ s/_mcf$/_mcf_hap5/; $w[0] =~ s/_qbl$/_qbl_hap6/; \
                  $w[0] =~ s/_ssto$/_ssto_hap7/; \
                  $w[0] =~ s/_1(_ctg\d)/${1}_hap1/; \
                  if ($w[0] eq "chr6_cox_hap2" && $w[8] == 4873745) { $w[8] = 4795371; } # yep, inconsistent \
                  print join("\t", $w[1], $w[8], $w[0], 0, $w[8]) . "\n"; }' \
      /hive/data/genomes/hg19/download/alternate_loci/*/placed_scaffolds/alt_locus_scaf2primary.pos \
        >> ctgPos.tab
      sort -k 3,3 -k4n,4n ctgPos.tab \
      | hgLoadSqlTab hg19 ctgPos ctgPos.sql stdin
      # TODO: tell NCBI alternate_loci/ALT_REF_LOCI_2/placed_scaffolds/alt_locus_scaf2primary.pos
      # has size inconsistent w/AGP, FASTA
  
  
  #############################################################################
  # CLONE ENDS - first step for BACEND/CytoBand tracks
  #	(DONE - 2009-04-28 - Hiram)
      mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
      cd /hive/data/genomes/hg19/bed/cloneend/ncbi
  
      wget --timestamping \
  'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
      wget --timestamping \
  'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'
  
      cd /hive/data/genomes/hg19/bed/cloneend
      # seems like the *.mfa files were split just for convenience
      # concatenate
  
      for F in ncbi/*.mfa.gz
  do
      zcat "${F}"
      echo "${F}" 1>&2
  done | gzip > all.mfa.gz
      #	that 1>&2 echos to stderr so you can see the file name and not
      #	interfere with the pipe stdout output to gzip
  
      # Convert the title line of the all.mfa file
      zcat all.mfa.gz \
  	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
  	    | gzip > cloneEnds.fa.gz
  
      zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
  
      #	make sure nothing got broken:
      faSize all.mfa.gz
  # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
  # in 833173 sequences in 1 files
  
      faSize cloneEnds.fa.gz
  # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
  # in 833173 sequences in 1 files
  
      #	identical numbers
      #	you can also carefully check the names:
      zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
  	| sed -e "s/\.[0-9]$//" | sort > mfa.names
      #	should be the same as:
      zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names
  
  
      # concatenate the text files, too
      bash
      for F in ncbi/*.txt.gz
  do
      zcat "${F}"
      echo "${F}" 1>&2
  done | gzip > all.txt.gz
  
      # generate cloneEndPairs.txt and cloneEndSingles.txt
      zcat all.txt.gz >all.txt
      $HOME/kent/src/hg/utils/cloneEndParse.pl all.txt
  
      #	Reading in end info
      #	Writing out pair info
      #	Writing out singleton info
      #	302264 pairs and 203094 singles
      #	examined all the clone names and all the bac end names in these two
      #	files and compared with business from all.txt to make sure we properly
      #	classified all of them correctly.  We had 833,173 clone sequences,
      #	and 501,135 bac end names
  
      #	faSplit does not function correctly if given a .gz source file
      #	AND, we need the unzipped file for sequence loading below
      gunzip cloneEnds.fa.gz
      # split
      mkdir splitdir
      cd splitdir
      faSplit sequence ../cloneEnds.fa 100 cloneEnds
      #	Check to ensure no breakage:
      cat *.fa | faSize stdin
  # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
  # in 833173 sequences in 1 files
      #	same numbers as before
  
      # load sequences
      ssh hgwdev
      mkdir /gbdb/hg19/cloneend
      cd /gbdb/hg19/cloneend
        ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
      cd /tmp
      hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
      #  Advisory lock created
      # Creating .tab file
      # Adding /gbdb/hg19/cloneend/cloneEnds.fa
      # 833173 sequences
      # Updating seq table
      # Advisory lock has been released
      # All done
  
  ##############################################################################
  # BACEND SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
      mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
      cd /hive/data/genomes/hg19/bed/bacends/run.blat
      #	going to run separate runs for the golden path sequence vs. the
      #	randoms, haplotypes, chrUn and chrM
      partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
  	| egrep -v "tParts|random|_hap|chrUn" \
  	| sed -e "s/.*2bit://; s/:/./" > hg19.list
      ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
  	> bacEnds.list
  
      ssh swarm
      cd /hive/data/genomes/hg19/bed/bacends/run.blat
  
      cat > template << '_EOF_'
  #LOOP
  runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set target = $1
  set query = $2
  set result = $3
  set partSpec = `echo $target | sed -e "s/\./:/"`
  set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
  set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
  set range = `echo $start $end | awk '{print $2-$1}'`
  set dir = $result:h
  set chr = `echo $target | sed -e "s/\..*//"`
  set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
  set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
  
  # echo $tmpFile
  # echo "chr: $chr $start $end -> size: $chrSize, range: $range"
  /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
  /bin/mkdir -p $dir
  /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
          /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
  rm -f $result
  liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
  rm -f $tmpFile.lift $tmpFile.psl
  '_EOF_'
      # << happy emacs
  
      gensub2 hg19.list bacEnds.list template jobList
      para create jobList
  # 62034 jobs in batch
      # these jobs run quickly, limit them to 250 at a time
      para try, check, -maxJob=250 push, etc ...
  # Completed: 62034 of 62034 jobs
  # CPU time in finished jobs:     506023s    8433.72m   140.56h    5.86d  0.016 y
  # IO & Wait Time:                175853s    2930.88m    48.85h    2.04d  0.006 y
  # Average job time:                  11s       0.18m     0.00h    0.00d
  # Longest finished job:             752s      12.53m     0.21h    0.01d
  # Submission to last job:          3533s      58.88m     0.98h    0.04d
  
      #	combine the alignments
      time pslSort dirs raw.psl temp psl/chr*
      #	62034 files in 24 dirs
      #	Got 62034 files 249 files per mid file
      #	real    81m2.820s
  
      #	-rw-rw-r--  1 13410334441 Apr 29 12:00 raw.psl
      # cleanup
      rmdir temp
  
      time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                  raw.psl  bacEnds.psl /dev/null > pslReps.out 2>&1 &
      #	real    5m55.990s
      #	Processed 106254032 alignments
      #	-rw-rw-r--  1   372734361 Apr 29 12:56 bacEnds.psl
  
  
      wc -l bacEnds.psl
      #	2852977 bacEnds.psl
  
      time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
  	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
  	-mismatch -verbose bacEnds.psl \
  	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
  	all_bacends bacEnds
      #	Reading pair file
      #	Reading psl file
      #	Creating Pairs
      #	Writing to files
      #	real    0m18.851s
      #	this creates the files:
      #	-rw-rw-r--  1    21178741 Apr 29 13:00 bacEnds.pairs
      #	-rw-rw-r--  1     5250873 Apr 29 13:00 bacEnds.orphan
      #	-rw-rw-r--  1      738045 Apr 29 13:00 bacEnds.short
      #	-rw-rw-r--  1      463560 Apr 29 13:00 bacEnds.slop
      #	-rw-rw-r--  1      146369 Apr 29 13:00 bacEnds.mismatch
      #	-rw-rw-r--  1        3528 Apr 29 13:00 bacEnds.long
  
      # filter and sort
      awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
      awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
  	bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed
  
      extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
  	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
  	    > bacEndPairs.load.psl
  
  ############################################################################
  # BACEND Randoms SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
      mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
      cd /hive/data/genomes/hg19/bed/bacends/run.randoms
      #	this separate run for the randoms, haplotypes, chrUn and chrM
      partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
  	| egrep "random|_hap|chrUn" \
  	| sed -e "s/.*2bit://; s/:/./" > random.list
      cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list
  
      ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
  	> bacEnds.list
  
      ssh swarm
      cd /hive/data/genomes/hg19/bed/bacends/run.randoms
      gensub2 random.list bacEnds.list ../run.blat/template jobList
      # very similar runOne.csh script as above, but it doesn't need to do
      #	the lift
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set target = $1
  set query = $2
  set result = $3
  set partSpec = `echo $target | sed -e "s/\./:/"`
  set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
  set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
  set range = `echo $start $end | awk '{print $2-$1}'`
  set dir = $result:h
  set chr = `echo $target | sed -e "s/\..*//"`
  set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
  set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
  
  # echo $tmpFile
  # echo "chr: $chr $start $end -> size: $chrSize, range: $range"
  /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
  /bin/mkdir -p $dir
  /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
          /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
  rm -f $result
  mv $tmpFile.psl $result
  echo rm -f $tmpFile.lift
  '_EOF_'
      # << happy emacs
  
      # these jobs run fast, do not let too many of them run
      para -maxJob=100 try...check...push
      para time
  # Completed: 6762 of 6762 jobs
  # CPU time in finished jobs:      20357s     339.29m     5.65h    0.24d  0.001 y
  # IO & Wait Time:                 17839s     297.31m     4.96h    0.21d  0.001 y
  # Average job time:                   6s       0.09m     0.00h    0.00d
  # Longest finished job:             261s       4.35m     0.07h    0.00d
  # Submission to last job:           508s       8.47m     0.14h    0.01d
  
      time pslSort dirs raw.psl temp psl/chr*
      #	6762 files in 69 dirs
      #	Got 6762 files 82 files per mid file
      #	real    6m37.177s
  
      #	37044 files in 98 dirs
      #	Got 37044 files 192 files per mid file
      #	real    32m24.804s
      #	-rw-rw-r--    1 6487445210 Feb  2 21:08 raw.psl
      time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                  raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
      #	real    0m5.761s
      #	Processed 1254273 alignments
  
      # cleanup
      rmdir temp
  
      wc -l randomEnds.psl
      #	367567 randomEnds.psl
  
      time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
  	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
  	-mismatch -verbose randomEnds.psl \
  	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
  	all_bacends bacEnds
      #	Reading pair file
      #	Reading psl file
      #	Creating Pairs
      #	Writing to files
      #	real    0m11.221s
      #	this creates the files:
      #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.slop
      #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.short
      #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.mismatch
      #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.long
      #	-rw-rw-r--  1    141836 Apr 29 14:53 bacEnds.pairs
      #	-rw-rw-r--  1    649907 Apr 29 14:53 bacEnds.orphan
  
  ##############################################################################
  # BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/bacends
      # filter and sort
      awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
  	| sort -k1,1 -k2,2n > bacEndPairs.bed
      awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
  	run.blat/bacEnds.long run.blat/bacEnds.mismatch \
  	run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
  	run.randoms/bacEnds.short run.randoms/bacEnds.long \
  	run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
  	    | sort -k1,1 -k2,2n > bacEndPairsBad.bed
  
      head -5 run.blat/bacEnds.psl > bacEnds.psl
      headRest 5 run.blat/bacEnds.psl > t.psl
      headRest 5 run.randoms/randomEnds.psl >> t.psl
      sort -k14,14 -k16,16n t.psl >> bacEnds.psl
      extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
  	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
  	    > bacEnds.load.psl
  
  
      #	load them into the database
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/bacends
      #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
      awk '{print $4}' bacEndPairs.bed | grep " "
      awk '{print $5}' bacEndPairs.bed | sort | uniq -c
      #	result should be the scores, no extraneous strings:
      #	156984 1000
      #	   195 300
      #	   316 375
      #	   297 500
      #	  1476 750
      #	edit the file and fix it if it has a bad name.
      hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
                   -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
      #	Loaded 208922 elements of size 11
      # note - this track isn't pushed to RR, just used for assembly QA
      hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
                   -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
      #	Loaded 79004 elements of size 11
      #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
      # NOTE: truncates file to 0 if -nobin is used
      hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
      # one complaint, there appears to be a bogus insert count in one
      #	of the blat results:
  # < 585   797     67      0       3       2       -63     9       79188   +      AQ743980 852     42      846     chr19_gl000208_random   92689   4045    84100  11       14,124,84,496,53,6,20,28,28,10,4,       42,56,180,200,696,750,756,776,804,832,842,      4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
  Became:
  # > 585   797     67      0       3       2       0       9       79188   +	 AQ743980 852     42      846     chr19_gl000208_random   92689   4045	84100  11       14,124,84,496,53,6,20,28,28,10,4,	42,56,180,200,696,750,756,776,804,832,842,	4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
  
      hgsql -N -e "select count(*) from all_bacends;" hg19
      #	 2289275
      hgsql -N -e "select count(*) from all_bacends;" hg18
      #	1727387
      hgsql -N -e "select count(*) from all_bacends;" hg17
      #	 1729146
  
      nice featureBits hg19 all_bacends
  # 230917362 bases of 2897316137 (7.970%) in intersection
      nice featureBits hg18 all_bacends
  # 227770876 bases of 2881515245 (7.905%) in intersectio
      nice featureBits hg17 all_bacends
  # 225763317 bases of 2866216770 (7.877%) in intersection
  
      nice featureBits hg19 bacEndPairs
  # 236889607 bases of 2897316137 (8.176%) in intersection
      nice featureBits hg18 bacEndPairs
  # 162690030 bases of 2881515245 (5.646%) in intersection
      nice featureBits hg17 bacEndPairs
  # 162099487 bases of 2866216770 (5.656%) in intersection
  
      nice featureBits hg19 bacEndPairsBad
  # 38344094 bases of 2897316137 (1.323%) in intersection
      nice featureBits hg18 bacEndPairsBad
  # 37326990 bases of 2881515245 (1.295%) in intersection
      nice featureBits hg17 bacEndPairsBad
  # 37437558 bases of 2866216770 (1.306%) in intersection
  
  ############################################################################
  # STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
      mkdir /hive/data/outside/ncbi/sts.2009-04
      cd /hive/data/outside/ncbi
      ln -s sts.2009-04 sts.11
      cd /hive/data/outside/ncbi/sts.2009-04
      wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
      wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
      wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
      gunzip sts.gz
      mv sts dbSTS.fa
  
      #	these items are copied in from the previous builds
      cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
      cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
      #	edit stsInfo2.bed.prev for a
      #	manual fixup of error that is in the hg18 bed file, replace
      #	the line for AFM067XA9 to fix bogus long list of aliases to be:
  # 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
      #	as taken directly out of the hg18.stsInfo2 table which was fixed
      #	by Bob and Archana
  
      # Convert the title line of the dbSTS.fa file
      #	Verify that column 3 only contains gb emb dbj
      grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c
  #   39124 dbj
  #   57375 emb
  # 1212541 gb
      #	if that is true, this sed will work:
      cat dbSTS.fa \
  	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
  	    > UniSTS.convert.fa
  
      # get accessions
      grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
      #	head and tail that to ensure names are reasonable, odd names would
      #	show up at the beginning or end
      wc -l UniSTS.acc
      #	1309040 UniSTS.acc
  
      # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
      #   all.STS.fa, stsAlias.bed files
  
      updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
  	UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
  
      #	verify the number of aliases is reasonable:
      awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
      #	50 D7S831
      #	34 CHLC.GATA2B06.465
      #	24 CHLC.GATA11E11
      #	23 AFM276ZF5
      #	23 AFM273YH9
      #	22 SHGC-133043
      #	... etc ...
      #	verify there are no unusually long or short lines:
      awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
      #	143
      #	144
      #	144
      awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
      #	552
      #	553
      #	644
      # check for null in the new files:
      grep -i null new.*
      #	if the new files look good, they can become the set to use:
      mv new.info stsInfo2.bed
      mv new.primers all.primers
      mv new.alias stsAlias.bed
      mv new.fa all.STS.fa
  
      # get list of all STS id's in the fasta file
      sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
      wc -l all.STS.id
      # 100520 total sequences
      # in hg18 this was: 93698 total sequences
      $HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
      # check that fasta file for unusual length sequences:
      faSize all.primers.fa
  # 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
  # Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244
  
      # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
      # these will be loaded into the database later
      mkdir -p /hive/data/genomes/hg19/bed/sts
      cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
      cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/
  
      # Create sts sequence alignments
      mkdir /hive/data/genomes/hg19/bed/sts/split
  
      faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts
  
      ssh swarm
      mkdir /hive/data/genomes/hg19/bed/sts/run
      cd /hive/data/genomes/hg19/bed/sts/run
  
      #	going to run separate runs for the golden path sequence vs. the
      #	randoms, haplotypes, chrUn and chrM
      #	40,000,000 chunck sizes, 20,000 overlap
      partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
  	| egrep -v "tParts|random|_hap|chrUn" \
  	| sed -e "s/.*2bit://;" > hg19.list
      ls -1S ../split > sts.list
  
      cat > template << '_EOF_'
  #LOOP
  runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set partSpec = $1
  set query = $2.fa
  set result = $3
  set tmpFile = "/scratch/tmp/$1.$2"
  set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
  set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
  set range = `echo $start $end | awk '{print $2-$1}'`
  set chr = `echo $partSpec | sed -e "s/:.*//"`
  set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
  /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
  /bin/mkdir -p psl/$partSpec
  /bin/rm -f $tmpFile
  /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
      /scratch/data/hg19/hg19.2bit:$partSpec \
  	../split/${query} -stepSize=5 $tmpFile.psl
  /bin/rm -f $result
  /cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
  # rm -f $tmpFile.lift $tmpFile.psl
  '_EOF_'
      # << happy emacs
      chmod +x runOne.csh
  
      gensub2 hg19.list sts.list template jobList
      #	these jobs run quickly, allow only 100 at a time
      para -maxJob=100 create jobList
  # 8367 jobs in batch
      para try ... check ... push ... etc
  # Completed: 8366 of 8366 jobs
  # CPU time in finished jobs:      89744s    1495.74m    24.93h    1.04d  0.003 y
  # IO & Wait Time:                 25467s     424.44m     7.07h    0.29d  0.001 y
  # Average job time:                  14s       0.23m     0.00h    0.00d
  # Longest finished job:              53s       0.88m     0.01h    0.00d
  # Submission to last job:          1592s      26.53m     0.44h    0.02d
  
      #	and, run the randoms as a separate run:
      mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
      cd /hive/data/genomes/hg19/bed/sts/run.randoms
      partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
  	| egrep "tParts|random|_hap|chrUn"
      cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
      ls -1S ../split > sts.list
      cat > template << '_EOF_'
  #LOOP
  runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set partSpec = $1
  set query = $2.fa
  set result = $3
  set tmpFile = "/scratch/tmp/$1.$2"
  /bin/mkdir -p psl/$partSpec
  /bin/rm -f $tmpFile
  /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
      /scratch/data/hg19/hg19.2bit:$partSpec \
  	../split/${query} -stepSize=5 $tmpFile.psl
  /bin/rm -f $result
  mv $tmpFile.psl $result
  /bin/rm -f $tmpFile.psl
  '_EOF_'
      # << happy emacs
      chmod +x runOne.csh
  
      gensub2 hg19.list sts.list template jobList
      #	these jobs run quickly, allow only 100 at a time
      para -maxJob=100 create jobList
  # 6486 jobs in batch
      para try ... check ... push ... etc
  # Completed: 6486 of 6486 jobs
  # CPU time in finished jobs:       2206s      36.77m     0.61h    0.03d  0.000 y
  # IO & Wait Time:                 16505s     275.08m     4.58h    0.19d  0.001 y
  # Average job time:                   3s       0.05m     0.00h    0.00d
  # Longest finished job:              21s       0.35m     0.01h    0.00d
  # Submission to last job:           601s      10.02m     0.17h    0.01d
  
      # Compile sts sequence results
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/sts/run
      time pslSort dirs raw.psl temp psl/chr*
      #	8366 files in 89 dirs
      #	Got 8366 files 91 files per mid file
      #	real    8m50.714s
      #	-rw-rw-r--  1 810438277 May  1 11:45 raw.psl
      cd /hive/data/genomes/hg19/bed/sts/run.randoms
      time pslSort dirs raw.psl temp psl/chr*
      #	6486 files in 69 dirs
      #	Got 6486 files 81 files per mid file
      #	real    1m42.120s
      #	-rw-rw-r--  1 18378188 May  1 11:52 raw.psl
  
      rmdir temp
      cd /hive/data/genomes/hg19/bed/sts
      cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
  	| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
  	stsMarkers.psl /dev/null
      #	Processed 7412166 alignments
      #	-rw-rw-r-- 1 12031760 May  1 11:57 stsMarkers.psl
  
      $HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
      # creates stsMarkers.psl.initial
      #	-rw-rw-r-- 1  4485053 May  1 12:06 stsMarkers.psl.initial
      wc -l stsMarkers.psl.initial
      #	101338  stsMarkers.psl.initial
      #	this command needs a chrom_names file to work correctly with this
      #	new style of layout for hg19:
      cd /hive/data/genomes/hg19
      cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
      cd /hive/data/genomes/hg19/bed/sts
  
      $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
  	/cluster/data/hg19
      wc -l stsMarkers.psl.initial.acc
      #	101338  stsMarkers.psl.initial.acc
  
      sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final
  
      # determine found markers (4th field in file)
      cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
      wc -l stsMarkers.found
      #	96472 stsMarkers.found
      #	out of 100520 total sequences from:
      wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
      #	There are lots of duplicates:
      wc -l stsMarkers.final
      #	101338 stsMarkers.final
      #	And a lot of them are just completely haywire:
      awk '$3-$2 < 1001' stsMarkers.final | wc -l
      #	98382
      #	filter out markers that are too long
      awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered
  
      #  alignment of primers
      ssh swarm
      cd /hive/data/outside/ncbi/sts.2009-04
      awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
  	    all.primers > all.primers.ispcr
      mkdir primerAlign
      cd primerAlign
      mkdir split
      cd split
      split -l 5000 ../../all.primers.ispcr primer_
      ls > ../primer.list
  
      cd ..
      #	we need a 10.ooc file for this business
      time blat /scratch/data/hg19/hg19.2bit \
  	/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
  # Wrote 146902 overused 10-mers to 10.ooc
  # real    19m16.758s
  
      # separate runs for whole genome vs. randoms
      mkdir run
      cd run
      partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
  	| egrep -v "tParts|random|_hap|chrUn" \
  	| sed -e "s/.*2bit://;" > hg19.list
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set partSpec = $1
  set primer = ../split/$2
  set result = $3
  set tmpFile = "/scratch/tmp/$1.$2"
  set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
  set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
  set range = `echo $start $end | awk '{print $2-$1}'`
  set chr = `echo $partSpec | sed -e "s/:.*//"`
  set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
  /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
  /bin/mkdir -p psl/$partSpec
  /bin/rm -f $tmpFile.psl
  /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
      -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
  	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
  /bin/rm -f $result
  /cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
  rm -f $tmpFile.lift $tmpFile.psl
  '_EOF_'
      # << happy emacs
      chmod +x runOne.csh
  
      cat > template << '_EOF_'
  #LOOP
  runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      gensub2 hg19.list ../primer.list template jobList
      para create jobList
  # 5696 jobs in batch
      para try ... check ... push ... etc
  # Completed: 5696 of 5696 jobs
  # CPU time in finished jobs:     203899s    3398.32m    56.64h    2.36d  0.006 y
  # IO & Wait Time:                 22049s     367.48m     6.12h    0.26d  0.001 y
  # Average job time:                  40s       0.66m     0.01h    0.00d
  # Longest finished job:            5314s      88.57m     1.48h    0.06d
  # Submission to last job:          5418s      90.30m     1.50h    0.06d
  # Estimated complete:                 0s       0.00m     0.00h    0.00d
  
      #	sort and filter the results
      cd psl
      pslSort dirs raw.psl temp chr*
      #	5696 files in 89 dirs
      #	Got 5696 files 75 files per mid file
      #	-rw-rw-r-- 1 456802973 May  4 13:32 raw.psl
      cd ..
      mkdir filter
      pslQuickFilter -minMatch=26 -maxMismatch=5 \
          -maxTinsert=5000 -verbose psl/ filter/
      #	-rw-rw-r-- 1 50302564 May  4 13:35 raw.psl
  
      #	And, for the randoms
      mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
      cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
  
      partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
  	| egrep "tParts|random|_hap|chrUn" \
  	| sed -e "s/.*2bit://;" > hg19.list
      cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
      cat tParts/* > hg19.list
  
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set partSpec = $1
  set primer = ../split/$2
  set result = $3
  set tmpFile = "/scratch/tmp/$1.$2"
  /bin/mkdir -p psl/$partSpec
  /bin/rm -f $tmpFile.psl
  /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
      -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
  	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
  /bin/rm -f $result
  mv $tmpFile.psl $result
  '_EOF_'
      # << happy emacs
      chmod +x runOne.csh
  
      #	can not use line+ check here, many of them are empty
      cat > template << '_EOF_'
  #LOOP
  runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      gensub2 hg19.list ../primer.list template jobList
      #	they run quickly, limit to 100
      para -maxJob=100 create jobList
      para try ... check ... push ... etc
  # Completed: 4416 of 4416 jobs
  # CPU time in finished jobs:       1746s      29.09m     0.48h    0.02d  0.000 y
  # IO & Wait Time:                 11407s     190.12m     3.17h    0.13d  0.000 y
  # Average job time:                   3s       0.05m     0.00h    0.00d
  # Longest finished job:               8s       0.13m     0.00h    0.00d
  # Submission to last job:           147s       2.45m     0.04h    0.00d
  
      #	sort and filter the results
      cd psl
      pslSort dirs raw.psl temp chr*
      #	4416 files in 69 dirs
      #	Got 4416 files 66 files per mid file
      rmdir temp
      #	-rw-rw-r-- 1 9066053 May  4 13:31 raw.psl
  
      #	putting the two runs together
      mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
      cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
      ln -s ../run/filter/raw.psl run.psl
      ln -s ../runRandoms/filter/raw.psl runRandoms.psl
      #	-rw-rw-r-- 1 50302564 May  4 13:35 run.psl
      #	-rw-rw-r-- 1   825973 May  4 13:35 runRandoms.psl
      cd ..
      pslSort dirs primers.psl temp psl
      #	2 files in 1 dirs
      #	Got 2 files 1 files per mid file
      #	-rw-rw-r-- 1 51128110 May  4 13:39 primers.psl
      wc -l primers.psl
      #	448107 primers.psl
      rmdir temp
      pslFilterPrimers primers.psl ../all.primers primers.filter.psl
      # creates primers.filter.unlifted.psl.notfound.primers
      wc -l primers*
      #	237962 primers.filter.psl
      #	97191 primers.filter.psl.notfound.primers
  
      #	see if ePCR can find some of these notfound
      ssh swarm
      mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
      cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
  
      mkdir split
      cd split
      split -l 5000 ../../primers.filter.psl.notfound.primers  primers_
      cd ..
      ls -1S split > primers.lst
      partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
  	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
  	| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
      cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list
  
      cat > runOne.csh << '_EOF_'
  #!/bin/csh -fe
  
  set partSpec = $1
  set primer = split/$2
  set result = $3
  set tmpFile = "/scratch/tmp/$1.$2"
  set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
  set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
  set range = `echo $start $end | awk '{print $2-$1}'`
  set chr = `echo $partSpec | sed -e "s/:.*//"`
  set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
  /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
  /bin/mkdir -p epcr/$partSpec
  /bin/rm -f $tmpFile.psl
  twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
  /cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
  /bin/rm -f $result
  /bin/mv $tmpFile.epcr $result
  rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
  '_EOF_'
      # << happy emacs
      chmod +x runOne.csh
  
      cat > template << '_EOF_'
  #LOOP
  runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      gensub2 hg19.list primers.lst template jobList
      para create jobList
  	# 3160 jobs
      para try ... check ... push ... etc ...
  # Completed: 3160 of 3160 jobs
  # CPU time in finished jobs:      86253s    1437.54m    23.96h    1.00d  0.003 y
  # IO & Wait Time:                 11196s     186.61m     3.11h    0.13d  0.000 y
  # Average job time:                  31s       0.51m     0.01h    0.00d
  # Longest finished job:              89s       1.48m     0.02h    0.00d
  # Submission to last job:           237s       3.95m     0.07h    0.00d
  
      find ./epcr -type f | xargs cat > all.epcr
      wc -l all.epcr
      #	797286 all.epcr
      # convert the coordinates from the partitionSequence.pl to a lift file
      awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
      $HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
  	> all.epcr.lift
      cat all.epcr | sed -e "s/\.\./ /; s/  */\t/g" \
  	| liftUp -type=.bed stdout all.epcr.lift error stdin \
  	| awk '
  {
  printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
  }
  ' > all.epcr.lifted
  
      pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
      /cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
  	-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
      #	this took a long time, many hours
  # -rw-rw-r--   1  2785254 May  5 17:28 epcr.not.found
  # -rw-rw-r--   1 27343510 May  5 17:28 epcr.primers.psl
  # -rw-rw-r--   1  1616885 May  5 17:28 epcr.primers.psl.notfound.primers
  
      time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
      time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
  	../../all.primers /hive/data/genomes/hg19
      #	real    69m38.444s
      #	-rw-rw-r--   1        0 May  6 14:18 epcr.not.found.nomatch
      #	-rw-rw-r--   1  8369138 May  6 15:26 epcr.not.found.psl
  
      #	combining everything together now
      cd /hive/data/outside/ncbi/sts.2009-04/primerAlign
  
      sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
                  | sort -k15,15 -k17,17n > primers.final.psl
      wc -l primers.final.psl
      #	310705 primers.final.psl
  
      time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
          ../all.primers primers.final.psl > primers.final.fix.psl
      #	real    0m19.580s
      wc -l primers.final.fix.psl
      #	310705 primers.final.fix.psl
  
      # Extract relevant info, make alignments unique, and create final file to
      #	be merged with full sequence alignments
      $HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
      #	real    0m15.303s
      #	-rw-rw-r-- 1 15660447 May  6 15:44 primers.final.fix.psl.initial
      wc -l primers.final.fix.psl.initial
      #	308210 primers.final.fix.psl.initial
      $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
  	primers.final.fix.psl.initial /hive/data/genomes/hg19
      wc -l primers.final.fix.psl.initial.acc
      #	308210 primers.final.fix.psl.initial.acc
  
      $HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
  	primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
      wc -l primers.final
      # 308210 primers.final
      #	There doesn't appear to be any use for this primers.ids list
      #	except for curiosity.  Check the head and tail of this list to
      #	verify no garbage is in here.  There should just be numbers.
      awk '{print $4}' primers.final | sort -n | uniq > primers.ids
      wc -l primers.ids
      #	290961 primers.ids
  
      # Merge primer and sequence files to create final bed file
      # Merge (combineSeqPrimerPos) takes about an hour to run
      cd /hive/data/genomes/hg19/bed/sts
      time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
  	/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
      #	real    0m12.310s
      #	-rw-rw-r-- 1 15222346 May  6 15:55 stsMarkers_pos.rdb
      wc -l stsMarkers_pos.rdb
      #	315308 stsMarkers_pos.rdb
  
      time /cluster/bin/scripts/createSTSbed \
  	/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed  \
  	stsMarkers_pos.rdb > stsMap.bed
      #	real    0m31.886s
      #	-rw-rw-r-- 1 38244880 May  6 16:25 stsMap.bed
      wc -l stsMap.bed
      #	305914 stsMap.bed
  
      # Set up sequence files
      ssh hgwdev
      mkdir /gbdb/hg19/sts.11/
      ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
  	/gbdb/hg19/sts.11/all.STS.fa
      ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
          /gbdb/hg19/sts.11/all.primers.fa
  
      # Load all files
      cd /hive/data/genomes/hg19/bed/sts
      hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
      #	Creating seq.tab file
      #	Adding /gbdb/hg19/sts.11/all.STS.fa
      #	100520 sequences
      #	Adding /gbdb/hg19/sts.11/all.primers.fa
      #	317592 sequences
      #	Updating seq table
      #	Advisory lock has been released
      #	All done
  
  
      hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
      hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
      #	these files already exist here from previous operations
      # cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
      hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
      hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
      #	a couple minutes for each load above
      #	filter the stsMap.bed to eliminate items longer than 5,000 bases,
      #	takes out about 850:
      awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
  	> stsMap.filtered.5000.bed
  
      hgLoadBed -notItemRgb -noBin -tab \
  	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
  	    stsMap.filtered.5000.bed
      #	Loaded 305064 elements of size 28
  
      ln -s \
  /hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
  	primers.psl
  
      hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
      hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl
  
  ##############################################################################
  # FISH CLONES (WORKING - 2009-04-29 - Hiram)
  # The STS Marker and BAC End Pairs tracks must be completed prior to
  # creating this track.
  
      mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
      cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
  
  # Download information from NCBI
          # point browser at:
  #   http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
  # change "Sequence tag:" to "placed on contig"
          # change "Show details on sequence-tag" to "yes"
          # change "Download or Display" to "Download table for UNIX"
          # press Submit - save as
  # /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
      chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
  
  #	Unfortunately the format of this hbrc file has changed since
  #	last time.  The columns have been rearranged, and one important
  #	column is missing, the contig information.  So, let's see if we
  #	can recover the original format by putting this together with
  #	some other things we have here.
      $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
  	/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
  	    2> dbg
  XXX - need to get this seq_clone.pmd from NCBI, maybe Paul Kitts
      #	the seq_clone.pmd file was obtained via email from Wonhee Jang
      #	jang at ncbi.nlm.nih.gov - I have asked for clarification where
      #	such a file can be fetched without resorting to email.
  
  # Get current clone/accession information
      wget --timestamping ftp://ftp.ncbi.nih.gov/repository/clone/reports/clac.out
  http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
  
  # Create initial Fish Clones bed file
      ssh kkstore02
      mkdir /hive/data/genomes/hg19/bed/fishClones
      cd /hive/data/genomes/hg19/bed/fishClones
  
      # Copy previous sts info from fhcrc
      cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
      #	This fhcrc.sts listing doesn't change.  It is merely a listing
      #	of aliases that remain in effect.
  
      #	Create cl_acc_gi_len file form cloneend information:
      grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
      | awk '{gsub(".[0-9]*$", "", $2);
  	printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
  
      hgsql -N \
  	-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
  	| sort -k1,1 -k2,2n > ctgPos.bed
      hgsql -N \
  -e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
  	| sort -k1,1 -k2,2n > gold.bed
      hgsql -N \
  -e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
  	| sort -k1,1 -k2,2n > all_bacends.bed
      hgsql -N \
  -e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
  	| sort -k1,1 -k2,2n > bacEndPairs.bed
  
  
  
      ssh hgwdev
      #	have to be on hgwdev for this since it is going to read from the
      #	database.  Had to work on this program to get it past what is
      #	evidently a bad entry in hbrc.fixed where columns of information
      #	are missing for one clone in particular
      time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
  	/hive/data/genomes/hg19/bed/ncbiCytoBand/contig/fixed.hbrc.txt \
  	/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
           ./cl_acc_gi_len \
  	/hive/data/genomes/hg19/bed/bacends/bacEnds.load.psl \
              fishClones
      #	real    2m4.708s
  # Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
  # reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
  # Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
  # Reading BAC Ends file ./cl_acc_gi_len
  # Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
  # Reading additional STS Marker links fhcrc.sts
  # Determining good positions
  #	findClonePos: determining positions of fish clones
  # Writing output file
  # ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
  # RP11-79L11
  # ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
  # RP11-79L11
  
      # Load the track
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/fishClones
      hgLoadBed -notItemRgb -noBin -tab \
          -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
  	hg19 fishClones fishClones.bed
      #	Loaded 9461 elements of size 16
  
  ##############################################################################
  # CytoBands from Wonhee Jang at NCBI (DONE - 2009-06-10 - Hiram)
  
      mkdir /hive/data/genomes/hg19/bed/ncbiCytoBand
      cd /hive/data/genomes/hg19/bed/ncbiCytoBand
      #	received the following files via email:
      ls -ogrt
  # -rw-rw-r-- 1 187930 Jun 10 13:53 ideogram
  # -rw-rw-r-- 1 672327 Jun  8 09:55 fish.markers.bed
  
      #	created cytobands.bed from the ideogram file with:
      cat << '_EOF_' > ideoToCytoBand.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  open (FH,"<ideogram") or die "can not read ideogram";
  
  while (my $line = <FH>) {
      next if $line =~ m/^#/;
      chomp $line;
      my ($chr, $arm, $location, $a, $b, $start, $end, $stain) =
          split('\s+',$line);
      next if ($location =~ m/[a-z]$/);
  //g;$stain =~ s/
      $start -= 1 if ($start == 1);
      printf "chr%s\t%d\t%d\t%s%s\t%s\n", $chr, $start, $end, $arm, $location,
          $stain;
  }
  
  close (FH);
  '_EOF_'
      # << happy emacs
      chmod +x ideoToCytoBand.pl
      ./ideoToCytoBand.pl > cytobands.bed
  
      hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
          hg19 cytoBand cytobands.bed
  
      hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
          hg19 cytoBandIdeo cytobands.bed
      #	checking coverage:
      featureBits -noRandom -noHap -countGaps hg19 cytoBand
      #	3095677412 bases of 3095693983 (99.999%) in intersection
      #	that is everything except chrM:
      echo 3095693983-3095677412 | bc -q
      #	16571
  
  ##########################################################################
  # CYTOBANDIDEO update -  (DONE - 2013-02-27 - kuhn)
  # adding rows for chroms with no cytology
  # this is just for navigation/orientation on those chroms
  
      set db=hg19
      set sql=~/kent/src/hg/lib/cytoBandIdeo.sql
      # make backup of existing table
      hgsql -e "CREATE TABLE cytoBandIdeoCopy SELECT * FROM cytoBandIdeo" $db
      # dump existing table
      hgsql -N -e "SELECT * FROM cytoBandIdeo" $db > $db.cytoBandIdeo
  
      # find chroms already covered
      hgsql -N -e 'SELECT chrom FROM cytoBandIdeo' $db \
         | sort -u > $db.coveredNames
      # make cytoBand records for chroms not already covered
      hgsql -N -e 'SELECT chrom, size FROM chromInfo' $db \
        | grep -wvf $db.coveredNames \
        | awk '{print $1"\t0\t"$2"\t\tgneg"}' > $db.cytoBandNew
      # check
      wc -l $db.*
      # combine and sort
      cat $db.cytoBandNew $db.cytoBandIdeo > $db.cytoBandIdeoFull
      bedSort $db.cytoBandIdeoFull $db.cytoBandIdeoFull
      # replace exsting table
      hgsql -e "DROP TABLE cytoBandIdeo" $db
      hgLoadSqlTab $db cytoBandIdeo $sql $db.cytoBandIdeoFull
      # check and then drop copy
  
  ##############################################################################
  # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
      #	new names as of Ensembl version 57, see below
      mkdir /hive/data/genomes/hg19/ensembl
      cd /hive/data/genomes/hg19/ensembl
      wget --timestamping \
  	'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
      #	do not need the repeat masker sequence (although it would be
      #	interesting to measure to see how it compares)
      rm -f *.dna_rm.*
      #	fortunately we have the same sizes as Ensembl for everything
      #	(except the haplotypes) and the sizes are unique for each sequence
      #	so we can relate the names via their sizes
      mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
      cd /hive/data/genomes/hg19/bed/ucscToEnsembl
      #	the toplevel file is a duplicate of everything else
      ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
  	| while read F
  do
      zcat "${F}"
  done | faCount stdin > faCount.txt
  
      cat << '_EOF_' > relateUcscEnsembl.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  my %ucscChrs;   # key is size, value is UCSC chr name
  
  open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
  while (my $line = <FH>) {
      chomp $line;
      my ($chr, $size) = split('\s+', $line);
      die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
  );
      $ucscChrs{$size} = $chr;
  }
  close (FH);
  
  my %ensemblChrs;        # key is size, value is Ensembl chr name
  
  open (FH,"<faCount.txt") or die "can not read faCount.txt";
  while (my $line = <FH>) {
      next if ($line =~ m/#/);
      next if ($line =~ m/total/);
      chomp $line;
      my ($chr, $size, $rest) = split('\s+', $line, 3);
      die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
  );
      $ensemblChrs{$size} = $chr;
  }
  close (FH);
  
  my %usedUcscChrs;
  my %usedEnsemblChrs;
  my %ensemblTranslate; # key is Ensembl name, value is UCSC size
  foreach my $size (keys %ucscChrs) {
      if (exists($ensemblChrs{$size})) {
          $usedUcscChrs{$size} = $ucscChrs{$size};
          $usedEnsemblChrs{$size} = $ensemblChrs{$size};
          printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
      } else {
          my $ucscName = $ucscChrs{$size};
          my $ensemblName = "unens";
          if ($ucscName =~ m/^chr6/) {
              $ucscName =~ s/_hap.//;
              $ucscName =~ s/chr6_/chr6_mhc_/;
              $ensemblName = "HS" . uc($ucscName);
          } elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
              $ucscName =~ s/_.*/_1/;
              $ensemblName = "HS" . uc($ucscName);
          } elsif ($ucscName =~ m/^chrM/) {
              print "# no translation for chrM\n";
          } else {
              die "unens UCSC chr name: $ucscName";
          }
          printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
          $ensemblTranslate{$ensemblName} = $size;
      }
  }
  
  foreach my $size (keys %ensemblChrs) {
      if (!exists($usedEnsemblChrs{$size})) {
          my $ensemblName = $ensemblChrs{$size};
          if (! exists($ensemblTranslate{$ensemblName})) {
              die "can not translate Ensembl name $ensemblName";
          } else {
              my $ucscSize = $ensemblTranslate{$ensemblName};
              printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
  , $ucscSize, $size;
          }
      }
  }
  
  printf "chrM\tMT\n";
  '_EOF_'
      # << happy emacs
      chmod +x relateUcscEnsembl.pl
  
      ./relateUcscEnsembl.pl  2>&1 | grep -v "^#" \
  	| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab
  
      cat << '_EOF_' > ucscToEnsembl.sql
  # UCSC to Ensembl chr name translation
  CREATE TABLE ucscToEnsembl (
      ucsc varchar(255) not null,        # UCSC chromosome name
      ensembl varchar(255) not null,     # Ensembl chromosome name
                #Indices
      PRIMARY KEY(ucsc(21))
  );
  '_EOF_'
  
      hgsql hg19 < ucscToEnsembl.sql
      hgsql hg19 \
  -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
  
      awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
  	> ensemblLift.tab
  
      cat << '_EOF_' > ensemblLift.sql
  # UCSC offset to Ensembl coordinates
  CREATE TABLE ensemblLift (
      chrom varchar(255) not null,      # Ensembl chromosome name
      offset int unsigned not null,     # offset to add to UCSC position
                #Indices
      PRIMARY KEY(chrom(15))
  );
  '_EOF_'
  
      hgsql hg19 < ensemblLift.sql
      hgsql hg19 \
  -e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
  
  ##############################################################################
  # LASTZ MOUSE Mm9 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
      cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
  
      cat << '_EOF_' > DEF
  # human vs mouse
  BLASTZ_ABRIDGE_REPEATS=1
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Mouse Mm9
  SEQ2_DIR=/scratch/data/mm9/nib
  SEQ2_SMSK=/scratch/data/mm9/notInOthers
  SEQ2_LEN=/scratch/data/mm9/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      cat fb.hg19.chainMm9Link.txt
      #	1022734273 bases of 2897316137 (35.299%) in intersection
  
      #	and the swap
      mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
      cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
  	-swap -noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    131m58.763s
      cat fb.mm9.chainHg19Link.txt
      #	1013880568 bases of 2620346127 (38.693%) in intersection
  
  #########################################################################
  # LASTZ Dog CanFam2 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
      cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
  
      cat << '_EOF_' > DEF
  # human vs dog
  BLASTZ_ABRIDGE_REPEATS=1
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Dog CanFam2
  SEQ2_DIR=/scratch/data/canFam2/nib
  SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
  SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
  SEQ2_IN_CONTIGS=0
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      cat fb.hg19.chainCanFam2Link.txt
      #	1532073507 bases of 2897316137 (52.879%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/canFam2/bed/blastz.hg19.swap
      cd /hive/data/genomes/canFam2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13/DEF \
  	-noLoadChainSplit -swap \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    200m17.158s
      cat fb.canFam2.chainHg19Link.txt
      #	1480018167 bases of 2384996543 (62.055%) in intersection
  #########################################################################
  # LASTZ Chicken GalGal3 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
      cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
  
      cat << '_EOF_' > DEF
  # human vs chicken
  # Specific settings for chicken (per Webb email to Brian Raney)
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  BLASTZ_ABRIDGE_REPEATS=1
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
  SEQ2_DIR=/scratch/data/galGal3/nib
  SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
  SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
  SEQ2_CHUNK=200000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-noLoadChainSplit \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      cat fb.hg19.chainGalGal3Link.txt
      #	104053179 bases of 2897316137 (3.591%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/galGal3/bed/blastz.hg19.swap
      cd /hive/data/genomes/galGal3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13/DEF \
  	-swap \
  	-noLoadChainSplit \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #	real    16m45.090s
      cat fb.galGal3.chainHg19Link.txt
      #	91605899 bases of 1042591351 (8.786%) in intersection
  
  #########################################################################
  # LASTZ Macaca Mulatta RheMac2 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
      cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
  
      cat << '_EOF_' > DEF
  # human vs macaca mulatta
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Macaca Mulatta RheMac2
  SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
  SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    760m22.810s
      cat fb.hg19.chainRheMac2Link.txt
      #	2397361211 bases of 2897316137 (82.744%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
      cd /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13/DEF \
  	-swap \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #	real    83m51.483s
      cat fb.rheMac2.chainHg19Link.txt
      #	2313806886 bases of 2646704109 (87.422%) in intersection
  #########################################################################
  # LASTZ Rat Rn4 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
      cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
  
      cat << '_EOF_' > DEF
  # human vs rat
  BLASTZ_ABRIDGE_REPEATS=1
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Rat Rn4
  SEQ2_DIR=/scratch/data/rn4/nib
  SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
  SEQ2_LEN=/scratch/data/rn4/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -noLoadChainSplit \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #	real    314m18.227s
      cat fb.hg19.chainRn4Link.txt
      #	952605822 bases of 2897316137 (32.879%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/rn4/bed/blastz.hg19.swap
      cd /hive/data/genomes/rn4/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13/DEF \
  	-swap -noLoadChainSplit \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    188m0.163s
      cat fb.rn4.chainHg19Link.txt
      #	947862300 bases of 2571531505 (36.860%) in intersection
  ##############################################################################
  # LASTZ Orangutan PonAbe2 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
      cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
  
      cat << '_EOF_' > DEF
  # human vs orangutan
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Orangutan PonAbe1
  SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
  SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      cat fb.hg19.chainPonAbe2Link.txt
      #	2646687531 bases of 2897316137 (91.350%) in intersection
  
      # better to have reciprocal best for this one since it is low coverage:
      cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
      time doRecipBest.pl hg19 ponAbe2 -buildDir=`pwd` -workhorse=hgwdev \
  	> best.log 2>&1
      #   real    159m4.934s
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
      cd /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13/DEF \
  	-swap \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> swap.log 2>&1 &
      #	real    124m3.610s
      cat fb.ponAbe2.chainHg19Link.txt
      #	2772351468 bases of 3093572278 (89.617%) in intersection
  ##############################################################################
  # LASTZ Lamprey PetMar1 (DONE - 2009-05-14 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
  
      cat << '_EOF_' > DEF
  # Human vs. Lamprey
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=100000000
  SEQ1_LAP=10000
  SEQ2_LIMIT=5
  
  # QUERY: Lamprey petMar1
  SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
  SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real    113m20.116s
      cat fb.hg19.chainPetMar1Link.txt
      #	31347143 bases of 2897316137 (1.082%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/petMar1/bed/blastz.hg19.swap
      cd /hive/data/genomes/petMar1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
  	-swap > swap.log 2>&1 &
      #	real    59m14.813s
      cat fb.petMar1.chainHg19Link.txt
      #	26615001 bases of 831696438 (3.200%) in intersection
  ##############################################################################
  # LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
  
      cat << '_EOF_' > DEF
  # Human vs. Fugu
  # Try "human-fugu" (more distant, less repeat-killed than mammal) params
  # +M=50:
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Fugu fr2
  #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
  SEQ2_DIR=/scratch/data/fr2/fr2.2bit
  SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
  SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
  SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
  SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=30
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
  	> do.log 2>&1 &
      #	real    5797m9.288s
      #	had a small problem finishing the fundamental batch run, continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-continue=cat -qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
  	> cat.log 2>&1 &
      cat fb.hg19.chainFr2Link.txt
      #	49309456 bases of 2897316137 (1.702%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/fr2/bed/blastz.hg19.swap
      cd /hive/data/genomes/fr2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
  	-swap > swap.log 2>&1 &
      #	real    25m8.491s
      cat fb.fr2.chainHg19Link.txt
      #	42984130 bases of 393312790 (10.929%) in intersection
  
  ##############################################################################
  # LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
  
      cat << '_EOF_' > DEF
  # human vs tetraodon
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
  SEQ2_DIR=/scratch/data/tetNig1/tetNig1.2bit
  SEQ2_LEN=/hive/data/genomes/tetNig1/chrom.sizes
  SEQ2_CHUNK=410000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real    166m19.745s
      cat fb.hg19.chainTetNig1Link.txt
      #	58038079 bases of 2897316137 (2.003%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
      cd /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-swap > swap.log 2>&1 &
      #	real    29m20.968s
      cat fb.tetNig1.chainHg19Link.txt
      #	49453375 bases of 342403326 (14.443%) in intersection
  
  ##############################################################################
  # LASTZ Stickleback GasAcu1 (DONE - 2009-05-14 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
  
      cat << '_EOF_' > DEF
  # Human vs. Stickleback
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # TARGET: Stickleback gasAcu1
  SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
  SEQ2_LEN=/hive/data/genomes/gasAcu1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real    174m40.659s
      cat fb.hg19.chainGasAcu1Link.txt
      #	55509003 bases of 2897316137 (1.916%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
      cd /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-swap > swap.log 2>&1 &
      #	real    29m41.433s
      cat fb.gasAcu1.chainHg19Link.txt
      #	49909819 bases of 446627861 (11.175%) in intersection
  ##############################################################################
  # LASTZ Marmoset CalJac1 (DONE - 2009-05-14,22 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
  
      cat << '_EOF_' > DEF
  # human vs. marmoset
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Marmoset (calJac1)
  SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
  SEQ2_LEN=/scratch/data/calJac1/chrom.sizes
  SEQ2_LIMIT=200
  SEQ2_CHUNK=30000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    214m16.294s
      cat fb.hg19.chainCalJac1Link.txt
      #	2053025318 bases of 2897316137 (70.860%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 calJac1 > rbest.log 2>&1 &
      #	real    97m17.207s
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/calJac1/bed/blastz.hg19.swap
      cd /hive/data/genomes/calJac1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14/DEF \
  	-swap \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #	real    162m52.189s
      cat fb.calJac1.chainHg19Link.txt
      #	2105959656 bases of 2929139385 (71.897%) in intersection
  
  #########################################################################
  # LASTZ Tarsier TarSyr1 (DONE - 2009-05-14,30 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
  
      cat << '_EOF_' > DEF
  # Human vs. Tarsier
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=200000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Tarsier
  SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
  SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=50
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1724m48.032s
      #	need to load the chain table manually:
      #	mySQL error 1114: The table 'chainTarSyr1Link' is full
      cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
      wc -l *.tab
      #	 21882142 chain.tab
      #	165017606 link.tab
      #	186899748 total
      awk '{print length($0)}' link.tab | sort | uniq -c | less
        4 23
        9 24
       27 25
      105 26
      767 27
     1401 28
     5020 29
     8472 30
    24390 31
   117666 32
   264774 33
   776095 34
  1632393 35
  2672187 36
  7125988 37
  16831901 38
  34905113 39
  45218159 40
  31570706 41
  13746548 42
  5868689 43
  2460114 44
  1118556 45
   420826 46
   106674 47
    36770 48
    40719 49
    36955 50
    19389 51
     5571 52
     1557 53
       61 54
  
      time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19
  
      cat << '_EOF_' | hgsql hg19
      CREATE TABLE chainTarSyr1Link (
        bin smallint(5) unsigned NOT NULL default 0,
        tName varchar(255) NOT NULL default '',
        tStart int(10) unsigned NOT NULL default 0,
        tEnd int(10) unsigned NOT NULL default 0,
        qStart int(10) unsigned NOT NULL default 0,
        chainId int(10) unsigned NOT NULL default 0,
        KEY tName (tName(16),bin),
        KEY chainId (chainId)
      ) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 hgsql -e \
        "load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
      #	real    157m0.230s
      #	the running the rest of loadUp.csh after the hgLoadChain
      #	real    26m8.263s
      cat fb.hg19.chainTarSyr1Link.txt
      #	1385797066 bases of 2897316137 (47.830%) in intersection
      #	Continuing:
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-continue=download -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> download.log 2>&1 &
      #	real    48m6.573s
      #	ran the script on swarm to recover after hive outages
      time doRecipBest.pl -buildDir=`pwd` hg19 tarSyr1 > rbest.log 2>&1 &
      #	real    404m0.201s
      time doRecipBest.pl -continue=download -buildDir=`pwd` \
  	hg19 tarSyr1 > rbest.download.log 2>&1 &
  
      # swap DONE - 2013-07-03 - Hiram
      mkdir /hive/data/genomes/tarSyr1/bed/blastz.hg19.swap
      cd /hive/data/genomes/tarSyr1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/DEF \
  	-swap -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #  real    1280m20.578s
  
      cat fb.tarSyr1.chainHg19Link.txt
      #  1529248348 bases of 2768536343 (55.237%) in intersection
  
      cd /hive/data/genomes/tarSyr1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,22 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
  
      cat << '_EOF_' > DEF
  # Human vs. Bushbaby
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=200000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
  SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
  SEQ2_LEN=/hive/data/genomes/otoGar1/chrom.sizes
  SEQ2_LIMIT=200
  SEQ2_CHUNK=30000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    762m56.055s
      cat fb.hg19.chainOtoGar1Link.txt
      #	1264492372 bases of 2897316137 (43.644%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 otoGar1 > rbest.log 2>&1 &
      #	real    271m39.925s
  
  #########################################################################
  # LASTZ Mouse lemur MicMur1 (DONE - 2009-05-14,26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
      cd /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
  
      cat << '_EOF_' > DEF
  # Human vs. Mouse lemur
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=200000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Mouse lemur
  SEQ2_DIR=/hive/data/genomes/micMur1/bed/repeatMasker/micMur1.rmsk.2bit
  SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real    5429m52.082s
      #	there is one unusual long running job having trouble
      #	continuing after finishing the lastz run manually:
      time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
  	-continue=cat -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> cat.log 2>&1 &
      #	real    388m25.032s
      cat fb.hg19.chainMicMur1Link.txt
      #	1347792207 bases of 2897316137 (46.519%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 micMur1 > rbest.log 2>&1
      #	about 4h30m
  
      # and for the swap (DONE - 2013-03-26 - Hiram)
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14/DEF \
          -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=encodek \
          > swap.log 2>&1 &
      #   Elapsed time: 186m47s
      cat fb.micMur1.chainHg19Link.txt
      #   1318212266 bases of 1852394361 (71.163%) in intersection
  
  #########################################################################
  # LASTZ Baboon PapAnu2 (DONE - 2013-08-19 braney)
      mkdir /hive/data/genomes/hg19/bed/lastzPapAnu2.2013-08-19
      cd /hive/data/genomes/hg19/bed/lastzPapAnu2.2013-08-19
  
      cat << '_EOF_' > DEF
  # human vs baboon
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=100000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Baboon papAnu2
  SEQ2_DIR=/hive/data/genomes/papAnu2/papAnu2.2bit
  SEQ2_LEN=/hive/data/genomes/papAnu2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPapAnu2.2013-08-19
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
  	> do.log 2>&1 &
  
      # real    162m10.321s
  
      cat fb.hg19.chainPapAnu2Link.txt
      # 2467659115 bases of 2897316137 (85.171%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPapAnu2.2013-08-19 lastz.papAnu2
      cd lastz.papAnu2
  
      time doRecipBest.pl -buildDir=`pwd` hg19 papAnu2 > rbest.log 2>&1 &
      #	real    182m0.276s
  
      #	running the swap - DONE - 2014-07-25
      mkdir /hive/data/genomes/papAnu2/bed/blastz.hg19.swap
      cd /hive/data/genomes/papAnu2/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-swap /hive/data/genomes/hg19/bed/lastzPapAnu2.2013-08-19/DEF \
  	-syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
  	> swap.log 2>&1
      #  real    99m36.864s
  
      cat fb.papAnu2.chainHg19Link.txt
      #  2410008689 bases of 2893250291 (83.298%) in intersection
  
  #########################################################################
  # LASTZ Baboon PapHam1 (DONE - 2009-05-20,22 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
      cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
  
      cat << '_EOF_' > DEF
  # human vs baboon
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=100000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Baboon papHam1
  SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
  SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	forgot that the synNet was not needed here, use recip best as below
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      cat fb.hg19.chainPapHam1Link.txt
      #	2399269031 bases of 2897316137 (82.810%) in intersection
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPapHam1.2009-05-20 lastz.papHam1
  
      time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
      #	real    182m0.276s
  
      #	running the swap - DONE - 2013-07-01
      mkdir /hive/data/genomes/papHam1/bed/blastz.hg19.swap
      cd /hive/data/genomes/papHam1/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-swap /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20/DEF \
  	-syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #  real    534m25.137s
  
      cat fb.papHam1.chainHg19Link.txt
      #  2353204907 bases of 2741849051 (85.825%) in intersection
  
      cd /hive/data/genomes/papHam1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # SGP GENES (DONE - 2009-05-22 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/sgpGene
      cd /hive/data/genomes/hg19/bed/sgpGene
      mkdir download
      cd download
  for C in `cut -f1 ../../../chrom.sizes`
  do
      echo $C
      wget --timestamping \
  http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
      wget --timestamping \
  http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
  done
  
      cd ..
      cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin
  
      #	Read 33994 transcripts in 291782 lines in 1 files
      #	33994 groups 85 seqs 1 sources 3 feature types
      #	33994 gene predictions
      nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
  # refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x
  
  ###########################################################################
  # GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
      ssh hgwdev
      mkdir /hive/data/genomes/hg19/bed/geneid
      cd /hive/data/genomes/hg19/bed/geneid
      mkdir download
      cd download
      for C in `cut -f1 ../../../chrom.sizes`
      do
  	echo $C
   wget --timestamping \
  http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
      wget --timestamping \
  http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
      done
  
      cd ..
      cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
      #	Read 33428 transcripts in 277332 lines in 1 files
      #	33428 groups 92 seqs 1 sources 3 feature types
      #	33428 gene predictions
  
  ##########################################################################
  ## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
      ssh hgwdev
      mkdir /hive/data/genomes/hg19/bed/multiz4way
      cd /hive/data/genomes/hg19/bed/multiz4way
  
      #	extract our 4 organisms from the 44-way on hg18:
      ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
  
      /cluster/bin/phast/tree_doctor \
  	--prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
  	| sed -e "s/hg18/hg19/" > 4way.nh
  
      #	this looks like:
      cat 4way.nh
  (((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);
  
  
      #	Use this specification in the phyloGif tool:
      #	http://genome.ucsc.edu/cgi-bin/phyloGif
      #	to obtain a gif image for htdocs/images/phylo/hg19_4way.gif
  
      /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
      #	Use this output to create the table below
      grep -y hg19 4way.distances.txt | sort -k3,3n
  #
  #	If you can fill in all the numbers in this table, you are ready for
  #	the multiple alignment procedure
  #
  #                         featureBits chainLink measures
  #                                        chainHg19Link   chain    linearGap
  #    distance                      on hg19    on other   minScore
  #  1  0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx)       5000     medium
  #  2  0.356914 - dog canFam2    (% 52.879) (% xx.xxx)       3000     medium
  #  3  0.495284 - mouse mm9      (% 35.299) (% 38.693)       3000     medium
  
      #	using the syntenic nets
      cd /cluster/data/hg19/bed/multiz4way
      mkdir mafLinks
      cd mafLinks
      mkdir rheMac2 canFam2 mm9
  
      cd mm9
      ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
      cd ../canFam2
      ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
      cd ../rheMac2
      ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .
  
      #	determine what is the newest version of multiz and use that
      cd /hive/data/genomes/hg19/bed/multiz4way
      mkdir penn
      cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
      cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
      cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
  
      # the autoMultiz cluster run
      ssh swarm
      cd /hive/data/genomes/hg19/bed/multiz4way
  
      # create species list and stripped down tree for autoMZ
      sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
  	4way.nh > tmp.nh
      echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
      sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
  
      mkdir run maf
      cd run
  
      #	NOTE: you need to set the db and multiz dirname properly in this script
      cat > autoMultiz << '_EOF_'
  #!/bin/csh -ef
  set db = hg19
  set c = $1
  set maf = $2
  set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
  set tmp = /scratch/tmp/$db/multiz.$c
  set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
  rm -fr $tmp
  mkdir -p $tmp
  cp ../{tree.nh,species.lst} $tmp
  pushd $tmp
  foreach s (`cat species.lst`)
      set in = $pairs/$s/$c.maf
      set out = $db.$s.sing.maf
      if ($s == $db) then
  	continue
      endif
      if (-e $in.gz) then
  	zcat $in.gz > $out
      else if (-e $in) then
  	cp $in $out
      else
  	echo "##maf version=1 scoring=autoMZ" > $out
      endif
  end
  set path = ($binDir $path); rehash
  $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
  popd
  cp $tmp/$c.maf $maf
  rm -fr $tmp
  '_EOF_'
      # << happy emacs
      chmod +x autoMultiz
  
  cat  << '_EOF_' > template
  #LOOP
  ./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
      gensub2 chrom.lst single template jobList
      para create jobList
      # 93 jobs
      para try ... check ... push ... etc ...
  # Completed: 93 of 93 jobs
  # CPU time in finished jobs:      24282s     404.70m     6.75h    0.28d  0.001 y
  # IO & Wait Time:                  2362s      39.36m     0.66h    0.03d  0.000 y
  # Average job time:                 286s       4.77m     0.08h    0.00d
  # Longest finished job:            2235s      37.25m     0.62h    0.03d
  # Submission to last job:          2241s      37.35m     0.62h    0.03d
  
      #	combine results into a single file for loading and gbdb reference
      cd /hive/data/genomes/hg19/bed/multiz4way
      time nice -n +19 catDir maf > multiz4way.maf
      #	real    3m27.561s
  
      #	makes a 8.5 Gb file:
      #	-rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf
  
      # Load into database
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz4way
      mkdir /gbdb/hg19/multiz4way
      ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
  	/gbdb/hg19/multiz4way
      #	the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
      cd /scratch/tmp
      time nice -n +19 hgLoadMaf hg19 multiz4way
      #	real    5m31.883s
      #	Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way
  
      cd /hive/data/genomes/hg19/bed/multiz4way
      time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
  	-maxSize=50000 hg19 multiz4waySummary multiz4way.maf
      #	Created 1238721 summary blocks from 11959676 components
      #	and 5788627 mafs from multiz4way.maf
      #	real    6m33.936s
  
  #########################################################################
  # LASTZ Medaka OryLat2 (DONE - 2009-05-22 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
      cd /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
  
      cat << '_EOF_' > DEF
  # Human vs. Medaka
  # typical parameters for a genome that is distant from human
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
  SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
  SEQ2_LEN=/hive/data/genomes/oryLat2/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=200
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real    124m5.298s
      cat fb.hg19.chainOryLat2Link.txt
      #	53571737 bases of 2897316137 (1.849%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
      cd /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-swap > swap.log 2>&1 &
      #	real    28m35.174s
      cat fb.oryLat2.chainHg19Link.txt
      #	46961818 bases of 700386597 (6.705%) in intersection
  ##############################################################################
  # LASTZ Opossum MonDom5 (DONE - 2009-05-23,29 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
      cd /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
  
      cat << '_EOF_' > DEF
  # human vs. opossum
  # settings for more distant organism alignments
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Opossum monDom5
  SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
  SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
  SEQ2_CHUNK=30000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	One job took a long time to complete, had to run it manually on
      #	swarm:
  # /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
  #	/scratch/data/hg19/hg19.2bit:chr19:50000000-59128983 \
  #	/scratch/data/monDom5/monDom5.2bit:chr4:390000000-420000000 \
  #	../DEF \
  #	../psl/hg19.2bit:chr19:50000000-59128983/hg19.2bit:chr19:50000000-59128983_monDom5.2bit:chr4:390000000-420000000.psl
      #	took about 48 hours, continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-continue=cat > cat.log 2>&1 &
      #	real    1508m18.471s ==	about 25h08m
      cat fb.hg19.chainMonDom5Link.txt
      #	415997117 bases of 2897316137 (14.358%) in intersection
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
      #	real    20m29.049s
  
      mkdir /hive/data/genomes/monDom5/bed/blastz.hg19.swap
      cd /hive/data/genomes/monDom5/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-swap -syntenicNet > swap.log 2>&1 &
      #	real    297m13.041s
      cat fb.monDom5.chainHg19Link.txt
      #	406727849 bases of 3501660299 (11.615%) in intersection
  
  ##############################################################################
  # LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
      cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
  
      cat << '_EOF_' > DEF
  # Human vs. Armadillo
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Armadillo
  SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
  SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	finished the lastz run manually after hive maintenance outages
      #	then, continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-continue=cat > cat.log 2>&1 &
      #	real    458m11.304s
      cat fb.hg19.chainDasNov2Link.txt
      #	971847303 bases of 2897316137 (33.543%) in intersection
      time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 dasNov2 \
  	> rbest.log 2>&1
      #	time about 6h30m
  
  ##############################################################################
  # LASTZ Rock Hyrax ProCap1 (DONE - 2009-05-23,26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
      cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
  
      cat << '_EOF_' > DEF
  # Human vs. Rock Hyrax
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Rock Hyrax
  SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
  SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
  # Completed: 997438 of 997438 jobs
  # CPU time in finished jobs:   32830587s  547176.45m  9119.61h  379.98d  1.041 y
  # IO & Wait Time:               9549484s  159158.07m  2652.63h  110.53d  0.303 y
  # Average job time:                  42s       0.71m     0.01h    0.00d
  # Longest finished job:            1953s      32.55m     0.54h    0.02d
  # Submission to last job:         67216s    1120.27m    18.67h    0.78d
      #	finished lastz run manually, then continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-continue=cat > cat.log 2>&1 &
      #	real    369m1.678s
      cat fb.hg19.chainProCap1Link.txt
      #	894221652 bases of 2897316137 (30.864%) in intersection
      time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 proCap1 \
  	> rbest.log 2>&1
      #	real    251m59.549s
  
  ##############################################################################
  # LASTZ Zebra Finch TaeGut1 (DONE - 2009-05-26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
      cd /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
  
      cat << '_EOF_' > DEF
  # human vs Zebra Finch
  # distant from Human settings
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Zebra Finch taeGut1 - single chunk big enough to run entire chrom
  SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
  SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
  SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
  SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
  SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-qRepeats=windowmaskerSdust > do.log 2>&1 &
      cat fb.hg19.chainTaeGut1Link.txt
      #	real    192m48.479s
      #	101295490 bases of 2897316137 (3.496%) in intersection
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
  	-chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-continue=syntenicNet -qRepeats=windowmaskerSdust > synNet.log 2>&1 &
      #	real    4m10.261s
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
      cd /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26/DEF \
  	-swap -noLoadChainSplit -chainMinScore=5000 \
  	-chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-qRepeats=windowmaskerSdust > swap.log 2>&1 &
      #	real    real    16m45.080s
      cat fb.taeGut1.chainHg19Link.txt
      #	95320369 bases of 1222864691 (7.795%) in intersection
  
  ##############################################################################
  # LASTZ Lizard AnoCar1 (DONE - 2009-05-30,31 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
      cd /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
  
      cat << '_EOF_' > DEF
  # human vs lizard
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Lizard anoCar1
  SEQ2_DIR=/scratch/data/anoCar1/anoCar1.2bit
  SEQ2_LEN=/scratch/data/anoCar1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-qRepeats=windowmaskerSdust > do.log 2>&1 &
      #	real    168m32.016s
      cat fb.hg19.chainAnoCar1Link.txt
      #	104045950 bases of 2897316137 (3.591%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
      #	real    45m58.001s
  
      #	running syntenic Net 2009-08-27 - Hiram
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-continue=syntenicNet -syntenicNet \
  	-qRepeats=windowmaskerSdust > syntenicNet.log 2>&1 &
      #	real    6m13.304s
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
      cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
      #	real    34m55.857s
      cat fb.anoCar1.chainHg19Link.txt
      #	89608316 bases of 1741478929 (5.146%) in intersection
  ##############################################################################
  # LASTZ X. tropicalis XenTro2 (DONE - 2009-05-26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
      cd /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
  
      cat << '_EOF_' > DEF
  # human vs X. tropicalis
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Lizard anoCar1
  SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
  SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1129m11.568s
      #	finished the lastz run manually after hive difficulties, continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-continue=cat > cat.log 2>&1 &
      #	time about 1h30m
      cat fb.hg19.chainXenTro2Link.txt
      #	92015242 bases of 2897316137 (3.176%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
      cd /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    130m53.860s
      cat fb.xenTro2.chainHg19Link.txt
      #	92070065 bases of 1359412157 (6.773%) in intersection
  
  ##############################################################################
  # LASTZ Zebrafish DanRer5 (DONE - 2009-05-26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
      cd /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
  
      cat << '_EOF_' > DEF
  # human vs X. zebrafish
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Zebrafish danRer5
  SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
  SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    311m39.817s
      cat fb.hg19.chainDanRer5Link.txt
      #	74229561 bases of 2897316137 (2.562%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/danRer5/bed/blastz.hg19.swap
      cd /hive/data/genomes/danRer5/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    26m54.605s
      cat fb.danRer5.chainHg19Link.txt
      #	73852780 bases of 1435609608 (5.144%) in intersection
  
  ##############################################################################
  # LASTZ Platypus OrnAna1 (DONE - 2009-05-26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
      cd /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
  
      cat << '_EOF_' > DEF
  # human vs platypus
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Platypus ornAna1
  SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
  SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    572m18.808s
      cat fb.hg19.chainOrnAna1Link.txt
      #	220977689 bases of 2897316137 (7.627%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 ornAna1 > rbest.log 2>&1
      #	time about 1h32m
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
      cd /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26/DEF \
  	-swap -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #	real    146m52.638s
      cat fb.ornAna1.chainHg19Link.txt
      #	207415519 bases of 1842236818 (11.259%) in intersection
  
  ##############################################################################
  # LASTZ Elephant LoxAfr2 (DONE - 2009-05-27,29 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
      cd /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
  
      cat << '_EOF_' > DEF
  # Human vs. Elephant
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Elephant
  SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
  SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      # time about 3h23m
      cat fb.hg19.chainLoxAfr2Link.txt
      #	1018502258 bases of 2897316137 (35.153%) in intersection
  
      time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr2 > rbest.log 2>&1
      #	real    322m37.502s
  
  ##############################################################################
  # LASTZ Tenrec EchTel1 (DONE - 2009-05-27 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
      cd /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
  
      cat << '_EOF_' > DEF
  # Human vs. Tenrec
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Tenrec
  SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
  SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
  SEQ2_CHUNK=30000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1153m34.595s
      cat fb.hg19.chainEchTel1Link.txt
      #	669856841 bases of 2897316137 (23.120%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 echTel1 > rbest.log 2>&1
      # time about 7h13m
  
  ##############################################################################
  # LASTZ Tree Shrew TupBel1 (DONE - 2009-05-27,06-02 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
      cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
  
      cat << '_EOF_' > DEF
  # Human vs. Tree Shrew
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Tree Shrew
  SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit
  SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes
  SEQ2_CHUNK=30000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real 811m54.095s
      #	having trouble with pk, finished manually
      #	XXX there is one job that is taking forever ...
      #	finished it in pieces on swarm in a few minutes, like this:
      mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
      cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
  #!/bin/sh
  
  S=100000000
  E=101010000
  export S E
  for I in 0 1 2 3 4 5 6 7 8 9
  do
  echo $S $E
  /usr/bin/time -p /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
  /scratch/data/hg19/nib/chr1.nib:chr1:${S}-${E} ../qParts/part019.lst \
  ../../DEF psl/chr1.nib:chr1:${S}-${E}_part019.lst.psl
  nextS=`echo $S | awk '{printf "%d", $1 + 1000000}'`
  nextE=`echo $E | awk '{printf "%d", $1 + 1000000}'`
  S=$nextS
  E=$nextE
  done
  
      grep -h "^#" psl/chr* | sort -u > result.psl
      grep -h -v "^#" psl/chr* | sort -k14,14 -k16,16n >> result.psl
      cp -p result.psl \
  ../../psl/chr1.nib:chr1:100000000-110010000/chr1.nib:chr1:100000000-110010000_part019.lst.psl
  
      #	then, continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
  	-continue=cat > cat.log 2>&1 &
      #	real    212m22.707s
      time doRecipBest.pl -buildDir=`pwd` hg19 tupBel1 > rbest.log 2>&1
      #	time about 4h22m
  
      # and for the swap (DONE - 2013-07-01 - Hiram)
      mkdir /hive/data/genomes/tupBel1/bed/blastz.hg19.swap
      cd /hive/data/genomes/tupBel1/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/DEF \
  	-swap -chainMinScore=3000 -chainLinearGap=medium \
          -qRepeats=windowmaskerSdust \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #  real    176m50.225s
  
      cat fb.tupBel1.chainHg19Link.txt
      #  1058143640 bases of 2137225476 (49.510%) in intersection
  
      cd /hive/data/genomes/tupBel1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ Shrew SorAra1 (DONE - 2009-05-28,30 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
      cd /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
  
      cat << '_EOF_' > DEF
  # Human vs. Shrew
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Shrew
  SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit
  SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes
  SEQ2_CHUNK=30000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	time about 23h26m
      cat fb.hg19.chainSorAra1Link.txt
      #	572519288 bases of 2897316137 (19.760%) in intersection
  
      time doRecipBest.pl -buildDir=`pwd` hg19 sorAra1 > rbest.log 2>&1
      #	real    251m20.055s
  
  ##############################################################################
  # LASTZ Rabbit OryCun1 (DONE - 2009-05-28,30 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
      cd /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
  
      cat << '_EOF_' > DEF
  # Human vs. Rabbit
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Rabbit
  SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
  SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
  SEQ2_CHUNK=30000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	time about 23h09m
      cat fb.hg19.chainOryCun1Link.txt
      #	975693323 bases of 2897316137 (33.676%) in intersection
  
      time doRecipBest.pl -buildDir=`pwd` hg19 oryCun1 > rbest.log 2>&1
      #	real    318m1.142s
  
  ##############################################################################
  # LASTZ Hedgehog EriEur1 (DONE - 2009-05-28,30 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
      cd /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
  
      cat << '_EOF_' > DEF
  # Human vs. Hedgehog
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Hedgehog
  SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit
  SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=500
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	> do.log 2>&1 &
      #	real    2043m33.198s
      cat fb.hg19.chainEriEur1Link.txt
      #	560965051 bases of 2897316137 (19.362%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 eriEur1 > rbest.log 2>&1
      #	real    350m17.737s
  
  ##############################################################################
  # LASTZ Pika OchPri2 (DONE - 2009-05-29,30 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
      cd /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
  
      cat << '_EOF_' > DEF
  # Human vs. Pika
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Pika
  SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit
  SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    393m42.569s
      cat fb.hg19.chainOchPri2Link.txt
      #	804516397 bases of 2897316137 (27.768%) in intersection
      time doRecipBest.pl -buildDir=`pwd` hg19 ochPri2 > rbest.log 2>&1
      #	real    224m47.979s
  
  ##############################################################################
  # LASTZ Kangaroo Rat DipOrd1 (DONE - 2009-05-29,30 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
      cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
  
      cat << '_EOF_' > DEF
  # Human vs. Kangaroo Rat
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Kangaroo Rat
  SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
  SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
  SEQ2_CHUNK=30000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    688m47.595s
      time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
      #	real    140m42.014s
  
  ##############################################################################
  # LIFTOVER TO Hg18 (DONE - 2009-06-04 - Hiram )
      mkdir /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
      cd /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
      # -debug run to create run dir, preview scripts...
      #	verifies files can be found
      doSameSpeciesLiftOver.pl -debug hg19 hg18
      # Real run:
      time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
  	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
  	 hg19 hg18 > do.log 2>&1
      #	real    115m26.071s
  
  #############################################################################
  # BLASTZ/CHAIN/NET/ETC 11 GENOMES TO HG19 (DONE, Andy 2009-06-06)
  ssh hgwdev
  cd /hive/data/genomes/hg19/bed
  mkdir lastz{SpeTri1,FelCat3,CavPor3,BosTau4,PteVam1,EquCab2,VicPac1,MyoLuc1,TurTru1,ChoHof1}.2009-06-04
  ln -s lastzSpeTri1.2009-06-04 lastz.speTri1
  ln -s lastzFelCat3.2009-06-04 lastz.felCat3
  ln -s lastzCavPor3.2009-06-04 lastz.cavPor3
  ln -s lastzBosTau4.2009-06-04 lastz.bosTau4
  ln -s lastzPteVam1.2009-06-04 lastz.pteVam1
  ln -s lastzEquCab2.2009-06-04 lastz.equCab2
  ln -s lastzVicPac1.2009-06-04 lastz.vicPac1
  ln -s lastzMyoLuc1.2009-06-04 lastz.myoLuc1
  ln -s lastzTurTru1.2009-06-04 lastz.turTru1
  ln -s lastzChoHof1.2009-06-04 lastz.choHof1
  cat > lastz.speTri1/DEF << 'EOF'
  # human vs squirrel
  
  # TARGET: human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: squirrel speTri1
  SEQ2_DIR=/hive/data/genomes/speTri1/speTri1.2bit
  SEQ2_LEN=/hive/data/genomes/speTri1/chrom.sizes
  SEQ2_LIMIT=100
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastz.speTri1
  TMPDIR=/scratch/tmp
  EOF
  
  sed 's/speTri1/felCat3/g; s/squirrel/cat/;' lastz.speTr1/DEF | \
     sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
     > lastz.felCat3/DEF
  sed 's/speTri1/cavPor3/g; s/squirrel/guinea pig/;' lastz.speTr1/DEF | \
     sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/' | \
     sed 's/hive\/data\/genomes\/cavPor3/scratch\/data\/cavPor3/' \
     > lastz.cavPor3/DEF
  sed 's/speTri1/bosTau4/g; s/squirrel/cow/;' lastz.speTr1/DEF | \
     sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
     > lastz.bosTau4/DEF
  sed 's/speTri1/pteVam1/g; s/squirrel/megabat/;' lastz.speTr1/DEF | \
     sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=2/' \
     > lastz.pteVam1/DEF
  sed 's/cavPor3/equCab2/g; s/guinea pig/horse/' lastz.cavPor3/DEF | \
     sed 's/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' > lastz.equCab2/DEF
  sed 's/equCab2/vicPac1/g; s/horse/alpaca/' lastz.equCab2/DEF > lastz.vicPac1/DEF
  sed 's/pteVam1/myoLuc1/g; s/megabat/microbat/' lastz.pteVam1/DEF | \
     sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.myoLuc1/DEF
  sed 's/equCab2/turTru1/g; s/horse/dolphin/' lastz.equCab2/DEF | \
     sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.turTru1/DEF
  sed 's/equCab2/choHof11/g; s/horse/sloth/' lastz.equCab2/DEF > lastz.choHof1/DEF
  
  cd andy/
  for db in speTri1 felCat3 cavPor3 bosTau4 pteVam1 equCab2 vicPac1 myoLuc1 turTru1 choHof1; do
      ln -s ../lastz.${db}/DEF ${db}.DEF
  done
  
  screen -S speTri1
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium speTri1.DEF >& speTri1.do.log
  # [detach screen]
  #real    2059m30.699s
  
  screen -S felCat3
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium felCat3.DEF >& felCat3.do.log
  # [detach screen]
  #real    1574m47.522s
  
  screen -S bosTau4
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium bosTau4.DEF >& bosTau4.do.log
  # [detach screen]
  #real    1474m54.655s
  
  screen -S pteVam1
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm
    -chainMinScore=3000 -chainLinearGap=medium pteVam1.DEF >& pteVam1.do.log
  # [detach screen]
  #real    1168m33.923s
  
  screen -S equCab2
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium -syntenicNet equCab2.DEF >& equCab2.do.log
  # [detach screen]
  #real    1662m56.158s
  # (included syntenic net)
  
  screen -S vicPac1
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium vicPac1.DEF >& vicPac1.do.log
  # [detach screen]
  #real    1495m48.173s
  
  screen -S turTru1
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium turTru1.DEF >& turTru1.do.log
  # [detach screen]
  #real    1079m17.234s
  
  screen -S choHof1
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
    -chainMinScore=3000 -chainLinearGap=medium choHof1.DEF >& choHof1.do.log
  # [detach screen]
  #real    1310m49.287s (script and cluster run stopped after halfway...
  # pk was too slow... remaining jobs started on swarm)
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium -continue=cat \
    choHof1.DEF >& choHof1.doAfterBlastz.log
  #real    257m32.701s
  
  screen -S cavPor3
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
    -smallClusterHub=memk -bigClusterHub=pk cavPor3.DEF >& cavPor3.do.log
  # [detach screen]
  #real    1370m5.258s
  # TROUBLE!  got to the 'load' step and failed.  This one needs a special
  # chain table and chainLink table to get loaded.
  cd ../lastz.cavPor3/axtChain/
  # figure out number of rows and average length
  wc -l *.tab
  #   27186468 chain.tab
  #  240602108 link.tab
  randomLines link.tab 10000000 stdout | awk '{print length($0)}' | sort | uniq -c
  randomLines chain.tab 1000000 stdout | awk '{print length($0)}' | sort | uniq -c
  # about 43 average length for the chainLink and 100 for the chain
  sed "s/hgLoadChain.*/hgsqldump hg19 chainSpeTri1Link --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=241000000 avg_row_length=43 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
  hgsqldump hg19 chainSpeTri1 --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=27200000 avg_row_length=100 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
  hgsql hg19 -e \"load data local infile \'chain.tab\' into table chainCavPor3\"\n\
  hgsql hg19 -e \"load data local infile \'link.tab\' into table chainCavPor3Link\"\n\
  hgsql hg19 -e \"INSERT into history (ix, startId, endId, who, what, modTime, errata) VALUES(NULL,0,0,\'aamp\',\'Loaded 27186468 chains into cavPor3 chain table manually\', NOW(), NULL)\"\
  /" loadUp.csh > manualLoadUp.csh
  chmod +x manualLoadUp.csh
  time nice -n +19 ./manualLoadUp.csh
  # [detach screen]
  #real    584m4.093s
  cd ../../andy/
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
    -smallClusterHub=memk -bigClusterHub=swarm -continue=download \
    cavPor3.DEF >& cavPor3.doAfterLoad.log
  #real    5m45.122s
  
  # syntenic nets
  
  screen -r bosTau4
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
    -continue=syntenicNet bosTau4.DEF >& bosTau4.syn.log
  #real    31m48.545s
  
  # reciprocal best choHof1 and cavPor3
  screen -r choHof1
  time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.choHof1 \
    -workhorse=hgwdev hg19 choHof1 >& choHof1.doRecip.log
  #real    367m52.993s
  
  screen -r cavPor3
  time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.cavPor3 \
    -workhorse=hgwdev hg19 cavPor3 >& cavPor3.doRecip.log
  #real    123m3.795s
  
  # reciprocal best small six genome memk run
  
  screen -S recipRun
  mkdir recipRun
  cd recipRun/
  cat > gsub << 'EOF'
  #LOOP
  ./doRecip.sh $(path1)
  #ENDLOOP
  'EOF'
  cat > doRecip.sh << 'EOF'
  #!/bin/csh -ef
  set db = $1
  /cluster/bin/scripts/doRecipBest.pl -workhorse=`uname -n` -stop=recipBest -buildDir=/hive/data/genomes/hg19/bed/lastz.$db hg19 $db >& $db.recipBest.log
  'EOF'
  chmod +x doRecip.sh
  cat > db.lst << 'EOF'
  speTri1
  vicPac1
  myoLuc1
  turTru1
  pteVam1
  felCat3
  EOF
  ssh memk
  cd /hive/data/genomes/hg19/bed/andy/recipRun
  gensub2 db.lst single gsub jobList
  para create jobList
  para push
  # finished overnight
  exit # to hgwdev
  for log in *.recipBest.log; do
    db=${log%.recipBest.log};
    echo $db;
    doRecipBest.pl -workhorse=hgwdev -continue=download \
      -buildDir=/hive/data/genomes/hg19/bed/lastz.$db \
       hg19 $db >& $db.recipBestDownload.log;
  done
  
  # swaps for equCab2, felCat3, bostTau4, cavPor3
  
  cd /hive/data/genomes/hg19/bed/andy
  screen -r equCab2
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u01 \
    -chainMinScore=3000 -chainLinearGap=medium -swap equCab2.DEF >& equCab2.doSwap.log
  # [detach screen]
  #real    486m35.206s
  
  screen -r felCat3
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u02 \
    -chainMinScore=3000 -chainLinearGap=medium -swap felCat3.DEF >& felCat3.doSwap.log
  # [detach screen]
  #real    463m5.257s
  
  screen -r bosTau4
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u03 \
    -chainMinScore=3000 -chainLinearGap=medium -swap bosTau4.DEF >& bosTau4.doSwap.log
  # [detach screen]
  #real    391m40.132s
  
  screen -r cavPor3
  time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=hgwdev
    -chainMinScore=3000 -chainLinearGap=medium -swap cavPor3.DEF >& cavPor3.doSwap.log
  # [detach screen]
  real    192m39.792s
  
  ##########################################################################
  # lastz Megabat/pteVam1 swap (DONE - 2013-07-05 - Hiram)
      # original alignment
      cd /hive/data/genomes/hg19/bed/lastzPteVam1.2009-06-04
      cat fb.hg19.chainPteVam1Link.txt
      #  1315799940 bases of 2897316137 (45.414%) in intersection
  
      # and this swap:
      mkdir /hive/data/genomes/pteVam1/bed/blastz.hg19.swap
      cd /hive/data/genomes/pteVam1/bed/blastz.hg19.swap
  
      time doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzPteVam1.2009-06-04/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=encodek \
  	-swap > swap.log 2>&1
      #   real    131m15.526s
  
      cat fb.pteVam1.chainHg19Link.txt
      #  1281818029 bases of 1839436660 (69.685%) in intersection
  
      cd /hive/data/genomes/pteVam1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##########################################################################
  # lastz Sloth/choHof1 swap (DONE - 2013-07-05 - Hiram)
      # original alignment
      cd /hive/data/genomes/hg19/bed/lastzChoHof1.2009-06-04
      cat fb.hg19.chainChoHof1Link.txt
      #  996000365 bases of 2897316137 (34.377%) in intersection
  
      # and this swap:
      mkdir /hive/data/genomes/choHof1/bed/blastz.hg19.swap
      cd /hive/data/genomes/choHof1/bed/blastz.hg19.swap
  
      time doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzChoHof1.2009-06-04/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=encodek \
  	-swap > swap.log 2>&1
      #  real    844m59.111s
  
      cat fb.choHof1.chainHg19Link.txt
      #   1013601302 bases of 2060419685 (49.194%) in intersection
  
      cd /hive/data/genomes/choHof1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##########################################################################
  # lastz Rock hyrax/proCap1 swap (DONE - 2013-07-05 - Hiram)
      # original alignment
      cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
      cat fb.hg19.chainProCap1Link.txt
      #  894221652 bases of 2897316137 (30.864%) in intersection
  
      # and this swap:
      mkdir /hive/data/genomes/proCap1/bed/blastz.hg19.swap
      cd /hive/data/genomes/proCap1/bed/blastz.hg19.swap
  
      time doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap > swap.log 2>&1
      #   real    346m15.361s
  
      cat fb.proCap1.chainHg19Link.txt
      #   865681626 bases of 2407847681 (35.953%) in intersection
  
      cd /hive/data/genomes/proCap1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##########################################################################
  # lastz Kangaroo rat/dipOrd1 swap (DONE - 2013-07-05 - Hiram)
      # original alignment
      cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
      cat fb.hg19.chainDipOrd1Link.txt
      #  786938637 bases of 2897316137 (27.161%) in intersection
  
      # and this swap:
      mkdir /hive/data/genomes/dipOrd1/bed/blastz.hg19.swap
      cd /hive/data/genomes/dipOrd1/bed/blastz.hg19.swap
  
      time doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap > swap.log 2>&1
      # about 2 hours
      time doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29/DEF \
  	-continue=download -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap > download.log 2>&1
      #  real    27m35.952s
  
      cat fb.dipOrd1.chainHg19Link.txt
      #  776300419 bases of 1844961421 (42.077%) in intersection
  
      cd /hive/data/genomes/dipOrd1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##########################################################################
  # LASTZ Venter's Poodle canFamPoodle1 (DONE - 2009-06-05,10 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
      cd /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
  
      cat << '_EOF_' > DEF
  # human vs Venter's poodle
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Dog CanFam2
  SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
  SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LAP=0
  SEQ2_LIMIT=600
  
  BASE=/hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl \
          -verbose=2 \
          `pwd`/DEF \
          -noDbNameCheck -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium
      #	real    5162m58.743s
      cat fb.hg19.chainCanFamPoodle1Link.txt
      #	898034247 bases of 2897316137 (30.995%) in intersection
      #	the original canFam2 measured:
      #	1532073507 bases of 2897316137 (52.879%) in intersection
  
      time nice -n +19 doRecipBest.pl -buildDir=`pwd` \
  	hg19 canFamPoodle1 > rbest.log 2>&1 &
      #	real    811m27.965s
  
  ##############################################################################
  ## 46-Way Multiz (DONE - 2009-06-09,2009-11-10 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/multiz46way
      cd /hive/data/genomes/hg19/bed/multiz46way
  
      #	starting with the 46way tree created from 44 way tree
      cat << '_EOF_' > 46way.nh
  (((((((((((((((((
  ((hg19:0.006591,panTro2:0.006639):0.002184,gorGor1:0.009411):0.009942,
  ponAbe2:0.018342):0.014256,rheMac2:0.036199):0.021496,papHam1:0.04):0.02,
  calJac1:0.066389):0.056911,tarSyr1:0.135169):0.011307,
  (micMur1:0.091452,otoGar1:0.128984):0.035463):0.015304,
  tupBel1:0.183583):0.004688,(((((mm9:0.083220,rn4:0.090564):0.196605,
  dipOrd1:0.209532):0.022555,cavPor3:0.223415):0.009828,
  speTri1:0.146894):0.025042,
  (oryCun2:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
  (((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
  ((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
  (myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
  (eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
  (((loxAfr3:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
  (dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
  macEug1:0.3):0.1,
  monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
  ((galGal3:0.166386,taeGut1:0.170717):0.199763,
  anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
  (((tetNig2:0.224774,fr2:0.205294):0.191836,
  (gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
  danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
  '_EOF_'
      # << happy emacs
  
      #	Use this specification in the phyloGif tool:
      #	http://genome.ucsc.edu/cgi-bin/phyloGif
      #	to obtain a gif image for htdocs/images/phylo/hg19_46way.gif
  
      /cluster/bin/phast/all_dists 46way.nh > 46way.distances.txt
      #	Use this output to create the table below, with this perl script:
      cat << '_EOF_' > sizeStats.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  open (FH, "grep -y hg19 46way.distances.txt | sort -k3,3n|") or
          die "can not read 46way.distances.txt";
  
  my $count = 0;
  while (my $line = <FH>) {
      chomp $line;
      my ($hg19, $D, $dist) = split('\s+', $line);
      my $chain = "chain" . ucfirst($D);
      my $B="/hive/data/genomes/hg19/bed/lastz.$D/fb.hg19." .
          $chain . "Link.txt";
      my $chainLinkMeasure =
          `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
      chomp $chainLinkMeasure;
      $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
      $chainLinkMeasure =~ s/\%//;
      my $swapFile="/hive/data/genomes/${D}/bed/blastz.hg19.swap/fb.${D}.chainHg19Link.txt";
      my $swapMeasure = "N/A";
      if ( -s $swapFile ) {
  	$swapMeasure =
  	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
  	chomp $swapMeasure;
  	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
  	$swapMeasure =~ s/\%//;
      }
      my $orgName=
      `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
      chomp $orgName;
      if (length($orgName) < 1) {
          $orgName="N/A";
      }
      ++$count;
      if ($swapMeasure eq "N/A") {
  	printf "# %02d  %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
  	    $orgName, $D, $chainLinkMeasure, $swapMeasure
      } else {
  	printf "# %02d  %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
  	    $orgName, $D, $chainLinkMeasure, $swapMeasure
      }
  }
  close (FH);
  '_EOF_'
      # << happy emacs
      chmod +x ./sizeStats.pl
      ./sizeStats.pl
  #
  #	If you can fill in all the numbers in this table, you are ready for
  #	the multiple alignment procedure
  #
  #                         featureBits chainLink measures
  #                                        chainOryLat1Link   chain    linearGap
  #    distance                      on hg19    on other   minScore
  # 01  0.0132 - Chimp panTro2    (% 94.846) (% 94.908)
  # 02  0.0182 - Gorilla gorGor1  (% 59.484) (N/A)
  # 03  0.0371 - Orangutan ponAbe2        (% 91.350) (% 89.617)
  # 04  0.0692 - Rhesus rheMac2   (% 82.744) (% 87.422)
  # 05  0.0945 - Baboon papHam1   (% 82.810) (N/A)
  # 06  0.1409 - Marmoset calJac1 (% 70.860) (% 71.897)
  # 07  0.2665 - Tarsier tarSyr1  (% 47.830) (N/A)
  # 08  0.2696 - Mouse lemur micMur1      (% 46.519) (N/A)
  # 09  0.3071 - Bushbaby otoGar1 (% 43.644) (N/A)
  # 10  0.3343 - Horse equCab2    (% 57.050) (% 66.774)
  # 11  0.3416 - TreeShrew tupBel1        (% 36.156) (N/A)
  # 12  0.3451 - Dolphin turTru1  (% 48.398) (N/A)
  # 13  0.3500 - Squirrel speTri1 (% 35.713) (N/A)
  # 14  0.3611 - Alpaca vicPac1   (% 39.399) (N/A)
  # 15  0.3620 - Sloth choHof1    (% 34.377) (N/A)
  # 16  0.3653 - Megabat pteVam1  (% 45.414) (N/A)
  # 17  0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
  # 18  0.3740 - Cat felCat3      (% 35.713) (% 61.104)
  # 19  0.3769 - Dog canFam2      (% 52.879) (% 62.055)
  # 20  0.3809 - Armadillo dasNov2        (% 33.543) (N/A)
  # 21  0.3941 - Rabbit oryCun2   (% 44.317) (58.405)
  # 22  0.3946 - Microbat myoLuc1 (% 33.174) (N/A)
  # 23  0.4028 - Cow bosTau4      (% 46.506) (% 50.297)
  # 24  0.4363 - Guinea Pig cavPor3       (% 43.680) (N/A)
  # 25  0.4421 - Rock hyrax proCap1       (% 30.864) (N/A)
  # 26  0.4450 - Kangaroo rat dipOrd1     (% 27.161) (N/A)
  # 27  0.4764 - Pika ochPri2     (% 27.768) (N/A)
  # 28  0.4811 - Hedgehog eriEur1 (% 19.362) (N/A)
  # 29  0.5035 - Tenrec echTel1   (% 23.120) (N/A)
  # 30  0.5153 - Mouse mm9        (% 35.299) (% 38.693)
  # 31  0.5226 - Rat rn4  (% 32.879) (% 36.860)
  # 32  0.5274 - Shrew sorAra1    (% 19.760) (N/A)
  # 33  0.6394 - Wallaby macEug1  (% 6.011) (N/A)
  # 34  0.7653 - Opossum monDom5  (% 14.358) (N/A)
  # 35  0.9657 - Platypus ornAna1 (% 7.627) (% 11.259)
  # 36  1.0960 - Chicken galGal3  (% 3.591) (% 8.786)
  # 37  1.1003 - Zebra finch taeGut1      (% 3.496) (% 7.795)
  # 38  1.2394 - Lizard anoCar1   (% 3.591) (% 5.146)
  # 39  1.6403 - X. tropicalis xenTro2    (% 3.176) (% 6.773)
  # 40  1.9387 - Stickleback gasAcu1      (% 1.916) (% 11.175)
  # 41  1.9634 - Fugu fr2 (% 1.702) (% 10.929)
  # 42  1.9746 - Zebrafish danRer6        (% 3.051) (% 6.399)
  # 43  1.9829 - Tetraodon tetNig2        (% 1.712) (% 14.194)
  # 44  2.1031 - Medaka oryLat2   (% 1.849) (% 6.705)
  # 45  2.1108 - Lamprey petMar1  (% 1.082) (% 3.200)
  
      # create species list and stripped down tree for autoMZ
      sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
  	46way.nh > tmp.nh
      echo `cat tmp.nh` > tree-commas.nh
      echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
      sed 's/[()]//g; s/,/ /g' tree.nh > species.list
  
      cd /hive/data/genomes/hg19/bed/multiz46way
      #	bash shell syntax here ...
      export H=/hive/data/genomes/hg19/bed
      mkdir mafLinks
      for G in `sed -e "s/hg19 //" species.list`
      do
  	mkdir mafLinks/$G
  	if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then
  	    echo "$G - recipBest"
  	    ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
  	else
  	    if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then
  		echo "$G - synNet"
  		ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
  	    else
  		if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then
  		    echo "$G - mafNet"
  		    ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
  		else
  		    echo "missing directory lastz.${G}/*Net"
  		fi
  	    fi
  	fi
      done
  
      #	verify the alignment type is correct:
      for D in `cat /hive/users/hiram/bigWayHg19/ordered.list`
  do
      ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}'
  done
      #	compare to the list at:
      #	http://genomewiki.ucsc.edu/index.php/Hg19_Genome_size_statistics
  
      #	need to split these things up into smaller pieces for
      #	efficient kluster run.
      cd /hive/data/genomes/hg19/bed/multiz46way
      mkdir mafSplit
      cd mafSplit
      #	mafSplitPos splits on gaps or repeat areas that will not have
      #	any chains, approx 5 Mbp intervals, gaps at least 10,000
      mafSplitPos -minGap=10000 hg19 5stdout | sort -u \
  	| sort -k1,1 -k2,2n > mafSplit.bed
      #	There is a splitRegions.pl script here (copied from previous 44way)
      #	that can create a custom track from this mafSplit.bed file.
      #	Take a look at that in the browser and see if it looks OK,
      #	check the number of sections on each chrom to verify none are
      #	too large.  Despite the claim above, it does appear that some
      #	areas are split where actual chains exist.
  
      #	run a small kluster job to split them all
      ssh memk
      cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
      cat << '_EOF_' > runOne
  #!/bin/csh -ef
  set G = $1
  set C = $2
  mkdir -p $G
  pushd $G > /dev/null
  if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then
      rm -f hg19_${C}.*.maf
      mafSplit ../mafSplit.bed hg19_ ../../mafLinks/${G}/${C}.maf.gz
      gzip hg19_${C}.*.maf
  else
      touch hg19_${C}.00.maf
      gzip hg19_${C}.00.maf
  endif
  popd > /dev/null
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      cat << '_EOF_' > template
  #LOOP
  runOne $(root1) $(root2) {check out line $(root1)/hg19_$(root2).00.maf}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      for G in `sed -e "s/hg19 //" ../species.list`
  do
      echo $G
  done > species.list
      cut -f 1 ../../../chrom.sizes > chr.list
  
      gensub2 species.list chr.list template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc...
  # Completed: 4185 of 4185 jobs
  # CPU time in finished jobs:      25547s     425.78m     7.10h    0.30d  0.001 y
  # IO & Wait Time:                268664s    4477.73m    74.63h    3.11d  0.009 y
  # Average job time:                  70s       1.17m     0.02h    0.00d
  # Longest finished job:            1234s      20.57m     0.34h    0.01d
  # Submission to last job:          3048s      50.80m     0.85h    0.04d
  
      # the autoMultiz cluster run
      ssh swarm
      cd /hive/data/genomes/hg19/bed/multiz46way/
  
      mkdir splitRun
      cd splitRun
      mkdir maf run
      cd run
      mkdir penn
      cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
      cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
      cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
  
      #	set the db and pairs directories here
      cat > autoMultiz.csh << '_EOF_'
  #!/bin/csh -ef
  set db = hg19
  set c = $1
  set result = $2
  set run = `/bin/pwd`
  set tmp = /scratch/tmp/$db/multiz.$c
  set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
  /bin/rm -fr $tmp
  /bin/mkdir -p $tmp
  /bin/cp -p ../../tree.nh ../../species.list $tmp
  pushd $tmp > /dev/null
  foreach s (`/bin/sed -e "s/ $db//" species.list`)
      set in = $pairs/$s/$c.maf
      set out = $db.$s.sing.maf
      if (-e $in.gz) then
          /bin/zcat $in.gz > $out
  	if (! -s $out) then
  	    echo "##maf version=1 scoring=autoMZ" > $out
  	endif
      else if (-e $in) then
          /bin/ln -s $in $out
      else
          echo "##maf version=1 scoring=autoMZ" > $out
      endif
  end
  set path = ($run/penn $path); rehash
  $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
  	> /dev/null
  popd > /dev/null
  /bin/rm -f $result
  /bin/cp -p $tmp/$c.maf $result
  /bin/rm -fr $tmp
  /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
  '_EOF_'
  # << happy emacs
      chmod +x autoMultiz.csh
  
      cat  << '_EOF_' > template
  #LOOP
  ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/$(root1).maf}
  #ENDLOOP
  '_EOF_'
  # << happy emacs
  
      find ../../mafSplit -type f | grep hg19_ | xargs -L 1 basename \
  	| sed -e "s/.gz//" | sort -u > chr.part.list
      gensub2 chr.part.list single template jobList
      para -ram=8g create jobList
      #	initial run experience suggest some of the big jobs reach 8 Gb
      #	of memory usage, so, tell parasol to limit the number of jobs per
      #	node to avoid thrashing
      para -ram=8g try
      para -ram=8g push
  # Completed: 504 of 504 jobs
  # CPU time in finished jobs:    1342039s   22367.32m   372.79h   15.53d  0.043 y
  # IO & Wait Time:                 63835s    1063.91m    17.73h    0.74d  0.002 y
  # Average job time:                2789s      46.49m     0.77h    0.03d
  # Longest finished job:           12625s     210.42m     3.51h    0.15d
  # Submission to last job:         15300s     255.00m     4.25h    0.18d
  
      # put the split maf results back together into a single maf file
      #	eliminate duplicate comments
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
      mkdir ../maf
      #	the sed edits take out partitioning name information from the comments
      #	so the multiple parts will condense to smaller number of lines
      #	this takes almost 2 hours of time, resulting in a bit over 150 Gb,
      #	almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
      #	HOWEVER, this is actually not necessary to maintain these comments,
      #	they are lost during the mafAddIRows
  
      cat << '_EOF_' >> runOne
  #!/bin/csh -fe
  set C = $1
  if ( -s ../maf/${C}.maf.gz ) then
      rm -f ../maf/${C}.maf.gz
  endif
  head -q -n 1 maf/hg19_${C}.*.maf | sort -u > ../maf/${C}.maf
  grep -h "^#" maf/hg19_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
      sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
          | sort -u >> ../maf/${C}.maf
  grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
  tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      cat << '_EOF_' >> template
  #LOOP
  runOne $(root1) {check out exists+ ../maf/$(root1).maf}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cut -f1 ../../../chrom.sizes > chr.list
      ssh encodek
      cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
      gensub2 chr.list single template jobList
      para create jobList
      para try ... check ... push ... etc ...
  # Completed: 92 of 93 jobs
  # Crashed: 1 jobs
  # CPU time in finished jobs:        412s       6.86m     0.11h    0.00d  0.000 y
  # IO & Wait Time:                 21187s     353.12m     5.89h    0.25d  0.001 y
  # Average job time:                 235s       3.91m     0.07h    0.00d
  # Longest finished job:            1529s      25.48m     0.42h    0.02d
  # Submission to last job:          1542s      25.70m     0.43h    0.02d
  
      #	one of the results is completely empty, the grep for results failed
      #	this file ../maf/chrUn_gl000226.maf only has header comments, no result
  
      # load tables for a look
      ssh hgwdev
      mkdir -p /gbdb/hg19/multiz46way/maf
      cd /hive/data/genomes/hg19/bed/multiz46way/maf
      ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf
  
      # this generates an immense multiz46way.tab file in the directory
      #	where it is running.  Best to run this over in scratch.
      cd /data/tmp
      time nice -n +19 hgLoadMaf \
  	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
      #	Loaded 33558634 mafs in 93 files from /gbdb/hg19/multiz46way/maf
      #	real    512m8.053s
  
      # load summary table
      time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
  	| $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
  		-mergeGap=1500 -maxSize=200000  multiz46waySummary stdin
      #	real    92m30.700s
  # flushSummaryBlocks: output 45 blocks
  # Created 8766427 summary blocks from 645238409 components and
  #	33558634 mafs from stdin
  # blocks too small to be used: 29456
  # Loading into hg19 table multiz46waySummary...
  
      # Gap Annotation
      # prepare bed files with gap info
      mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
      cd /hive/data/genomes/hg19/bed/multiz46way/anno
      mkdir maf run
  
      #	most of these will already exist from previous multiple alignments
      #	remove the echo from in front of the twoBitInfo command to get them
      #	to run if this loop appears to be correct
      for DB in `cat ../species.list`
  do
      CDIR="/hive/data/genomes/${DB}"
      if [ ! -f ${CDIR}/${DB}.N.bed ]; then
  	echo "creating ${DB}.N.bed"
  	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
      else
  	ls -og ${CDIR}/${DB}.N.bed
      fi
  done
  
      cd run
      rm -f nBeds sizes
      for DB in `sed -e "s/hg19 //" ../../species.list`
  do
      echo "${DB} "
      ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
      echo ${DB}.bed  >> nBeds
      ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
      echo ${DB}.len  >> sizes
  done
  
      #	the annotation step requires large memory, run on memk nodes
      ssh memk
      cd /hive/data/genomes/hg19/bed/multiz46way/anno/run
      ls ../../maf | sed -e "s/.maf//" > chr.list
      cat << '_EOF_' > template
  #LOOP
  ./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cat << '_EOF_' > anno.csh
  #!/bin/csh -fe
  
  set inMaf = ../../maf/$1.maf
  set outMaf = ../maf/$1.maf
  rm -f $outMaf
  mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg19/hg19.2bit $outMaf
  '_EOF_'
      # << happy emacs
      chmod +x anno.csh
  
      gensub2 chr.list single template jobList
      para -ram=30g create jobList
      #	specify lots of ram to get one job per node
      para -ram=30g push
      #	
  # Completed: 93 of 93 jobs
  # CPU time in finished jobs:      10371s     172.85m     2.88h    0.12d  0.000 y
  # IO & Wait Time:                  3365s      56.09m     0.93h    0.04d  0.000 y
  # Average job time:                 148s       2.46m     0.04h    0.00d
  # Longest finished job:            1153s      19.22m     0.32h    0.01d
  # Submission to last job:          7402s     123.37m     2.06h    0.09d
  
      ssh hgwdev
      rm -fr /gbdb/hg19/multiz46way/maf
      mkdir /gbdb/hg19/multiz46way/maf
      cd /hive/data/genomes/hg19/bed/multiz46way/anno/maf
      ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf/
      #	by loading this into the table multiz46way, it will replace the
      #	previously loaded table with the unannotated mafs
      #	huge temp files are made, do them on local disk
      cd /data/tmp
      time nice -n +19 hgLoadMaf \
  	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
      #	real    113m11.709s
      #	Loaded 33612571 mafs in 93 files from /gbdb/hg19/multiz46way/maf
  
      time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
  	| hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
                   -maxSize=200000  multiz46waySummary stdin
      #	with the quality annotated mafs, and mem interference on hgwdev:
      #	Created 8514381 summary blocks from 600504256 components \
      #	and 33320838 mafs from stdin
      #	real    169m56.936s
  
      #	with the Irow annotations after the multiz fix:
      #	Created 8514380 summary blocks from 600499937
      #		components and 33298894 mafs from stdin
      #	real    184m42.893s
      #	user    70m44.431s
      #	sys     8m7.970s
  
      #	Created 8514078 summary blocks from 604683213 components
      #	and 35125649 mafs from stdin
      #	real    130m55.115s
      #	user    71m37.409s
      #	sys     8m5.110s
  
      #	by loading this into the table multiz46waySummary, it will replace
      #	the previously loaded table with the unannotated mafs
      #	remove the multiz46way*.tab files in this /data/tmp directory
  # -rw-rw-r--   1 1949221892 Nov 15 14:04 multiz46way.tab
  # -rw-rw-r--   1  417994189 Nov 15 20:57 multiz46waySummary.tab
      wc -l multiz46way*.tab
      #	33964377 multiz46way.tab
      #	 8514078 multiz46waySummary.tab
      #	42478455 total
      rm multiz46way*.tab
  
      # create some downloads
      mkdir -p /hive/data/genomes/hg19/bed/multiz46way/download/maf
      cd /hive/data/genomes/hg19/bed/multiz46way/download/maf
      time cp -p ../../anno/maf/chr*.maf .
      #	real    72m46.514s
      #	user    0m1.293s
      #	sys     5m15.981s
      time gzip --rsyncable *.maf
      time gzip --rsyncable *.maf
      #	real    185m37.884s
      #	user    179m51.161s
      #	sys     3m48.016s
      time md5sum *.gz > md5sum.txt
      #	real    3m59.009s
      #	user    1m19.338s
      #	sys     0m18.976s
  
  ##############################################################################
  # LASTZ Sea Hare aplCal1 (STARTING - 2009-06-08 - Galt)
  # To Do #1813 remove aplCal1 <-> hg19 chain/nets (2011-10-07 Chin)
  # However, only tables and download files are physically drop/removed
  # All data created stay on hive.
      mkdir /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
      cd /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
  
      cat << '_EOF_' > DEF
  # Human vs. Sea Hare
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=100000000
  SEQ1_LAP=10000
  SEQ2_LIMIT=5
  
  # QUERY: Sea Hare aplCal1
  SEQ2_DIR=/scratch/data/aplCal1/aplCal1.2bit
  SEQ2_LEN=/scratch/data/aplCal1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=300
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      #   (NOTE I SHOULD NOT HAVE USED  -qRepeats=windowmaskerSdust)
      screen
      time nice +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
  	>& do.log &
      #	real about one hour but one job hung
  
      # resuming from failure
      # edited loadUp.csh, commenting out the first completed step
      # and removing the unneeded -qRepeats=windowmaskerSdust
      # from the next step, now run it to complete the load step.
      /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/axtChain/loadUp.csh \
          >& continue-loadUp.log&
  
      # continue from step 'download'
      time nice +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
          -continue download \
          >& continue-download.log &
  
      cat fb.hg19.chainAplCal1Link.txt
      #   19675762 bases of 2897316137 (0.679%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      #   (NOTE I SHOULD NOT HAVE USED  -qRepeats=windowmaskerSdust)
      mkdir /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
      cd /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
      time nice +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
  	-swap >& swap.log &
      #	real  time not long
  
      # resuming from failure
      # edited loadUp.csh, commenting out the first completed step
      # and removing the unneeded -tRepeats=windowmaskerSdust
      # from the next step, now run it to complete the load step.
      /hive/data/genomes/aplCal1/bed/blastz.hg19.swap/axtChain/loadUp.csh \
          >& continue-loadUp.log&
  
      time nice +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
          -continue download \
  	-swap >& continue-download.log &
  
      cat fb.aplCal1.chainHg19Link.txt
      #   14163455 bases of 619228098 (2.287%) in intersection
  
  #########################################################################
  # EXONIPHY Hg19, lifted from hg18 (DONE - 2009-06-19 - Hiram)
  #	needed for uscsGenes11 building
      # create a syntenic liftOver chain file
      cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
      time nice -n +19 netSyntenic run.chain/hg18.hg19.noClass.net.gz stdout \
  	| netFilter -syn stdin | netChainSubset -verbose=0 stdin \
  		run.chain/hg18.hg19.all.chain.gz stdout \
  	| chainStitchId stdin stdout | gzip -c > hg18.hg19.syn.chain.gz
      #	memory usage 55492608, utime 3 s/100, stime 3
      #	real    2m35.613s
  
      #	real    5m55.575s
      #	slightly smaller than the ordinary liftOver chain file:
  # -rw-rw-r-- 1 137245 Mar  6 17:37 hg18ToHg19.over.chain.gz
  # -rw-rw-r-- 1  96115 Jun 19 14:30 hg18.hg19.syn.chain.gz
  
      # exoniphyHg19.gp is prepared as follows
      mkdir /cluster/data/hg19/bed/exoniphy
      cd /cluster/data/hg19/bed/exoniphy
      hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
      time nice -n +19 liftOver -genePred exoniphyHg18.gp \
        /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06/hg18.hg19.syn.chain.gz \
  	    exoniphyHg19.gp unmapped
      wc -l *
      #	178162 exoniphyHg18.gp
      #	178109 exoniphyHg19.gp
      #	   106 unmapped
  
      mkdir dump
      cd dump
      hgsqldump --all -c --tab=. hg18 exoniphy
      cd ..
      chmod 775 dump
      hgsql hg19 < dump/exoniphy.sql
      hgsql hg19 \
  -e "load data local infile \"exoniphyHg19.gp\" into table exoniphy;"
      nice -n +19 featureBits hg19 exoniphy
      #	27421336 bases of 2897316137 (0.946%) in intersection
      nice -n +19 featureBits hg18 exoniphy
      #	27475705 bases of 2881515245 (0.954%) in intersection
  
  #########################################################################
  # BIOCYCTABLES NEEDED BY hgGene (DONE - 2009-06-22 - Hiram)
  
  # First register with BioCyc to download their HumanCyc database
  # The site will email you the URL for download.  Beware, they supply
  #	a URL to a directory chock a block full of data, almost 7 Gb,
  #	you only need one file
  
      mkdir /hive/data/outside/bioCyc/090623
      cd /hive/data/outside/bioCyc/090623
  
      mkdir download
      cd download
      wget --timestamping --no-directories --recursive \
  	"http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.tar.Z"
      tar xvzf humancyc-flatfiles.tar.Z
  
      mkdir /hive/data/genomes/hg19/bed/bioCyc
      cd /hive/data/genomes/hg19/bed/bioCyc
      #	clean the headers from these files
      grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/genes.col \
  	> genes.tab
      #	this file isn't consistent in its number of columns
      grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/pathways.col \
  | awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' \
  	> pathways.tab
  
      hgsql hg19 -e 'create database bioCyc090623'
  
      hgLoadSqlTab bioCyc090623 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
      hgLoadSqlTab bioCyc090623 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
  
  # Create bioCycMapDesc.tab
      hgsql bioCyc090623 -N \
  	-e 'select UNIQUE_ID, NAME from pathways' | sort -u >  bioCycMapDesc.tab
  XXX see alternative below
  
      #	this kgBioCyc0 thing needs kgXref and other UCSC gene tables to work
  # Create bioCycPathway.tab
      kgBioCyc0 bioCyc090623 hg19 hg19
  
      hgLoadSqlTab hg19 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
      hgLoadSqlTab hg19 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
  
  XXX maybe instead do this in the gene build procedure
      # from the UCSC genes build procedure
  # Do BioCyc Pathways build
      mkdir $dir/bioCyc
      cd $dir/bioCyc
      grep -v '^#' $bioCycPathways > pathways.tab
      grep -v '^#' $bioCycGenes > genes.tab
      kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
      hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
      hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
  
  ##############################################################################
  nscanGene (2009-06-22 markd)
     # nscanGene track from WUSTL
     cd /cluster/data/hg19/bed/nscan
     wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.updated.gtf
     wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.readme
     wget -r -np -l 1 http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19_proteins
     bzip2 hg19.updated.gtf hg19_proteins/*.fa
  
     # load track
     gtfToGenePred -genePredExt hg19.updated.gtf.bz2 stdout| hgLoadGenePred -genePredExt hg19 nscanGene stdin
     bzcat hg19_proteins/chr*.fa.bz2 | hgPepPred hg19 generic nscanPep stdin
     rm *.tab
  
     # validate same number of transcripts and peptides are loaded
     hgsql -Ne 'select count(*) from nscanGene' hg19
     hgsql -Ne 'select count(*) from nscanPep' hg19
  
     # validate search expression
     hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l
  
  #########################################################################
  # Phylogenetic tree from 46-way for chrX  (DONE - 2009-10-26 - Hiram)
  # 	We need two trees, one for chrX only, and a second for all other chroms
      mkdir /hive/data/genomes/hg19/bed/multiz46way/4dX
      cd /hive/data/genomes/hg19/bed/multiz46way/4dX
  
      hgsql hg19 -Ne \
      "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA' and refGene.chrom='chrX'" \
  	| cut -f 2-20 > refSeqReviewed.gp
      wc -l refSeqReviewed.gp
      # 727 refSeqReviewed.gp
      genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
      wc -l refSeqReviewedNR.gp
      # 401 refSeqReviewed.gp
  
      ssh memk
      mkdir /hive/data/genomes/hg19/bed/multiz46way/4dX/run
      cd /hive/data/genomes/hg19/bed/multiz46way/4dX/run
      mkdir ../mfa
  
  # whole chrom mafs version, using new version of
  # uses memory-efficient version of phast, from Melissa Hubisz at Cornell
  #	mjhubisz at gmail.com
  
      cat << '_EOF_' > 4dX.csh
  #!/bin/csh -fe
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
  set r = "/hive/data/genomes/hg19/bed/multiz46way"
  set c = $1
  set infile = $r/maf/$2
  set outfile = $3
  cd /scratch/tmp
  # 'clean' maf
  perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
  awk -v C=$c '$2 == C {print}' $r/4dX/refSeqReviewedNR.gp > $c.gp
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
  $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
  $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4dX/$outfile
  rm -f $c.gp $c.maf $c.ss
  '_EOF_'
      # << happy emacs
      chmod +x 4dX.csh
  
      ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/chrX.maf | \
          egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
  	> maf.list
  
      cat << '_EOF_' > template
  #LOOP
  4dX.csh $(root1) $(path1) {check out line+ mfa/$(root1).mfa}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      gensub2 maf.list single template stdout | tac > jobList
      #	run this one job on hgwdev, takes a few minutes:
      ./4dX.csh chrX chrX.maf mfa/chrX.mfa
      #	not sure what these warnings are about:
  # WARNING: ignoring out-of-range feature
  # chrX    genepred        CDS     1       -1      .       +       2       transcript_id "NM_000475"
  # WARNING: ignoring out-of-range feature
  # chrX    genepred        CDS     1       -1      .       +       2       transcript_id "NM_005365.2"
  
      # combine mfa files
      cd ..
      sed -e "s/ /,/g" ../species.list > species.lst
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
  	--aggregate `cat species.lst` mfa/*.mfa | sed s/"> "/">"/ > 4dX.chrX.mfa
  
  XXXX ! 2010-12-29 - There is an error in the awk below.
  XXXX It ends up only working on the first file mfa/chr1.mfa
  XXXX with the result in placentals.mfa only from chr1.mfa
      sed -e 's/,macEug1.*//' species.lst > placentals.lst
      awk '
  BEGIN { good = 1 }
  {
      if (match($0, "^> macEug1")) { good = 0 }
      if (good) {print}
  }
  ' mfa/*.mfa > placentals.mfa
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
  	--aggregate `cat placentals.lst` placentals.mfa | sed s/"> "/">"/ \
  	> 4dX.placentals.mfa
  
  XXXX ! 2010-12-29 - There is an error in the awk below.
  XXXX It ends up only working on the first file mfa/chr1.mfa
  XXXX with the result in primates.mfa only from chr1.mfa
      sed -e 's/,tupBel1.*//' species.lst > primates.lst
      awk '
  BEGIN { good = 1 }
  {
      if (match($0, "^> tupBel1")) { good = 0 }
      if (good) {print}
  }
  ' mfa/*.mfa > primates.mfa
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
  	--aggregate `cat primates.lst` primates.mfa | sed -e "s/> />/" \
  	> 4dX.primates.mfa
  
      # use phyloFit to create tree model (output is phyloFit.mod)
      time /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
  	--EM --precision MED --msa-format FASTA --subst-mod REV \
  	--tree ../tree-commas.nh 4dX.chrX.mfa
      #	real    0.54.139s
      mv phyloFit.mod phyloFit.chrX.mod
  
      grep TREE phyloFit.chrX.mod | sed 's/TREE\:\ //' > tree_4d.chrX.46way.nh
  
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
          --no-branchlen --prune-all-but=`cat primates.lst` ../tree-commas.nh \
                  > tree_commas.primates.nh
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
          --no-branchlen --prune-all-but=`cat placentals.lst` ../tree-commas.nh \
                  > tree_commas.placentals.nh
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
  	--EM --precision MED --msa-format FASTA --subst-mod REV \
  	--tree tree_commas.primates.nh 4dX.primates.mfa
      mv phyloFit.mod phyloFit.chrX.primates.mod
      grep TREE phyloFit.chrX.primates.mod | sed 's/TREE\:\ //' \
  	> tree_4d.chrX.primates.46way.nh
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
  	--EM --precision MED --msa-format FASTA --subst-mod REV \
  	--tree tree_commas.placentals.nh 4dX.placentals.mfa
      mv phyloFit.mod phyloFit.chrX.placentals.mod
      grep TREE phyloFit.chrX.placentals.mod | sed 's/TREE\:\ //' \
  	> tree_4d.chrX.placentals.46way.nh
  
  #########################################################################
  # Phylogenetic tree from 46-way for non-chrX  (DONE - 2009-10-27 - Hiram)
  # 	We need two trees, one for chrX only, and a second for all other chroms
      mkdir /hive/data/genomes/hg19/bed/multiz46way/4dNoX
      cd /hive/data/genomes/hg19/bed/multiz46way/4dNoX
  
      hgsql hg19 -Ne \
      "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" \
  	| cut -f 2-20 | egrep -E -v "chrM|chrUn|random|_hap|chrX" \
  	> refSeqReviewed.gp
      wc -l refSeqReviewed.gp
      # 12977 refSeqReviewed.gp
      genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
      wc -l refSeqReviewedNR.gp
      # 7252 refSeqReviewed.gp
  
      ssh memk
      mkdir /hive/data/genomes/hg19/bed/multiz46way/4dNoX/run
      cd /hive/data/genomes/hg19/bed/multiz46way/4dNoX/run
      mkdir ../mfa
  
  # whole chrom mafs version, using new version of
  # uses memory-efficient version of phast, from Melissa Hubisz at Cornell
  #	mjhubisz at gmail.com
  
      cat << '_EOF_' > 4dNoX.csh
  #!/bin/csh -fe
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
  set r = "/hive/data/genomes/hg19/bed/multiz46way"
  set c = $1
  set infile = $r/maf/$2
  set outfile = $3
  cd /scratch/tmp
  # 'clean' maf
  perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
  awk -v C=$c '$2 == C {print}' $r/4dNoX/refSeqReviewedNR.gp > $c.gp
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
  $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
  $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4dNoX/$outfile
  rm -f $c.gp $c.maf $c.ss
  '_EOF_'
      # << happy emacs
      chmod +x 4dNoX.csh
  
      ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/chr*.maf | \
          egrep -E -v "chrM|chrUn|random|_hap|chrX" \
  	| sed -e "s#.*multiz46way/maf/##" \
  	> maf.list
  
      cat << '_EOF_' > template
  #LOOP
  4dNoX.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      gensub2 maf.list single template stdout | tac > jobList
      para try ... check ... push ... etc
      para time
  # Completed: 23 of 23 jobs
  # CPU time in finished jobs:       9032s     150.53m     2.51h    0.10d  0.000 y
  # IO & Wait Time:                   672s      11.21m     0.19h    0.01d  0.000 y
  # Average job time:                 422s       7.03m     0.12h    0.00d
  # Longest finished job:             860s      14.33m     0.24h    0.01d
  # Submission to last job:          1210s      20.17m     0.34h    0.01d
  
      # combine mfa files
      cd ..
      sed -e "s/ /,/g" ../species.list > species.lst
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
  	--aggregate `cat species.lst` mfa/*.mfa | sed s/"> "/">"/ \
  	> 4dNoX.all.mfa
  
      sed -e 's/,macEug1.*//' species.lst > placentals.lst
      awk '
  BEGIN { good = 1 }
  {
      if (match($0, "^> macEug1")) { good = 0 }
      if (good) {print}
  }
  ' mfa/*.mfa > placentals.mfa
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
  	--aggregate `cat placentals.lst` placentals.mfa | sed s/"> "/">"/ \
  	> 4dNoX.placentals.mfa
  
      sed -e 's/,tupBel1.*//' species.lst > primates.lst
      awk '
  BEGIN { good = 1 }
  {
      if (match($0, "^> tupBel1")) { good = 0 }
      if (good) {print}
  }
  ' mfa/*.mfa > primates.mfa
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
  	--aggregate `cat primates.lst` primates.mfa | sed -e "s/> />/" \
  	> 4dNoX.primates.mfa
  
  
      # use phyloFit to create tree model (output is phyloFit.mod)
      time /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
  	--EM --precision MED --msa-format FASTA --subst-mod REV \
  	--tree ../tree-commas.nh 4dNoX.all.mfa
      #	about 40 minutes
      mv phyloFit.mod phyloFit.NoChrX.mod
  
      grep TREE phyloFit.chrX.mod | sed 's/TREE\:\ //' > tree_4d.chrX.46way.nh
  
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
          --no-branchlen --prune-all-but=`cat primates.lst` ../tree-commas.nh \
                  > tree_commas.primates.nh
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
          --no-branchlen --prune-all-but=`cat placentals.lst` ../tree-commas.nh \
                  > tree_commas.placentals.nh
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
  	--EM --precision MED --msa-format FASTA --subst-mod REV \
  	--tree tree_commas.primates.nh 4dNoX.primates.mfa
      mv phyloFit.mod phyloFit.NoChrX.primates.mod
      grep TREE phyloFit.NoChrX.primates.mod | sed 's/TREE\:\ //' \
  	> tree_4d.NoChrX.primates.46way.nh
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
  	--EM --precision MED --msa-format FASTA --subst-mod REV \
  	--tree tree_commas.placentals.nh 4dNoX.placentals.mfa
      mv phyloFit.mod phyloFit.NoChrX.placentals.mod
      grep TREE phyloFit.NoChrX.placentals.mod | sed 's/TREE\:\ //' \
  	> tree_4d.NoChrX.placentals.46way.nh
  
  #########################################################################
  # Phylogenetic tree from 46-way  (DONE - 2009-06-25,07-07 - Hiram)
  #	This was an early first time experiment.  All this was redone
  #	above for chrX only and non-chrX trees
  
      # Extract 4-fold degenerate sites based on
      # of RefSeq Reviewed, coding
      mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
      cd /hive/data/genomes/hg19/bed/multiz46way/4d
  
      hgsql hg19 -Ne \
      "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 \
  	> refSeqReviewed.gp
      wc -l refSeqReviewed.gp
      # 14077 refSeqReviewed.gp
      genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
      wc -l refSeqReviewedNR.gp
      # 7951 refSeqReviewedNR.gp
  
      ssh memk
      mkdir /hive/data/genomes/hg19/bed/multiz46way/4d/run
      cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
      mkdir ../mfa
  
  # whole chrom mafs version, using new version of
  # uses memory-efficient version of phast, from Melissa Hubisz at Cornell
  #	mjhubisz at gmail.com
      cat << '_EOF_' > 4d.csh
  #!/bin/csh -fe
  set r = "/hive/data/genomes/hg19/bed/multiz46way"
  set c = $1
  set infile = $r/maf/$2
  set outfile = $3
  cd /scratch/tmp
  # 'clean' maf
  perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
  awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
  set PHASTBIN=/cluster/bin/phast.2008-12-18
  $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
  $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/$outfile
  rm -f $c.gp $c.maf $c.ss
  '_EOF_'
      # << happy emacs
      chmod +x 4d.csh
  
      ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/*.maf | \
          egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
  	> maf.list
  
      cat << '_EOF_' > template
  #LOOP
  4d.csh $(root1) {check in line+ $(path1)} {check out line+ mfa/$(root1).mfa}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      gensub2 maf.list single template stdout | tac > jobList
      rm -fr /cluster/data/hg19/bed/multiz46way/4d/mfa
      mkdir /cluster/data/hg19/bed/multiz46way/4d/mfa
      para create jobList
      para try
      para check
      para push
  
      # combine mfa files
      cd ..
      sed -e "s/ /,/g" ../species.list > species.lst
      /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
          sed s/"> "/">"/ > 4d.all.mfa
  
      sed -e 's/,macEug1.*//' species.lst > placentals.lst
      #  XXX this didn't work
      /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
          sed s/"> "/">"/ > 4d.placentals.mfa
  
      # use phyloFit to create tree model (output is phyloFit.mod)
      set PHASTBIN=/cluster/bin/phast.2008-12-18
      time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \
  	--subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
      #	real    111m23.119s
      mv phyloFit.mod phyloFit.all.mod
      grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.46way.nh
  
      sed -e 's/.*,choHof1,//' species.lst > notPlacentals.list
  
      $PHASTBIN/tree_doctor \
          --prune=`cat notPlacentals.list` \
                  tree_4d.46way.nh > tree_4d.46way.placental.nh
  
  #############################################################################
  # phastCons 46-way (DONE - 2009-09-21,2009-11-10 - Hiram)
      #	was unable to split the full chrom MAF files, now working on the
      #	maf files as they were split up during multiz
  
      # split 46way mafs into 10M chunks and generate sufficient statistics
      # files for # phastCons
      ssh swarm
      mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
      cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
      ./splitRegions.pl mafSplit.bed > \
  	/hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/region.list
      mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
      mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19
      mkdir ss
  
      cat << '_EOF_' > doSplit.csh
  #!/bin/csh -ef
  set c = $1
  set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
  set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19/ss/$c
  set WC = `cat $MAF | wc -l`
  set NL = `grep "^#" $MAF | wc -l`
  if ( -s $2 ) then
      exit 0
  endif
  if ( -s $2.running ) then
      exit 0
  endif
  
  date >> $2.running
  
  rm -fr $WINDOWS
  mkdir $WINDOWS
  pushd $WINDOWS > /dev/null
  if ( $WC != $NL ) then
  /cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
      $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
  endif
  popd > /dev/null
  date >> $2
  rm -f $2.running
  '_EOF_'
      # << happy emacs
      chmod +x doSplit.csh
  
      cat << '_EOF_' > template
  #LOOP
  doSplit.csh $(root1) {check out line+ $(root1).done}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      #	do the easy ones first to see some immediate results
      ls -1S -r ../../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
  
      gensub2 maf.list single template jobList
      para -ram=32g create jobList
      para try ... check ... etc
  # Completed: 503 of 504 jobs
  # Crashed: 1 jobs
  # CPU time in finished jobs:      14171s     236.18m     3.94h    0.16d  0.000 y
  # IO & Wait Time:                188193s    3136.55m    52.28h    2.18d  0.006 y
  # Average job time:                 402s       6.71m     0.11h    0.00d
  # Longest finished job:            1597s      26.62m     0.44h    0.02d
  # Submission to last job:          2586s      43.10m     0.72h    0.03d
      #	the one crashed job is hg19_chr18_gl000207_random.00.maf
  
      #	XXX - this did not work
      #	this takes a really long time.  memk was down to 2 usable
      #	machines - got it finished manually on a combination of hgwdevnew CPUs
      #	and other machines
  
      # Estimate phastCons parameters
      #	experimented with this as a parasol job on hgwdevnew to try a number
      #	of SS files.  With a command of:
  
  /cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
  --tree "(((((((((((((((((hg19,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
  --out-root=$OUT/starting_tree
  
      #	running over the input files ../ss/*/*.ss results to
  #.../genomes/hg19/bed/multiz46way/cons/startingTree/result/*/starting-tree.mod
  
      # add up the C and G:
      find ./result -type f | xargs ls -rt | while read F
  do
      D=`dirname $F`
      echo -n `basename $D`" - "
      grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
  done
      #	counting number of species seen in the maf file:
      find ./result -type f | xargs ls -rt | while read F
  do
      D=`dirname $F`
      echo -n `basename $D`" - "
      grep TREE $F | sed -e \
  "s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
  done
  
      # Run phastCons
      #	This job is I/O intensive in its output files, beware where this
      #	takes place or do not run too many at once.
      ssh swarm
      mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
  
      #	there are going to be several different phastCons runs using
      #	this same script.  They trigger off of the current working directory
      #	$cwd:t which is the "grp" in this script.  It is one of:
      #	all primates placentals
  
      cat << '_EOF_' > doPhast.csh
  #!/bin/csh -fe
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
  set c = $1
  set cX = $1:r
  set f = $2
  set len = $3
  set cov = $4
  set rho = $5
  set grp = $cwd:t
  set cons = /hive/data/genomes/hg19/bed/multiz46way/cons
  set tmp = $cons/tmp/$f
  mkdir -p $tmp
  set ssSrc = $cons
  set useGrp = "$grp.mod"
  if ( $cX == "chrX" ) then
      set useGrp = "$grp.chrX.mod"
  endif
  if (-s $cons/$grp/$grp.non-inf) then
    ln -s $cons/$grp/$grp.mod $tmp
    ln -s $cons/$grp/$grp.chrX.mod $tmp
    ln -s $cons/$grp/$grp.non-inf $tmp
    ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
  else
    ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
    ln -s $cons/$grp/$grp.mod $tmp
    ln -s $cons/$grp/$grp.chrX.mod $tmp
  endif
  pushd $tmp > /dev/null
  if (-s $grp.non-inf) then
    $PHASTBIN/phastCons $f.ss $useGrp \
      --rho $rho --expected-length $len --target-coverage $cov --quiet \
      --not-informative `cat $grp.non-inf` \
      --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
  else
    $PHASTBIN/phastCons $f.ss $useGrp \
      --rho $rho --expected-length $len --target-coverage $cov --quiet \
      --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
  endif
  popd > /dev/null
  mkdir -p pp/$c bed/$c
  sleep 4
  touch pp/$c bed/$c
  rm -f pp/$c/$f.pp
  rm -f bed/$c/$f.bed
  mv $tmp/$f.pp pp/$c
  mv $tmp/$f.bed bed/$c
  rm -fr $tmp
  '_EOF_'
      # << happy emacs
      chmod a+x doPhast.csh
  
      #	this template will serve for all runs
      #	root1 == chrom name, file1 == ss file name without .ss suffix
      cat << '_EOF_' > template
  #LOOP
  ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      ls -1S ../msa.split/2009-10-21/ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
  
      # Create parasol batch and run it
      # run for all species
      cd /hive/data/genomes/hg19/bed/multiz46way/cons
      mkdir -p all
      cd all
      #	Using the two different .mod tree
      cp -p ../../4dNoX/phyloFit.NoChrX.mod ./all.mod
      cp -p ../../4dX/phyloFit.chrX.mod ./all.chrX.mod
  
      gensub2 ../run.cons/ss.list single ../run.cons/template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc.
  # Completed: 581 of 581 jobs
  # CPU time in finished jobs:      41877s     697.95m    11.63h    0.48d  0.001 y
  # IO & Wait Time:                 39172s     652.87m    10.88h    0.45d  0.001 y
  # Average job time:                 139s       2.32m     0.04h    0.00d
  # Longest finished job:             329s       5.48m     0.09h    0.00d
  # Submission to last job:          2240s      37.33m     0.62h    0.03d
  
      # create Most Conserved track
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
      cut -f1 ../../../../chrom.sizes | while read C
  do
      ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
      do
          cat ${D}/${C}*.bed
      done | sort -k1,1 -k2,2n \
      | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
  done > tmpMostConserved.bed
  /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
  
      # load into database
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
      time nice -n +19 hgLoadBed hg19 phastConsElements46way mostConserved.bed
      #	Loaded 5163775 elements of size 6
      #	real     1m44.439s
  
      # Try for 5% overall cov, and 70% CDS cov
      featureBits hg19 -enrichment refGene:cds phastConsElements46way
      #	--rho 0.3 --expected-length 45 --target-coverage 0.3
      #	refGene:cds 1.187%, phastConsElements46way 5.065%,
      #	both 0.884%, cover 74.46%, enrich 14.70x
  
      # Create merged posterier probability file and wiggle track data files
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
      mkdir downloads
      cat << '_EOF_' > phastCat.sh
  #!/bin/sh
  
  mkdir -p downloads
  cut -f1 ../../../../chrom.sizes | while read C
  do
      echo -n "${C} ... working ... "
      ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
      do
          cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
      done | gzip > downloads/${C}.phastCons46way.wigFix.gz
      echo "done"
  done
  '_EOF_'
      #	<< happy emacs
      chmod +x phastCat.sh
      time nice -n +19 ./phastCat.sh
      #	real    30m2.623s
  
      #	encode those files into wiggle data
      zcat downloads/*.wigFix.gz \
  	| wigEncode stdin phastCons46way.wig phastCons46way.wib
      #	Converted stdin, upper limit 1.00, lower limit 0.00
      #	real    18m37.881s
      du -hsc *.wi?
      #	2.7G    phastCons46way.wib
      #	271M    phastCons46way.wig
      #	3.0G    total
  
      #	encode into a bigWig file:
      #	(warning wigToBigWig process grows to about 36 Gb)
      #	in bash, to avoid the 32 Gb memory limit:
  sizeG=188743680
  export sizeG
  ulimit -d $sizeG
  ulimit -v $sizeG
      zcat downloads/*.wigFix.gz \
          | wigToBigWig stdin ../../../../chrom.sizes phastCons46way.bw
      #	real    52m36.142s
  # -rw-rw-r--   1 21667535139 Oct 20 13:59 phastCons46way.bw
      mkdir /gbdb/hg19/bbi
      ln -s `pwd`/phastCons46way.bw /gbdb/hg19/bbi
      #	if you wanted to use the bigWig file, loading bigWig table:
      hgsql hg19 -e 'drop table if exists phastCons46way; \
              create table phastCons46way (fileName varchar(255) not null); \
              insert into phastCons46way values
  	("/gbdb/hg19/bbi/phastCons46way.bw");'
  
      # Load gbdb and database with wiggle.
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
      ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
  	phastCons46way phastCons46way.wig
      #	real    1m45.381s
  
      wigTableStats.sh hg19 phastCons46way
  # db.table      min max mean count sumData
  # hg19.phastCons46way     0 1 0.103653 2845303719 2.94924e+08
  #	stdDev viewLimits
  #	0.230184 viewLimits=0:1
  
      #  Create histogram to get an overview of all the data
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
      time nice -n +19 hgWiggle -doHistogram -db=hg19 \
  	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
  	    phastCons46way > histogram.data 2>&1
      #	real    7m37.212s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phastCons46way track"
  set xlabel " phastCons46way score"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.02]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
      ########################################################################
      ### Create a phastCons data set for Primates
  
      # setup primates-only run
      ssh swarm
      mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/primates
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
      # primates-only: exclude all but these for phastCons tree:
  
      cp -p ../../4dNoX/phyloFit.NoChrX.primates.mod primates.mod
      cp -p ../../4dX/phyloFit.chrX.primates.mod primates.chrX.mod
      #	and place the removed ones in the non-inf file so phastCons will
      #	truly ignore them:
      echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1,macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
  	> primates.non-inf
  
      gensub2 ../run.cons/ss.list single ../run.cons/template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc.
  # Completed: 581 of 581 jobs
  # CPU time in finished jobs:      17077s     284.62m     4.74h    0.20d  0.001 y
  # IO & Wait Time:                 73693s    1228.21m    20.47h    0.85d  0.002 y
  # Average job time:                 156s       2.60m     0.04h    0.00d
  # Longest finished job:             402s       6.70m     0.11h    0.00d
  # Submission to last job:          2322s      38.70m     0.65h    0.03d
  
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
      # create Most Conserved track
      cut -f1 ../../../../chrom.sizes | while read C
  do
      ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
      do
          cat ${D}/${C}*.bed
      done | sort -k1,1 -k2,2n \
      | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
  done > tmpMostConserved.bed
  /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
  
      featureBits hg19 mostConserved.bed
      #	146285948 bases of 2897316137 (5.049%) in intersection
  
      # load into database
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
      time nice -n +19 hgLoadBed hg19 phastConsElements46wayPrimates \
  	mostConserved.bed
      #	Loaded 725627 elements of size 6
      #	real    0m8.583s
      # verify coverage
      featureBits hg19 phastConsElements46wayPrimates
      #	116785954 bases of 2897316137 (4.031%) in intersection
  
      #	--rho 0.3 --expected-length 45 --target-coverage 0.3
      featureBits hg19 -enrichment refGene:cds phastConsElements46wayPrimates
      #	refGene:cds 1.186%, phastConsElements46wayPrimates 4.031%,
      #	both 0.730%, cover 61.54%, enrich 15.27x
  
      featureBits hg19 -enrichment ensGene:cds phastConsElements46wayPrimates
      #	ensGene:cds 1.252%, phastConsElements46wayPrimates 4.031%,
      #	both 0.743%, cover 59.31%, enrich 14.71x
  
      #	Create the downloads .pp files, from which the phastCons wiggle data
      #	is calculated
      # sort by chromName, chromStart so that items are in numerical order
      #  for wigEncode
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
      mkdir downloads
      cat << '_EOF_' > phastCat.sh
  #!/bin/sh
  
  mkdir -p downloads
  cut -f1 ../../../../chrom.sizes | while read C
  do
      echo -n "${C} ... working ... "
      ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
      do
  	cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
      done | gzip > downloads/${C}.phastCons46way.primates.wigFix.gz
      echo "done"
  done
  '_EOF_'
      # << happy emacs
      chmod +x ./phastCat.sh
      time nice -n +19 ./phastCat.sh
      #	real    39m47.189s
  
      # Create merged posterier probability file and wiggle track data files
      zcat downloads/chr*.wigFix.gz \
  	 | wigEncode stdin phastCons46wayPrimates.wig phastCons46wayPrimates.wib
      # Converted stdin, upper limit 1.00, lower limit 0.00
      #	real    17m20.601s
  
      #	encode to bigWig
      #	(warning wigToBigWig process grows to about 36 Gb)
      #	in bash, to avoid the 32 Gb memory limit:
  sizeG=188743680
  export sizeG
  ulimit -d $sizeG
  ulimit -v $sizeG
  
      zcat downloads/*.wigFix.gz \
          | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPrimates.bw
  
      ln -s `pwd`/phastCons46wayPrimates.bw /gbdb/hg19/bbi
      #	if desired to use the bigWig file, loading bigWig table:
      hgsql hg19 -e 'drop table if exists phastCons46wayPrimates; \
              create table phastCons46wayPrimates \
  		(fileName varchar(255) not null); \
              insert into phastCons46wayPrimates values
  	("/gbdb/hg19/bbi/phastCons46wayPrimates.bw");'
  
      ## load table with wiggle data
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
      ln -s `pwd`/phastCons46wayPrimates.wib \
  	/gbdb/hg19/multiz46way/phastCons46wayPrimates.wib
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
  	phastCons46wayPrimates phastCons46wayPrimates.wig
  
      wigTableStats.sh hg19 phastCons46wayPrimates
  # db.table      min max mean count sumData
  hg19.phastCons46wayPrimates     0 1 0.128883 2845303719 3.66712e+08
  #	stdDev viewLimits
  #	0.214067 viewLimits=0:1
  
      #  Create histogram to get an overview of all the data
      time nice -n +19 hgWiggle -doHistogram \
  	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
  	    -db=hg19 phastCons46wayPrimates  > histogram.data 2>&1
      #	real    5m30.086s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small color \
          x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Mouse Hg19 Histogram phastCons46wayPrimates track"
  set xlabel " phastCons46wayPrimates score"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.02]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
      ########################################################################
      ### Create a phastCons data set for Placentals
      # setup placental-only run
      ssh swarm
      mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/placental
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
  
      cp -p ../../4dNoX/phyloFit.NoChrX.placentals.mod placental.mod
      cp -p ../../4dX/phyloFit.chrX.placentals.mod placental.chrX.mod
      # placental-only: exclude all but these for phastCons tree:
      #	and place the removed ones in the non-inf file so phastCons will
      #	truly ignore them:
      echo "macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
          > placental.non-inf
  
      gensub2 ../run.cons/ss.list single ../run.cons/template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc.
  # Completed: 581 of 581 jobs
  # CPU time in finished jobs:      33942s     565.69m     9.43h    0.39d  0.001 y
  # IO & Wait Time:                 75536s    1258.94m    20.98h    0.87d  0.002 y
  # Average job time:                 188s       3.14m     0.05h    0.00d
  # Longest finished job:             417s       6.95m     0.12h    0.00d
  # Submission to last job:          1878s      31.30m     0.52h    0.02d
  
      # create Most Conserved track
      cut -f1 ../../../../chrom.sizes | while read C
  do
      ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
      do
          cat ${D}/${C}*.bed
      done | sort -k1,1 -k2,2n \
      | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
  done > tmpMostConserved.bed
  /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
  
      # load into database
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
      time nice -n +19 hgLoadBed hg19 phastConsElements46wayPlacental \
  	mostConserved.bed
      #	Loaded 3743478 elements of size 6
      #	real    1m15.952s
      # verify coverage
      featureBits hg19 phastConsElements46wayPlacental
      #	118211444 bases of 2897316137 (4.080%) in intersection
  
      #	--rho 0.3 --expected-length 45 --target-coverage 0.3
      featureBits hg19 -enrichment refGene:cds phastConsElements46wayPlacental
      #	refGene:cds 1.187%, phastConsElements46wayPlacental 4.080%,
      #	both 0.861%, cover 72.59%, enrich 17.79x
      featureBits hg19 -enrichment ensGene:cds phastConsElements46wayPlacental
      #	ensGene:cds 1.252%, phastConsElements46wayPlacental 4.080%,
      #	both 0.879%, cover 70.22%, enrich 17.21x
  
      #	Create the downloads .pp files, from which the phastCons wiggle data
      #	is calculated
      # sort by chromName, chromStart so that items are in numerical order
      #  for wigEncode
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
      mkdir downloads
      cat << '_EOF_' > phastCat.sh
  #!/bin/sh
  
  mkdir -p downloads
  cut -f1 ../../../../chrom.sizes | while read C
  do
      echo -n "${C} ... working ... "
      ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
      do
  	cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
      done | gzip > downloads/${C}.phastCons46way.placental.wigFix.gz
      echo "done"
  done
  '_EOF_'
      # << happy emacs
      chmod +x ./phastCat.sh
      time nice -n +19 ./phastCat.sh
  
      # Create merged posterier probability file and wiggle track data files
      zcat downloads/chr*.wigFix.gz \
  	| wigEncode stdin phastCons46wayPlacental.wig \
  		phastCons46wayPlacental.wib
      #	Converted stdin, upper limit 1.00, lower limit 0.00
      #	real    14m53.395s
  
      #	encode to bigWig
      #	(warning wigToBigWig process grows to about 36 Gb)
      #	in bash, to avoid the 32 Gb memory limit:
  sizeG=188743680
  export sizeG
  ulimit -d $sizeG
  ulimit -v $sizeG
  
      zcat downloads/*.wigFix.gz \
          | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPlacental.bw
      #	real    40m55.568s
  
      ln -s `pwd`/phastCons46wayPlacental.bw /gbdb/hg19/bbi
      #	loading bigWig table:
      hgsql hg19 -e 'drop table if exists phastCons46wayPlacental; \
              create table phastCons46wayPlacental \
  		(fileName varchar(255) not null); \
              insert into phastCons46wayPlacental values
  	("/gbdb/hg19/bbi/phastCons46wayPlacental.bw");'
  
      ## load table with wiggle data
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
      ln -s `pwd`/phastCons46wayPlacental.wib \
  	/gbdb/hg19/multiz46way/phastCons46wayPlacental.wib
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
  	phastCons46wayPlacental phastCons46wayPlacental.wig
  
      wigTableStats.sh hg19 phastCons46wayPlacental
  # db.table      min max mean count sumData
  hg19.phastCons46wayPlacental    0 1 0.0885757 2845303719 2.52025e+08
  #	stdDev viewLimits
  #	0.210242 viewLimits=0:1
  
      #  Create histogram to get an overview of all the data
      time nice -n +19 hgWiggle -doHistogram \
  	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
  	    -db=hg19 phastCons46wayPlacental > histogram.data 2>&1
      #	real    8m15.623s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phastCons46wayPlacental track"
  set xlabel " phastCons46wayPlacental score"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.02]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
  #########################################################################
  # phyloP conservation for 46-way (DONE - 2009-10-21,2009-11-10 - Hiram)
  #
  # Vertebrate, Placental, Primates
  #
      # split SS files into 1M chunks, this business needs smaller files
      #   to complete
  
      ssh swarm
      mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
      cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
      mkdir ss run.split
      cd run.split
  
      cat << '_EOF_' > doSplit.csh
  #!/bin/csh -ef
  set c = $1
  set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
  set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/run.split/ss/$c
  set WC = `cat $MAF | wc -l`
  set NL = `grep "^#" $MAF | wc -l`
  if ( -s $2 ) then
      exit 0
  endif
  if ( -s $2.running ) then
      exit 0
  endif
  
  date >> $2.running
  
  rm -fr $WINDOWS
  mkdir $WINDOWS
  pushd $WINDOWS > /dev/null
  if ( $WC != $NL ) then
  /cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
      $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
  endif
  popd > /dev/null
  date >> $2
  rm -f $2.running
  '_EOF_'
  # << happy emacs
  
      ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
  
      cat << '_EOF_' > template
  #LOOP
  doSplit.csh $(path1) {check out exists+ done/$(path1).done}
  #ENDLOOP
  '_EOF_'
  # << happy emacs
  
      mkdir ss done
      gensub2 maf.list single template jobList
      para -ram=8g create jobList
  # Completed: 504 of 504 jobs
  # CPU time in finished jobs:      14486s     241.43m     4.02h    0.17d  0.000 y
  # IO & Wait Time:                306280s    5104.67m    85.08h    3.54d  0.010 y
  # Average job time:                 636s      10.61m     0.18h    0.01d
  # Longest finished job:            1635s      27.25m     0.45h    0.02d
  # Submission to last job:          2965s      49.42m     0.82h    0.03d
  
  
      # run phyloP with score=LRT
      ssh swarm
      cd /cluster/data/hg18/bed/multiz44way/consPhyloP
      mkdir run.phyloP
      cd run.phyloP
  
      # Adjust model file base composition background and rate matrix to be
      # representative of the chromosomes in play
      grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
      #	0.542
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
  	../../cons/all/all.mod 0.542 > all.mod
      grep BACKGROUND ../../cons/all/all.chrX.mod \
  	| awk '{printf "%0.3f\n", $3 + $4}'
      #	0.503
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
  	../../cons/all/all.chrX.mod 0.503 > all.chrX.mod
      grep BACKGROUND ../../cons/primates/primates.mod \
  	| awk '{printf "%0.3f\n", $3 + $4}'
      #	0.523
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
  	../../cons/primates/primates.mod 0.523 > primates.mod
      grep BACKGROUND ../../cons/primates/primates.chrX.mod \
  	| awk '{printf "%0.3f\n", $3 + $4}'
      #	0.491
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
  	../../cons/primates/primates.chrX.mod 0.491 > primates.chrX.mod
      grep BACKGROUND ../../cons/placental/placental.mod \
  	| awk '{printf "%0.3f\n", $3 + $4}'
      #	0.542
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
  	../../cons/placental/placental.mod 0.542 > placental.mod
      grep BACKGROUND ../../cons/placental/placental.chrX.mod \
  	| awk '{printf "%0.3f\n", $3 + $4}'
      #	0.489
      /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
  	../../cons/placental/placental.chrX.mod 0.489 > placental.chrX.mod
  
  
      # repeat for chrX only tree
      cd /cluster/data/hg18/bed/multiz46way/4d
      $PHASTBIN/modFreqs 4d.chrX.mod $gc > 46way.chrX.mod
      ln -s `pwd`/46way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons46way
  
      cat << '_EOF_' > doPhyloP.csh
  #!/bin/csh -fe
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
  set f = $1
  set out = $2
  set cName = $f:r:r
  set chrDir = $f:r
  set n = $f:r:e
  set grp = $cwd:t
  set cons = /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
  set tmp = $cons/tmp/$grp/$f
  rm -fr $tmp
  mkdir -p $tmp
  set ssSrc = "$cons/run.split/ss/$chrDir/$f"
  set useGrp = "$grp.mod"
  if ( $cName == "chrX" ) then
      set useGrp = "$grp.chrX.mod"
  endif
  ln -s $cons/run.phyloP/$grp.mod $tmp
  ln -s $cons/run.phyloP/$grp.chrX.mod $tmp
  pushd $tmp > /dev/null
  $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
      -i SS $useGrp $ssSrc.ss > $f.wigFix
  popd > /dev/null
  mkdir -p $out:h
  sleep 4
  mv $tmp/$f.wigFix $out
  rm -fr $tmp
  '_EOF_'
      # << happy emacs
  
      # Create list of chunks
      find ../run.split/ss -type f | sed -e "s/.ss$//; s#^../run.split/ss/##" \
  	> ss.list
  
      # Create template file
      #	file1 == $chr/$chunk/file name without .ss suffix
      cat << '_EOF_' > template
  #LOOP
  ../run.phyloP/doPhyloP.csh $(file1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      ######################   Running all species  #######################
      # setup run for all species
      mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/all
      cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/all
      rm -fr wigFix
      mkdir wigFix
  
      gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
      para create jobList
      para try ... check ... push ... etc ...
  
      para time
      # second time around 2014-02-10:
  # Completed: 3010 of 3010 jobs
  # CPU time in finished jobs:    5635021s   93917.02m  1565.28h   65.22d  0.179 y
  # IO & Wait Time:                162236s    2703.93m    45.07h    1.88d  0.005 y
  # Average job time:                1926s      32.10m     0.53h    0.02d
  # Longest finished job:            2890s      48.17m     0.80h    0.03d
  # Submission to last job:         40469s     674.48m    11.24h    0.47d
  
      # first time:
  # Completed: 3010 of 3010 jobs
  # CPU time in finished jobs:    5672403s   94540.06m  1575.67h   65.65d  0.180 y
  # IO & Wait Time:                 51879s     864.64m    14.41h    0.60d  0.002 y
  # Average job time:                1902s      31.70m     0.53h    0.02d
  # Longest finished job:            2889s      48.15m     0.80h    0.03d
  # Submission to last job:         58824s     980.40m    16.34h    0.68d
  
      ssh hgwdev
      cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/all
      find ./wigFix -type f \
  	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
  	| sort -k1,1 -k3,3n -k4,4n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
      cat wigFile.list | xargs cat \
  	| wigEncode stdin phyloP46way.wig phyloP46way.wib > wigEncode.log 2>&1 &
      #	Converted stdin, upper limit 6.39, lower limit -13.27
      cat wigFile.list | xargs cat \
  	| wigToBigWig stdin ../../../../chrom.sizes phyloP46way.bw
      #	if you wanted to use the bigWig file, loading bigWig table:
      ln -s `pwd`/phyloP46way.bw /gbdb/hg19/bbi
      hgsql hg19 -e 'drop table if exists phyloP46wayAll; \
              create table phyloP46wayAll \
  		(fileName varchar(255) not null); \
              insert into phyloP46wayAll values
  	("/gbdb/hg19/bbi/phyloP46way.bw");'
  
      #	loading the wiggle table:
      ln -s `pwd`/phyloP46way.wib /gbdb/hg19/multiz46way
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
  	phyloP46wayAll phyloP46way.wig
  
      #	create download files:
      cat << '_EOF_' > mkDown.csh
  #!/bin/csh -fe
  foreach F (`cat wigFile.list`)
      set C = $F:h:t:r
      cat $F >> downloads/${C}.wigFix
  end
  '_EOF_'
      # << happy emacs
      chmod +x ./mkDown.csh
      mkdir downloads
      time ./mkDown.csh
      #	real    16m19.683s
  
      time gzip downloads/chr*.wigFix
      #	real    47m11.017s
  
      wigTableStats.sh hg19 phyloP46wayAll
  # db.table      min max mean count sumData
  # hg19.phyloP46wayAll     -14.08 6.424 0.0896064 2845303719 2.54957e+08
  #	stdDev viewLimits
  #	0.833186 viewLimits=-4.07632:4.25553
      #	that range is: 14.08+6.424 = 20.504
  
      #  Create histogram to get an overview of all the data
      time nice -n +19 hgWiggle -doHistogram \
  	-hBinSize=0.020504 -hBinCount=1000 -hMinVal=-14.08 -verbose=2 \
  	    -db=hg19 phyloP46wayAll > histogram.data 2>&1
      #	real    8m15.623s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phyloP46way track, all 46 vertebrates"
  set xlabel " phyloP46way score, all 46 vertebrates"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.04]
  set xrange [-2:2]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
      ######################   Running the primates  #######################
      mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/primates
      cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/primates
      rm -fr wigFix
      mkdir wigFix
  
      gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
      para create jobList
      para try ... check ... push ... etc ...
  
      para time
  # Completed: 3186 of 3186 jobs
  # CPU time in finished jobs:     447177s    7452.95m   124.22h    5.18d  0.014 y
  # IO & Wait Time:                 36673s     611.22m    10.19h    0.42d  0.001 y
  # Average job time:                 152s       2.53m     0.04h    0.00d
  # Longest finished job:             279s       4.65m     0.08h    0.00d
  # Submission to last job:          4849s      80.82m     1.35h    0.06d
  
      cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/primates
      find ./wigFix -type f \
  	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
  	| sort -k1,1 -k3,3n -k4,4n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
      cat wigFile.list | xargs cat \
  	| wigEncode stdin phyloP46wayPrimates.wig phyloP46wayPrimates.wib \
  	> wigEncode.log 2>&1 &
      #	Converted stdin, upper limit 0.65, lower limit -9.12
      cat wigFile.list | xargs cat \
  	| wigToBigWig stdin ../../../../chrom.sizes phyloP46wayPrimates.bw
      #	if you wanted to use the bigWig file, loading bigWig table:
      ln -s `pwd`/phyloP46wayPrimates.bw /gbdb/hg19/bbi
      hgsql hg19 -e 'drop table if exists phyloP46wayPrimates; \
              create table phyloP46wayPrimates \
  		(fileName varchar(255) not null); \
              insert into phyloP46wayPrimates values
  	("/gbdb/hg19/bbi/phyloP46wayPrimates.bw");'
  
      #	loading the wiggle table:
      ln -s `pwd`/phyloP46wayPrimates.wib /gbdb/hg19/multiz46way
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
  	phyloP46wayPrimates phyloP46wayPrimates.wig
  
      #	create download files:
      mkdir downloads
      time ../all/mkDown.csh
      #	real    18m44.186s
      time gzip downloads/chr*.wigFix
      #	real    32m11.461s
  
      wigTableStats.sh hg19 phyloP46wayPrimates
  # db.table      min max mean count
  # hg19.phyloP46wayPrimates        -9.065 0.655 0.0448196 2845303719
  #	sumData stdDev viewLimits
  #	1.27525e+08 0.600051 viewLimits=-2.95544:0.655
      #	that range is: 9.065+0.655 = 9.720
  
      #  Create histogram to get an overview of all the data
      time nice -n +19 hgWiggle -doHistogram \
  	-hBinSize=0.00972 -hBinCount=1000 -hMinVal=-9.065 -verbose=2 \
  	    -db=hg19 phyloP46wayPrimates > histogram.data 2>&1
      #	real    8m15.623s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phyloP46wayPrimates track"
  set xlabel " phyloP46wayPrimates score"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.03]
  set xrange [-2:0.655]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
      ######################   Running the placentals  #######################
      mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/placentals
      cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/placentals
      rm -fr wigFix
      mkdir wigFix
  
      gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
      para create jobList
      para try ... check ... push ... etc ...
      para time
  # Completed: 3186 of 3186 jobs
  # CPU time in finished jobs:    1582989s   26383.14m   439.72h   18.32d  0.050 y
  rY.phyloP46way.placental.wigFix.gz
  # IO & Wait Time:                 25577s     426.29m     7.10h    0.30d  0.001 y
  # Average job time:                 505s       8.41m     0.14h    0.01d
  # Longest finished job:             768s      12.80m     0.21h    0.01d
  # Submission to last job:         12967s     216.12m     3.60h    0.15d
  
      cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/placental
      find ./wigFix -type f \
  	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
  	| sort -k1,1 -k3,3n -k4,4n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
      cat wigFile.list | xargs cat \
  	| wigEncode stdin phyloP46wayPlacental.wig phyloP46wayPlacental.wib \
  	> wigEncode.log 2>&1 &
      #	Converted stdin, upper limit 2.95, lower limit -13.28
      cat wigFile.list | xargs cat \
  	| wigToBigWig stdin ../../../../chrom.sizes phyloP46wayPlacental.bw
  
      #	loading bigWig table:
      ln -s `pwd`/phyloP46wayPlacental.bw /gbdb/hg19/bbi
      hgsql hg19 -e 'drop table if exists phyloP46wayPlacental; \
              create table phyloP46wayPlacental \
  		(fileName varchar(255) not null); \
              insert into phyloP46wayPlacental values
  	("/gbdb/hg19/bbi/phyloP46wayPlacental.bw");'
  
      #	loading the wiggle table:
      ln -s `pwd`/phyloP46wayPlacental.wib /gbdb/hg19/multiz46way
      time hgLoadWiggle
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
  	phyloP46wayPlacental phyloP46wayPlacental.wig
  
      #	create download files:
      mkdir downloads
      time ../all/mkDown.csh
      #	real    18m52.778s
      time gzip downloads/chr*.wigFix
      #	real    46m55.550s
  
      wigTableStatus.sh hg19 phyloP46wayPlacental
  # db.table      min max mean count sumData stdDev viewLimits
  # hg19.phyloP46wayPlacental -13.796 2.941 0.0359345 2845303719 1.02245e+08
  #	stdDev viewLimits
  #	0.779426 viewLimits=-3.86119:2.941
      #	that range is: 13.796+2.941 = 16.737
  
      #  Create histogram to get an overview of all the data
      time nice -n +19 hgWiggle -doHistogram \
  	-hBinSize=0.016737 -hBinCount=1000 -hMinVal=-13.796 -verbose=2 \
  	    -db=hg19 phyloP46wayPlacental > histogram.data 2>&1
      #	real    8m15.623s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phyloP46wayPlacental track"
  set xlabel " phyloP46wayPlacental score"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.03]
  set xrange [-2.5:2.5]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
  #########################################################################
  # LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
      cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
  
      cat << '_EOF_' > DEF
  # human vs X. zebrafish
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Zebrafish danRer6
  SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
  SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1678m17.827s
      #	failed during the chain step due to encodek cluster problems
      #	finish that manually, then:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-continue=chainMerge > chainMerge.log 2>&1 &
      #	real    167m6.930s
      cat fb.hg19.chainDanRer6Link.txt
      #	88391631 bases of 2897316137 (3.051%) in intersection
  
      #	running the swap - DONE - 2009-06-02
      mkdir /hive/data/genomes/danRer6/bed/blastz.hg19.swap
      cd /hive/data/genomes/danRer6/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    183m21.102s
      cat fb.danRer6.chainHg19Link.txt
      #	96424507 bases of 1506896106 (6.399%) in intersection
  
  ##############################################################################
  # LASTZ Elephant LoxAfr3 (DONE - 2009-07-21,23 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
      cd /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
  
      cat << '_EOF_' > DEF
  # Human vs. Elephant
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Elephant
  SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
  SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=50
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    317m32.664s
      #	broken when it went to chaining on encodek, finish the chain then:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-continue=chainMerge > chainMerge.log 2>&1 &
      #	real    217m25.159s
  
      # time about 3h23m
      cat fb.hg19.chainLoxAfr3Link.txt
      #	1351200080 bases of 2897316137 (46.636%) in intersection
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> synNet.log 2>&1 &
      #	real    32m40.554s
  
      time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
      #	real    184m3.435s
  
      mkdir /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
      cd /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    220m16.839s
      cat fb.loxAfr3.chainHg19Link.txt
      #	1323201500 bases of 3118565340 (42.430%) in intersection
  
  ##############################################################################
  # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
  
  vertebrate-wide transMap alignments were built  Tracks are created and loaded
  by a single Makefile. This is available from:
     svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
  Or on the new hgwdev, /hive/users/markd/projs/transMap/
  
  see doc/builds.txt for specific details.
  
  ############################################################################
  # AGILENT PROBES LIFTED FROM HG18 (DONE, 2009-07-28 Andy)
  
  ssh hgwdev
  bash
  mkdir /hive/data/genomes/hg19/bed/agilentProbes
  cd /hive/data/genomes/hg19/bed/agilentProbes
  for table in `echo show tables like \'agilent%\' | hgsql hg18 | tail -n +2 | grep -v Probe`; do
      echo $table; echo "select * from $table" | hgsql hg18 | \
          tail -n +2 | cut -f2- > ${table}.hg18.bed; liftOver ${table}.hg18.bed \
            /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz ${table}.hg19.{bed,unmapped};
      hgLoadBed hg19 $table ${table}.hg19.bed;
      echo done with $table;
  done
  for unmap in *.unmapped; do
     table=${unmap%.hg19.unmapped}
     grep Deleted -A1 $unmap | grep -v Deleted | grep -v "^--" > agilentProbesHg18Unmapped/${table}.deleted.bed
     grep Split -A1 $unmap | grep -v Split | grep -v "^--" > agilentProbesHg18Unmapped/${table}.split.bed
     grep Partially -A1 $unmap | grep -v Partially | grep -v "^--" > agilentProbesHg18Unmapped/${table}.partiallyDeleted.bed
  done
  find agilentProbesHg18Unmapped/ -size 0b | xargs rm
  rm *hg18.bed *.unmapped bed.tab
  gzip *.bed
  tar cfz agilentProbesHg18Unmapped.tar.gz agilentProbesHg18Unmapped
  cd /usr/local/apache/htdocs/goldenPath/hg19
  mkdir agilentProbes
  cd agilentProbes/
  ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped beds
  ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped.tar.gz
  
  ##############################################################################
  # LASTZ Tetraodon TetNig2 (DONE - 2009-08-10,11 - Hiram)
      #	This is the incorrect date/time stamp on this directory,
      #	it should be 2009-08-10
      mkdir /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
      cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
  
      cat << '_EOF_' > DEF
  # human vs tetraodon
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
  SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
  SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
  SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
  SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
  SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    220m36.068s
      #	forgot the qRepeats for tetNig2
      rm axtChain/hg19.tetNig2.net
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-continue=load -qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> load.log 2>&1 &
      #	real    5m53.096s
      cat fb.hg19.chainTetNig2Link.txt
      #	49611132 bases of 2897316137 (1.712%) in intersection
  
      #	running the swap
      mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
      cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
  	-qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    13m21.591s
      #	forgot the qRepeats for tetNig2
      rm axtChain/tetNig2.hg19.net
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
  	-continue=load -qRepeats=windowmaskerSdust \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-swap > load.log 2>&1 &
      #	real    4m7.559s
      cat fb.tetNig2.chainHg19Link.txt
      #	42910930 bases of 302314788 (14.194%) in intersection
  
  
  ##############################################################################
  # dbSNP BUILD 130 - PROVISIONAL REMAPPING TO BUILD 37 (DONE 8/28/09 angie)
      # /hive/data/outside/dbSNP/130/ was already set up during the hg18 run --
      # just add hg19 coord files and go from there.
      cd /hive/data/outside/dbSNP/130/human/data
      alias wg wget --timestamping
      set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/misc/exchange
      # These are provisional files in an ad-hoc format.
      wg $ftpSnpDb/README.txt
      wg $ftpSnpDb/Remap_36_3_37_1.info
      wg $ftpSnpDb/Remap_36_3_37_1.txt.gz
      mv README.txt Remap_36_3_37_1_README
      zcat Remap_36_3_37_1.txt.gz | wc -l
  #18823990
  
      # Use the remapping to transform ../ucscNcbiSnp.bed into one for hg19.
      # Useful columns, 1-based: 1=ID, 3=oldChr, 4=oldStart, 5=oldEnd,
      # 10=newChr, 11=newStart, 12=newEnd, 13=newLocType, 14=newWeight, 16=newStrand
      # For mappings to chr*_random, oldStart and oldEnd are empty -- skip.
      # Sort both hg18 snp file and remap file by {rsID,chr,start} to keep them in sync.
      mkdir /hive/data/outside/dbSNP/130/human/hg19
      cd /hive/data/outside/dbSNP/130/human/hg19
      sort -k4n,4n -k1,1 -k2n,2n ../ucscNcbiSnp.bed > /data/tmp/hg18.ucscNcbiSnp.idSorted.bed
      zcat ../data/Remap_36_3_37_1.txt.gz \
      | sort -t "	" -k1n,1n -k3,3 -k4n,4n \
        > /data/tmp/Remap_36_3_37_1.txt
      perl -we \
        'use strict; \
         sub nextMap { \
           my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
               $nLocType, $nWt, $nRef, $nStr);\
           do { \
             ($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
                 $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = split("\t", <>); \
             if (defined $nStr) { \
               chomp $nStr; $nStr =~ tr/+-/01/; $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
             } \
             $oStart--;  $oEnd--;  $nStart--;  $nEnd--;  # Yep. 0-based closed vs 1-based closed \
           } while (defined $nStr && ($oEnd < 0 || $nChr eq "chrUn")); \
           return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
                   $nLocType, $nWt, $nRef, $nStr); \
         } # nextMap \
         my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
           &nextMap(); \
         my ($rCount, $oCount, $tCount) = 0; \
         open(my $oldF, "/data/tmp/hg18.ucscNcbiSnp.idSorted.bed") || die; \
         while (my ($chr, $s, $e, $id, $str, $rn,$obs,$mt,$cn,$vn,$ah,$ahse,$fc,$lt,$wt) = \
                split("\t", <$oldF>)) { \
           my $thisRCount = 0; \
           while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
             print join("\t", $nChr,$nStart,$nEnd,$id,$nStr,$nRef,$obs,$mt,$cn,$vn,$ah,$ahse,$fc, \
                              $nLocType,$nWt,$nStart) \
                        . "\n"; \
             ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
               &nextMap(); \
             $thisRCount++; \
           } \
           if (defined $rsId && $id > $rsId) {warn "Slipped a cog"; last;} \
           $tCount += $thisRCount; \
           $rCount++ if ($thisRCount > 0); \
           $oCount++; \
         } \
         close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
        /data/tmp/Remap_36_3_37_1.txt \
      | sort -k1,1 -k2n,2n -k4,4 \
      > /data/tmp/hg19.ucscNcbiSnp.bed
  #Replaced 18693260 of 19189750 inputs (18697579 outputs).
  #504.562u 27.037s 8:59.57 98.5%  0+0k 0+0io 0pf+0w
      wc -l /data/tmp/hg19.ucscNcbiSnp.bed
  #  18697579 /data/tmp/hg19.ucscNcbiSnp.bed
  
      # Drum roll please... translate NCBI's encoding into UCSC's, and
      # perform a bunch of checks.  This is where developer involvement
      # is most likely as NCBI extends the encodings used in dbSNP.
      cd /hive/data/outside/dbSNP/130/human/hg19
      snpNcbiToUcsc /data/tmp/hg19.ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit \
        -1000GenomesRsIds=../data/1000GenomesRsIds.txt snp130
  #spaces stripped from observed:
  #chr12   6093134 6093134 rs41402545
  #Line 8049395 of /data/tmp/hg19.ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
  #count of snps with weight  0 = 0
  #count of snps with weight  1 = 17042465
  #count of snps with weight  2 = 345274
  #count of snps with weight  3 = 1017906
  #count of snps with weight 10 = 291934
  #Skipped 1496 snp mappings due to errors -- see snp130Errors.bed
  #146.837u 9.867s 4:21.63 59.8%   0+0k 0+0io 0pf+0w
      # Comparable to hg18.snp130, with some losses due to coord translation, loss of _randoms,
      # and 1496 errors (new locType or refNCBI inconsistent with new size).
      expr 18697579 - 291934 - 1496
  #18404149
  
      # Move hg19.ucscNcbiSnp.bed from fast tmp to slow (today) hive:
      gzip /data/tmp/hg19.ucscNcbiSnp.bed
      mv /data/tmp/hg19.ucscNcbiSnp.bed.gz hg19.ucscNcbiSnp.bed.gz
  
      # Will try not reuse hg18.snp130's giant 18G fasta file, not duplicate.
  
      # Load up main track tables.
      cd /hive/data/outside/dbSNP/130/human/hg19
      hgLoadBed -tab -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp130 -sqlTable=snp130.sql snp130.bed
  #Loaded 18404149 elements of size 17
  #115.086u 21.663s 2:32:09.98 1.4%        0+0k 0+0io 1pf+0w
  #that is freakishly long -- lots happening today w/db move, hive recovery,...
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
        snp130Exceptions.bed
  #Loaded 1982828 elements of size 5
  #10.500u 0.851s 1:13.42 15.4%    0+0k 0+0io 0pf+0w
      hgLoadSqlTab hg19 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
        snp130ExceptionDesc.tab
      # Load up sequences *from hg18 file*:
      hgLoadSqlTab hg19 snp130Seq ~/kent/src/hg/lib/snpSeq.sql ../snp130Seq.tab
  
      # Put in a link where one would expect to find the track build dir...
      ln -s /hive/data/outside/dbSNP/130/human/hg19 /hive/data/genomes/hg19/bed/snp130
  
      # Look at the breakdown of exception categories:
      cd /hive/data/outside/dbSNP/130/human/hg19
      cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
  #1350217 MultipleAlignments
  # 495981 ObservedMismatch
  #  37603 ObservedTooLong
  #  26855 SingleClassTriAllelic
  #  24443 FlankMismatchGenomeShorter
  #  17927 SingleClassLongerSpan
  #  13685 SingleClassZeroSpan
  #   6238 FlankMismatchGenomeLonger
  #   3016 DuplicateObserved
  #   2851 SingleClassQuadAllelic
  #   1777 MixedObserved
  #   1264 NamedDeletionZeroSpan
  #    508 FlankMismatchGenomeEqual
  #    329 NamedInsertionNonzeroSpan
  #    121 ObservedContainsIupac
  #     11 RefAlleleMismatch
  #      2 ObservedWrongFormat
  
  #TODO: go through those above (esp snp130Errors.bed) and send some bug reports to dbSNP.
  
  
  ##############################################################################
  # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 8/31/09 angie)
      mkdir /hive/data/genomes/hg19/bed/snp130Ortho
      cd /hive/data/genomes/hg19/bed/snp130Ortho
  
      # Following Heather's lead in snp126orthos, filter SNPs to to keep
      # only those with class=single, length=1, chrom!~random;
      # Exclude those with exceptions MultipleAlignments,
      # SingleClassTriAllelic or SingleClassQuadAllelic.
      # Unlike snp masking, we do not filter for weight -- don't know why.
      awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
        /hive/data/outside/dbSNP/130/human/hg19/snp130Exceptions.bed \
      | sort -u \
        > snp130ExcludeIds.txt
      awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
        /hive/data/outside/dbSNP/130/human/hg19/snp130.bed \
      | grep -vFwf snp130ExcludeIds.txt \
        > snp130Simple.bed
  #203.193u 9.197s 2:57.40 119.7%  0+0k 0+0io 0pf+0w
      wc -l snp130Simple.bed
  #12278514 snp130Simple.bed
  
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        snp130Simple.bed > snp130ForLiftOver.bed
      # Map coords to chimp using liftOver.
      # I don't know why chimp took so much longer than macaque... the
      # chimp .over has fewer chains and fewer bytes than the macaque .over.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
      splitFile ../snp130ForLiftOver.bed 25000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
          \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh swarm
      cd /hive/data/genomes/hg19/bed/snp130Ortho/run.liftOChimp
      para make jobList
  #Completed: 492 of 492 jobs
  #CPU time in finished jobs:      51793s     863.22m    14.39h    0.60d  0.002 y
  #IO & Wait Time:                  3825s      63.75m     1.06h    0.04d  0.000 y
  #Average job time:                 113s       1.88m     0.03h    0.00d
  #Longest finished job:             286s       4.77m     0.08h    0.00d
  #Submission to last job:           300s       5.00m     0.08h    0.00d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 492 of 492 jobs
  #CPU time in finished jobs:     125656s    2094.26m    34.90h    1.45d  0.004 y
  #IO & Wait Time:                  5413s      90.22m     1.50h    0.06d  0.000 y
  #Average job time:                 266s       4.44m     0.07h    0.00d
  #Longest finished job:             646s      10.77m     0.18h    0.01d
  #Submission to last job:           649s      10.82m     0.18h    0.01d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
          \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 492 of 492 jobs
  #CPU time in finished jobs:     161612s    2693.54m    44.89h    1.87d  0.005 y
  #IO & Wait Time:                  6218s     103.63m     1.73h    0.07d  0.000 y
  #Average job time:                 341s       5.69m     0.09h    0.00d
  #Longest finished job:             727s      12.12m     0.20h    0.01d
  #Submission to last job:           739s      12.32m     0.21h    0.01d
  
      cd /hive/data/genomes/hg19/bed/snp130Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
      | sort > panTro2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
      | sort > rheMac2.orthoGlom.txt
      wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
  #  11428526 panTro2.orthoGlom.txt
  #  10861969 ponAbe2.orthoGlom.txt
  #   9694237 rheMac2.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac2.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
  #304.434u 27.118s 4:31.30 122.2% 0+0k 0+0io 0pf+0w
      wc -l snp130OrthoPt2Pa2Rm2.bed
  #11876029 snp130OrthoPt2Pa2Rm2.bed
  
      cd /hive/data/genomes/hg19/bed/snp130Ortho
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
  #Loaded 11876029 elements of size 22
  #75.442u 8.828s 9:50.27 14.2%    0+0k 0+0io 0pf+0w
  
      # Cleanup fileserver:
      cd /hive/data/genomes/hg19/bed/snp130Ortho
      gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed &
      rm -r run*/split tmp.txt *.orthoGlom.txt
  
  ##############################################################################
  # LASTZ Rabbit OryCun2 (DONE - 2009-08-12 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
      cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
  
      cat << '_EOF_' > DEF
  # Human vs. Rabbit
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
  SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
  SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
  SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
  SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
  SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=400
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    516m41.981s
      cat fb.hg19.chainOryCun2Link.txt
      #	1283994337 bases of 2897316137 (44.317%) in intersection
      #	should have run syntenicNet in that first run
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
      #	about 1 hour
  
      mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
      cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-swap -syntenicNet > swap.log 2>&1 &
      #	real    176m35.932s
      cat fb.oryCun2.chainHg19Link.txt
      #	1260477501 bases of 2604023284 (48.405%) in intersection
  
  ##############################################################################
  # running syntenicNet on CavPor3 lastz (DONE - 2009-08-27 - Hiram)
      cd /hive/data/genomes/hg19/bed/lastzCavPor3.2009-06-04
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
      #	about 44 minutes
  
  ##############################################################################
  # loading the lastz tables on cavPor3 - (DONE - 2009-08-28 - Hiram)
      # the chain.tab and link.tab files are left over from the failed load
      cd  /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
  
      #	find out their sizes, average and total:
      awk '{print length($0)}' chain.tab | ave stdin
  Q1 92.000000 median 93.000000 Q3 96.000000
  average 93.651267
  min 64.000000 max 109.000000
  count 27186468
  total 2546047186.000000
      awk '{print length($0)}' link.tab | ave stdin
  Q1 45.000000 median 47.000000 Q3 48.000000
  average 46.731871
  min 22.000000 max 52.000000
  count 240602108
  total 11243786622.000000
  
      cat << '_EOF_' > chainHg19Link.sql
  CREATE TABLE chainHg19Link (
    bin smallint(5) unsigned NOT NULL default 0,
    tName varchar(255) NOT NULL default '',
    tStart int(10) unsigned NOT NULL default 0,
    tEnd int(10) unsigned NOT NULL default 0,
    qStart int(10) unsigned NOT NULL default 0,
    chainId int(10) unsigned NOT NULL default 0,
    KEY tName (tName(13),bin),
    KEY chainId (chainId)
  ) ENGINE=MyISAM max_rows=241000000 avg_row_length=50 pack_keys=1 CHARSET=latin1;
  '_EOF_'
      # << happy emacs
      hgsql cavPor3 < chainHg19Link.sql
  
      time hgsql -e \
        'load data local infile "link.tab" into table chainHg19Link;' cavPor3
      #	real    405m15.956s
  
      cd  /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
  
      #	and the net tracks were not loaded:
      time netClass -verbose=0 -noAr noClass.net cavPor3 hg19 cavPor3.hg19.net
      #	real    40m25.078s
  
      netFilter -minGap=10 cavPor3.hg19.net \
  	| hgLoadNet -verbose=0 cavPor3 netHg19 stdin
      # real    33m24.972s (plus the featureBits below)
  
      featureBits cavPor3 chainHg19Link > fb.cavPor3.chainHg19Link.txt 2>&1
      cat fb.cavPor3.chainHg19Link.txt
      #	1279572660 bases of 2663369733 (48.043%) in intersection
  
  ##############################################################################
  # DBSNP CODING ANNOTATIONS (DONE 10/12/10 angie)
  # Updated 10/12/10 using rebuilt hg18 snp130CodingDbSnp.bed w/corrected coords.
  # Originally done 9/1/09
  
      # Repeat the coord-remapping performed for snp130 on the hg18 coding anno table.
      cd /hive/data/outside/dbSNP/130/human/hg19
      sed -re 's/\trs([0-9]+)\t/\t\1\t/' ../snp130CodingDbSnp.bed \
      | sort -k4n,4n -k1,1 -k2n,2n > /data/tmp/hg18.snp130Coding.idSorted.bed
      # reuse /data/tmp/Remap_36_3_37_1.txt mapping file created for snp130 above,
      # but first translate its coords (1-based fully-closed with 2-base-long insertions)
      # into ours (0-based half-open with 0-base-long insertions) and discard incompletes.
      perl -we \
        'while (my ($rsId, undef, $oChr, $oStart, $oEnd, $oLocType, undef,undef,undef, \
                                  $nChr, $nStart, $nEnd, $nLocType) = split("\t", <>)) { \
           next if ($oStart eq "" || $nStart eq ""); \
           $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
           # 2-base-long insertion (loc_type==3) -> 0-base-long: \
           if ($oLocType == 3) { $oEnd--; } else { $oStart--; } \
           if ($nLocType == 3) { $nEnd--; } else { $nStart--; } \
           print join("\t", $rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) . "\n"; \
         }' /data/tmp/Remap_36_3_37_1.txt \
        > /data/tmp/Remap_36_3_37_1_ucscCoords.txt
      # Apply the cleaned-up mapping to id-sorted hg18 snp130CodingDbSnp:
      perl -we \
        'use strict; \
         my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = split("\t", <>); \
         my ($rCount, $oCount, $tCount) = 0; \
         open(my $oldF, "/data/tmp/hg18.snp130Coding.idSorted.bed") || die; \
         while (my ($chr, $s, $e, $id, $tx, $frm, $alCount, $funcs, $als, $codons, $peps) = \
                split("\t", <$oldF>)) { \
           my $thisRCount = 0; \
           while (defined $rsId && $rsId < $id) { \
             ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = split("\t", <>); \
           } \
           while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
             chomp $nEnd; \
             print join("\t", $nChr, $nStart, $nEnd, "rs$id", $tx, $frm, \
                              $alCount, $funcs, $als, $codons, $peps) unless $nEnd < $nStart; \
             ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = split("\t", <>); \
             $thisRCount++; \
           } \
           $tCount += $thisRCount; \
           $rCount++ if ($thisRCount > 0); \
           $oCount++; \
         } \
         close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
        /data/tmp/Remap_36_3_37_1_ucscCoords.txt \
      | sort -k1,1 -k2n,2n -k4,4 \
      > /data/tmp/hg19.snp130Coding.bed
  #Replaced 197921 of 279815 inputs (198493 outputs).
  #35.486u 1.515s 0:36.70 100.7%   0+0k 0+0io 0pf+0w
     hgLoadBed hg19 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
        -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
        /data/tmp/hg19.snp130Coding.bed
  #Loaded 198459 elements of size 11
      # A bit fewer than reported 198493 above, but we ditched a few with $nEnd < $nStart
      # (corresponding SNPs ended up in snp130Errors.bed not snp130.bed anyway).
      mv /data/tmp/hg19.snp130Coding.bed hg19.snp130CodingDbSnp.bed
  
  ############################################################################
  # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
  
  vertebrate-wide transMap alignments were built  Tracks are created and loaded
  by a single Makefile. This is available from:
     svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
  Or on the new hgwdev, /hive/users/markd/projs/transMap/
  
  see doc/builds.txt for specific details.
  ############################################################################
  # BURGE	LAB DATA MAPPED WITH GEMMAPPER. PROVIDED BY THOMAS DERRIEN FROM RODERIC
  # GUIGO'S LAB AT CRG. (E-MAIL: thomas.derrien@crg.es). Data received on
  # 09/14/09.
  # (hartera, 2009-09-28, DONE)
  # 2009-12-14, hartera. Set cdsStart = cdsEnd = 0. Moved track data directory to
  # /hive/data/genomes/hg18/bed.
  # 2010-01-04, hartera. Change the data to BED format and re-loaded tables. BED
  # is more appropriate for this data type.
  # The data is too dense in places (feedback from QA) so it would be more
  # appropriate to have a Signal track as for the ENCODE RNA-seq data tracks.
  # 2010-02-09, hartera. Create bedGraph Signal subtracks for each tissue/cell
  # using reads/per million mapped reads as the data value.
  # 2010-02-17, hartera. Updated trackDb.ra entry to include views.
  # 2010-02-18, hartera. Loaded the bedGraph tables for the Raw Signal
  # subtracks.
  # 2010-05-15 and 2010-05-16, hartera. Re-created the Signal subtracks using
  # the -bed12 option of bedItemOverlapCount so that blocks are used.
  
     mkdir /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
     cd /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
  
  # Added the statements below to a script so that it can be run to fetch
  # all the sequences.
  
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325476_brain_HCT168_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325477_liver_HCT169_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325478_heart_HCT170_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325479_skelMuscle_HCT171_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325480_colon_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325481_adipose_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325482_testes_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325483_lymphNode_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325484_HCT204_bt474_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325485_HCT205_HME_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325486_HCT202_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325487_HCT203_s2468.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325488_HCT206_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325489_HCT207_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  
     # Load this data into tables for hg19.
     # Unzip the files:
     gunzip *.gff.gz
     # Create a file with the list of file names and tissues.
     ls *.gff > burgeDataFiles.txt
     GSM325486_HCT202_s2468	breast
     GSM325487_HCT203_s2468	MCF-7
     GSM325488_HCT206_s2468	MB435
     GSM325489_HCT207_s2468	T47D
     # Did not map these two as they are not 32 bp.
     GSM325490_brain_s1368	MAQC	mixed human brain tissue/cell lines
     GSM325491_UHR_s247		MAQC_UHR mixed human cell lines
     # Edit the file above to add a tab separation between file name and tissue
     # name. Then remove the "read_name: " from the last field in each
     # file otherwise it gets included in the name and load the data into hg18.
     # Write a script to do this:
  cat << '_EOF_' > formatAndLoadData
  #!/bin/bash -e
  
  # Assign variables
  # Tab-separated file of file names and tissue/cell line names
  DATAFILES=$1
  # track name used as prefix for subtracks
  TRACK=$2
  # database
  DATABASE=$3
  
  cat $DATAFILES | while read file tissue; do
      subTrack=`echo $TRACK$tissue`
      echo $subTrack
      sed -e 's/read_name:\s//' $file > ${subTrack}.gff
      ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff
  done
  '_EOF_'
     chmod +x formatAndLoadData
     ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg19 \
       >& load.log &
     # Took about 2 hours to load the tables.
     # Copy trackDb entry in hg18 trackDb.ra to
     # ccds/trunk/gencode/browser/trackDb/human/hg19/trackDb.ra
  
     # 2009-12-14, Need to change cdsStart = cdsEnd = 0 in the tables as this
     # data should have no CDS defined. Currently cdsStart = cdsEnd = txEnd.
     cd /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
     hgsql -Ne 'show tables like "burge%";' hg19 > burgeTables
     foreach t (`cat burgeTables`)
        echo $t
        hgsql -e "update $t set cdsStart = 0;" hg19
        hgsql -e "update $t set cdsEnd = 0;" hg19
     end
     # Then move data to directory in hg19 genome bed directory
     cd /hive/data/genomes/hg19/bed
     mv /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign ./
  
     # 2010-01-04 Change the data to BED format. For genePred format,
     # there is always a track configuration added for colouring tracks by
     # genomic codons which does not make sense for this data. Also, BED is
     # more appropriate for this data type.
     cd /hive/data/genomes/hg19/bed/burgeRnaSeqGemMapperAlign
     # Convert gff to genePred and then genePred to BED, drop old table and
     # then load database with BED format data. Need to fix the cdStart and
     # cdsEnd fields to be 0.
     foreach f (`ls burgeRnaSeqGemMapperAlign*.gff`)
       echo $f >> bed.log
       set g=$f:r
       echo $g
       ldHgGene -exon=read -nobin -out=${g}.gp hg19 $g $f >>& bed.log
       awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$2,$3,$4,$5,0,0,$8,$9,$10}' \
           ${g}.gp > ${g}Fixed.gp
       genePredToBed ${g}Fixed.gp > ${g}.bed
       echo "Dropping table $g"
       hgsql -e "drop table ${g};" hg19
       hgLoadBed hg19 $g ${g}.bed >>& bed.log
     end
     # Changed track type in trackDb/human/trackDb.ra to bed 12 and
     # then did make alpha in trackDb directory.
  
     # trackDb/human/trackDb.ra entry was updated to include views for Raw Signal
     # and Alignment subtracks (2010-02-17)
     # 2010-05-15 and 2010-05-16. Add a Signal track so it is easier to view the data in
     # regions where there is a high density of reads.
     cd /hive/data/genomes/hg19/bed/burgeRnaSeqGemMapperAlign
     # Use bedItemOverlapCount to get counts of overlapping items for each base.
     # Need to sort the bed files and then get the number of reads mapped for
     # that tissue. Divide the counts by the number of million mapped reads to
     # get the number of reads per million mapped reads as the data value.
     # Re-make the subtracks using the -bed12 option so that blocks are used
     # instead of just the first three fields of the BED file as is the default.
     rm *.count *.bedGraph
     foreach f (`ls *.bed`)
        echo $f
        set g=$f:r
        sort $f | bedItemOverlapCount -bed12 hg19 stdin > ${f}.count
        set size=`hgsql -Ne "select count(distinct name) from ${g};" hg19`
        awk -v size=${size} 'BEGIN {OFS="\t"} {print $1,$2,$3,($4 / (size/1000000));}' ${f}.count > ${g}.bedGraph
     end
     # Load the bedGraph tables into the database as Raw Signal tracks.
     foreach f (`ls *.bedGraph`)
        echo $f
        set g=$f:r
        hgsql -e "drop table ${g}AllRawSignal;" hg19
        hgLoadBed -bedGraph=4 hg19 ${g}AllRawSignal $f >>& loadSignal.log
     end
  
  
  ##########################################################################
  # BUILD ALLEN BRAIN TRACK (DONE 09/30/09 kent)
  
  # Make the working directory
      ssh hgwdev
      cd /cluster/data/hg19/bed
      mkdir allenBrain
      cd allenBrain
  
  # Remap the probe alignments from mm7 to hg19
  
      zcat /gbdb/mm9/liftOver/mm9ToHg19.over.chain.gz \
          |  pslMap -chainMapFile -swapMap \
  	       /cluster/data/mm9/bed/allenBrain/allenBrainAli.psl stdin stdout \
  	  |  sort -k 14,14 -k 16,16n > unscored.psl
  
      pslRecalcMatch unscored.psl /cluster/data/hg19/hg19.2bit \
          /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa allenBrainAli.psl
  
  # Load the database
     hgsql hg19 < ~/kent/src/hg/lib/allenBrainUrl.sql
     hgsql hg19 -e 'load data local infile "/cluster/data/mm9/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
     hgLoadPsl hg19 allenBrainAli.psl
     mkdir /gbdb/hg19/allenBrain
     ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa /gbdb/hg19/allenBrain/allenBrainProbes.fa
     hgLoadSeq hg19 /gbdb/hg19/allenBrain/allenBrainProbes.fa
  
  # Make mapping between ens genes and allenBrain
     hgMapToGene hg19 allenBrainAli -type=psl ensGene ensToAllenBrain
  
  #############################################################################
  # ADD ALLEN BRAIN CORTEXT LINK (DONE, 11/18/09 kent)
  
  # Copy over version from hg18 since we don't have new data from Allen Brain
  # Inst.
      cd /cluster/data/hg19/bed/allenBrain
      cp /cluster/data/hg18/bed/allenBrain/allenBrainGene.tab .
  
  # Load it into database.
  
      hgsql hg19 < ~/src/hg/lib/allenBrainGene.sql
      hgsql hg19 -e \
      'load data local infile "allenBrainGene.tab" into table allenBrainGene'
  
  
  ############################################################################
  ## Annotate 46-way multiple alignment with gene annotations
  ##		(DONE - 2008-12-08,23 - Hiram)
      # Gene frames
      ## survey all genomes to see what type of gene track to use
      ssh hgwdev
      mkdir /hive/data/genomes/hg19/bed/multiz46way/frames
      cd /hive/data/genomes/hg19/bed/multiz46way/frames
      #
      #	survey all the genomes to find out what kinds of gene tracks they have
      cat << '_EOF_' > showGenes.csh
  #!/bin/csh -fe
  foreach db (`cat ../species.list`)
      echo -n "${db}: "
      set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
      foreach table ($tables)
  	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
  	    $table == "ensGene" || $table == "xenoRefGene" ) then
  		set count = `hgsql $db -N -e "select count(*) from $table"`
  		echo -n "${table}: ${count}, "
  	endif
      end
      set orgName = `hgsql hgcentraltest -N -e \
  	    "select scientificName from dbDb where name='$db'"`
      set orgId = `hgsql hg19 -N -e \
  	    "select id from organism where name='$orgName'"`
      if ($orgId == "") then
  	echo "Mrnas: 0"
      else
  	set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
  	echo "Mrnas: ${count}"
      endif
  end
  '_EOF_'
      # << happy emacs
      chmod +x ./showGenes.csh
      #	rearrange that output to create four sections:
      #	1. ensGenes for hg19, mm9, rn4
      #	2. ensGene for almost everything else
      #	3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1, oryCun2
  
      mkdir genes
      # ensGene
      for DB in hg19 mm9 rn4
  do
      hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
        | genePredSingleCover stdin stdout | gzip -2c \
          > /scratch/tmp/${DB}.tmp.gz
      mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
      echo "${DB} done"
  done
  
      echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
  	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
  	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
  	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
  	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
      | sed -e "s/  */ /g" > ensGene.list
  
  
  do
      # ensGene
      for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
  	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
  	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
  	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
  	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
  do
      hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
        | genePredSingleCover stdin stdout | gzip -2c \
          > /scratch/tmp/${DB}.tmp.gz
      mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
      echo "${DB} done"
  done
  
      echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list
  
      # xenoRefGene
      for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
  do
      hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
        | genePredSingleCover stdin stdout | gzip -2c \
          > /scratch/tmp/${DB}.tmp.gz
      mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
      echo "${DB} done"
  done
  
      #	the following single command doesn't work on any 32 Gb computer,
      #	requires much more memory, turn it into a kluster job, see below ...
  
      #	Create this command with this script:
      cat << '_EOF_' > mkCmd.sh
  #!/bin/sh
  
  echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \\"
  for G in mm9 rn4
  do
      if [ ! -s genes/${G}.gp.gz ]; then
  	echo "missing genes/${G}.gp.gz"
  	exit 255
      fi
      echo -n "${G} genes/${G}.gp.gz "
  done
  echo "\\"
  for D in `sort ensGene.list`
  do
      if [ ! -s genes/${D}.gp.gz ]; then
          echo "missing genes/${D}.gp.gz"
          exit 255
      fi
      echo -n "${D} genes/${D}.gp.gz "
  done
  echo "\\"
  for D in `sort xenoRef.list`
  do
      if [ ! -s genes/${D}.gp.gz ]; then
          echo "missing genes/${D}.gp.gz"
          exit 255
      fi
      echo -n "${D} genes/${D}.gp.gz "
  done
  echo "\\"
  echo "    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1"
  '_EOF_'
      # << happy emacs
      chmod +x ./mkCmd.sh
  
      #	this doesn't work on any 32 Gb computer, requires much more memory
      #	turn it into a kluster job, see below
      time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \
  mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
  panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
  calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
      | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1
  
      #	that doesn't work on any 32 Gb computer, requires much more memory
      #	turn it into a kluster job
      ssh swarm
      cd /hive/data/genomes/hg19/bed/multiz46way/frames
      cat << '_EOF_' > runOne
  #!/bin/csh -fe
  
  set C = $1
  set G = $2
  
  cat ../maf/${C}.maf | genePredToMafFrames hg19 stdin stdout \
          ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      ls ../maf | sed -e "s/.maf//" > chr.list
      ls genes | sed -e "s/.gp.gz//" | grep -v hg19 > gene.list
  
      cat << '_EOF_' > template
  #LOOP
  runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      mkdir parts
      gensub2 chr.list gene.list template jobList
      para -ram=8g create jobList
      para try ... check ... push
  # Completed: 4185 of 4185 jobs
  # CPU time in finished jobs:      72491s    1208.19m    20.14h    0.84d  0.002 y
  # IO & Wait Time:               1462162s   24369.36m   406.16h   16.92d  0.046 y
  # Average job time:                 367s       6.11m     0.10h    0.00d
  # Longest finished job:            3165s      52.75m     0.88h    0.04d
  # Submission to last job:          6364s     106.07m     1.77h    0.07d
  
      # see what it looks like in terms of number of annotations per DB:
      find ./parts -type f | while read F
  do
      zcat ${F}
  done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
    79191 rn4
   108287 petMar1
   139581 gorGor1
   140487 taeGut1
   143058 choHof1
   143233 vicPac1
   150073 anoCar1
   154462 tarSyr1
   163930 sorAra1
   164575 galGal3
   171191 macEug1
   174221 felCat3
   175831 dasNov2
   177622 ornAna1
   190729 eriEur1
   192285 tupBel1
   198052 speTri1
   199639 micMur1
   201731 papHam1
   201961 panTro2
   206170 oryCun2
   209327 ponAbe2
   209504 otoGar1
   210860 rheMac2
   212533 proCap1
   212848 myoLuc1
   213146 dipOrd1
   213479 calJac1
   215995 echTel1
   220341 ochPri2
   225132 loxAfr3
   226689 turTru1
   230903 monDom5
   232025 pteVam1
   232831 equCab2
   236945 cavPor3
   238167 bosTau4
   239857 mm9
   255727 canFam2
   316850 xenTro2
   359507 danRer6
   375156 oryLat2
   390076 fr2
   426532 gasAcu1
   434619 tetNig2
  
      #	load the resulting file
      ssh hgwdev
      cd /cluster/data/hg19/bed/multiz46way/frames
      find ./parts -type f | while read F
  do
      zcat ${F}
  done | sort -k1,1 -k2,2n > multiz46wayFrames.bed
  
      hgLoadMafFrames hg19 multiz46wayFrames stdin
  
      featureBits -countGaps hg19 multiz46wayFrames.bed
      #	57146632 bases of 3137161264 (1.822%) in intersection
  
      #	enable the trackDb entries:
  # frames multiz46wayFrames
  # irows on
      #	appears to work OK
  
  #############################################################################
  ## create upstream refGene maf files
      cd /hive/data/genomes/hg19/bed/multiz46way/downloads/maf
      # bash script
  #!/bin/sh
  for S in 1000 2000 5000
  do
      echo "making upstream${S}.maf"
      featureBits hg19 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
          | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
          | /cluster/bin/$MACHTYPE/mafFrags hg19 multiz46way \
                  stdin stdout \
                  -orgs=/hive/data/genomes/hg19/bed/multiz46way/species.list \
          | gzip -c > upstream${S}.maf.gz
      echo "done upstream${S}.maf.gz"
  done
  
      cd /usr/local/apache/htdocs/goldenPath/hg19/multiz46way/maf
      ln -s /hive/data/genomes/hg19/bed/multiz46way/downloads/maf/up*.gz .
      md5sum up*.gz >> md5sum.txt
  
  
  #############################################################################
  # AFFY U133AB (Done - 2009-09-30 - Jim)
      # Align probes
      ssh swarm
      cd /cluster/data/hg19/bed
      mkdir -p affyProbes/affyU133/run
      cd affyProbes/affyU133/run
      mkdir psl
      ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
      ls -1 /hive/data/outside/affyProbes/HG-U133AB_all.fa > mrna.lst
  
      cat << '_EOF_' > gsub
  #LOOP
  /cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << this line makes emacs coloring happy
  
      gensub2 genome.lst mrna.lst gsub jobList
      para create jobList
      para try
      para check
      para push
      para time
  #Completed: 93 of 93 jobs
  #CPU time in finished jobs:      21246s     354.09m     5.90h    0.25d  0.001 y
  #IO & Wait Time:                   349s       5.82m     0.10h    0.00d  0.000 y
  #Average job time:                 232s       3.87m     0.06h    0.00d
  #Longest finished job:            1650s      27.50m     0.46h    0.02d
  #Submission to last job:          1685s      28.08m     0.47h    0.02d
  
  
      # Do sort, best in genome filter.
      # to create affyU133.psl.
      pslSort dirs raw.psl tmp psl
      pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU133.psl /dev/null
      rm -r raw.psl psl
  
      # Load probes and alignments into database.
      ssh hgwdev
      cd /cluster/data/hg19/bed/affyProbes/affyU133
      hgLoadPsl hg19 affyU133.psl
      hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
  
      # Added ensToU133 table
      hgMapToGene hg19 affyU133 ensGene ensToU133
      # trim unwanted chip-prefix to be backwards compatible with hg17 and hg18
      hgsql hg19 -e 'update ensToU133 set value=substring(value,7)'
  
      # remove the trailing ";" from the value field (redmine #1685)
      hgsql hg19 -e 'update ensToU133 set value=trim(trailing ";" from value);'
  
  ##########################################################################
  # GNF ATLAS 2 (Done - 2009-09-30 - Jim)
      # Align probes from GNF1H chip.
      ssh swarm
      cd /cluster/data/hg19/bed
      mkdir -p geneAtlas2/run/psl
      cd geneAtlas2/run
      mkdir psl
      ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
      ls -1 /hive/data/outside/gnf/human/atlas2/gnf1h.fa > mrna.lst
      cat << '_EOF_' > gsub
  #LOOP
  /cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << this line makes emacs coloring happy
  
      gensub2 genome.lst mrna.lst gsub jobList
      para create jobList
      para try
      para check
      para push
      para time
  #Completed: 93 of 93 jobs
  #CPU time in finished jobs:       3299s      54.98m     0.92h    0.04d  0.000 y
  #IO & Wait Time:                   330s       5.50m     0.09h    0.00d  0.000 y
  #Average job time:                  39s       0.65m     0.01h    0.00d
  #Longest finished job:             370s       6.17m     0.10h    0.00d
  #Submission to last job:           477s       7.95m     0.13h    0.01d
  
  
      # Do sort, best in genome filter
      # to create gnf1h.psl.
      pslSort dirs raw.psl tmp psl
      pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1h.psl /dev/null
      rm -r raw.psl psl
  
      # Load probes and alignments from GNF1H into database.
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/geneAtlas2
      hgLoadPsl hg19 affyGnf1h.psl
      hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/gnf1h.fa
  
      grep -v U133B ../affyProbes/affyU133/affyU133.psl \
  	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
  	| sed -e "s/;//" > affyU133A.psl
  
      hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
      	affyU133A.psl  affyGnf1h.psl
  
      # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
      # Mapped 33186,  multiply-mapped 3171, missed 48, unmapped 11510
  
      hgLoadBed hg19 gnfAtlas2 gnfAtlas2.bed
      # Loaded 36357 elements of size 15
  
      # Added ensToGnf1h table
      hgMapToGene hg19 affyGnf1h ensGene ensToGnf1h
  
  ##########################################################################
  # BUILD NIBB IMAGE PROBES (DONE 2009-10-12 JK)
  
  # Make directory on san for cluster job and copy in sequence
      ssh swarm
      mkdir /hive/data/genomes/hg19/bed/nibbPics
      cd /hive/data/genomes/hg19/bed/nibbPics
      cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
  
  # Make parasol job dir and sequence list files
      mkdir run
      cd run
      mkdir psl
      ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
      echo ../nibbImageProbes.fa > mrna.lst
  
  # Create parasol gensub file file
  cat << '_EOF_' > gsub
  #LOOP
  blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
  #ENDLOOP
  '_EOF_'
  # << emacs
  
  # Create parasol batch
      gensub2 genome.lst mrna.lst gsub spec
      para create spec
  
  # Do para try/push/time etc.
  #Completed: 93 of 93 jobs
  #CPU time in finished jobs:       8008s     133.47m     2.22h    0.09d  0.000 y
  #IO & Wait Time:                   364s       6.07m     0.10h    0.00d  0.000 y
  #Average job time:                  90s       1.50m     0.03h    0.00d
  #Longest finished job:             765s      12.75m     0.21h    0.01d
  #Submission to last job:           824s      13.73m     0.23h    0.01d
  
  # Make sort and filter
      catDir psl | sort -k 10 \
          | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
  	| sort -k 14,14 -k 16,16n \
  	| sed 's#/scratch/data/hg19/nib/chr#chr#' \
  	| sed 's/.nib//' > ../nibbImageProbes.psl
  
  # Make bed file and copy in stuff
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/nibbPics
      cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
  
  # Load into database
      ln -s /cluster/data/hg19/bed/nibbPics/nibbImageProbes.fa /gbdb/hg19/nibbImageProbes.fa
      hgLoadSeq hg19 /gbdb/hg19/nibbImageProbes.fa
      hgLoadPsl hg19 nibbImageProbes.psl
  
  ##########################################################################
  # Initial vgProbeTrack run for hg19 (galt 2009-10-15)
  # see visiGene.txt make doc
  # uses nibbImageProbes and vgProbeTrack utility
  # creates vgAllProbes and ensToVisiGene
  #    25931
  # updates visiGene.vgPrbAliAll.
  # creates and runs hgLoadSeq on /gbdb/hg19/visiGene/*.fa
  
  ##########################################################################
  # make new grp table to match hg18 (DONE  2009-10-01 kuhn)
  # to split regulation from expression
  # phenDis group is also missing in hg19
  # and add one more column: defaultIsClosed
  
  # get the hg18.grp table into hg19
  
  # copy the hg18.grp table into hg19.grpNew and edit
     hgsql hg19
     CREATE TABLE grpNew SELECT * FROM hg18.grp;
     # 24 rows in set (0.00 sec)
  
     DELETE FROM grpNew WHERE name LIKE "encode%";
     DELETE FROM grpNew WHERE name LIKE "remc%";
     DELETE FROM grpNew WHERE name LIKE "tcga%";
     DELETE FROM grpNew WHERE name LIKE "cancer%";
     DELETE FROM grpNew WHERE name LIKE "jk%";
     # 10 rows in set (0.00 sec)
  
  # move the new table into place quickly
     DROP TABLE grp;
     RENAME TABLE grpNew TO grp;
  
  #########################################################################
  # BUILD OMIM RELATED GENES TRACK (done 2009-10-13 jk)
  
  ssh hgwdev
  cd /hive/data/genomes/hg19/bed
  mkdir omimGene
  cd omimGene
  
  # download the file morbidmap and genemap from OMIM
  
  mkdir omim
  cd omim
  wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
  wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
  cat genemap|sed -e 's/|/\t/g' > genemap.tab
  autoSql ~/src/hg/lib/omimGeneMap.as x
  cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
  hgLoadSqlTab -warn hg19 omimGeneMap omimGeneMap.sql genemap.tab
  
  # got warning on 3 records, just ignore them
  # Warning: load of omimGeneMap did not go as planned: 12216 record(s), 0 row(s)
  
  rm x.c x.h
  cd ..
  cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
  autoSql ~/src/hg/lib/omimMorbidMap.as x
  cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
  hgLoadSqlTab -warn hg19 omimMorbidMap omimMorbidMap.sql mobidmap.tab
  
  # get all UCSC genes (from the ensGene table) that cross-reference to a RefSeq gene
  # that has a non-empty OMIM ID according to the refLink table.  And use OMIM ID as
  # the gene name for this new table.  Please note the alignId field still holds the KG ID.
  
  hgsql hg19 -N -e \
  'select omimId, kg.* from ensGene kg, ensToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
  |cut -f 1,3-13 >o1.tab
  
  # collect more OMIM related genes via the MIM external DB links from UniProt
  
  hgsql hg19 -N -e \
  'select extAC, kg.* from ensGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
  |cut -f 1,3-13 >o2.tab
  
  # concatenate the above two gene sets and remove duplications.
  
  cat o1.tab o2.tab |sort -u >o3.tab
  
  # load the result into a temp table, fanO3
  hgLoadSqlTab hg19 fanO3 ~/src/hg/lib/ensGene.sql o3.tab
  
  # while holding onto the OMIM ID, get the canonical gene (via the ensGene, knowIsoforms,
  # and ensCanonical tables) that represent a cluster which contains
  # initial OMIM gene in the fanO3 table
  
  hgsql hg19 -N -e \
  'select f3.name, kg.* from fanO3 f3, ensGene kg, ensCanonical c, ensIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
  > o4.tab
  
  # first column is the OMIM ID
  cut -f 1 o4.tab >j1.tmp
  
  # col 3-13 is the gene structure of the canonical KG
  cut -f 3-13 o4.tab >j2.tmp
  
  # stitch them together and remove duplicates, load the result into fanO4 table
  paste j1.tmp j2.tmp |sort -u >fanO4.tab
  hgLoadSqlTab hg19 fanO4  ~/src/hg/lib/ensGene.sql fanO4.tab
  
  # finally sort the table and create bed 4 file and load it as the omimGene table
  
  hgsql hg19 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
  hgLoadBed hg19 omimGene omimGene.bed
  
  # create and load the omimToKnownCanonical table.
  
  hgsql hg19 -N -e 'select name, alignId from fanO4 order by name'\
  > omimToKnownCanonical.tab
  
  hgLoadSqlTab hg19 omimToKnownCanonical  \
  ~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab
  
  # The following clean up could be done.
  # hgsql hg19 -e 'drop table fanO3'
  # hgsql hg19 -e 'drop table fanO4'
  # rm j*.tmp
  # rm o1.tab o2.tab o3.tab o4.tab
  
  #########################################################################
  # BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (in progress 2009-10-14 jk)
  
  # Make the directory to work in
     cd /hive/data/genomes/hg19/bed
     mkdir hprd
     cd hprd
  
  # Download HPRD_XML_070609.tar.gz from www.hprd.org. Unfortunately this
  # requires registration, so can't just wget it.
  
      zcat HPRD_XML_070609.tar.gz | tar -xv
  
  # This will create 20000 or more  xxxx.xml files under HPRD_XML_070609
  
  # Create hprdToCdna table
      echo HPRD_XML_070609/*.xml | xargs grep entry_cdna  > j.cdna
      cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
  	sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
  	grep -v None >hprdToCdna.tab
  
      hgsql hg19 <~/src/hg/lib/hprdToCdna.sql
      hgsql hg19 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'
  
  # Create hprdToUniProt table
  
      echo 'fgrep -H Swiss  HPRD_XML_070609/$1.xml' >do1
  
      ls HPRD_XML_070609 >j
      cat j |sed -e 's/.xml/\tdo1/g' >jj
      cut -f 1 jj >j.2
      cut -f 2 jj >j.1
      paste j.1 j.2 >doall
      chmod +x do*
  
      ./doall >j.out
      cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
      sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hprdToUniProt.tab
  
      hgsql hg19 <~/src/hg/lib/hprdToUniProt.sql
      hgsql hg19 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'
  
  # build ensToHprd table
  
      hgsql hg19 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=refseq' >j.kg1
      hgsql hg19 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
  
      cat j.kg1 j.kg2 | sed 's/_.//' | sort -u >ensToHprd.tab
      wc ensToHprd.tab
  
      hgsql hg19 <~/src/hg/lib/ensToHprd.sql
  
      hgsql hg19 -e 'load data local infile "ensToHprd.tab" into table ensToHprd'
      hgsql hg19 -e 'select count(*) from ensToHprd'
  
  # 21,516 records created
  
  # remove temporary files.
  
      rm j*
  
  #########################################################################
  # hgPal downloads (DONE braney 2009-11-03)
  #   FASTA from 46way for refGene, ensGene, ensCanonical
  
      ssh hgwdev
      screen
      bash
      rm -rf /cluster/data/hg19/bed/multiz46way/pal
      mkdir /cluster/data/hg19/bed/multiz46way/pal
      cd /cluster/data/hg19/bed/multiz46way/pal
      for i in `cat ../species.list`; do echo $i; done > order.lst
  
      # redid the refgene ones  2013-08-15
      mz=multiz46way
      gp=refGene
      db=hg19
      mkdir exonAA exonNuc ppredAA ppredNuc
      for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
      do
  	echo "date"
  	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
  	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
  	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
  	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
  	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
  	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
  	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
  	    gzip -c > exonAA/$j.exonAA.fa.gz"
      done > $gp.jobs
  
      nice time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
      sleep 1
      tail -f $gp.jobs.log
  
  # 1817.21user 233.92system 4:54:04elapsed 11%CPU (0avgtext+0avgdata
  # 920192maxresident)k
  # 6024inputs+0outputs (7major+1648126minor)pagefaults 0swaps
  
      mz=multiz46way
      gp=refGene
      db=hg19
      zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
      zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
  
      rm -rf exonAA exonNuc ppredAA ppredNuc
  
      # we're only distributing exons at the moment
      mz=multiz46way
      gp=refGene
      db=hg19
      pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
  
      mz=multiz46way
      gp=ensGene
      db=hg19
      mkdir exonAA exonNuc ppredAA ppredNuc
      for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
      do
  	echo "date"
  	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
  	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
  	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
  	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
  	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
  	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
  	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
  	    gzip -c > exonAA/$j.exonAA.fa.gz"
      done > $gp.$mz.jobs
  
      time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
      sleep 1
      tail -f $gp.$mz.job.log
  
  # oops... missed the timing
  
  
      mz=multiz46way
      gp=ensGene
      db=hg19
  
      zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
      zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
  
      rm -rf exonAA exonNuc ppredAA ppredNuc
  
      mz=multiz46way
      gp=ensGene
      db=hg19
      pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
  
      # now do the canonical set
      cd /cluster/data/hg19/bed/multiz46way/pal
      mz=multiz46way
      gp=ensCanonical
      db=hg19
      for j in `awk '{print $1}' /cluster/data/hg19/chrom.sizes`
      do
  	echo "select chrom, chromStart, chromEnd, transcript from ensCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.ens.bed
      done
  
      mkdir exonAA exonNuc ppredAA ppredNuc
      for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
      do
  	echo "date"
  	echo "mafGene -geneBeds=$j.ens.bed  $db $mz ensGene order.lst stdout | \
  	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
  	echo "mafGene -geneBeds=$j.ens.bed -noTrans $db $mz ensGene order.lst stdout | \
  	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
  	echo "mafGene -geneBeds=$j.ens.bed -exons -noTrans $db $mz ensGene order.lst stdout | \
  	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
  	echo "mafGene -geneBeds=$j.ens.bed -exons $db $mz ensGene order.lst stdout | \
  	    gzip -c > exonAA/$j.exonAA.fa.gz"
      done > $gp.$mz.jobs
  
      time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
      sleep 1
      tail -f $gp.$mz.job.log
  
  # real    302m20.489s
  # user    27m31.179s
  # sys     5m30.071s
  
  
      rm *.ens.bed
      mz=multiz46way
      gp=ensCanonical
      db=hg19
      zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
      zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
  
      rm -rf exonAA exonNuc ppredAA ppredNuc
  
      mz=multiz46way
      gp=ensCanonical
      db=hg19
      pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
  
  ############################################################################
  # SEGMENTAL DUPLICATIONS (2010-02-02 - 2010-02-04, hartera, DONE)
  # (2011-09-26, Fan, REBUILT using corrected data from provider)
  # File emailed from Tin Louie <tinlouie at u.washington.edu>
  # in Evan Eichler's lab on 01/28/10. This is a data update since it was
  # thought that the last data set was incorrect so the pipeline had to be
  # re-run.
  # NOTE: Received e-mail from Tin Louie suggesting that the otherSize
  # column could be dropped. It is just the size of the otherChrom and it
  # does not seem to be used for the track display or details page. It has the
  # correct description in the table schema so it is ok to keep it for now.
  # In the future, this column could be dropped if it not useful.
  # There are a number of columns that could be dropped as they are
  # meaningless but decided to keep them as the code for the details page
  # expect them to be there.
  # 01/28/10 Received new data as previous run of the pipeline may have
  # produced incorrect results.
  # 2010-02-02 Loader aborted on data since in some lines there was an empty
  # field so the loader read only 28 words instead of 29. E-mailed Tin to
  # ask for the data to be fixed.
  # 2010-02-03 Received new data as the previous data had empty fields.
  # 2010-02-04 Loaded new data into hg19 database.
  # 2010-02-09 Received new data on 02/08/10 as there were more errors in the
  # code that caused the data to have empty fields.
  # 2010-02-19 Changed the posBasesHit column values to match those for hg18.
  # 2011-09-26 Rebuilt, using corrected (10th col) from data provider.
  # In hg18, they are all 1000, but this is meaningless.
      mkdir /hive/data/genomes/hg19/bed/genomicSuperDups/0926_2011
      cd /hive/data/genomes/hg19/bed/genomicSuperDups/0926_2011
      wget --timestamping \
           ftp://mesh.gs.washington.edu/pub/UCSC/hg19genomicSuperDups.fixed.tab.gz
      gunzip hg19genomicSuperDups.fixed.tab.gz
      # Fix incorrect chromosome names in data. Check both chrom and otherChrom.
      # Previously, found several cases where the last letter of random was
      # missing for the names of the random contigs. They all look good this
      # time.
      awk '{print $1}' hg19genomicSuperDups.fixed.tab | sort | uniq > chroms
      awk '{print $7}' hg19genomicSuperDups.fixed.tab | sort | uniq > otherChroms
      hgsql -Ne 'select chrom from chromInfo;' hg19 | sort | uniq > chromInfo.txt
      comm -23 chroms chromInfo.txt
      comm -23 otherChroms chromInfo.txt
      # chroms and otherChroms match chromosome names in chromInfo.
  
      # The sed command is necessary to fix "_" used as strand to "-".
      # The awk command was necessary for some recent other species
      # genomicSuperDups that had some too-short regions.  It does not seem
      # to be necessary here, but doesn't hurt and may be useful in
      # future builds.
      hgsql -e 'drop table genomicSuperDups;' hg19
      sed -e 's/\t_\t/\t-\t/' hg19genomicSuperDups.fixed.tab \
      | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
      | hgLoadBed hg19 genomicSuperDups stdin \
        -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
  # Loaded 51599 elements of size 29
  # Sorted
  # Creating table definition for genomicSuperDups
  # Saving bed.tab
  # Loading hg19
  
      # 2009-11-05:
      # Updated details page with suggested text and an additional reference.
      # src/hg/makeDb/trackDb/genomicSuperDups.html
      # 2010-02-04: Updated the schema description as below in
      # src/hg/lib/genomicSuperDups.sql. Kept score as it is used in older
      # datasets e.g. on hg18 -
      # Suggestions by Tin Louie for the schema description:
  # I suggest that the description of those meaningless columns (on the webpage
  # 'Schema for Segmental Dups') be changed to "for future use". The meaningless
  # columns are:  score, posBasesHit, testResult, verdict, chits, ccov
  # The descriptions of other columns should be changed for clarification:
  # otherSize -- equal to otherEnd minus otherStart
  # uid -- id shared by the query & subject of a hit
       # 2010-02-19 Changed the posBasesHit column to be 1000. Checked with
       # data provider about doing this so that the values are the same as for
       # those in the hg18 table.
       # hgsql -e 'update genomicSuperDups set posBasesHit = 1000;' hg19
  # New corrected data fixed the above problem.
  ############################################################################
  # ADD LINK TO GENENETWORK (DONE. 12/02/09 Fan).
  
  # Received geneNetwork ID list file, GN_human_RefSeq.txt, for hg19 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].
  
      ssh hgwdev
      mkdir -p /cluster/data/hg19/bed/geneNetwork
      cd /cluster/data/hg19/bed/geneNetwork
  
      hgsql hg19 < ~/src/hg/lib/geneNetworkId.sql
      hgsql hg19 -e \
      'load data local infile "GN_human_RefSeq.txt" into table geneNetworkId'
  
  ########################################################################
  # EXONIPHY (2009-12-28, hartera, DONE)
  # 2010-01-05, hartera. Moved trackDb entry for exoniphy up to human directory
  # level as it is the same for all assemblies.
      # New predictions for hg19 run by Melissa Hubisz
      # (mjhubisz at gmail.com) for hg19 and sent by
      # Adam Siepel (acs4 at cornell.edu) on 2009-12-18
      mkdir -p /hive/data/genomes/hg19/bed/exoniphy.2009-12-18
      cd /hive/data/genomes/hg19/bed/exoniphy.2009-12-18
      # Download predictions file, exoniphy.gff
      wget --timestamping \
           "http://compgen.bscb.cornell.edu/~acs/exoniphy.gff.gz"
      gunzip exoniphy.gff.gz
      # Remove table of lifted predictions from hg18
      hgsql -e 'drop table exoniphy;' hg19
      ldHgGene -genePredExt -gtf hg19 exoniphy exoniphy.gff
      # Read 620689 transcripts in 647299 lines in 1 files
      # 620689 groups 24 seqs 1 sources 4 feature types
      # 186601 gene predictions
      # Added a copy of the hg18 track description to trackDb/human/hg19 and
      # updated it and added a trackDb entry to the trackDb.ra there.
      # 2010-01-04. Moved exoniphy trackDb entry up to the human level
      # trackDb.ra since the entry is the same for hg16-19. Removed the entry in
      # trackDb.ra in each of those assembly directories.
  
  ########################################################################
  # Vega gene update (DONE - 2010-01-15 - Hiram)
      #	lookup version number at the Vega WEB site:
      #	http://vega.sanger.ac.uk/index.html
      #	and FTP site:
      #	ftp://ftp.sanger.ac.uk/pub/vega/
      cd /hive/data/genomes/hg19
      #	step wise to verify operation
      doEnsGeneUpdate.pl -vegaGene -ensVersion=36 -stop=download hg19.ensGene.ra
      doEnsGeneUpdate.pl -vegaGene -ensVersion=36 \
  	-continue=process -stop=process hg19.ensGene.ra
      doEnsGeneUpdate.pl -vegaGene -ensVersion=36 \
  	-continue=load -stop=load hg19.ensGene.ra
      doEnsGeneUpdate.pl -vegaGene -ensVersion=36 \
  	-continue=cleanup hg19.ensGene.ra
      featureBits hg19 vegaGene
      # 64888909 bases of 2897316137 (2.240%) in intersection
      featureBits hg19 vegaPseudoGene
      # 6885145 bases of 2897316137 (0.238%) in intersection
  
  ########################################################################
  # NHGRI GWAS CATALOG (DONE 2/4/13 angie)
  # NOTE: This assumes that the corresponding section in hg18.txt has just been run.
  #       It depends on the noCoords.tab file in the corresponding hg18 build directory.
  # 2013 updates: 2/4
  # 2012 updates: 12/10, 10/4, 8/1, 6/4, 4/4, 2/21 (remove extra whitespace, translate non-ASCII to html), 2/6
  # Updated 12/7/11, 11/2/11, 10/3/11, 9/2/11, 8/1/11, 6/9/11, 4/1/11 (last one to use snp131), 3/1/11, 2/1/11
  # Updated 12/7/10, 11/1/10, 10/6/10, 9/1/10, 8/2/10, 6/2/10, 5/12/10 (last one to use snp130)
  # Updated 4/1/10, 3/1/10
  # Originally done 1/19/10
      mkdir /hive/data/genomes/hg19/bed/gwasCatalog
      cd /hive/data/genomes/hg19/bed/gwasCatalog
      # Done once per dbSNP build, don't need to redo until next dbSNP build is released:
      zcat ../snp137/snp137.bed.gz | cut -f 1-4,6,8,18,21-24  \
      | sort -k4,4 \
        > snp137Coords.bed
      set today = `date +%y%m%d`
      mkdir /hive/data/genomes/hg19/bed/gwasCatalog/$today
      cd /hive/data/genomes/hg19/bed/gwasCatalog/$today
  
      # Mapping to hg19 by joining hg19 SNP coords with catalog flatfile (see hg18.txt)
      join -t "	" -1 4 ../snp137Coords.bed /hive/data/genomes/hg18/bed/gwasCatalog/$today/noCoords.tab \
          -o 1.1,1.2,1.3,1.4,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19,1.5,1.6,1.7,1.8,1.9,1.10,1.11 \
      | sort -k1,1 -k2n,2n \
          > gwasCatalogPlus.bed
      cut -f 1-22 gwasCatalogPlus.bed \
      | hgLoadBed hg19 gwasCatalog stdin \
          -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
  #Read 12194 elements of size 22 from stdin
      # For David: find examples of risk alleles for which dbSNP observed
      # alleles are complementary (A/T or C/G) -- how do we know what strand the
      # risk allele is on??  -- asked corresp. author Teri Manolio.  Info is not
      # always available in the original publication, so sadly there is not always
      # a way to resolve these. GWAS catalog folks aren't going to modify their
      # database to add a column for these cases.
      hgsql hg19 -NBe 'select snp.name,gc.riskAllele,snp.strand,snp.refNcbi,snp.observed \
                       from gwasCatalog as gc, snp137 as snp \
                       where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \
                             gc.name = snp.name and snp.observed in ("C/G", "A/T") \
                       order by gc.name;' > ambigStrand.txt
      wc -l ambigStrand.txt
  #1249 ambigStrand.txt
  
  
  ########################################################################
  # ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04
      cd /hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04
  
      cat << '_EOF_' > DEF
  # Human vs. Panda
  #	parameters from the Panda paper supplemental where they describe
  #	their lastz parameters
  BLASTZ_K=2200
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_H=2000
  BLASTZ_C=2
  BLASTZ_T=2
  
  # our usual M
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Panda
  SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
  SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=50
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #	real    434m21.792s
      cat fb.hg19.chainAilMel1Link.txt
      #	1453400264 bases of 2897316137 (50.164%) in intersection
  
      mkdir /hive/data/genomes/ailMel1/bed/blastz.hg19.swap
      cd /hive/data/genomes/ailMel1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04/DEF \
  	-swap -noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      real    124m14.393s
      cat fb.ailMel1.chainHg19Link.txt
      #	1411953704 bases of 2245312831 (62.884%) in intersection
  
      cd /hive/data/genomes/ailMel1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-21 - Hiram)
      screen # use a screen to manage this multi-day job
      mkdir /hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21
      cd /hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21
  
      cat << '_EOF_' > DEF
  # Pig vs. Human
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Pig SusScr1
  SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
  SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << this line keeps emacs coloring happy
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #	real    1072m48.949s
      cat fb.hg19.chainSusScr1Link.txt
      #	1198793067 bases of 2897316137 (41.376%) in intersection
  
      mkdir /hive/data/genomes/susScr1/bed/blastz.hg19.swap
      cd /hive/data/genomes/susScr1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21/DEF \
  	-swap -noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    119m14.040s
      cat fb.susScr1.chainHg19Link.txt
      #	1272787231 bases of 2231332019 (57.042%) in intersection
  
  #########################################################################
  # PERSONAL GENOME VARIANTS (DONE 12/29/09 giardine)
  # OBSOLETE -- see "PERSONAL GENOME VARIANTS - RELOAD" below.
      # This is Angie's attempt to reconstruct Belinda's steps:
      mkdir /hive/data/genomes/hg19/bed/pgSnpLiftOver
      cd /hive/data/genomes/hg19/bed/pgSnpLiftOver
      # liftOver track files in /hive/data/genomes/hg18/bed/pgSnp/
      set hg18Dir = /hive/data/genomes/hg18/bed/pgSnp
      set chainFile = /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz
  
      #*** Are we losing insertions here?:
      foreach f (NA12878.pgSnp NA12891.pgSnp NA12892.pgSnp NA19240.pgSnp \
                 pgWatson.bed pgYri3.txt pgSnpYh.txt)
        liftOver $hg18Dir/$f $chainFile $f:r.hg19.pgSnp{,.unmapped}
      end
      liftOver $hg18Dir/koref.sub.pgSnp $chainFile koref.hg19.pgSnp{,.unmapped}
      # Why pgVenter2?
      liftOver $hg18Dir/pgVenter.bed $chainFile pgVenter2.hg19.pgSnp{,.unmapped}
  
      # remove variants that are homozygous matches to hg19
  cat > addRefNt.pl <<'_EOF_'
  #!/usr/bin/perl -w
  use strict;
  
  my $build = 'hg19';
  my $nib = "/hive/data/genomes/hg19/nib/";
  my $nibFrag = "nibFrag";
  
  while (<>) {
     chomp;
     my @f = split(/\t/);
     my $ref = '';
     if ($f[1] eq $f[2]) {
        $ref = '.'; #insertion nothing in ref
     }else {
        open(NIB, "$nibFrag $nib$f[0].nib $f[1] $f[2] + stdout |")
           or die "Couldn't run $nibFrag, $!\n";
        while(<NIB>) {
           chomp;
           if (/^>/) { next; }
           $ref .= $_;
        }
        close NIB or die "Couldn't close $nibFrag, $!\n";
     }
     #splice(@f, 3, 0, uc($ref));
     #print join("\t", @f), "\n";
     print join("\t", @f), "\t", uc($ref), "\n";
  }
  '_EOF_'
      # << emacs
      chmod a+x addRefNt.pl
      foreach f (*.pgSnp)
        addRefNt.pl $f \
        | perl -wpe '@w=split; s/^.*\n$// if ($w[3] eq $w[6]); s/\t\w+$//;' \
          > $f:r.filtered.pgSnp
      end
  #TODO: complete attempt to reverse-engineer Belinda's work
      BelindasFix pgYri3.hg19.filtered.pgSnp > pgYri3.hg19.filtered.fixed.pgSnp
  
      # Load into db:
      foreach i (NA12878 NA12891 NA12892 NA19240 Watson)
        hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
          hg19 pg$i $i.hg19.filtered.pgSnp
      end
      hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
        hg19 pgVenter pgVenter2.hg19.filtered.pgSnp
      hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
        hg19 pgYh1 pgSnpYh1.hg19.filtered.pgSnp
      hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
        hg19 pgSjk koref.hg19.filtered.pgSnp
      hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
        hg19 pgYoruban3 pgYri3.hg19.filtered.fixed.pgSnp
  
  
  ########################################################################
  # CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
  # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
  # from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona.
  # Data was produced using their GEM mapper aligner taking sliding k-mers
  # window of the human genome that were mapped back onto the genome with up
  # to 2mismatches. For each window, a mappability score is computed
  # S = 1/(nb of match_found) and the BigWig index was created according to
  # this score.
  # 2010-01-26 Loaded tables and added data to /gbdb/
  # 2010-01-28 Changed the table names to have the "enc" prefix for consistency
  # going forward with hg19 ENCODE tracks. Added trackDb entry for this
  # ENCODE Mapability track.
  # 2010-02-05 Added a 40mer sequence subtrack received on 2010-02-04.
  # 2010-03-16 - 2010-03-18. Added metadata to trackDb for the subtracks and
  # added downloads for the bigWig data files.
  # 2010-04-28. Received new data from Thomas Derrien. Downloaded data and
  # added it to /gbdb/. A bug was found in a library used by bedGraphToBigWig so
  # sent a new binary to data providers and they re-created the bigWig files.
  # 2010-05-11. All ENCODE tracks need to be preceded by the wgEncode prefix now
  # on all assemblies. Update the file names in /gbdb/hg19/bbi and the
  # table names. (hartera)
  # 2010-05-12. Added 24mer track to trackDb entry. Updated downloads with the
  # new data.
      mkdir -p /hive/data/genomes/hg19/bed/crgMapability
      cd /hive/data/genomes/hg19/bed/crgMapability
  cat << 'EOF' > temp
  #!/bin/tcsh -ef
  http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-36.bw.bz2
  http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-50.bw.bz2
  http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-75.bw.bz2
  http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-100.bw.bz2
  'EOF'
  
      awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
          temp > download.csh
      rm temp
      chmod +x download.csh
      ./download.csh >& download.log &
  
       # Add the data to /gbdb/ and load the file names into tables (2010-01-26)
       cd /hive/data/genomes/hg19/bed/crgMapability
       bunzip2 *.bz2
  
       # Add data to gbdb
       mkdir -p /gbdb/hg19/bbi/
       # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/hg18/bbi
       # and load file name into a table - one per dataset. Each table
       # represents a subtrack.
       foreach f (`ls *.bw`)
          echo $f
          set g=`echo $f | cut -d "-" -f2`
          set num=`echo $g | cut -d "." -f1`
          set mer=`echo "${num}mer"`
          set nf=`echo "crgMapabilityAlign${mer}.bw"`
          echo $nf
          ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
          hgsql hg19 -e "drop table if exists crgMapabilityAlign${mer}; \
       create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \
       insert into crgMapabilityAlign${mer} values ('/gbdb/hg19/bbi/${nf}');"
       end
  
       # 2010-01-28
       # Renamed the tables to have a enc prefix for consistency going
       # forward with hg19.
       cd /hive/data/genomes/hg19/bed/crgMapability
       hgsql -Ne 'show tables like "crg%";' hg19 > tables.txt
       foreach t (`cat tables.txt`)
          set g=`echo $t | sed -e 's/c/C/'`
          hgsql -e "alter table ${t} rename enc${g};" hg19
       end
       # Added a trackDb entry for this ENCODE Mapability
       # track in kent/src/hg/makeDb/trackDb/human/hg19/trackDb.enc.ra
       # Copied track from the hg18/trackDb.wgEncode.ra entry.
       # use bigWigInfo to check min and max values.
  
       # Added data for a 40mer subtrack - 2010-02-05
       cd /hive/data/genomes/hg19/bed/crgMapability
       wget --timestamping \
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-40.bw.bz2
       bunzip2 H.sapiens.genome.hg19.main.mappability-40.bw.bz2
       ln -s `pwd`/H.sapiens.genome.hg19.main.mappability-40.bw \
  /gbdb/hg19/bbi/crgMapabilityAlign40mer
          hgsql hg19 -e "drop table if exists encCrgMapabilityAlign40mer; \
       create table encCrgMapabilityAlign40mer (fileName varchar(255) not null); \
       insert into encCrgMapabilityAlign40mer values \
  ('/gbdb/hg19/bbi/crgMapabilityAlign40mer');"
      # Added a subtrack to trackDb/human/hg19/trackDb.enc.ra to the
      # Mapability track.
  
      # 2010-03-16 - 2010-03-18
      # Added metadata to the trackDb entries for the subtracks and
      # added downloads for these data files.
      mkdir -p /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC/encMapability
      cd /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC/encMapability
      cp -p /gbdb/hg19/bbi/crg*.bw .
      gzip crg*.bw
      # Copied over hg18/encodeDCC/wgEncodeMapability/preamble.html
      # and edited it to only mention the CRG dataset.
      # Run encodeDownloadsPage.pl to generate the index page for downloads.
      # It does not capture all the information probably because the subtrack
      # name is different to the downloads name so change the file names and
      # re-load the tables and make the downloads.
      cd /hive/data/genomes/hg19/bed/crgMapability
      foreach f (`ls *.bw`)
         echo $f
         set g=`echo $f | cut -d "-" -f2`
         set num=`echo $g | cut -d "." -f1`
         set mer=`echo "${num}mer"`
         set of=`echo "crgMapabilityAlign${mer}.bw"`
         set nf=`echo "encCrgMapabilityAlign${mer}.bw"`
         echo $nf
         rm /gbdb/hg19/bbi/${of}
         ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
         hgsql hg19 -e "drop table if exists encCrgMapabilityAlign${mer}; \
       create table encCrgMapabilityAlign${mer} (fileName varchar(255) not null); \
       insert into encCrgMapabilityAlign${mer} values ('/gbdb/hg19/bbi/${nf}');"
       end
  
       cd /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC/encMapability
       rm crg*
       cp -p /gbdb/hg19/bbi/encCrg*.bw .
       gzip encCrg*.bw
       # Then run encodeDownloadsPages.pl
       /cluster/home/hartera/bin/encodeDownloadsPage.pl -db=hg19 -checksum \
            -preamble=preamble.html index.html .
  
       # Downloaded and added new bigWig files to /gbdb/hg19/bbi
       # (2010-04-28 and 2010-04-30, hartera). New files were created as
       # there was a bug in the older version of bedGraphToBigWig.
       cd /hive/data/genomes/hg19/bed/crgMapability
       rm temp download.csh download.log
  cat << 'EOF' > temp
  #!/bin/tcsh -ef
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-100.bw.bz2
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-24.bw.bz2
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-36.bw.bz2
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-40.bw.bz2
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-50.bw.bz2
  http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-75.bw.bz2
  'EOF'
       awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
           temp > download.csh
       rm temp
       chmod +x download.csh
       ./download.csh >& download.log &
  
       # Add data to /gbdb/. The file names in /gbdb/ are the same as before
       # so the tables do not need to be reloaded.
       cd /hive/data/genomes/hg19/bed/crgMapability
       bunzip2 *.bz2
       foreach f (`ls *.bw`)
         echo $f
         set g=`echo $f | cut -d "-" -f2`
         set num=`echo $g | cut -d "." -f1`
         set mer=`echo "${num}mer"`
         set nf=`echo "encCrgMapabilityAlign${mer}.bw"`
         echo $nf
         rm /gbdb/hg19/bbi/${nf}
         ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
       end
  
       # 2010-05-11, hartera. Re-name bigWig files and update tables
       # as all ENCODE tracks should now have the wgEncode prefix on all
       # assemblies.
      cd /hive/data/genomes/hg19/bed/crgMapability
      foreach f (`ls *.bw`)
         echo $f
         set g=`echo $f | cut -d "-" -f2`
         set num=`echo $g | cut -d "." -f1`
         set mer=`echo "${num}mer"`
         set of=`echo "encCrgMapabilityAlign${mer}.bw"`
         set nf=`echo "wgEncodeCrgMapabilityAlign${mer}.bw"`
         echo $nf
         rm /gbdb/hg19/bbi/${of}
         ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
         hgsql hg19 -e "drop table if exists encCrgMapabilityAlign${mer}; \
       create table wgEncodeCrgMapabilityAlign${mer} (fileName varchar(255) not null); \
       insert into wgEncodeCrgMapabilityAlign${mer} values ('/gbdb/hg19/bbi/${nf}');"
       end
       # Then change the subtrack names to match the new table names in
       # kent/src/hg/makeDb/trackDb/human/hg19/trackDb.wgEncode.ra as
       # the contents of trackDb.enc.ra has been moved there.
  
       # 2010-05-12
       # Added subtrack for the new 24mer table.
       # Updated the downloads for the new data.
       cd /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC
       # Change name of downloads directory to be consistent with the
       # new track name.
       mv encMapability wgEncodeMapability
       cd wgEncodeMapability
       rm encCrg* md5sum.txt
       cp -p /gbdb/hg19/bbi/wgEncodeCrg*.bw .
       gzip wgEncodeCrg*.bw
       # Then run encodeDownloadsPage.pl script to update the index.html and
       # regenerate the md5sum.txt file.
       encodeDownloadsPage.pl -db=hg19 -checksum index.html
  
  #####################################################################
  # tRNAs track (2010-01-13, Fan DONE)
  # tRNAs track (2010-03-10, Fan RE-BUILT WITH UPDATED DATA FROM TODD LOWE)
  #
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed
      mkdir tRNAs
      cd tRNAs
  
  # Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/
  
      cp -p /projects/lowelab/users/lowe/Browser/vertebrates/hg19-tRNAs.bed .
      cp -p \
      /projects/lowelab/users/lowe/Browser/vertebrates/hg19_tRNAs_images.tar\
      .
  
      hgsql hg19 -e 'drop table if exists tRNAs'
      hgLoadBed -tab hg19 tRNAs hg19-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
  
      mkdir gif
      cd gif
      tar -xvf ../hg19_tRNAs_images.tar
      mv image/* .
      rmdir image
      mkdir /hive/data/gbdb/hg19/RNA-img
      cp -p * /hive/data/gbdb/hg19/RNA-img
  
  # tRNAs track (2015-10-04, Chris RE-BUILD WITH UPDATED DATA FROM TODD LOWE)
      cd /hive/data/genomes/hg19/bed/tRNAs
      cp /hive/users/pchan/gtrnadb2/Eukaryota/hg19/hg19-tRNAs.bed hg19-tRNAs2.bed
      hgsql hg19 -e 'drop table if exists tRNAs'
      hgLoadBed -tab hg19 tRNAs hg19-tRNAs2.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
      mv gif gif.old
      mkdir gif
      cp -p /hive/users/pchan/gtrnadb2/Eukaryota/hg19/images/* gif
  
  #####################################################################
  # calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-01-21 - Hiram)
      screen # use a screen to manage this multi-day job
      mkdir /hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11
      cd /hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11
  
      cat << '_EOF_' > DEF
  # human vs. marmoset
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Marmoset (calJac3)
  SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
  SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
  SEQ2_LIMIT=50
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << this line keeps emacs coloring happy
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    287m24.258s
      cat fb.hg19.chainCalJac3Link.txt
      #	2047068864 bases of 2897316137 (70.654%) in intersection
  
      mkdir /hive/data/genomes/calJac3/bed/blastz.hg19.swap
      cd /hive/data/genomes/calJac3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11/DEF \
  	-swap -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    120m42.991s
      cat fb.calJac3.chainHg19Link.txt
      #	2030475813 bases of 2752505800 (73.768%) in intersection
  
  #############################################################################
  # MAKE PCR TARGET FOR UCSC GENES (Done Feb 26, 2010 -Jim)
      ssh hgwdev
      mkdir /cluster/data/hg19/bed/mrnaPcr
      cd /cluster/data/hg19/bed/mrnaPcr
      # First, get consistent FA and PSL for UCSC Genes.
      # Initially I tried to use files from /cluster/data/hg19/bed/ucsc.10/:
      # subColumn 10 /cluster/data/hg19/bed/ucsc.10/rnaToGenome.psl
      #   /cluster/data/hg19/bed/ucsc.10/txToAcc.tab ucscGenes.hg19.psl
      # /cluster/data/hg19/bed/ucsc.10/ucscGenes.fa
      # But the psl was not from exactly the same seq's as in the fa.
      # Jim's suggestion: use sequenceForBed to get genomic-translated
      # sequences, and then genePredToFakePsl.  sequenceToBed must be
      # run on hgwdev.
      genePredToBed /cluster/data/hg19/bed/ucsc.12/ucscGenes.gp > ucscGenes.bed
      hgsql hg19 -NBe 'select kgId,geneSymbol from kgXref' \
      | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
        > idSub.txt
      subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
      sequenceForBed -keepName -db=hg19 -bedIn=ucscGenesIdSubbed.bed \
        -fastaOut=stdout \
      | faToTwoBit stdin kgTargetSeq.2bit
      cut -f 1-10 /cluster/data/hg19/bed/ucsc.12/ucscGenes.gp \
      | genePredToFakePsl hg19 stdin kgTargetAli.psl /dev/null
  
      # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
      cd /cluster/data/hg19/bed/mrnaPcr
      hgLoadPsl hg19 kgTargetAli.psl
      mkdir /gbdb/hg19/targetDb
      ln -s /cluster/data/hg19/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg19/targetDb/
  
      # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
      # /gbdb/hg19/targetDb/kgTargetSeq.2bit .
  
      ssh hgwdev
      # Add records to hgcentraltest blatServers and targetDb:
      hgsql hgcentraltest -e \
        'INSERT into blatServers values ("hg19Kg", "blat12", 17807, 0, 1);'
      hgsql hgcentraltest -e \
        'INSERT into targetDb values("hg19Kg", "UCSC Genes", \
           "hg19", "kgTargetAli", "", "", \
           "/gbdb/hg19/targetDb/kgTargetSeq.2bit", 1, now(), "");'
  
  
  
  ########################################################################
  # DGV V10 (DATABASE OF GENOMIC VARIANTS) (DONE 11/10/10 angie - color change 2/22/11 #2917)
  # 2/22/11 color change (Bug #2917): swap blue and red; green -> brown
  # Old DGV format is obsolete; see the following section.
  _EOF_
  #######################################################################
  # DGV (DATABASE OF GENOMIC VARIANTS) (DONE 8/10/16 angie)
  # Redmine #17351
  # previously done 9/13/13 #11200, 11/7/14 #14188, 8/18/15 #15767
      # Beta last updated 2/11/13 #9401
      # 9/13/13: DGV is back out of beta, and has added some columns again for the 7/2013
      # public release.  They have also requested that we make a composite track
      # with subtracks for supporting variants and merged variants.
      set today = `date +%y%m%d`
      mkdir -p /hive/data/genomes/hg19/bed/dgv/$today
      cd /hive/data/genomes/hg19/bed/dgv/$today
      set release = 2016-05-15
      wget http://dgv.tcag.ca/dgv/docs/GRCh37_hg19_variants_$release.txt
      wget http://dgv.tcag.ca/dgv/docs/GRCh37_hg19_supportingvariants_$release.txt
      # These are the latest columns; if any changes, update translateDgvPlus.pl:
      head -1 GRCh37_hg19*.txt
  #variantaccession        chr     start   end     varianttype     variantsubtype  reference       pubmedid        method  platform        mergedvariants  supportingvariants      mergedorsample  frequency       samplesize      observedgains   observedlosses  cohortdescription      genes   samples
      # Eyeball the categories of variants:
      cut -f 5,6 GRCh37_hg19_supportingvariants_$release.txt  | sort | uniq -c | head -100
  #1941208 CNV     deletion
  # 165948 CNV     duplication
  # 884153 CNV     gain
  #   8663 CNV     gain+loss
  #  75564 CNV     insertion
  #3488453 CNV     loss
  #  48428 CNV     mobile element insertion
  #   9059 CNV     novel sequence insertion
  #  12249 CNV     tandem duplication
  #    516 OTHER   complex
  #  30767 OTHER   inversion
  #   3707 OTHER   sequence alteration
  #      1 varianttype     variantsubtype
  
      cut -f 5,6 GRCh37_hg19_variants_$release.txt | sort | uniq -c | head -100
  # 134568 CNV     deletion
  #  29937 CNV     duplication
  #  47482 CNV     gain
  #   7062 CNV     gain+loss
  #  27603 CNV     insertion
  # 123868 CNV     loss
  #   4156 CNV     mobile element insertion
  #   8974 CNV     novel sequence insertion
  #   3703 CNV     tandem duplication
  #    578 OTHER   complex
  #   2652 OTHER   inversion
  #   2000 OTHER   sequence alteration
  #      1 varianttype     variantsubtype
  
      ~/kent/src/hg/utils/automation/translateDgvPlus.pl \
        GRCh37_hg19_supportingvariants_$release.txt > dgvSupporting.bed
      ~/kent/src/hg/utils/automation/translateDgvPlus.pl \
        GRCh37_hg19_variants_$release.txt > dgvMerged.bed
      hgLoadBed hg19 dgvSupporting dgvSupporting.bed \
        -sqlTable=$HOME/kent/src/hg/lib/dgvPlus.sql -renameSqlTable -tab
  #Read 6668715 elements of size 22 from dgvSupporting.bed
      hgLoadBed hg19 dgvMerged dgvMerged.bed \
        -sqlTable=$HOME/kent/src/hg/lib/dgvPlus.sql -renameSqlTable -tab
  #Read 392583 elements of size 22 from dgvMerged.bed
      rm bed.tab && gzip *.{txt,bed} &
  
  
  #######################################################################
  # felCat4 Cat BLASTZ/CHAIN/NET (DONE  - 2010-06-07 - Chin)
      screen # use a screen to manage this multi-day job
      mkdir /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07
      cd /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07
  
      cat << '_EOF_' > DEF
  # human vs. cat
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Cat (felCat4)
  SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit
  SEQ2_LEN=/scratch/data/felCat4/chrom.sizes
  SEQ2_LIMIT=50
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << this line keeps emacs coloring happy
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -noDbNameCheck \
  	-chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      # doBlastzChainNet from step chainRun after para stop, para freeBatch
      # After para stop para freeBatch in
      # /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07/axtChain/run]
      # rm the run directory,  and use memk/swarm this time
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
           -continue chainRun \
           -syntenicNet -noDbNameCheck \
           -chainMinScore=3000 -chainLinearGap=medium \
           -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
           > do_chainRun.log 2>&1 &
      # real    245m43.063s
      # *** All done !  Elapsed time: 245m43s
      # *** Make sure that goldenPath/hg19/vsFelCat4/README.txt is accurate.
      # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary.
  
      cat fb.hg19.chainFelCat4Link.txt
      #  1266003011 bases of 2897316137 (43.696%) in intersection
      # make it time independent and indicate that it is really done
      cd /hive/data/genomes/hg19/bed
      ln -s  lastzFelCat4.2010-06-07 lastz.felCat4
  
      # Swap
      mkdir /hive/data/genomes/felCat4/bed/blastz.hg19.swap
      cd /hive/data/genomes/felCat4/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07/DEF \
  	-swap -syntenicNet -noDbNameCheck \
  	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      # real    432m36.917s
      cat fb.felCat4.chainHg19Link.txt
      #   1211702270 bases of 1990635005 (60.870%) in intersection
  
  #####################################################################
  # susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2010-03-26,27 - Hiram)
      screen # use a screen to manage this multi-day job
      mkdir /hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26
      cd /hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26
  
      cat << '_EOF_' > DEF
  # Pig vs. Human
  BLASTZ_M=50
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Pig SusScr2
  SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit
  SEQ2_LEN=/scratch/data/susScr2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << this line keeps emacs coloring happy
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #	Elapsed time: 842m23s
      cat fb.hg19.chainSusScr2Link.txt
      #	1198794058 bases of 2897316137 (41.376%) in intersection
  
      mkdir /hive/data/genomes/susScr2/bed/blastz.hg19.swap
      cd /hive/data/genomes/susScr2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26/DEF \
  	-swap -noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	Elapsed time: 112m40s
  
      cat fb.susScr2.chainHg19Link.txt
      #	1272785114 bases of 2231298548 (57.042%) in intersection
  
  #########################################################################
  # Vega gene update (DONE - 2010-04-07 - Hiram)
      #	lookup version number at the Vega WEB site:
      #	http://vega.sanger.ac.uk/index.html
      #	and FTP site:
      #	ftp://ftp.sanger.ac.uk/pub/vega/
      cd /hive/data/genomes/hg19
      #	step wise to verify operation
      doEnsGeneUpdate.pl -vegaGene -ensVersion=38 -stop=download hg19.ensGene.ra
      doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
  	-continue=process -stop=process hg19.ensGene.ra
  # genePredCheck -db=hg19 vegaPseudo.gp.gz
  # checked: 11590 failed: 0
  # genePredCheck -db=hg19 not.vegaPseudo.gp.gz
  # checked: 96345 failed: 0
  # genePredCheck -db=hg19 hg19.allGenes.gp.gz
  # checked: 107935 failed: 0
      doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
  	-continue=load -stop=load hg19.ensGene.ra
  # zcat: download/Homo_sapiens.VEGA.38.pep.all.fa.gz: unexpected end of file
      #	they changed their file name convention ...
      doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
  	-continue=cleanup hg19.ensGene.ra
      featureBits hg19 vegaGene
      # 74206453 bases of 2897316137 (2.561%) in intersection
      featureBits hg19 vegaPseudoGene
      # 8494715 bases of 2897316137 (0.293%) in intersection
  
  #####################################################################
  # oviAri1 Sheep BLASTZ/CHAIN/NET (DONE - 2010-04-16 - Chin)
      screen # use a screen to manage this multi-day job
      mkdir /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16
      cd /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16
  
      cat << '_EOF_' > DEF
  # Sheep vs. Human
  BLASTZ_M=50
  
  # TARGET: Mouse Mm9
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Sheep OviAri1
  SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit
  SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << this line keeps emacs coloring happy
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -noLoadChainSplit -syntenicNet \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
          -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #   real    578m58.918s
      cat fb.hg19.chainOviAri1Link.txt
      #   878545517 bases of 2897316137 (30.323%) in intersection
  
      #   and the swap
      mkdir /hive/data/genomes/oviAri1/bed/blastz.hg19.swap
      cd /hive/data/genomes/oviAri1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16/DEF \
          -swap -noLoadChainSplit -syntenicNet \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
          -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #   real    72m47.780s
  
      cat fb.oviAri1.chainHg19Link.txt
      #   824310420 bases of 1201271277 (68.620%) in intersection
  
  
  ########################################################################
  # H-Inv 7.0 Gene track (DONE - 2010-04-07 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/hinv
      cd /hive/data/genomes/hg19/bed/hinv
      ./hinvToBed12.pl go > broken.1.exons.txt
      hgLoadBed hg19 hinv70Coding fcdna.coding.bed
      #	Loaded 272257 elements of size 12
      featureBits hg19 hinv70Coding
      #	141717797 bases of 2897316137 (4.891%) in intersection
  
      hgLoadBed hg19 hinv70NonCoding fcdna.nonCoding.bed
      #	Loaded 22625 elements of size 12
      featureBits hg19 hinv70NonCoding
      #	1350960 bases of 2897316137 (0.047%) in intersection
  
      hgLoadBed hg19 hinv70PseudoGene fcdna.pseudoGene.bed
      #	Loaded 1166 elements of size 12
      featureBits hg19 hinv70PseudoGene
      #	1701647 bases of 2897316137 (0.059%) in intersection
  
      featureBits hg19 hinv70Coding hinv70PseudoGene
      #	619377 bases of 2897316137 (0.021%) in intersection
  
      featureBits hg19 hinv70Coding hinv70NonCoding
      #	912553 bases of 2897316137 (0.031%) in intersection
  
      featureBits hg19 hinv70PseudoGene hinv70NonCoding
      #	9642 bases of 2897316137 (0.000%) in intersection
  
  ########################################################################
  # Updating the ucscToEnsembl table (DONE - 2010-04-06 - Hiram)
      #	as of Ensembl V57, their naming scheme changed for the randoms
      cd /hive/data/genomes/hg19/bed/ucscToEnsembl
  cat ../../chrom.sizes | while read L
  do
      size=`echo $L | awk '{print $2}'`
      ucName=`echo $L | awk '{print $1}'`
      ensName=`echo $ucName | sed -e "s/^chrM/MT/; s/^chr//;"`
      case $ucName in
          chr17_ctg5_hap1) ensName="HSCHR17_1"
                  ;;
          chr4_ctg9_hap1) ensName="HSCHR4_1"
                  ;;
          chr6_apd_hap1) ensName="HSCHR6_MHC_APD"
                  ;;
          chr6_cox_hap2) ensName="HSCHR6_MHC_COX"
                  ;;
          chr6_dbb_hap3) ensName="HSCHR6_MHC_DBB"
                  ;;
          chr6_mann_hap4) ensName="HSCHR6_MHC_MANN"
                  ;;
          chr6_mcf_hap5) ensName="HSCHR6_MHC_MCF"
                  ;;
          chr6_qbl_hap6) ensName="HSCHR6_MHC_QBL"
                  ;;
          chr6_ssto_hap7) ensName="HSCHR6_MHC_SSTO"
                  ;;
          *_gl*)
  ensName=`echo $L | awk '{print $1}' | sed -e "s/^chr.*_gl/GL/; s/_random//"`
                  ;;
      esac
      echo -e "$ucName\t$ensName"
  done > ucscToEnsemblV57.tab
  
      hgsql hg19 -e 'delete from ucscToEnsembl where ucsc like "%";'
      hgsql hg19 -e \
  'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'
  
  
  ############################################################################
  # dbSNP BUILD 131 (SNP131) (DONE 5/25/10 angie - TWEAKED 8/4/10)
  # Originally done 4/15/10 -- updated 5/25 with corrected function codes from
  # dbSNP (b131_SNPContigLocusId_37_1.bcp.gz).
      # Set up build directory
      mkdir -p /hive/data/outside/dbSNP/131/{human,shared}
  
      # Get field encodings -- if there are changes or additions to the
      # encoding of the corresponding fields, you might need to update
      # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
      # hg/lib/snp125Ui.c).
      cd /hive/data/outside/dbSNP/131/shared
      alias wg wget --timestamping
      set ftpShared = ftp://ftp.ncbi.nih.gov/snp/database/shared_data
      wg $ftpShared/LocTypeCode.bcp.gz
      wg $ftpShared/SnpClassCode.bcp.gz
      wg $ftpShared/SnpFunctionCode.bcp.gz
      wg $ftpShared/SnpValidationCode.bcp.gz
      # Here is another source -- it is not as up-to-date as the above, but
      # our encodings (enums and sets in snp131.sql) are named more similar
      # to those in the 2005 ASN:
      # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
  
      ########################## DOWNLOAD #############################
      cd /hive/data/outside/dbSNP/131/human
      mkdir data schema rs_fasta
      # Get data from NCBI (anonymous FTP)
      set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
      wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
      cd /hive/data/outside/dbSNP/131/human/data
      # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
      wg $ftpSnpDb/organism_data/b131_SNPContigLoc_37_1.bcp.gz
      wg $ftpSnpDb/organism_data/b131_SNPContigLocusId_37_1.bcp.gz
      wg $ftpSnpDb/organism_data/b131_ContigInfo_37_1.bcp.gz
      # MapInfo has alignment weights
      wg $ftpSnpDb/organism_data/b131_SNPMapInfo_37_1.bcp.gz
      # SNP has univar_id, validation status and heterozygosity
      wg $ftpSnpDb/organism_data/SNP.bcp.gz
  
      # Get schema
      cd /hive/data/outside/dbSNP/131/human/schema
      wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
      wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
  
      # Get fasta files
      # using headers of fasta files for molType, class, observed
      cd /hive/data/outside/dbSNP/131/human/rs_fasta
      wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
  
      ########################## LOAD NCBI TABLES #############################
      # Simplify names of data files -- strip version & extras to get
      # local canonical table names.
      cd /hive/data/outside/dbSNP/131/human/data
      foreach f (*.bcp.gz)
        set new = `echo $f \
                   | sed -e 's/^b131_SNP//; s/^b131_//; s/_37_1//; s/.bcp//;'`
        mv $f $new
        echo $new
      end
  
      cd /hive/data/outside/dbSNP/131/human/schema
      zcat human_9606_table.sql.gz \
      | perl -we '$/ = "\nGO\n\n\n"; \
          while (<>) { \
            next unless /^CREATE TABLE \[(b131_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
            s/b131_(SNP)?//; s/_37_1//; \
            s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
            s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
            s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
            s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
            s/(image|varchar\s+\(\d+\))/BLOB/g; \
            print; \
          }' \
        > table.sql
  
      # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
      hgsql -e 'create database hg19snp131'
      cd /hive/data/outside/dbSNP/131/human/schema
      hgsql hg19snp131 < table.sql
      cd ../data
  
      # Avoid wasting space by excluding mappings to non-reference contigs (ContigInfo.group_label):
      zcat ContigInfo.gz | cut -f 12 | uniq | sort -u
  #CRA_TCAGchr7v2
  #Celera
  #GRCh37
  #Homo sapiens MT
  #HuRef
      foreach t (ContigInfo MapInfo ContigLocusId)
        zcat $t.gz \
        | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
        | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
        | hgLoadSqlTab -oldTable hg19snp131 $t placeholder stdin
      end
  
      # Compare contig list between our ctgPos and reference contigs in ContigInfo.
      # If they are identical, sweet, we probably have a $db/jkStuff/liftContigs.lft
      # or similar file to use below.  If they are not identical, need to make
      # lift file using available information.
      hgsql hg19 -N -B -e 'select contig from ctgPos;' \
      | sort > /tmp/1
      # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
      hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;' | sort > /tmp/2
      diff /tmp/1 /tmp/2
      # Doh!  Completely different: ctgPos has GL*, ContigInfo has NC_* / NT_*
      # We will need to generate own liftUp file for N*_* contig IDs.
  
      # NC_001807 entrez sez "Record removed.This sequence was removed
      # since the accepted reference sequence for the Homo sapiens
      # mitochondrion is the rCRS/Mitomap sequence, which is now
      # available as the record NC_012920".
      # They align w/gaps on both q & t, so liftUp won't do, we need liftOver:
      blat -noHead NC_012920.fa /hive/data/genomes/hg19/M/chrM.fa stdout \
      | axtChain -psl -linearGap=medium stdin -faT NC_012920.fa /hive/data/genomes/hg19/hg19.2bit \
          NC_012920ToChrM.over.chain
  
      # NT_004350: entrez sez:
  #COMMENT     REFSEQ INFORMATION: Features on this sequence have been produced
  #            for build 37 version 1 of the NCBI's genome annotation [see
  #            documentation].   The reference sequence is identical to
  #            GL000003.1.
  
      # Using the contigs named in ContigInfo, screen-scrape genbank to get GL ID for contig ID.
      cp /dev/null contigToGl.txt
      foreach nt (`hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;'`)
        wget --quiet -O - 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?tool=portal&amp;db=nuccore&amp;val='$nt'&amp;dopt=genbank&amp;sendto=on' \
        | perl -we 'while (<>) { \
                      if (/^LOCUS/) { \
                        m/^LOCUS\s+'$nt'\s+(\d+) bp/ || die "parse ('$nt'): $_\t"; \
  		      $size = $1; \
                      } elsif (/^            (GL\d+)\.\d+\.$/) { \
                        print "'$nt'\t$1\t$size\n"; \
  		    } \
                    }' \
          >> contigToGl.txt
      end
      hgsql hg19 -NBe 'select chromStart, SUBSTRING_INDEX(contig, ".", 1), \
                         ctgPos.size, ctgPos.chrom, chromInfo.size \
                         from ctgPos,chromInfo \
                         where ctgPos.chrom=chromInfo.chrom order by contig' \
        > glToLift.txt
      sort -k2,2 contigToGl.txt \
      | join -1 2 -2 2 -t"	" -o 1.1,2.1,1.3,1.4,1.5 -a 2 -e MISSING \
          glToLift.txt - \
        > /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  
      # Manually add NC_001807 -> chrM just in case:
      echo "0	NC_001807	16571	chrM	16571" \
        >> /hive/data/genomes/hg19/jkStuff/liftContigs.lft
      # Blat NC_012920 to chrM shows gaps, so we'll need to use liftOver chain created above.
  
      # Make sure there are no orient != 0 contigs among those selected.
      hgsql hg19snp131 -NBe \
        'select count(*) from ContigInfo where orient != 0;'
  #0
  
      # ContigLoc is huge, and we want just the reference contig mappings.
      # So, based on the reference & haplo ctg_id values in ContigInfo,
      # filter to get just the mappings for those contigs:
      zcat ContigLoc.gz \
      | awk '$3 <= 9 || $3 == 6647 || $3 >= 11178' \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp131 ContigLoc placeholder stdin
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 1
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 2
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 3
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 4
  #load of ContigLoc did not go as planned: 27500025 record(s), 0 row(s) skipped, 3273 warning(s) loading /dev/stdin
      zcat SNP.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp131 SNP placeholder stdin
  #Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 1
  #Warning 1366 Incorrect integer value: '' for column 'map_property' at row 1
  #Warning 1264 Out of range value adjusted for column 'last_updated_time' at row 2
  #Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
  #Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
      # ... no big deal.
      foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
       echo -n "${t}:\t"
        hgsql -N -B hg19snp131 -e 'select count(*) from '$t
      end
  #ContigInfo:           260
  #ContigLoc:       27500025
  #ContigLocusId:   55347972
  #MapInfo:         23619373
  #SNP:    	  23653729
  
  
      #################### EXTRACT INFO FROM NCBI TABLES ####################
      # Glom each SNP's function codes together and load up a new hg19Snp131 table.
      # Also extract NCBI's annotations of coding SNPs' effects on translation.
      # We extract ContigLocusId info only for reference assembly mapping.
      # Some SNP's functional annotations are for an alternate assembly, so we will
      # have no NCBI functional annotations to display for those (but our own are
      # available).
      cd /hive/data/outside/dbSNP/131/human
      # Add indices to tables for a big join (5 or 6 minutes):
      hgsql hg19snp131 -e \
        'alter table ContigInfo add index (ctg_id); \
         alter table ContigLocusId add index (ctg_id);'
      hgsql hg19snp131 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
                             fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
                             from ContigLocusId as cli, ContigInfo as ci \
                             where cli.ctg_id = ci.ctg_id;' \
        > ncbiFuncAnnotations.txt
      wc -l ncbiFuncAnnotations.txt
  #16835438 ncbiFuncAnnotations.txt
      # Ignore function code 8 (cds-reference, just means that some allele matches reference)
      # and glom functions for each SNP id:
      cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
      | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
      | perl -we 'while (<>) { chomp; \
                    ($id, undef, $s, $e, $f, $c) = split; \
                    if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
                      $prevFunc .= "$f," unless ($f == 8); \
                    } else { \
                      if (defined $prevId) { \
                        print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc); \
                      } \
                      $prevFunc = ($f == 8) ? "" : "$f,"; \
                    } \
                    ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
                  } \
                  print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc);' \
        > ucscFunc.txt
      wc -l ucscFunc.txt
  #10328697 ucscFunc.txt
      cat > ucscFunc.sql <<EOF
  CREATE TABLE ucscFunc (
          snp_id int NOT NULL ,
          ctg_id int(10) NOT NULL ,
          asn_from int(10) NOT NULL ,
          asn_to int(10) NOT NULL ,
          fxn_class varchar(255) NOT NULL ,
          INDEX snp_id (snp_id),
          INDEX ctg_id (ctg_id)
  );
  EOF
      hgLoadSqlTab hg19snp131 ucscFunc{,.sql,.txt}
      # 10/12/10: Those coords are NCBI's 0-based, fully-closed, 2-base-wide insertions.
      # We need to leave the coords alone here so ucscFunc can be joined below.
      # Make a list of SNPs with func anno's that are insertion SNPs, so we can use
      # the list to determine what type of coord fix to apply to each annotation
      # when making snp130CodingDbSnp below.
      hgsql hg19snp131 -NBe \
        'select ci.contig_acc, cl.asn_from, cl.asn_to, uf.snp_id \
         from ucscFunc as uf, ContigLoc as cl, ContigInfo as ci \
         where uf.snp_id = cl.snp_id and \
               uf.ctg_id = cl.ctg_id and uf.asn_from = cl.asn_from and uf.asn_to = cl.asn_to and \
               cl.loc_type = 3 and \
               cl.ctg_id = ci.ctg_id' \
        > ncbiFuncInsertions.ctg.bed
      wc -l ncbiFuncInsertions.ctg.bed
  #1165272 ncbiFuncInsertions.ctg.bed
  
      # Extract observed allele, molType and snp class from FASTA headers gnl
      # 4/13: found some inconsistent headers in rs_chPAR.fas.gz vs. other rs_ch*,
      # reported to dbSNP, Lon said that rs_chPAR.fas.gz snuck in from build 130!
      rm /hive/data/outside/dbSNP/131/human/rs_fasta/rs_chPAR.fas.gz
      zcat /hive/data/outside/dbSNP/131/human/rs_fasta/rs_ch*.fas.gz \
      | grep '^>gnl' \
      | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
      | sort -nu \
        > ucscGnl.txt
  #520.305u 74.766s 7:02.48 140.8% 0+0k 0+0io 0pf+0w
      wc -l ucscGnl.txt
  #23653726 ucscGnl.txt
      cut -f 1 ucscGnl.txt | uniq | wc -l
  #23653726
      cat > ucscGnl.sql <<EOF
  CREATE TABLE ucscGnl (
          snp_id int NOT NULL ,
          observed varchar(255) NOT NULL,
          molType varchar(255) NOT NULL,
          class varchar(255) NULL ,
          INDEX snp_id (snp_id)
  );
  EOF
      hgLoadSqlTab hg19snp131 ucscGnl{,.sql,.txt}
  
      # Add indices to tables for a big join (5 or 6 minutes):
      hgsql hg19snp131 -e \
        'alter table ContigLoc  add index (ctg_id); \
         alter table SNP        add index (snp_id); \
         alter table MapInfo    add index (snp_id);'
  
      # Big leftie join to bring together all of the columns that we want in snp131,
      # using all of the available joining info:
      hgsql hg19snp131 -NBe \
       'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
               ug.observed, ug.molType, ug.class, \
               s.validation_status, s.avg_heterozygosity, s.het_se, \
               uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
        FROM \
        ((((ContigLoc as cl JOIN ContigInfo as ci \
                 ON cl.ctg_id = ci.ctg_id) \
            LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
           LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
          LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
         LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
                                  and uf.asn_from = cl.asn_from;' \
        > ucscNcbiSnp.ctg.bed
  #75.815u 13.622s 32:04.35 4.6%   0+0k 0+0io 0pf+0w
      wc -l ucscNcbiSnp.ctg.bed
  #27500025 ucscNcbiSnp.ctg.bed
      # Use liftUp for everything except mito, then liftOver for mito:
      # There are some weird cases of length=1 but locType=range... in all the cases
      # that I checked, the length really seems to be 1 so I'm not sure where they got
      # the locType=range.  Tweak locType in those cases so we can keep those SNPs:
      grep -vw ^NC_012920 ucscNcbiSnp.ctg.bed \
      | awk -F"\t" 'BEGIN{OFS="\t";}  $2 == $3 && $14 == 1 {$14=2; numTweaked++;}  {print;} \
             END{print numTweaked, "single-base, locType=range, tweaked locType" > "/dev/stderr";}' \
      | liftUp ucscNcbiSnp.bed \
        /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
  #2535    single-base, locType=range, tweaked locType
  #392.182u 27.358s 7:20.66 95.2%  0+0k 0+0io 0pf+0w
      # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
      # doesn't deal with 0-base items.  Fake out phys_pos_from to 0 because many coords
      # will differ, oh well.
      grep -w NC_012920 ucscNcbiSnp.ctg.bed \
      | awk -F"\t" 'BEGIN{OFS="\t";} {$3 += 1; $16 = 0; print;}' \
      | liftOver -bedPlus=3 stdin NC_012920ToChrM.over.chain stdout chrM.unmapped \
      | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
      | sort -k2n,2n \
        > chrMNcbiSnp.bed
  #3.479u 2.428s 0:53.57 10.9%     0+0k 0+0io 4pf+0w
      # Good, got all but 2 SNPS (rs28693675 and rs55749223, partially deleted / deleted in new)
      cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
      wc -l ucscNcbiSnp.bed
  #27500023 ucscNcbiSnp.bed
  
      # Translate NCBI's encoding into UCSC's, and perform a bunch of checks.
      # This is where developer involvement is most likely as NCBI extends the
      # encodings used in dbSNP.
      cd /hive/data/outside/dbSNP/131/human/
      snpNcbiToUcsc ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp131
  #spaces stripped from observed:
  #chr12   6093134 6093134 rs41402545
  #count of snps with weight  0 = 67535
  #count of snps with weight  1 = 23023681
  #count of snps with weight  2 = 472416
  #count of snps with weight  3 = 2536961
  #count of snps with weight 10 = 1399430
  #Skipped 7 snp mappings due to errors -- see snp131Errors.bed
  #173.162u 5.982s 7:57.91 37.4%   0+0k 0+0io 3pf+0w
      head snp131Errors.bed
  #chr13   32953907        32954033        rs80359736      rs80359736 is 126 bases long but refNCBI is different length: CATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTCAAAATCTAAAAGTAAATCTGAAAGAGCTAACAT
  #chr17   41223118        41223133        rs80359888      Missing observed value (deleted SNP?).
  #chr17   41245687        41245900        rs80359886      rs80359886 is 213 bases long but refNCBI is different length: AATATGCCTGGTAGAAGACTTCCTCCTCAGCCTATTCTTTTTAGGTGCTTTTGAATTGTGGATATTTAATTCGAGTTCCATATTGCTTATACTGCTGCTT
  #chr17   41245687        41245900        rs80359886      Missing observed value (deleted SNP?).
  #chr17   41276085        41276094        rs80359887      Missing observed value (deleted SNP?).
  #chrM    308     310     rs66492218      Unexpected coords for locType "between" (3) -- expected NCBI's chrEnd = chrStart+1.
  #chrM    308     310     rs66492218      rs66492218 is 2 bases long but refNCBI is different length: -
      wc -l snp*
  #  26033053 snp131.bed
  #        22 snp131.sql
  #         7 snp131Errors.bed
  #        18 snp131ExceptionDesc.tab
  #   4281351 snp131Exceptions.bed
      # 8M new snps, lots more exceptions than snp130 (had 2631563)
  
      # Make one big fasta file.
      # It's a monster: 18G!  Can we split by hashing rsId?
      zcat rs_fasta/rs_ch*.fas.gz \
      | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
        > snp131.fa
      # Check for duplicates.
      grep ^\>rs snp131.fa | sort > /data/tmp/seqHeaders
      wc -l /data/tmp/seqHeaders
  #23653726 /data/tmp/seqHeaders
      uniq /data/tmp/seqHeaders | wc -l
  #23653726
      # Use hgLoadSeq to generate .tab output for sequence file offsets,
      # and keep only the columns that we need: acc and file_offset.
      # Index it and translate to snpSeq table format.
      hgLoadSeq -test placeholder snp131.fa
  #23653726 sequences
  #128.364u 25.531s 10:52.02 23.6% 0+0k 0+0io 0pf+0w
      cut -f 2,6 seq.tab > snp131Seq.tab
      rm seq.tab
  
      # Load up main track tables.
      cd /hive/data/outside/dbSNP/131/human
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp131 -sqlTable=snp131.sql snp131.bed
  #Loaded 26033053 elements of size 17
  #162.666u 19.611s 8:53.56 34.1%  0+0k 0+0io 0pf+0w
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp131Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
        snp131Exceptions.bed
  #Loaded 4281351 elements of size 5
  #32.020u 2.006s 1:22.87 41.0%    0+0k 0+0io 0pf+0w
      hgLoadSqlTab hg19 snp131ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
        snp131ExceptionDesc.tab
      # Load up sequences.
      mkdir -p /gbdb/hg19/snp
      ln -s /hive/data/outside/dbSNP/131/human/snp131.fa /gbdb/hg19/snp/snp131.fa
      hgLoadSqlTab hg19 snp131Seq ~/kent/src/hg/lib/snpSeq.sql snp131Seq.tab
  
      # Put in a link where one would expect to find the track build dir...
      ln -s /hive/data/outside/dbSNP/131/human /hive/data/genomes/hg19/bed/snp131
  
  #*** NOTE FOR NEXT TIME: ask cluster-admin to pack the snp131 table
  
      # Look at the breakdown of exception categories:
      cd /hive/data/outside/dbSNP/131/human
      cut -f 5 snp131Exceptions.bed | sort | uniq -c | sort -nr
  #3088435 MultipleAlignments
  # 886159 ObservedMismatch
  #  92341 SingleClassTriAllelic
  #  70184 SingleClassZeroSpan
  #  43319 ObservedTooLong
  #  25745 MixedObserved
  #  22606 SingleClassLongerSpan
  #  19681 SingleClassQuadAllelic
  #  15245 FlankMismatchGenomeShorter
  #   9808 DuplicateObserved
  #   4463 NamedDeletionZeroSpan
  #   2040 FlankMismatchGenomeLonger
  #    802 ObservedContainsIupac
  #    317 NamedInsertionNonzeroSpan
  #    142 FlankMismatchGenomeEqual
  #     62 RefAlleleMismatch
  #      1 RefAlleleRevComp
  #      1 ObservedWrongFormat
      # Compared to snp130, nice to see fewer disfunctional locTypes (FlankMismatch*)
      # and singleClassQuadAllelic -- major increases in most others though.
      # Sent a few bug reports to dbSNP
  
      # Tweaked 8/4/10 to correct missing-func (loophole fixed in perl that generates
      # ucscFunc.txt above).
      hgsql hg19 -e "update snp131 set func = 'unens' where name = 'rs75946332' and func = '';"
  
  
  
  ############################################################################
  # SPLIT SNP131 INTO CLINICAL / NON-CLINICAL (DONE 8/19/10 angie)
  # http://redmine.soe.ucsc.edu/issues/559
      cd /hive/data/outside/dbSNP/131/human/data
      wget --timestamping ftp://ftp.ncbi.nlm.nih.gov/snp/database/organism_data/human_9606/SNP_bitfield.bcp.gz
      # I did a little analysis of the bitfields -- see file
      # /hive/data/outside/dbSNP/131/human/data/bitfield_breakdown.txt .
      cd /hive/data/outside/dbSNP/131/human
      zcat data/SNP_bitfield.bcp.gz \
      | perl -wpe '@w = split;  if ($w[3] & 0x40) { $_ = "rs$w[0]\n" } else { $_ = ""; }' \
        > clinicalRsIds.txt
      wc -l clinicalRsIds.txt
  #16605 clinicalRsIds.txt
      grep -Fwf clinicalRsIds.txt snp131.bed > snp131Clinical.bed
      wc -l snp131Clinical.bed
  #14907 snp131Clinical.bed
      # Wow, just a subset have been mapped to hg19, bummer.
      grep -vFwf clinicalRsIds.txt snp131.bed > snp131NonClinical.bed
      wc -l snp131*Clinical.bed
  #     14907 snp131Clinical.bed
  #  26018146 snp131NonClinical.bed
  #  26033053 total
      # Good, 26033053 is the right total.
      # Edit snp131.sql to use table name "snp131Tmp" so we don't nuke snp131.
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp131Clinical -sqlTable=snp131.sql -renameSqlTable snp131Clinical.bed
  #Loaded 14907 elements of size 17
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp131NonClinical -sqlTable=snp131.sql -renameSqlTable snp131NonClinical.bed
  #Loaded 26018146 elements of size 17
      # 9/1/10: Don't forget to fix the empty-func bug!
      hgsql hg19 -e 'update snp131Clinical set func = "unens" where func = "";'
  
  
  ############################################################################
  # SPLIT SNP131 INTO MULTIMAPPED / HAPMAP+1000GENOMES / MISC (DONE 11/5/10 angie)
  # another swipe at http://redmine.soe.ucsc.edu/issues/559
      cd /hive/data/outside/dbSNP/131/human
  
      # First, separate out the "SNPs" that map to multiple genomic loci:
      grep -Fw MultipleAlignments snp131Exceptions.bed | wc -l
  #3088435
      grep -Fw MultipleAlignments snp131Exceptions.bed \
      | cut -f 4 \
      | sort -u > multipleMappingIds.txt
      wc -l multipleMappingIds.txt
  #978068 multipleMappingIds.txt
      grep -Fwf multipleMappingIds.txt snp131.bed > snp131NonUnique.bed
      wc -l snp131NonUnique.bed
  #3088435 snp131NonUnique.bed
      grep -vFwf multipleMappingIds.txt snp131.bed > snp131Unique.bed
      wc -l snp131Unique.bed
  #22944618 snp131Unique.bed
  
      # Next, separate the uniquely mapped SNPs into HapMap and/or 1000 Genomes
      # (the ones that we are certain have been observed in a large number of
      # apparently healthy samples) vs. other ones (rarer SNPs, maybe clinical SNPs).
  
  #*** NOTE *** To do this right, we need to get allele freq data from dbSNP and use it
  #             as a filter here:
  
      egrep -w 'by-hapmap|by-1000genomes' snp131Unique.bed > snp131Common.bed
      wc -l snp131Common.bed
  #12750453 snp131Common.bed
      egrep -vw 'by-hapmap|by-1000genomes' snp131Unique.bed > snp131Misc.bed
      wc -l snp131Misc.bed
  #10194165 snp131Misc.bed
  
      # How many "clinical" SNPs (i.e. included in a Locus-Specific Database) are in the common set?
      grep -Fwf clinicalRsIds.txt snp131Common.bed > snp131CommonButClinical.bed
      wc -l snp131CommonButClinical.bed
  #2373 snp131CommonButClinical.bed
      calc 2373 / 16605
  #2373 / 16605 = 0.142909
      # A higher percentage than I expected... are 15% of OMIM / LSDB SNPs common?
      # Spot-checking, OMIM will mention SNPs from GWAS, or as endpoints of an interval...
      # the SNPs are clearly common but sometimes common variants are associated with
      # a disorder.
      # How many "clinical" SNPs in the NonUnique set?
      grep -Fwf clinicalRsIds.txt snp131NonUnique.bed | wc -l
  #901
  
      # Load tables:
      foreach t (snp131Common snp131Misc snp131NonUnique)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
          hg19 $t -sqlTable=snp131.sql -renameSqlTable $t.bed
      end
  #Reading snp131Common.bed
  #Loaded 12750453 elements of size 17
  #Reading snp131Misc.bed
  #Loaded 10194165 elements of size 17
  #Reading snp131NonUnique.bed
  #Loaded 3088435 elements of size 17
  
  
  ############################################################################
  # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 6/3/10 angie)
  # First done 4/15/10.  Then found that SNPs that appeared on both a main chrom
  # (like chr6) and on a haplo chrom (like chr6_cox_hap2) were being flagged
  # as multiple alignments when they should be, excluding them from this.
  # Regenerated exceptions, then regenerated this.
      mkdir /hive/data/genomes/hg19/bed/snp131Ortho
      cd /hive/data/genomes/hg19/bed/snp131Ortho
  
      # Following Heather's lead in snp126orthos, filter SNPs to to keep
      # only those with class=single, length=1, chrom!~random;
      # Exclude those with exceptions MultipleAlignments,
      # SingleClassTriAllelic or SingleClassQuadAllelic.
      # Unlike snp masking, we do not filter for weight -- don't know why.
      awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
        /hive/data/outside/dbSNP/131/human/snp131Exceptions.bed \
      | sort -u \
        > snp131ExcludeIds.txt
      awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
        /hive/data/outside/dbSNP/131/human/snp131.bed \
      | grep -vFwf snp131ExcludeIds.txt \
        > snp131Simple.bed
  #333.829u 11.879s 3:57.31 145.6% 0+0k 0+0io 0pf+0w
      wc -l snp131Simple.bed
  #17784981 snp131Simple.bed
  #with too many SNPs excluded, was: 17337248 snp131Simple.bed
  
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        snp131Simple.bed > snp131ForLiftOver.bed
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
      splitFile ../snp131ForLiftOver.bed 25000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
          \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh swarm
      cd /hive/data/genomes/hg19/bed/snp131Ortho/run.liftOChimp
      para make jobList
  #Completed: 712 of 712 jobs
  #CPU time in finished jobs:     127853s    2130.88m    35.51h    1.48d  0.004 y
  #IO & Wait Time:                 11528s     192.14m     3.20h    0.13d  0.000 y
  #Average job time:                 196s       3.26m     0.05h    0.00d
  #Longest finished job:             506s       8.43m     0.14h    0.01d
  #Submission to last job:           676s      11.27m     0.19h    0.01d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  # on pk:
  #Completed: 712 of 712 jobs
  #CPU time in finished jobs:     230882s    3848.03m    64.13h    2.67d  0.007 y
  #IO & Wait Time:                  3660s      61.00m     1.02h    0.04d  0.000 y
  #Average job time:                 329s       5.49m     0.09h    0.00d
  #Longest finished job:            1019s      16.98m     0.28h    0.01d
  #Submission to last job:          1667s      27.78m     0.46h    0.02d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
          \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 712 of 712 jobs
  #CPU time in finished jobs:     281168s    4686.14m    78.10h    3.25d  0.009 y
  #IO & Wait Time:                 22164s     369.39m     6.16h    0.26d  0.001 y
  #Average job time:                 426s       7.10m     0.12h    0.00d
  #Longest finished job:             868s      14.47m     0.24h    0.01d
  #Submission to last job:           872s      14.53m     0.24h    0.01d
  
      cd /hive/data/genomes/hg19/bed/snp131Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
      | sort > panTro2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
      | sort > rheMac2.orthoGlom.txt
      wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
  #  16641106 panTro2.orthoGlom.txt
  #  15796202 ponAbe2.orthoGlom.txt
  #  14289736 rheMac2.orthoGlom.txt
  #was:  16230258 panTro2.orthoGlom.txt
  #was:  15535287 ponAbe2.orthoGlom.txt
  #was:  13996256 rheMac2.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of snp131OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac2.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp131OrthoPt2Pa2Rm2.bed
  #437.114u 37.309s 6:33.92 120.4% 0+0k 0+0io 0pf+0w
      wc -l snp131OrthoPt2Pa2Rm2.bed
  #17276174 snp131OrthoPt2Pa2Rm2.bed
  #was: 16842459 snp131OrthoPt2Pa2Rm2.bed
  
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp131OrthoPt2Pa2Rm2 snp131OrthoPt2Pa2Rm2.bed
  #Loaded 17276174 elements of size 22
  #123.287u 13.079s 8:17.88 27.3%  0+0k 0+0io 0pf+0w
  
      # Cleanup:
      nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
      rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
  
  
  ############################################################################
  # DBSNP CODING ANNOTATIONS (DONE 10/12/10 angie)
  # Updated 10/12/10 - redone w/corrected genome coords (Redmine Track #1249)
  # Updated 5/25/10 with corrected function codes (b131_SNPContigLocusId_37_1.bcp.gz).
  # Updated 4/16 - redone w/snp131, using mapping locations of dbSNP's func. annos.
  #                found some strange function codes and notified dbSNP.
  # originally done 6/2/09
      cd /hive/data/outside/dbSNP/131/human
      # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
      # For anything except an insertion (0 bases between flanks),
      # we need to add 1 to the end coord.  For an insertion, we need
      # to add 1 to the start coord.  Make a hash of the insertion IDs,
      # then look up each ID in ncbiFuncAnnotations.txt to tell which
      # transform to apply.
      # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
      perl -we 'open($IDS, "ncbiFuncInsertions.ctg.bed") || die "ids: $!"; \
                while (<$IDS>) { chomp; $ids{$_} = 1; } \
                close($IDS); \
                %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
                while (<>) { \
                  chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                  next unless $coding{$w[5]}; \
                  $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                  if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                    $w[2]++; # 2-base insertions: increment start coord \
                  } else { \
                    $w[3]++; # increment end coord to get half-open \
                  } \
                  print join("\t", @w) . "\n"; \
                }' ncbiFuncAnnotations.txt \
      | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
      | uniq \
        > ncbiCodingAnnotations.txt
      wc -l ncbiCodingAnnotations.txt
  #950616 ncbiCodingAnnotations.txt
  
      # How many & what kinds of function types?
      cut -f 6 ncbiCodingAnnotations.txt \
      | sort -n | uniq -c
  # 168639 3   (coding-synon)
  # 443419 8   (cds-reference -- ignored)
  #      1 9   (coding-synonymy-unens: rs80359842)
  #   9790 41  (nonsense)
  # 261982 42  (missense)
  #  65656 44  (frameshift)
  #   1129 45  (cds-indel)
      # Gather up multiple annotation lines into one line per {snp, gene, frame}:
      perl -e  'while (<>) { chomp; \
                  my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                  if (defined $lastRs && \
                      ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                       $lastTx ne $txId || $lastFrm ne $frm)) { \
                    if (defined $refRow) { \
                      $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                      $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                    } \
                    print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                          "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                    $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                  } \
                  ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                      ($rsId, $ctg, $s, $e, $txId, $frm); \
                  $count++; \
                  if ($fxn == 8) { \
                    $refRow = [$fxn, $nt, $aa, $codon]; \
                  } else { \
                   $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                  } \
                } \
                if (defined $refRow) { \
                  $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                  $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                } \
                print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                      "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
        ncbiCodingAnnotations.txt \
      | liftUp snp131CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
      hgLoadBed hg19 snp131CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
        -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
        snp131CodingDbSnp.bed
  #Loaded 443454 elements of size 11
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP131 (DONE 5/27/10 angie)
      mkdir /hive/data/genomes/hg19/snp131Mask
      cd /hive/data/genomes/hg19/snp131Mask
  
      # Identify rsIds with various problems -- we will exclude those.
      # MultipleAlignments is kinda broad because anything that maps on
      # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
      # matches on chrN_random might disqualify good matches on chrN.
      # Well, erring on the side of caution is good.
      awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
        /hive/data/outside/dbSNP/131/human/snp131Exceptions.bed \
        | sort -u \
        > snp131ExcludeRsIds.txt
      time grep -vFwf snp131ExcludeRsIds.txt \
        /hive/data/outside/dbSNP/131/human/snp131.bed \
        > snp131Cleaned.bed
  #193.507u 5.203s 4:07.62 80.2%   0+0k 0+0io 0pf+0w
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
      | faSplit byname stdin substitutions/
      # 180 warnings about differing observed strings at same base position --
      # saved as diffObserved.txt.
  #Masked 17377658 snps in 17377491 out of 3099287100 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3099287100 (difference is 37874164)
  #52.903u 10.964s 4:49.08 22.0%   0+0k 0+0io 3pf+0w
      # Check that 37874164 is the total #bases in sequences with nothing in snp131Cleaned:
      grep -Fw single snp131Cleaned.bed | cut -f 1 | uniq > /tmp/1
      grep -vwf /tmp/1 ../chrom.sizes
      grep -vwf /tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #37874164
  #TODO: send list to dbSNP.
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10491 (y != c)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 61004 (r != a)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
        gzip $f:r.subst.fa
      end
  
      # Insertions:
      mkdir insertions
      snpMaskAddInsertions snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
      | faSplit byname stdin insertions/
  #Added 2496221 snps totaling 5939697 bases to 3098816404 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3098816404 (difference is 38344860)
  #52.764u 12.593s 3:55.47 27.7%   0+0k 0+0io 2pf+0w
      # Again, that just means that some chroms didn't have filtered SNPs.
      # Make sure that all sizes have increased relative to original:
      foreach f (insertions/chr*.fa)
        echo -n "${f:t:r}: "
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ" \
        |& perl -we '$_=<>; \
             if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
               if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
               else {die "ERROR: ins size $1 <= $2\n";} \
             } else {die $_;}'
      end
  #chr1: OK: ins size 249717078 > 249250621
  #chr10: OK: ins size 135805198 > 135534747
  #...
  #(output OK -- new sizes > old)
      foreach f (insertions/chr*.fa)
        mv $f $f:r.ins.fa
        gzip $f:r.ins.fa
      end
  
      # Deletions:
      mkdir deletions
      snpMaskCutDeletions snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
      | faSplit byname stdin deletions/
  #Cut 1522178 snps totaling 3455905 bases from 3098701788 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3098701788 (difference is 38459476)
  #114.251u 20.911s 4:24.26 51.1%  0+0k 0+0io 3pf+0w
      # Again, that just means that some chroms didn't have filtered SNPs.
      # Make sure that all sizes have decreased relative to original:
      foreach f (deletions/chr*.fa)
        echo -n "${f:t:r}: "
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ" \
        |& perl -we '$_=<>; \
             if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
               if ($1 < $2) {print "OK: del size $1 < $2\n";} \
               else {die "ERROR: del size $1 >= $2\n";} \
             } else {die $_;}'
      end
  #chr1: OK: del size 248968549 < 249250621
  #chr10: OK: del size 135378065 < 135534747
  #...
  #(output OK -- del sizes < old)
      foreach f (deletions/chr*.fa)
        mv $f $f:r.del.fa
        gzip $f:r.del.fa
      end
  
      # Clean up and prepare for download:
      gzip snp131Cleaned.bed
      foreach d (substitutions insertions deletions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg18/snp130Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt in each subdir.
  
      # Create download links on hgwdev.
      # NOTE: Currently we offer only the substitutions.
      # If we get any user requests, then maybe we can put the insertions
      # and deletions out there.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask
      ln -s /hive/data/genomes/hg19/snp131Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/
  ## If there is user demand for ins & del, then start over with an empty
  ## goldenPath/snp131Mask and do this:
  ##    foreach type (substitutions insertions deletions)
  ##      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/$type
  ##      ln -s /hive/data/genomes/hg19/snp131Mask/$type/* \
  ##        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/$type/
  ##    end
  
  
  ##############################################################################
  #  RE-BUILD sno/miRNA TRACK (DONE - 04-20-2010 - Chin)
  
      # The data in this track is out of date so update the track.
      mkdir -p /hive/data/genomes/hg19/bed/wgRna-2010-04-20
      cd /hive/data/genomes/hg19/bed/wgRna-2010-04-20
  
      # Download GFF file of latest miRNA annotations from miRBase at the
      # ftp://mirbase.org/pub/mirbase/CURRENT/. This is Release 14.0
      # (September,  2009)
      # 04-27-2010 Get the newest miRNA release 15
      wget --timestamping \
           ftp://mirbase.org/pub/mirbase/CURRENT/genomes/hsa.gff
      # Re-format, need to add "chr" to the beginning of each line.
      sed -e 's/^/chr/' hsa.gff > hsMirBaseFormat.gff
      # Remove extra "chr" in comment lines
      perl -pi.bak -e 's/chr#/#/' hsMirBaseFormat.gff
      # Change chrMT to chrM
      perl -pi.bak -e 's/chrMT/chrM/' hsMirBaseFormat.gff
      # Remove all but ID name in last field
      sed -e 's/\";//g' hsMirBaseFormat.gff | sed -e 's/ID=\"//g' \
         | sed -e 's/ACC=\"MI[0-9]*\s//' > hsMirBaseFormatIdOnly.gff
  
      # set score to zero, since the color is based on the type of the RNA
      # Starts appear to be 1-based when compared to miRNAs in current
      # track
      # and those in Ensembl.
      # Confirmed with Sam Griffith-Jones (one of the authors of miRBase,
      # sam.griffith-jones@manchester.ac.uk) that these GFF coordinates
      # are 1-based.
      # Also add thickStart and thickEnd columns and "miRNA" for type.
      awk 'BEGIN {FS="\t"} {OFS="\t"} \
          {if ($0 !~ /#/ && $7 == "+") \
           print $1, $4-1, $5, $9, 0, $7, 0, 0, "miRNA"; \
         else if ($0 !~ /#/ && $7 == "-") \
           print $1, $4-1, $5, $9, 0, $7, 0, 0, "miRNA";}' \
          hsMirBaseFormatIdOnly.gff > hsMirBaseFormatIdOnly.bed
  
      # 2010-04-21
      # Down load the current snoRNABase coordinates (version 3, based on hg19)
      #  from
      # http://www-snorna.biotoul.fr/coordinates.php
      #   to
      # /hive/data/genomes/hg19/bed/wgRna-2010-04-20/snoRNABaseVer3Coords.xls
  
      cd /hive/data/genomes/hg19/bed/wgRna-2010-04-20/
      cp snoRNABaseVer3Coords.xls snoRNABaseVer3Coords.txt
      # remove the header line (column title).
      # remove all the quotes surrounding characters field
      perl -pi.bak -e 's/\"//g' snoRNABaseVer3Coords.txt
      # Reformat to BED format with thickStart and thickEnd set to 0.
      awk 'BEGIN {FS="\t"} {OFS="\t"} \
          {if ($4 == "+") \
           print $1, $2-1, $3, $5, 0, $4, 0, 0,$6; \
         else if ($4 == "-") \
           print $1, $2-1, $3, $5, 0, $4, 0, 0,$6;}' \
         snoRNABaseVer3Coords.txt > snoRNABaseVer3Coords.bed
      # 2010-08-02: snoRnaBase team has not response to hg19 update request.
      # use liftOver to convert the 400 coordinates to hg19 directly.
      liftOver snoRNABaseVer3Coords.bed -bedPlus=3 \
        /hive/data/genomes/hg18/bed/liftOver10K/hg18ToHg19.over.chain.gz \
          snoRNABaseHg19Coords.bed unMapped
      # Reading liftover chains
      # Mapping coordinates
  
  
  
      # Merge the miRNA and snoRNA files together
      cat hsMirBaseFormatIdOnly.bed snoRNABaseHg19Coords.bed \
          > wgRna20100420.bed
      # Create and load wgRna
      cp -p /cluster/bin/build/build-kent/src/hg/lib/wgRna.sql wgRna.sql
      hgLoadBed -sqlTable=wgRna.sql hg19 wgRna wgRna20100420.bed
      #  Reading wgRna20100420.bed
      #  Loaded 1341 elements of size 9
      #  Sorted
      #  Creating table definition for wgRna
      #  Saving bed.tab
      #  Loading hg19
  
      # Clean up
      rm *.bak
  
      # some details about this track:
      hgsql -e "select count(*) from wgRna;" hg19
      #  1341
      # contain 4 types:
      cat wgRna20100420.bed | awk '{print $9}' | sort | uniq
      # CDBox
      # HAcaBox
      # miRNA
      # scaRna
      hgsql -e "select type, count(*) from wgRna  group by type;" hg19
      #   CDBox     269
      #   HAcaBox   112
      #   miRNA     939
      #   scaRna     21
      featureBits hg19 wgRna
      # 122226 bases of 2899183193 (0.004%) in intersection
  
  
  #############################################################################
  # AFFY U133Plus2 (DONE 2010-10-04 Chin) (Removed prefixes 2010-11-29 galt)
      # Align probes
      ssh swarm
      cd /hive/data/genomes/hg19/bed
      mkdir -p affyProbes/affyU133Plus2/run
      cd affyProbes/affyU133Plus2/run
      mkdir psl
      ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
      ls -1 /hive/data/outside/affyProbes/U133Plus2_all.fa > mrna.lst
  
      cat << '_EOF_' > gsub
  #LOOP
  /cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << this line makes emacs coloring happy
  
      gensub2 genome.lst mrna.lst gsub jobList
      para create jobList
      para try
      para check
      para push
      para time
      # Completed: 96 of 96 jobs
      # CPU time in finished jobs: 31136s     518.93m     8.65h    0.36d  0.001 y
      # IO & Wait Time:            2218s      36.97m     0.62h    0.03d  0.000 y
      # Average job time:                 347s       5.79m     0.10h    0.00d
      # Longest finished job:            2548s      42.47m     0.71h    0.03d
      # Submission to last job:          4244s      70.73m     1.18h    0.05d
  
      # Do sort, best in genome filter.
      # to create affyU133Plus2.psl.
      pslSort dirs raw.psl tmp psl
      pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU133Plus2.psl /dev/null
      #   Processing raw.psl to ../affyU133Plus2.psl and /dev/null
      #   .....Processed 693340 alignments
      rm -r raw.psl psl
  
      # Load probes and alignments into database.
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/affyProbes/affyU133Plus2
  
      # remove prefix
      perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
      hgLoadPsl hg19 affyU133Plus2.psl
      hgLoadSeq -abbr=U133+2: hg19 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
           # note: re-ran the -replace was used with hgLoadSeq
      #   Creating seq.tab file
      #   Adding /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
      #   54613 sequences
      #   Updating seq table
      #   All done
      #  hgsql -e "select count(*) from affyU133Plus2;" hg19
      #     58592
  
  
      # Added ensToU133Plus2 table
      hgMapToGene hg19 affyU133Plus2 ensGene ensToU133Plus2
  
  
  #############################################################################
  # AFFY U95 (DONE 2010-10-07 Chin)  (Removed prefixes 2010-11-29 galt)
      # Align probes
      ssh swarm
      cd /hive/data/genomes/hg19/bed
      mkdir -p affyProbes/affyU95/run
      cd affyProbes/affyU95/run
      mkdir psl
      ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
      ls -1 /hive/data/outside/affyProbes/HG-U95Av2_all.fa > mrna.lst
  
      cat << '_EOF_' > gsub
  #LOOP
  /cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
  #ENDLOOP
  '_EOF_'
      # << this line makes emacs coloring happy
  
      gensub2 genome.lst mrna.lst gsub jobList
      para create jobList
      para try
      para check
      para push
      para time
  # Completed: 93 of 93 jobs
  # CPU time in finished jobs:       2101s      35.01m     0.58h    0.02d  0.000 y
  # IO & Wait Time:                   657s      10.95m     0.18h    0.01d  0.000 y
  # Average job time:                  30s       0.49m     0.01h    0.00d
  # Longest finished job:             165s       2.75m     0.05h    0.00d
  # Submission to last job:           619s      10.32m     0.17h    0.01d
  # Estimated complete:                 0s       0.00m     0.00h    0.00d
  #Submission to last job:          1685s      28.08m     0.47h    0.02d
  
  
      # Do sort, best in genome filter.
      # to create affyU95.psl.
      pslSort dirs raw.psl tmp psl
      pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU95.psl /dev/null
      rm -r raw.psl psl
  
      # Load probes and alignments into database.
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/affyProbes/affyU95
  
      # remove prefix
      perl -pi.bak -e "s/U95Av2://" affyU95.psl
      hgLoadPsl hg19 affyU95.psl
      hgLoadSeq -abbr=U95Av2: hg19 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
           # note: re-ran the -replace was used with hgLoadSeq
      #   Creating seq.tab file
      #   Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
      #   12386 sequences
      #   Updating seq table
      #   All done
  
      # Added ensToU95 table
      hgMapToGene hg19 affyU95 ensGene ensToU95
  
  #############################################################################
  # UPDATE KEGG TABLES (DONE, Fan, 6/18/10)
  
  mkdir -p /hive/data/genomes/hg19/bed/pathways/kegg
  cd /hive/data/genomes/hg19/bed/pathways/kegg
  
  wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab
  
  cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp
  cut -f 2 j.tmp >j.hsa
  cut -f 1,3 j.tmp >j.1
  paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab
  rm j.hsa j.1
  rm j.tmp
  
  hgsql hg19 -e 'drop table keggMapDesc'
  hgsql hg19 < ~/kent/src/hg/lib/keggMapDesc.sql
  hgsql hg19 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
  
  wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list
  
  cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
  hgsql hg19 -e 'drop table keggPathway'
  hgsql hg19 < ~/kent/src/hg/lib/keggPathway.sql
  hgsql hg19 -e 'load data local infile "j.tmp" into table keggPathway'
  
  hgsql hg19 -N -e \
  'select name, locusID, mapID from keggPathway p, ensToLocusLink l where p.locusID=l.value' \
  >keggPathway.tab
  
  hgsql hg19 -e 'delete from keggPathway'
  
  hgsql hg19 -e 'load data local infile "keggPathway.tab" into table keggPathway'
  
  rm j.tmp
  #############################################################################
  # Add KEGG column to hg19 Gene Sorter (Done, Fan, 6/18/2010)
  
  mkdir -p /hive/data/genomes/hg19/bed/geneSorter
  cd /hive/data/genomes/hg19/bed/geneSorter
  hgsql hg19 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > ensToKeggEntrez.tab
  
  hgsql hg19 -e 'drop table ensToKeggEntrez'
  
  hgsql hg19 < ~/kent/src/hg/lib/ensToKeggEntrez.sql
  
  hgsql hg19 -e 'load data local infile "ensToKeggEntrez.tab" into table ensToKeggEntrez'
  
  #############################################################################
  # Haplotype locations (DONE - 2010-06-29 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/haplotypeLocations
      cd /hive/data/genomes/hg19/bed/haplotypeLocations
      for H in 1 2 3 4 5 6 7 8 9
  do
      grep -v "^#" ../../download/alternate_loci/ALT_REF_LOCI_${H}/placed_scaffolds/alt_locus_scaf2primary.pos | awk -F'\t' '
  {
  printf "chr%d\t%d\t%d\t%s\n", $3, $4, $6, $1
  }
  '
  done | sed -e "s/HSCHR6_MHC_APD_CTG1/chr6_apd_hap1/;
  s/HSCHR6_MHC_COX_CTG1/chr6_cox_hap2/;
  s/HSCHR6_MHC_DBB_CTG1/chr6_dbb_hap3/;
  s/HSCHR6_MHC_MANN_CTG1/chr6_mann_hap4/;
  s/HSCHR6_MHC_MCF_CTG1/chr6_mcf_hap5/;
  s/HSCHR6_MHC_QBL_CTG1/chr6_qbl_hap6/;
  s/HSCHR6_MHC_SSTO_CTG1/chr6_ssto_hap7/;
  s/HSCHR4_1_CTG9/chr4_ctg9_hap1/;
  s/HSCHR17_1_CTG5/chr17_ctg5_hap1/;" > haplotypeLocations.bed
  
      hgLoadBed hg19 haplotypeLocations haplotypeLocations.bed
      featureBits hg19 haplotypeLocations
  # 7207422 bases of 2897316137 (0.249%) in intersection
  
  #############################################################################
  # BUILD THE TRACK OF IKMC MAPPED TO HUMAN GENOME. (DONE 5/23/12 angie)
  # done 8/2/11 Fan
      ssh hgwdev
      mkdir -p /hive/data/genomes/hg19/bed/ikmc/2012_05
      cd /hive/data/genomes/hg19/bed/ikmc/2012_05
      # Save files emailed from Carol Bult as
      # 20120518_human.gff.gz
      # Make bed12 with itemRgb:
      # watch out for a few items on chrUn|NT_167216.1 which we call chrUn_gl000222.
      zcat 20120518_human.gff.gz \
      | sed -e 's/^chrUn|NT_167216.1/chrUn_gl000222/' \
      | perl -we \
        'while (<>) { \
           s/\r?\n$//; \
           ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
           if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
           $col = ($col eq "Yellow") ? "255,215,0" : \
                  ($col eq "Green")  ? "0,240,0" : \
                  ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
           $s--; \
           $id =~ s/^MGI:\d+; ([\w ]+); .*/$1/ || die "Cant parse id \"$id\""; \
           $id =~ s/ //g; \
           my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
           push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
        } \
        warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
        foreach my $geneId (keys %geneBlks) { \
           my @blks = @{$geneBlks{$geneId}}; \
           my ($chrom, $center, $name) = split(/\|/, $geneId); \
           my $blkCount = @blks; \
           @blks = sort {$a->[0] <=> $b->[0]} @blks; \
           my $chromStart = $blks[0]->[0]; \
           my $chromEnd = $blks[$blkCount-1]->[1]; \
           my $color = $blks[0]->[2]; \
           my $blkStarts = ""; \
           my $blkSizes = ""; \
           foreach my $blk (@blks) { \
             my ($start, $end, $col) = @{$blk}; \
             $blkStarts .= ($start - $chromStart) . ","; \
             $blkSizes  .= ($end - $start) . ","; \
             if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
           } \
          print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                     $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
        }' \
      | sort -k 1,1 -k 2n,2n > hgIkmc.bed
  #Got 46392 genes.
  
      # Make an alias-style table with associated info (MGI ID and status):
      zcat 20120518_human.gff.gz \
      | sed -e 's/^chrUn|NT_167216.1/chrUn_gl000222/' \
      | perl -wpe 's/\r?\n$//; @w = split("\t"); \
        if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
        if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
        $w[8] =~ m/^(MGI:\d+); ([\w ]+); (\w.*)/ || die; \
        ($mgi, $designId, $status) = ($1, $2, $3); \
        $designId =~ s/ //g; \
        # NOTE: This line differs from the mouse version: has $designId preceding $w[2]: \
        $_ = "$w[10]_$designId\t$mgi,$designId,$w[2],$status\n";' \
      | sort -u > hgIkmcExtra.tab
      wc -l hgIkmcExtra.tab
  #46392 hgIkmcExtra.tab
  
      # load and check tables
      hgLoadBed hg19 hgIkmc hgIkmc.bed
      checkTableCoords  -verbose=2  hg19 hgIkmc
      hgLoadSqlTab hg19 hgIkmcExtra $HOME/kent/src/hg/lib/genericAlias.sql hgIkmcExtra.tab
      runJoiner.csh mm9 ikmc
  # mm9.ikmcExtra.name - hits 51052 of 51052 ok
  
  
  #############################################################################
  # adding patches to the sequence (DONE - 2010-07-23)
      #	fetch the "official" chrM sequence
      mkdir -p /hive/data/genomes/hg19/bed/additionalSequence/chrM
      cd /hive/data/genomes/hg19/bed/additionalSequence/chrM
      wget --timestamping -O NC_012920.1.fa \
  "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=NC_012920.1.fa"
      echo ">chrM_NC_012920" > chrM_NC_012920.fa
      grep -v "^>" NC_012920.1.fa | sed -e "/^$/d" >> chrM_NC_012920.fa
  
      #	fetch the first two patches:
      mkdir -p /hive/data/genomes/hg19/bed/additionalSequence/patches
      cd /hive/data/genomes/hg19/bed/additionalSequence/patches
      wget --cut-dirs=7 --no-parent --timestamping --no-remove-listing -m \
          -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
  "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/PATCHES/"
  
      #	take a look through the downloaded files to find the name
      #	correspondence.  The fasta names are in:
      zcat patch_release_1/FASTA/alt.scaf.fa.gz | grep "^>"
      # the other names are in:
      cat patch_release_1/alt_scaffold_placement.txt
      #	Decide on UCSC chrom names
      #	Create a file with these different names to use later:
  # fasta string             alt_scaf_name  parent_name  UCSC chrom name
      cat << '_EOF_' > ucscNames.txt
  gi|289436847|gb|GL339449.1 HSCHR5_1_CTG1 CM000667.1 chr5_ctg1_hap1
  gi|289436846|gb|GL339450.1 HG79_PATCH CM000671.1 chr9_gl339450
  chrM_NC_012920 unens unens chrM_NC_012920
  '_EOF_'
      # << happy emacs
  
  
      #	construct the files for UCSC:
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch1
  
      #	add these sequences to existing hg19.2bit to make a new one:
      cat ../chrM/chrM_NC_012920.fa > patch1.ucsc.fa
      zcat ../patches/patch_release_1/FASTA/alt.scaf.fa.gz \
          | sed -e "s/^>.*GL339449.1.*/>chr5_ctg1_hap1/;" \
  -e "s/^>.*GL339450.1.*/>chr9_gl339450/" >> patch1.ucsc.fa
  
      twoBitToFa /gbdb/hg19/hg19.2bit hg19.existing.fa
      faToTwoBit hg19.existing.fa patch1.ucsc.fa hg19.patch1.2bit
      rm -f /gbdb/hg19/hg19.unmasked.patch1.2bit
      #	temporarily use this unmasked sequence
      ln -s `pwd`/hg19.unmasked.patch1.2bit /gbdb/hg19/hg19.patch1.2bit
      twoBitInfo hg19.patch1.2bit stdout | sort -k2nr > patch1.chrom.sizes
  
      cat << '_EOF_' > mkTables.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  sub usage() {
      printf STDERR "usage: mkTables.pl patches.chrom.sizes \\\n";
      printf STDERR " ../patches/ucscNames.txt ../patches/patch_release_1/AGP/alt.scaf.agp.gz\n";
  }
  
  my $argc = scalar(@ARGV);
  
  if ($argc < 3) {
      usage;
      exit 255;
  }
  
  my $sizes = shift;      # patches.chrom.sizes
  my $names = shift;      # patches/ucscNames.txt
  my $agpFile = shift;    # alt.scaf.agp.gz
  
  my %glToChr;
  my %chrToCtg;
  my %fastaToChr;
  my %chrToSize;
  
  open(FH, "<$sizes") or die "can not read $sizes";
  while (my $line = <FH>) {
      chomp $line;
      my ($chr, $size) = split('\s+', $line);
      $chrToSize{$chr} = $size;
  }
  close (FH);
  
  open(CI, ">chromInfo.txt") or die "can not write to chromInfo.txt";
  open(CT, ">ctgPos.txt") or die "can not write to ctgPos.txt";
  open(FH, "<$names");
  while (my $line = <FH>) {
      chomp $line;
      my ($faName, $ctg, $cmName, $chr) = split('\s+', $line);
      $faName =~ s/.*gb.GL/GL/;
      my $size = $chrToSize{$chr};
      if (exists($glToChr{$faName})) {
          if ($glToChr{$faName} ne $chr) {
              printf STDERR "ERROR: contig name: $faName was chr name: $glToChr{$faName}\n";
              printf STDERR " now claiming to be chr name: $chr\n";
              exit 255;
          }
      } else {
          $glToChr{$faName} = $chr;
      }
      printf CT "%s\t%d\t%s\t0\t%d\n", $faName, $size, $chr, $size;
      printf CI "%s\t%d\t/gbdb/hg19/hg19.patches.2bit\n", $chr, $size;
  }
  close (FH);
  close (CT);
  close (CI);
  
  my $prevObj = "";
  my $newIx = 1;
  open (GP,">gap.txt") or die "can not write to gap.txt";
  open (GL,">gold.txt") or die "can not write to gold.txt";
  open (FH,"zcat $agpFile|") or die "can not read $agpFile";
  while (my $line = <FH>) {
      next if ($line =~ m/^\s*#/);
      chomp $line;
      my ($object, $objStart, $objEnd, $ix, $type, $frag, $fragStart, $fragEnd, $strand) = split('\s+', $line);
      die "ERROR: can not find contig $object to chr name"
          if (!exists($glToChr{$object}));
      $newIx = 1 if ($prevObj ne $object);
      my $chr = $glToChr{$object};
      if ($type eq "N") {
          # frag is size, fragStart is type of gap, and fragEnd is bridged y/n
          printf GP "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
              $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart,
              $fragEnd;
      } else {
          printf GL "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
              $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart-1,
              $fragEnd, $strand;
      }
      ++$newIx;
      $prevObj = $object;
      printf "%s\n", $line;
  }
  close (FH);
  close (GL);
  close (GP);
  '_EOF_'
      # << happy emacs
      chmod +x mkTables.pl
      ./mkTables.pl  patch1.chrom.sizes ../patches/ucscNames.txt \
  	../patches/patch_release_1/AGP/alt.scaf.agp.gz
      echo 'chrM_NC_012920  16569   /gbdb/hg19/hg19.patches.2bit' \
  	>> chromInfo.txt
  
      #	create tab files
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=/cluster/home/hiram/kent/src/hg/lib/agpFrag.sql \
          hg19 tGold gold.txt
      rm -f gold.tab
      mv bed.tab gold.tab
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=/cluster/home/hiram/kent/src/hg/lib/gap.sql \
          hg19 tGap gap.txt
      rm -f gap.tab
      mv bed.tab gap.tab
  
      # these table inserts are performed carefully to make sure they are
      # sane, for example, count the rows before and after load:
      hgsql -e 'load data local infile "gold.tab" into table gold;' hg19
      hgsql -e 'load data local infile "gap.tab" into table gap;' hg19
      hgsql -e 'load data local infile "ctgPos.txt" into table ctgPos;' hg19
      hgsql -e 'load data local infile "chromInfo.txt" into table chromInfo;' hg19
  
      hgsql -e 'update chromInfo set fileName="/gbdb/hg19/hg19.patch1.2bit";' hg19
  
      cat << '_EOF_' > ctgPos2.txt
  HSCHR5_1_CTG1   1620324 chr5_ctg1_hap1  0       1620324 F
  HG79_PATCH      330164  chrUn_gl339450  0       330164  F
  NC_012920.1     16569   chrM_NC_012920  0       16569   F
  NC_001807.4     16571   chrM    0       16571   O
  '_EOF_'
      # << happy emacs
  
      hgsql -e 'load data local infile "ctgPos2.txt" into table ctgPos2;' hg19
  
      #	RepeatMasking and SimpleRepeats
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1/RMRun
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch1/RMRun
      ln -s ../patch1.ucsc.fa .
      time /scratch/data/RepeatMasker/RepeatMasker -align -s \
  	-species 'Homo sapiens' patch1.ucsc.fa
      # took about 6 hours.  Probably should have broken up the large chr5 bit
      #	sort it:
  
      head -3 patch1.ucsc.fa.out | sed -e "s/  *$//" > patch1.ucsc.sort.out
      headRest 3 patch1.ucsc.fa.out | sort -k5,5 -k6,6n \
  	| sed -e "s/  *$//" >> patch1.ucsc.sort.out
      #	create a .tab file to load
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql \
          hg19 tRmsk patch1.ucsc.sort.out
      mv bed.tab patch1.rmsk.tab
      hgsql -e 'load data local infile "patch1.rmsk.tab" into table rmsk;' hg19
      #	create nestedRepeats
      /cluster/bin/scripts/extractNestedRepeats.pl patch1.ucsc.sort.out
      #	create a .tab file to load
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql \
          hg19 tNest patch1.nestedRepeats.bed
      rm -f patch1.nestedRepeats.tab
      mv bed.tab patch1.nestedRepeats.tab
      hgsql -e 'load data local infile "patch1.nestedRepeats.tab" into table nestedRepeats;' hg19
  
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1/simpleRepeat
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch1/simpleRepeat
      ln -s ../patch1.ucsc.fa .
      /cluster/bin/$MACHTYPE/trfBig -trf=/cluster/bin/$MACHTYPE/trf \
  	patch1.ucsc.fa /dev/null -bedAt=patch1.ucsc.bed -tempDir=.
      awk '$5 <= 12' patch1.ucsc.bed > trfMask.bed
      mkdir trfMaskChrom
      splitFileByColumn trfMask.bed trfMaskChrom/
      hgLoadBed -oldTable hg19 simpleRepeat patch1.ucsc.bed \
          -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
  
      #	add these masks
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch1
      twoBitMask -add hg19.unmasked.patch1.2bit RMRun/patch1.ucsc.sort.out \
  	hg19.rmsk.2bit
      ln -s `pwd`/hg19.unmasked.patch1.2bit /gbdb/hg19/hg19.patch1.2bit
      twoBitMask -add hg19.rmsk.2bit simpleRepeat/trfMask.bed hg19.patch1.t.2bit
      #	safe to ignore errors about >= 13 fields
      twoBitToFa hg19.patch1.t.2bit stdout | faSize stdin \
  	> hg19.patch1.2bit.faSize 2>&1
      # 3139128321 bases (239950803 N's 2899177518 real 1431272440 upper
      # 1467905078 lower) in 96 sequences in 1 files
      # %46.76 masked total, %50.63 masked real
      #	update the unmasked sequence from earlier:
      rm -f hg19.patch1.2bit; mv hg19.patch1.t.2bit hg19.patch1.2bit
  
      time blat hg19.patch1.2bit \
  	/dev/null /dev/null -tileSize=11 -makeOoc=hg19.patch1.11.ooc \
  	-repMatch=1024
      #	Wrote 30723 overused 11-mers to hg19.patch1.11.ooc
      cp -p hg19.patch1.2bit hg19.patch1.11.ooc /hive/data/staging/data/hg19
      mkdir nib
      twoBitToFa -seq=chrM_NC_012920 hg19.patch1.2bit stdout \
  	| faToNib -softMask stdin nib/chrM_NC_012920.nib
      twoBitToFa -seq=chr5_ctg1_hap1 hg19.patch1.2bit stdout \
  	| faToNib -softMask stdin nib/chr5_ctg1_hap1.nib
      twoBitToFa -seq=chr9_gl339450 hg19.patch1.2bit stdout \
  	| faToNib -softMask stdin nib/chr9_gl339450.nib
  
  
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1/linSpecRep
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch1/linSpecRep
  for C in chr9_gl339450 chrM_NC_012920 chr5_ctg1_hap1
  do
      head -3 ../RMRun/patch1.ucsc.sort.out > ${C}.out
      grep "${C} " ../RMRun/patch1.ucsc.sort.out >> ${C}.out
      rm -f ${C}.out_mus*
      /scratch/data/RepeatMasker/DateRepeats ${C}.out -query human -comp mouse
      /cluster/bin/scripts/extractRepeats 1 ${C}.out_mus* > ${C}.out.spec
      rm -f ${C}.out_mus*
  done
  
      # copy new files to /hive/data/staging/data/hg19/ and request rsync
      #	to kluster nodes
  -rw-rw-r-- 1      2036 Jul 21 15:39 patch1.chrom.sizes
  -rw-rw-r-- 1 816756572 Jul 23 15:32 hg19.patch1.2bit
  -rw-rw-r-- 1    122900 Jul 23 15:56 hg19.patch1.11.ooc
  -rw-rw-r-- 1      8293 Jul 23 15:59 nib/chrM_NC_012920.nib
  -rw-rw-r-- 1    810170 Jul 23 16:01 nib/chr5_ctg1_hap1.nib
  -rw-rw-r-- 1    165090 Jul 23 16:01 nib/chr9_gl339450.nib
  -rw-rw-r-- 1   34676 Jul 23 16:27 lineageSpecificRepeats/chr9_gl339450.out.spec
  -rw-rw-r-- 1     386 Jul 23 16:27 lineageSpecificRepeats/chrM_NC_012920.out.spec
  -rw-rw-r-- 1  221381 Jul 23 16:27 lineageSpecificRepeats/chr5_ctg1_hap1.out.spec
  #############################################################################
  # Update BlastTab tables for hg19 (Done, Fan, 8/6/2010)
      ssh hgwdev
      mkdir -p /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp
      cd /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp
      mkdir 100806
      cd 100806
      # Get the proteins used by the other hgNear organisms:
      pepPredToFa hg19 ensGenePep hg19.ens.faa
      pepPredToFa mm9 ensGenePep mm9.ens.faa
      pepPredToFa rn4 ensGenePep rn4.ens.faa
      pepPredToFa danRer6 ensPep danRer6.ensPep.faa
      pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
      pepPredToFa ce6 sangerPep ce6.sangerPep.faa
      pepPredToFa sacCer2 sgdPep sacCer2.sgdPep.faa
  
      cat << _EOF_ > config.ra
  # Latest human vs. other Gene Sorter orgs:
  # mouse, rat, zebrafish, worm, yeast, fly
  
  targetGenesetPrefix ens
  targetDb hg19
  queryDbs mm9 rn4 danRer6 dm3 ce6 sacCer2
  
  hg19Fa /hive/data/genomes/hg19/bed/ucsc.12/ucscGenes.faa
  mm9Fa /hive/data/genomes/mm9/bed/ucsc.12/ucscGenes.faa
  rn4Fa /hive/data/genomes/rn4/bed/blastp/ens.faa
  danRer6Fa /hive/data/genomes/danRer6/bed/blastp/danRer6.ensPep.faa
  dm3Fa /hive/data/genomes/dm3/bed/flybase5.3/flyBasePep.fa
  ce6Fa /hive/data/genomes/ce6/bed/blastp/wormPep190.faa
  sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa
  
  buildDir /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp/100806
  scratchDir /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp/100806/tmp
  _EOF_
  doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log
  #########################################################################
  # LIFTOVER TO Hg18 (RE-DONE - 2010-07-26 - Hiram )
      #	preserving the previous 10K liftOver files
      mkdir /hive/data/genomes/hg19/bed/liftOver10K
      cd /hive/data/genomes/hg19/bed/liftOver10K
      ln -s ../blat.hg18.2009-06-04/hg19ToHg18.over.chain.gz .
  
      #	this liftOver is a 5000 size chunk
      mkdir /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26
      cd /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26
      # -debug run to create run dir, preview scripts...
      #	verifies files can be found
      doSameSpeciesLiftOver.pl -debug hg19 hg18
      # Real run:
      time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
  	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
  	 hg19 hg18 > do.log 2>&1
      #	real    115m26.071s
  
      #	checking liftOver accuracy
      mkdir /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/refGene
      cd /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/refGene
      hgsql -N -e "select * from refGene;" hg19 | cut -f2- > refGene.hg19.gp
      wc -l refGene.hg19.gp
      #	36766
      #	the 5K block size lift over chain
      liftOver -genePred refGene.hg19.gp ../hg19ToHg18.over.chain.gz \
  	refGene.hg19ToHg18.5K.lift.gp refGene.hg19ToHg18.5K.unMapped.gp
      wc -l refGene.hg19ToHg18.5K.unMapped.gp
      #   830
      #	the 10K block size lift over chain
      liftOver -genePred refGene.hg19.gp \
  	../../liftOver10K/hg19ToHg18.over.chain.gz \
  	refGene.hg19ToHg18.10K.lift.gp refGene.hg19ToHg18.10K.unMapped.gp
      wc -l refGene.hg19ToHg18.10K.unMapped.gp
      #	820
      #	construct custom track of chain files.
      #	the 5K block size lift over chain
      chainToPsl ../hg19ToHg18.over.chain.gz \
  	/hive/data/genomes/hg19/chrom.sizes \
  	/hive/data/genomes/hg18/chrom.sizes \
  	/hive/data/genomes/hg19/hg19.2bit \
  	/hive/data/genomes/hg18/hg18.2bit stdout \
  	| pslToBed stdin hg19ToHg18.5K.bed
      #	the 10K block size lift over chain
      chainToPsl ../../liftOver10K/hg19ToHg18.over.chain.gz \
  	/hive/data/genomes/hg19/chrom.sizes \
  	/hive/data/genomes/hg18/chrom.sizes \
  	/hive/data/genomes/hg19/hg19.2bit \
  	/hive/data/genomes/hg18/hg18.2bit stdout \
  	| pslToBed stdin hg19ToHg18.10K.bed
  
  #############################################################################
  # GENSCAN PREDICTIONS (DONE - 2010-07-30 Fan)
  # 		      (PARTIALLY RE-DONE AFTER FIXING .LTF FILES - 2010-08-03 Fan)
      #   After several attempts, the old genscan process could not be
      #   successfully completed, nor the new process used by mm9,
      #   even with the manual steps to do gsBig with
      #   smaller windowSize.
      #
      #	A new process is developed to overcome the challenges.
      #   This new build process for hg19 genscan is substantially different from
      #	hg18.  Due to gsBig errors, the chroms are split into 2,000,000 bp
      #   segments and the -windowSize is set to the default value of 1,200,000.
      #   The results are then collected and lifted to original chrom
      #   coordinates and then loaded into 3 genscan tables.
  
      ssh hgwdev
  
      mkdir -p /hive/data/genomes/hg19/bed/genscan
      cd /hive/data/genomes/hg19/bed/genscan
  
      # Check out hg3rdParty/genscanlinux to get latest genscan.
      cvs co hg3rdParty/genscanlinux
  
  # the latest and correct genscan build subdir is newTry
  
      mkdir -p /hive/data/genomes/hg19/bed/genscan/newTry
      cd /hive/data/genomes/hg19/bed/genscan/newTry
  
  # collect .fa files for all chroms in one subdir
  
      mkdir -p faOrig
      for f in `cat /hive/data/genomes/hg19/chr.fasta.list`
  	do
  	cp -p /hive/data/genomes/hg19/$f faOrig
  	done
  
  # construct the chrom.list
  
      ls -1 faOrig |sed -e 's/\.fa//'>chrom.list
  
  # creaet the file template to be used for cluster runs
      cat << '_EOF_' > template
  #LOOP
  /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=../../../hg3rdParty/genscanlinux/genscan -par=../../../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
  #ENDLOOP
  '_EOF_'
  
  # create the chunk2, lift, and runBlat subdir structures
  
      mkdir -p chunk2
      mkdir -p lift
      mkdir -p runBlat
  
      for f in `cat chrom.list`
  	do
  	mkdir -p lift/$f
  	mkdir -p chunk2/$f
  	mkdir -p runBlat/$f/gtf
  	mkdir -p runBlat/$f/pep
  	mkdir -p runBlat/$f/subopt
          cp ./template runBlat/$f
          done
  
  # split the chrom fa files into chunks of 2,000,000 bases or less
  # and create corresponding lift files
  
      for f in `cat chrom.list`
  	do
  	cd chunk2/$f
  	cat ../../faOrig/$f.fa\
  	|faSplit -lift=../../lift/$f/$f.lft gap stdin 2000000 ./ck_
  	cd ../..
  	done
  
  # LATER FOUND THAT THE SEQUENCE ID IN THE .fa FILES UNDER
  # /hive/data/genomes/hg19/* ARE NOT ALWAYS THE SAME AS THE CHROM ID.
  # THIS CAUSED INCORRECT CHROM IDs GENERATED IN THE .lft FILES.
  # PERFORM THE FOLLOWING TO FIX THE .lft FILES.
  
      mkdir fixLift
      cd fixLift
  
      cat << '_EOF_' > fixAll
  fix1 chr6_apd_hap1 apd.chr6.4622290.0.4622290.1
  fix1 chr6_cox_hap2 cox.chr6.4795371.0.4795371.1
  fix1 chr6_dbb_hap3 dbb.chr6.4610396.0.4610396.1
  fix1 chr6_mann_hap4 mann
  fix1 chr6_mcf_hap5 mcf.chr6.4833398.0.4833398.1
  fix1 chr6_qbl_hap6 qbl.chr6.4611984.0.4611984.1
  fix1 chr6_ssto_hap7 ssto
  '_EOF_'
  
      cat << '_EOF_' > fix1
  echo
  echo
  echo processing $1 $2
  cat /hive/data/genomes/hg19/bed/genscan/newTry/lift/$1/$1.lft |grep $2
  cat /hive/data/genomes/hg19/bed/genscan/newTry/lift/$1/$1.lft |sed -e
  "s/${2}/${1}/g" >new/$1.lft
  cat new/$1.lft |grep $1
  cp new/$1.lft /hive/data/genomes/hg19/bed/genscan/newTry/lift/$1/$1.lft
  '_EOF_'
  
      chmod +x fix*
      fixAll
      cd ..
  
  # go to memk to run cluster jobs
  
      ssh memk
      cd /hive/data/genomes/hg19/bed/genscan/newTry
  
  # create genome.list and jobList files for each chrom
  
      for f in `cat chrom.list`
  	do
  	cd runBlat/$f
  	ls -1 /hive/data/genomes/hg19/bed/genscan/newTry/chunk2/$f/* > genome.list
  	gensub2 genome.list single template jobList
  	cd ../..
  	done
  
  # create batch files
  
      for f in `cat chrom.list`
  	do
  	cd runBlat/$f
  	para create jobList
  	cd ../..
  	done
  
  # Send off cluster runs
  
      for f in `cat chrom.list`
  	do
          cd runBlat/$f
  	para try
          cd ../..
         done
  
      for f in `cat chrom.list`
  	do
          cd runBlat/$f
  	para push
          cd ../..
         done
  
  # When all the cluster runs are finished,
  # go back to hgwdev for final data collection and table loading
  
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/genscan/newTry
  
  # collect ptf results and lift them and then load into the genscan table
  
      mkdir -p gtf
      for f in `cat chrom.list`
  	do
  	echo
  	echo processing $f
  	cat runBlat/$f/gtf/*.gtf| liftUp -type=.gtf stdout \
          lift/$f/$f.lft error stdin \
  	| sed -e "s/ck_/${f}_/g" > gtf/$f.gtf
          ldHgGene -oldTable hg19 -gtf genscan gtf/$f.gtf
  	done
  
  # collect subopt results and lift them and then load them into the
  # genscanSubopt table
  
      mkdir -p subopt
      for f in `cat chrom.list`
  	do
          echo
  	echo processing $f
  	cat runBlat/$f/subopt/*.bed| liftUp -type=.bed stdout \
          lift/$f/$f.lft error stdin \
  	| sed -e "s/ck_/${f}_/g" > subopt/$f.bed
  	hgLoadBed -oldTable hg19 genscanSubopt subopt/$f.bed
  	done
  
  # collect pep results and load them into the genscanPep table
  
      mkdir -p pep
      rm pep/genscanPep.pep
      for f in `cat chrom.list`
  	do
          echo
  	echo processing $f
  	cat runBlat/$f/pep/*.pep\
  	| sed -e "s/ck_/${f}_/g" >> pep/genscanPep.pep
  	done
      hgPepPred hg19 generic genscanPep pep/genscanPep.pep
  
  ################################################################
  # HUMAN FETAL BRAIN EXON ARRAYS (YALE) (DONE 2010-08-03 - Chin)
  # Note from Andy: All primary data files were lost, The table
  # has all of the original data.
  # The "kent/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra" file should
  # still be valid.  The hgFixed table sestanBrainAtlasExps should be ok.
  # So it's just a matter of getting the hg18.sestanBrainAtlas table in
  #  hg19 coordinates.
  
      mkdir  /hive/data/genomes/hg19/bed/yaleMicroarrays
      cd  /hive/data/genomes/hg19/bed/yaleMicroarrays
      cp  /hive/data/genomes/hg18/bed/yaleMicroarrays/sestanBrainAtlas.bed \
             sestanBrainAtlasHg18.bed
      # In the hg18, thickStart is off by 1, fix it
      # hgsql -e " select thickStart-chromStart from sestanBrainAtlas;" hg18 |
      #   sort | uniq
      # 1
      cat sestanBrainAtlasHg18.bed | awk '{print  $1,$2,$3,$4,$5,$6,$7-1,$8,$9,$10,$11,$12,$13,$14,$15 }' >  sestanBrainAtlasHg18Fixed.bed
  
  
      # use liftOver to convert the hg18 coordinates to hg19 directly.
      liftOver sestanBrainAtlasHg18Fixed.bed -bedPlus=8 \
        /hive/data/genomes/hg18/bed/liftOver10K/hg18ToHg19.over.chain.gz \
          sestanBrainAtlasHg19.bed unMapped
  
      # Check the result of liftOver:
      wc -l sestanBrainAtlasHg1*.bed
      # 877877 sestanBrainAtlasHg18.bed
      # 877469 sestanBrainAtlasHg19.bed
      cat unMapped | awk ' /^chr/ {print $1}' | wc -l
      408
      cat unMapped | awk ' /^#/|| /^chr/ {print $1, $2, $3,_}' \
         > summaryUnMapped.txt
  
      # Fix up the result, so that $11(blockSizes) = $8(thickEnd)  - $7
      # (thickStart) without the fix, the checkTableCoors will complain.
       cat sestanBrainAtlasHg19.bed | awk '{print  $1,$2,$3,$4,$5,$6,$7,$8,$9, \
          $10, $8-$7, $12,$13,$14,$15 }' >  sestanBrainAtlasHg19Fixed.bed
  
  
      # load the table
      hgLoadBed hg19 sestanBrainAtlas sestanBrainAtlasHg19Fixed.bed
      # Reading sestanBrainAtlasHg19.bed
      # Loaded 877469 elements of size 15
      # Sorted
      # Creating table definition for sestanBrainAtlas
      # Saving bed.tab
      # Loading hg19
  
      # track stuff done from hg18 days:
      # kent/src/hg/makeDb/trackDb/human/sestanBrainAtlas.html
      # kent/src/hg/makeDb/trackDb/human/trackDb.ra
  
  #############################################################################
  # Updating to patch2 sequence (DONE - 2010-08-18 - Hiram)
  #	Most of this business is encapsulated into .sh or .pl scripts in
  #	these directories.  They can be used next time with slight alterations.
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2
      cd /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2
      wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
          -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
  "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p2/"
  
      cd /hive/data/genomes/hg19/bed/additionalSequence/patches
      #	construct a script that can gather the names from the delivered
      #	files and generate UCSC names.  May be useful next time.
      cat << '_EOF_' > gatherNames.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  sub usage() {
      print STDERR "usage: ./gatherNames.pl patch_release_2\n";
  }
  
  my $argc = scalar(@ARGV);
  
  if ($argc != 1) {
      usage;
      exit 255;
  }
  
  my $patchDir = shift;
  
  if ( ! -d $patchDir ) {
      print STDERR "ERROR: given directory $patchDir is not a directory or does not exist";
      usage;
      exit 255;
  }
  
  my %ctgToChr;
  my %ctgToFastaName;
  my $fasta = "$patchDir/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz";
  open (FH, "zcat $fasta | grep '^>' |")
  	or die "ERROR: can not read $fasta";
  while ( my $line = <FH> ) {
      chomp $line;
      my ($gi, $rest) = split('\s+',$line,2);
      my ($x, $acc, $y, $gl, $z) = split('\|', $gi);
      my $chr = $rest;
      $chr =~ s/Homo sapiens chromosome //;
      $chr =~ s/ genomic contig.*//;
      $ctgToChr{$gl} = $chr;
      $ctgToFastaName{$gl} = $gi;
      $ctgToFastaName{$gl} =~ s/\|$//;
      $ctgToFastaName{$gl} =~ s/^>//;
  #    printf "%s\t%s\n", $gl, $chr;
  }
  close (FH);
  
  my $placement = "$patchDir/PATCHES/alt_scaffolds/alt_scaffold_placement.txt";
  open (FH, "sort -t'\t' -k6,6n $placement|") or die "ERROR: can not read $placement";
  while (my $line = <FH>) {
      chomp $line;
      next if ($line =~ m/^\s*#/);
      my ($altAsmName, $primAsmName, $altScafName, $altScafAcc, $parentType,
  	$parentName, $parentAcc, $regionName, $ori, $altScafStart,
  	$altScafStop, $parentStart, $parentStop, $altStartTail, $altStopTail) =
  	split('\t',$line);
      my $chr = $ctgToChr{$altScafAcc};
      die "ERROR: chrom name here does not match: $chr != $parentName"
  	if ($chr ne $parentName);
  #    printf "chr%s %s\t%s\t%s\t%s\t%s\n", $chr, $parentName, $altScafAcc,
  #	$altScafName, $parentAcc, $regionName;
      my $ucscChrName = lc($altScafName);
      if ($ucscChrName =~ m/_patch$/) {
  	$ucscChrName = sprintf("chr%d_%s", $parentName, lc($altScafAcc));
  	$ucscChrName =~ s/\.1$//;
      } else {
  	$ucscChrName =~ s/^hs//;
  	$ucscChrName =~ s/_1$//;
  	my ($chrNum, $hapNum, $ctgName) = split('_', $ucscChrName, 3);
  	$ucscChrName = sprintf("%s_%s_%s", $chrNum, $ctgName, lc($altScafAcc));
  	$ucscChrName =~ s/\.1$//;
      }
      printf "%s %s %s %s\n", $ctgToFastaName{$altScafAcc}, $altScafName,
  	$parentAcc, $ucscChrName;
  }
  close (FH);
  '_EOF_'
      # << happy emacs
      chmod +x ./gatherNames.pl
      ./gatherNames.pl patch_release_2 > ucscNames.patch2.txt
  
      # with that name translation file in place, begin to put the sequences
      #	together:
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch2
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch2
      cat << '_EOF_' > addSequence.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  sub usage() {
      print STDERR "usage: ./addSequence.pl ../patches/ucscNames.patch2.txt \\\n";
      print STDERR "\t../patches/patch_release_2/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz\n";
  }
  
  my $argc = scalar(@ARGV);
  
  if ($argc != 2) {
      usage;
      exit 255;
  }
  
  my %skipSequence;
  $skipSequence{"chr9_gl339450"} = 1;
  $skipSequence{"chr5_ctg1_gl339449"} = 1;
  
  my $names = shift;
  my $fasta = shift;
  
  my %fastaToChrNames;
  open(FH, "<$names") or die "ERROR: can not read $names";
  while (my $line = <FH>) {
      chomp $line;
      my ($fa, $ctg, $cm, $chr) = split('\s+', $line);
      $fastaToChrNames{$fa} = $chr;
  }
  close (FH);
  
  open(PA, ">patch2.ucsc.fa") or die "ERROR: can not write to patch2.ucsc.fa";
  open(FH, "zcat $fasta|") or die "ERROR: can not zcat $fasta";
  my $skipToNext = 0;
  while (my $line = <FH>) {
      if ($line =~ m/^>/) {
  	my ($fa, $rest) = split('\s+', $line, 2);
  	$fa =~ s/\|$//;
  	$fa =~ s/^>//;
  	die "can not find $fa" if (!exists($fastaToChrNames{$fa}));
  	my $chr = $fastaToChrNames{$fa};
  	if (exists($skipSequence{$chr})) {
  	    $skipToNext = 1;
  	} else {
  	    printf PA ">%s\n", $chr;
  	    $skipToNext = 0;
  	}
      } else {
  	next if($skipToNext);
  	print PA $line;
      }
  }
  close (FH);
  close (PA);
  
  my $here=`pwd`;
  chomp $here;
  print `twoBitToFa ../patch1/hg19.patch1.2bit hg19.existing.fa`;
  print `faToTwoBit hg19.existing.fa patch2.ucsc.fa hg19.patch2.2bit`;
  print `rm -f /gbdb/hg19/hg19.patch2.2bit`;
  print `ln -s $here/hg19.patch2.2bit /gbdb/hg19`;
  print `twoBitInfo hg19.patch2.2bit stdout | sort -k2nr > patch2.chrom.sizes`;
  '_EOF_'
      # << happy emacs
      chmod +x addSequence.pl
      ./addSequence.pl ../patches/ucscNames.patch2.txt \
        ../patches/patch_release_2/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz
  
      sort patch2.chrom.sizes ../patch1/patch1.chrom.sizes | uniq -c \
          | sort -rn | awk '$1 == 1' | awk '{printf "%s\t%s\n", $2, $3}' \
          | sort -k2,2nr > patches.chrom.sizes
  
      cat << '_EOF_' > mkTables.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  sub usage() {
      printf STDERR "usage: ./mkTables.pl patches.chrom.sizes \\\n";
      printf STDERR " ../patches/ucscNames.patch2.txt ../patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz\n";
  }
  
  my $argc = scalar(@ARGV);
  
  if ($argc < 3) {
      usage;
      exit 255;
  }
  
  my %skipSequence;
  $skipSequence{"GL339449.1"} = 1;
  $skipSequence{"GL339450.1"} = 1;
  
  my $sizes = shift;	# patches.chrom.sizes
  my $names = shift;	# patches/ucscNames.txt
  my $agpFile = shift;	# alt.scaf.agp.gz
  
  my %glToChr;
  my %chrToCtg;
  my %fastaToChr;
  my %chrToSize;
  
  open(FH, "<$sizes") or die "can not read $sizes";
  while (my $line = <FH>) {
      chomp $line;
      my ($chr, $size) = split('\s+', $line);
      $chrToSize{$chr} = $size;
  }
  close (FH);
  
  open(CI, ">chromInfo.txt") or die "can not write to chromInfo.txt";
  open(CT, ">ctgPos.txt") or die "can not write to ctgPos.txt";
  open(FH, "<$names");
  while (my $line = <FH>) {
      chomp $line;
      my ($faName, $ctg, $cmName, $chr) = split('\s+', $line);
      $faName =~ s/.*gb.GL/GL/;
      next if (exists($skipSequence{$faName}));
      my $size = $chrToSize{$chr};
      if (exists($glToChr{$faName})) {
  	if ($glToChr{$faName} ne $chr) {
  	    printf STDERR "ERROR: contig name: $faName was chr name: $glToChr{$faName}\n";
  	    printf STDERR " now claiming to be chr name: $chr\n";
  	    exit 255;
  	}
      } else {
  	$glToChr{$faName} = $chr;
      }
      die "not defined faName" if (!defined($faName));
      die "not defined $faName $chr size" if (!defined($size));
      printf CT "%s\t%d\t%s\t0\t%d\n", $faName, $size, $chr, $size;
      printf CI "%s\t%d\t/gbdb/hg19/hg19.patch2.2bit\n", $chr, $size;
  }
  close (FH);
  close (CT);
  close (CI);
  
  my $prevObj = "";
  my $newIx = 1;
  open (GP,">gap.txt") or die "can not write to gap.txt";
  open (GL,">gold.txt") or die "can not write to gold.txt";
  open (FH,"zcat $agpFile|") or die "can not read $agpFile";
  while (my $line = <FH>) {
      next if ($line =~ m/^\s*#/);
      chomp $line;
      my ($object, $objStart, $objEnd, $ix, $type, $frag, $fragStart, $fragEnd, $strand) = split('\s+', $line);
      next if (exists($skipSequence{$object}));
      die "ERROR: can not find contig $object to chr name"
  	if (!exists($glToChr{$object}));
      $newIx = 1 if ($prevObj ne $object);
      my $chr = $glToChr{$object};
      if ($type eq "N") {
  	# frag is size, fragStart is type of gap, and fragEnd is bridged y/n
  	printf GP "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
  	    $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart,
  	    $fragEnd;
      } else {
  	printf GL "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
  	    $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart-1,
  	    $fragEnd, $strand;
      }
      ++$newIx;
      $prevObj = $object;
      printf "%s\n", $line;
  }
  close (FH);
  close (GL);
  close (GP);
  '_EOF_'
      # << happy emacs
      chmod +x mkTables.pl
      ./mkTables.pl patches.chrom.sizes \
      	../patches/ucscNames.patch2.txt \
  	../patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
  
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=/cluster/home/hiram/kent/src/hg/lib/agpFrag.sql \
          hg19 tGold gold.txt
      rm -f gold.tab
      mv bed.tab gold.tab
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=/cluster/home/hiram/kent/src/hg/lib/gap.sql \
          hg19 tGap gap.txt
      rm -f gap.tab
      mv bed.tab gap.tab
  
      hgsql -e 'load data local infile "gap.tab" into table gap;' hg19
      hgsql -e 'load data local infile "gold.tab" into table gold;' hg19
      hgsql -e 'load data local infile "ctgPos.txt" into table ctgPos;' hg19
  
      cat << '_EOF_' > mkCtgPos2.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  sub usage() {
      print STDERR "usage: ./mkCtgPos2.pl ../patches/ucscNames.patch2.txt \\\n";
      print STDERR "\tpatch2.chrom.sizes\n";
  }
  
  my $argc = scalar(@ARGV);
  
  if ($argc != 2) {
      usage;
      exit 255;
  }
  
  my %skipSequence;
  $skipSequence{"chr9_gl339450"} = 1;
  $skipSequence{"chr5_ctg1_gl339449"} = 1;
  
  my $names = shift;
  my $sizes = shift;
  
  my %chrSize;
  open(FH, "<$sizes") or die "ERROR: can not read $sizes";
  while (my $line = <FH>) {
      chomp $line;
      my ($chr, $size) = split('\s+', $line);
      $chrSize{$chr} = $size;
  }
  close (FH);
  
  open(FH, "<$names") or die "ERROR: can not read $names";
  while (my $line = <FH>) {
      chomp $line;
      my ($fa, $ctg, $cm, $chr) = split('\s+', $line);
      next if (exists($skipSequence{$chr}));
      if (exists($chrSize{$chr})) {
  	my $size = $chrSize{$chr};
  	printf "%s\t%d\t%s\t0\t%d\tF\n", $ctg, $size, $chr, $size;
      }
  }
  close (FH);
  '_EOF_'
      # << happy emacs
      chmod +x mkCtgPos2.pl
      ./mkCtgPos2.pl ../patches/ucscNames.patch2.txt patch2.chrom.sizes \
  	> ctgPos2.txt
      hgsql -e 'load data local infile "ctgPos2.txt" into table ctgPos2;' hg19
  
      cat << '_EOF_' > mkHapLocate.pl
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  sub usage() {
      print STDERR "usage: ./mkHapLocate.pl ctgPos.txt \\\n";
      print STDERR "\t../patches/patch_release_2/PATCHES/alt_scaffolds/alt_scaffold_placement.txt\n";
  }
  
  my $argc = scalar(@ARGV);
  
  if ($argc != 2) {
      usage;
      exit 255;
  }
  
  my %skipSequence;
  $skipSequence{"chr9_gl339450"} = 1;
  $skipSequence{"chr5_ctg1_gl339449"} = 1;
  
  my $ctgPos = shift;
  my $placement = shift;
  
  my %ctgToHap;
  open(FH, "<$ctgPos") or die "ERROR: can not read $ctgPos";
  while (my $line = <FH>) {
      my ($ctg, $size, $hapName, $rest) = split('\s+', $line, 4);
      $ctgToHap{$ctg} = $hapName;
  }
  close (FH);
  
  open(FH,"<$placement") or die "ERROR: can not read $placement";
  while (my $line = <FH>) {
      next if ($line =~ m/^#/);
      chomp $line;
      my ($altAsmName, $primAsmName, $altScafName, $altScafAcc, $parentType,
  	$parentName, $parentAcc, $regionName, $ori, $altScafStart,
  	$altScafStop, $parentStart, $parentStop, $altStartTail,
  	$altStopTail) = split('\t', $line);
      if (exists($ctgToHap{$altScafAcc})) {
  	my $hapName = $ctgToHap{$altScafAcc};
  	printf "chr%s\t%d\t%d\t%s\n", $parentName, $parentStart-1,
  	    $parentStop, $hapName;
      } else {
  	print STDERR "not found: $altScafAcc $altScafName\n";
      }
  }
  close (FH);
  '_EOF_'
      # << happy emacs
      chmod +x mkHapLocate.pl
      ./mkHapLocate.pl ctgPos.txt \
  ../patches/patch_release_2/PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
      > haplotypeLocations.bed
      hgLoadBed -oldTable hg19 haplotypeLocations haplotypeLocations.bed
  
      mkdir simpleRepeat
      cd simpleRepeat
      ln -s ../patch2.ucsc.fa
      /cluster/bin/$MACHTYPE/trfBig -trf=/cluster/bin/$MACHTYPE/trf \
        patch2.ucsc.fa /dev/null -bedAt=patch2.ucsc.bed -tempDir=.
      awk '$5 <= 12' patch2.ucsc.bed > trfMask.bed
      splitFileByColumn trfMask.bed trfMaskChrom/
      hgLoadBed -oldTable hg19 simpleRepeat patch2.ucsc.bed \
          -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
  
      mkdir ../RMRun
      cd ../RMRun
      ln -s ../patch2.ucsc.fa
      faSplit byname patch2.ucsc.fa ctgs/
  
      cat << '_EOF_' > runOne
  #!/bin/csh -fe
  
  set ctg = $1
  
  set runDir = /hive/data/genomes/hg19/bed/additionalSequence/patch2/RMRun
  set src = ${runDir}/ctgs/${ctg}.fa
  mkdir -p ${runDir}/out/${ctg}
  cd ${runDir}/out/${ctg}
  
  cp -p ${src} .
  /scratch/data/RepeatMasker/RepeatMasker -align -s -species 'Homo sapiens' ${ctg}.fa
  rm -f ${ctg}.fa
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      ls ctgs > ctg.list
      cat << '_EOF_' > template
  #LOOP
  runOne $(root1) {check out line+ out/$(root1)/$(root1).fa.out}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
      gensub2 ctg.list single template jobList
      ssh swarm
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/RMRun
      para create jobList
      mkdir out
      para try
      para push
  # Completed: 68 of 68 jobs
  # CPU time in finished jobs:      88730s    1478.83m    24.65h    1.03d  0.003 y
  # IO & Wait Time:                  6277s     104.62m     1.74h    0.07d  0.000 y
  # Average job time:                1397s      23.29m     0.39h    0.02d
  # Longest finished job:            4154s      69.23m     1.15h    0.05d
  # Submission to last job:          4291s      71.52m     1.19h    0.05d
  
      find ./out -type f -name "*.fa.out" | head -1 | xargs head -3 \
  	> hg19.patch2.out
      find ./out -type f -name "*.fa.out" | xargs -L 1 headRest 3 \
  	| sort -k5,5 -k6,6n | sed -e "s/  *$//" >> hg19.patch2.out
  
      extractNestedRepeats.pl hg19.patch2.out > hg19.nestedRepeats.patch2.bed
      hgLoadBed -noLoad -maxChromNameLength=14 \
          -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql \
          hg19 tNest hg19.nestedRepeats.patch2.bed
      rm -f hg19.nestedRepeats.patch2.tab
      mv bed.tab hg19.nestedRepeats.patch2.tab
  
      hgLoadOut -tabFile=hg19.patch2.rmsk.tab -nosplit hg19 hg19.patch2.out
  
      hgsql -e 'load data local infile "hg19.nestedRepeats.patch2.tab" into table nestedRepeats;' hg19
      hgsql -e 'load data local infile "hg19.patch2.rmsk.tab" into table rmsk;' \
  	hg19
  
      mv hg19.patch2.2bit hg19.patch2.0.2bit
      twoBitMask -add hg19.patch2.0.2bit RMRun/hg19.patch2.out hg19.patch2.1.2bit
      twoBitMask -add hg19.patch2.1.2bit \
          simpleRepeat/trfMask.bed hg19.patch2.2bit
      twoBitToFa hg19.patch2.2bit stdout | faSize stdin \
  	> faSize.hg19.patch2.2bit.txt
  
  #############################################################################
  # new blat server for the hg19.patch2 sequence (DONE - 2010-08-18 - Hiram)
      hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
  	VALUES ("hg19", "blat4", "17792", "1", "0"); \
  	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
  	VALUES ("hg19", "blat4", "17793", "0", "1");' \
  	    hgcentraltest
  
  #############################################################################
  # establish some liftover chains from reference sequence to new added sequence
  #	(WORKING - 2010-08-19 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
      twoBitInfo ../hg19.patch2.2bit stdout | sort > patch2.chrom.sizes
      twoBitInfo ../../../../hg19.2bit stdout | sort > hg19.chrom.sizes
      comm -13 hg19.chrom.sizes patch2.chrom.sizes > new.sequence.chrom.sizes
  for S in `awk '{print $1}' new.sequence.chrom.sizes`
  do
      echo $S
      mkdir -p $S
      twoBitToFa ../hg19.patch2.2bit:${S} ${S}/${S}.fa
  done
  
      faToTwoBit chr*/chr*.fa hg19.patch2.only.2bit
      rm -fr chr*/chr*.fa
      rmdir chr*
  
      ssh swarm
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
      cat << '_EOF_' > runOne
  #!/bin/csh -ef
  
  set hg19 = "/hive/data/genomes/hg19/bed/additionalSequence/patch2/hg19.patch2.2bit"
  set runJob = `pwd`/job.csh
  set target = $1
  set outPsl = $2
  set query = `echo $target | sed -e "s/_.*//"`
  set qSize = `twoBitInfo ${hg19}:${query} stdout | awk '{print $2}'`
  set tSequence = "${hg19}:${target}"
  set qSequence = "${hg19}:${query}:0-${qSize}"
  mkdir -p psl/${target}
  pushd psl/${target}
  # echo "${runJob} ${tSequence} ${qSequence} `pwd`/${target}.psl"
  ${runJob} ${tSequence} ${qSequence} `pwd`/${target}.psl
  popd
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      cat << '_EOF_' > job.csh
  #!/bin/csh -ef
  
  set targetList = $1
  set queryListIn = $2
  set outPsl = $3
  
  if ($targetList:e == "lst") set targetList = /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/run.blat/$targetList
  if ($queryListIn:e == "lst") set queryListIn = /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/run.blat/$queryListIn
  
  # Use local disk for output, and move the final result to $outPsl
  # when done, to minimize I/O.
  set tmpDir = `mktemp -d -p /scratch/tmp doSame.blat.XXXXXX`
  pushd $tmpDir
  
  # We might get a .lst or a 2bit spec here -- convert to (list of) 2bit spec:
  if ($queryListIn:e == "lst") then
    set specList = `cat $queryListIn`
  else
    set specList = $queryListIn
  endif
  
  # Further partition the query spec(s) into 5k coord ranges, building up
  # a .lst of 2bit specs for blat and a .lft liftUp spec for the results:
  cp /dev/null reSplitQuery.lst
  cp /dev/null query.lft
  foreach spec ($specList)
    set file  = `echo $spec | awk -F: '{print $1;}'`
    set seq   = `echo $spec | awk -F: '{print $2;}'`
    set range = `echo $spec | awk -F: '{print $3;}'`
    set start = `echo $range | awk -F- '{print $1;}'`
    set end   = `echo $range | awk -F- '{print $2;}'`
    if (! -e q.sizes) twoBitInfo $file q.sizes
    set seqSize = `awk '$1 == "'$seq'" {print $2;}' q.sizes`
    set chunkEnd = '0'
    while ($chunkEnd < $end)
      set chunkEnd = `expr $start + 5000`
      if ($chunkEnd > $end) set chunkEnd = $end
      set chunkSize = `expr $chunkEnd - $start`
      echo $file\:$seq\:$start-$chunkEnd >> reSplitQuery.lst
      if (($start == 0) && ($chunkEnd == $seqSize)) then
        echo "$start	$seq	$seqSize	$seq	$seqSize" >> query.lft
      else
        echo "$start	$seq"":$start-$chunkEnd	$chunkSize	$seq	$seqSize" >> query.lft
      endif
      set start = `expr $chunkEnd - 500`
    end
  end
  
  # Align unsplit target sequence(s) to .lst of 2bit specs for 5k chunks
  # of query:
  blat $targetList reSplitQuery.lst tmpUnlifted.psl \
    -tileSize=11 -minScore=100 -minIdentity=98 -fastMap -noHead
  
  # Lift query coords back up:
  liftUp -pslQ -nohead tmpOut.psl query.lft warn tmpUnlifted.psl
  
  # Move final result into place:
  mv tmpOut.psl $outPsl
  
  popd
  rm -rf $tmpDir
  '_EOF_'
      # << happy emacs
      chmod +x job.csh
  
      cat << '_EOF_' > template
  #LOOP
  runOne $(root1) {check out line+ psl/$(root1)/$(root1).psl}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      mkdir psl
      awk '{print $2}' new.sequence.chrom.sizes > target.list
      gensub2 target.list single template jobList
      para create jobList
      para try ... check ... push
      para time
  # Completed: 71 of 71 jobs
  # CPU time in finished jobs:      29233s     487.22m     8.12h    0.34d  0.001 y
  # IO & Wait Time:                  2567s      42.78m     0.71h    0.03d  0.000 y
  # Average job time:                 448s       7.46m     0.12h    0.01d
  # Longest finished job:            3757s      62.62m     1.04h    0.04d
  # Submission to last job:          4070s      67.83m     1.13h    0.05d
  
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
  for C in `cat target.list`
  do
      echo ${C}
      mkdir chain/${C}
      cd chain/${C}
      ln -s ../../psl/${C}/${C}.psl
      axtChain -linearGap=medium -psl ${C}.psl ../../../hg19.patch2.2bit ../../../hg19.patch2.2bit ${C}.chain
      cd ../..
  done
  
      cd chain
      find . -type f | xargs cat | chainSort stdin patch2.sort.chain
      hgLoadChain hg19 patch2Chain patch2.sort.chain
      #	Loading 93897 chains into hg19.patch2Chain
  
  #########################################################################
  # Vega gene update (DONE - 2010-08-25 - Hiram)
      #	lookup version number at the Vega WEB site:
      #	http://vega.sanger.ac.uk/index.html
      #	and FTP site:
      #	ftp://ftp.sanger.ac.uk/pub/vega/
      cd /hive/data/genomes/hg19
      #	step wise to verify operation
      doEnsGeneUpdate.pl -vegaGene -ensVersion=39 -stop=download hg19.ensGene.ra
      # they changed their naming convention again.  Look at the FTP site,
      #	fix the download script:
      cd bed/vega.39/download
      ./doDownload.csh
  # -rw-rw-r-- 1 12399394 Aug 25 09:15 gtf_file.gz
  # -rw-rw-r-- 1  9329992 Aug 25 09:15 Homo_sapiens.VEGA.39.pep.all.fa.gz
  
      doEnsGeneUpdate.pl -vegaGene -ensVersion=39 \
  	-continue=process -stop=process hg19.ensGene.ra
  # genePredCheck -db=hg19 vegaPseudo.gp.gz
  # checked: 12012 failed: 0
  # genePredCheck -db=hg19 not.vegaPseudo.gp.gz
  # checked: 103437 failed: 0
  # genePredCheck -db=hg19 hg19.allGenes.gp.gz
  # checked: 115449 failed: 0
  
      doEnsGeneUpdate.pl -vegaGene -ensVersion=39 \
  	-continue=load -stop=load hg19.ensGene.ra
      featureBits hg19 vegaGene
      # 78097231 bases of 2911519270 (2.682%) in intersection
      featureBits hg19 vegaPseudoGene
      #	8782198 bases of 2911519270 (0.302%) in intersection
  
  #########################################################################
  # FOSMID END PAIRS (STARTED 9/1/10 angie, WORKING - 2011-07-07 - Hiram)
      # First I downloaded raw files from NCBI, to see if they are newer
      # than what we have in fosends.3:
      mkdir /hive/data/outside/fosends.4
      cd /hive/data/outside/fosends.4
      wget --timestamping ftp://ftp.ncbi.nih.gov/genomes/FOSMIDS/homo_sapiens/\*
      # The file dates are 2005, but all log files begin with 2002 dates.
      # Good, we can proceed with fosends.3 and won't have to reverse-engineer
      # Terry's lost pipeline for translating the files.
      # take a look at the names
      zcat Hs*.dr.mfa.gz | grep "^>" > sequence.names
      zcat *.trim.log.gz | cut -f6 | sort > clone.names
      # we are going to translate the .T0 .T1 .T2 suffix on the sequence names
      #	to _T0 _T1 _T2 to avoid problems during blat and other parts of
      #	the pipeline
      # using the *.trim.log.gz files, figure out the pairs, write to
      #	files endPairs.txt and singles.txt
      cat << '_EOF_' > pairEnds.pl
  #!/bin/env perl
  
  use strict;
  use warnings;
  
  my %endIds;	# key is cloneId.subId, value is endId_F,endId_R
  open(FH,"zcat Hs.*trim.log.gz|") or die "can not read Hs.*trim.log.gz";
  while (my $line = <FH>) {
      next if ($line =~ m/^#/);
      my ($id, $l0, $l1, $endId, $wibr, $cloneId, $fr, $cId, $l2) =
  	split('\s+', $line);
      my ($oneEnd, $subId) = split('\.', $endId);
      my $key = "$cloneId.$subId";
      if (exists($endIds{$key})) {
  	die "ERROR: third end: $endId $cloneId $endIds{$key}"
  		if ($endIds{$key} =~ m/,/);
  	if ($fr eq "F") {
  	    $endIds{$key} = "$endId,$endIds{$key}";
  	} else {
  	    $endIds{$key} = "$endIds{$key},$endId";
  	}
      } else {
  	$endIds{$key} = $endId;
      }
  }
  close (FH);
  
  open (EP, ">endPairs.txt") or die "can not write to endPairs.txt";
  open (SI, ">singles.txt") or die "can not write to singles.txt";
  foreach my $key (sort (keys %endIds)) {
      my ($cloneId, $subId) = split('\.', $key);
      if ($endIds{$key} =~ m/,/) {
  	my ($fwd, $rev) = split(',', $endIds{$key});
  	$fwd =~ s/\./_/;
  	$rev =~ s/\./_/;
  	printf EP "%s\t%s\t%s\n", $fwd, $rev, $cloneId;
      } else {
  	my $fwd = $endIds{$key};
  	$fwd =~ s/\./_/;
  	printf SI "%s\t%s\n", $fwd, $cloneId;
      }
  }
  close(EP);
  close(SI);
  '_EOF_'
      # << happy emacs
      chmod +x pairEnds.pl
      ./pairEnds.pl
  
      # working in the hg19 build directory
      mkdir /hive/data/genomes/hg19/bed/fosEndPairs
      cd /hive/data/genomes/hg19/bed/fosEndPairs
  
      mkdir /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
      cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
      # fixup the names, and upper case all sequence (the names are already uc)
      zcat /hive/data/outside/fosends.4/*.dr.mfa.gz \
  	| sed -e "s/^>gnl.* />/; s/\.T/_T/" | tr '[a-z]' '[A-Z]' > fosends.4.fa
      faCount fosends.4.fa > fosends.4.faCount
      tail -1 fosends.4.faCount
  # total   1258791003      329912804       274295722       339230145       279976011       35376321        40508700
      # verify nothing broken, nothing lost with the name transition
      faCount /hive/data/outside/fosends.4/*.mfa.gz > /tmp/fs4.faCount
      tail -1 /tmp/fs4.faCount
  # total   1258791003      329912804       274295722       339230145       279976011       35376321        40508700
  
      faSize fosends.4.fa
  # 1258791003 bases (35376321 N's 1223414682 real 1223414682 upper 0 lower) in 1615492 sequences in 1 files
  # Total size: mean 779.2 sd 190.8 min 12 (G248P84602FB12_T0) max 3784 (G248P89620RG7_T0) median 722
  
      mkdir splitEnds
      faSplit sequence fosEnds.4.fa 400 splitEnds/fosEnds
  
      # figure out break points in hg19 around large gaps
      hgsql -N -e "select chrom,chromStart,chromEnd,size from gap;" hg19 \
  	| sort -k4nr > hg19.gap.bed
      # script to combine adjacent gaps into single gaps
      cat << '_EOF_' > bedCollapse.pl
  #!/bin/env perl
  
  use strict;
  use warnings;
  
  my $argc = scalar(@ARGV);
  
  if ($argc < 1) {
      printf STDERR "usage: ./bedCollapse.pl <file.bed>\n";
      printf STDERR "will combine adjacent bed elements into one element\n";
      exit 255
  }
  
  my $file = shift;
  my $chr = "";
  my $prevEnd = 0;
  my $start = 0;
  my $end = 0;
  my $size = 0;
  open (FH, "sort -k1,1 -k2,2n $file|") or die "can not read $file";
  while (my $line = <FH>) {
      chomp $line;
      my ($c, $s, $e, $rest) = split('\s+', $line, 4);
      $size = $end - $start;
      if (length($chr) > 1) {
          if ($chr ne $c) {
              printf "%s\t%d\t%d\t%d\n", $chr, $start, $end, $size;
              $chr = $c; $start = $s; $end = $e;
          } else {
              if ($s == $end) {
                  $end = $e;
              } else {
                  printf "%s\t%d\t%d\t%d\n", $chr, $start, $end, $size;
                  $chr = $c; $start = $s; $end = $e;
              }
          }
      } else {
          $chr = $c; $start = $s; $end = $e;
      }
  }
  printf "%s\t%d\t%d\t%d\n", $chr, $start, $end, $size;
  close (FH);
  '_EOF_'
      # << happy emacs
      chmod +x bedCollapse.pl
      # filter out for gaps of 50,000 and larger
      ./bedCollapse.pl hg19.gap.bed | awk '$4 > 49999' > hg19.gapBreaks.bed
      cat << '_EOF_' > hg19SplitSpec.pl
  #!/bin/env perl
  
  use strict;
  use warnings;
  
  my %chromSizes;
  open (FH, "<../../../chrom.sizes") or die "can not read ../../../chrom.sizes";
  while (my $line = <FH>) {
      chomp $line;
      my ($chr, $size) = split('\s+', $line);
      $chromSizes{$chr} = $size;
  }
  close (FH);
  
  my %chrDone;	# to measure which chroms have been done
  my $curChr = "";
  my $start = 0;
  my $end = 0;
  open (FH, "grep -v '_' hg19.gapBreaks.bed|") or die "can not grep hg19.gapBreaks.bed";
  while (my $line = <FH>) {
      chomp $line;
      my ($c, $s, $e, $rest) = split('\s+', $line, 4);
      if (length($curChr) > 0) {
  	if ($c eq $curChr) {
  	    $end = $s;
  	    my $size = $end - $start;
  	    die "ERROR: size is zero ? $c $s $e" if ($size < 1);
  	    printf "%s\t%d\t%d\t%d\n", $curChr, $start, $end, $size;
  	    $chrDone{$curChr} = 1;
  	} else {	# finish off previous chrom
  	    my $chrSize = $chromSizes{$curChr};
  	    if ($start < $chrSize) {
  		my $size = $chrSize - $start;
  		printf "%s\t%d\t%d\t%d\n", $curChr, $start, $chrSize, $size;
  		$chrDone{$curChr} = 1;
  	    }
  	    $curChr = $c;
  	}
      } else {  # first line in the file
  	$curChr = $c;
  	if ($s > 0) {
  	    printf "%s\t0\t%d\t%d\n", $curChr, $s, $s;
  	    $chrDone{$curChr} = 1;
  	}
      }
      $start = $e;	# next start is this end
      $end = $start;	# next end will be next start or chrSize
  }
  close(FH);
  '_EOF_'
      # << happy emacs
      chmod +x hg19SplitSpec.pl
      ./hg19SplitSpec.pl > hg19.splits.bed
  
      cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
      # 4M chunks might work a bit better
      ./partitionBed.pl 4000000 10000 hg19.splits.bed > hg19.4M.10K.bed
      mkdir /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
      cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
      awk '{printf "%s:%d-%d\n", $1,$2,$3}' ../hg19.4M.10K.bed > hg19.list
      ls ../../splitEnds | sed -e "s/.fa//" > fosEnds.list
  
      # XXX this didn't work with -fastMap, too much of the query was knocked
      #	out and the results wouldn't pass the pslReps filter
      #  Need to run blat without any arguments, filter the results
      #	with pslReps
      cat << '_EOF_' > runOne
  #!/bin/csh -fe
  
  set t=$1
  set q=$2
  set c=`echo $t | sed -e 's/:.*//'`
  set ctgStart=`echo $t | sed -e 's/.*://; s/-.*//'`
  set ctgSize=`echo $t | sed -e 's/:/ /; s/-/ /;' | awk '{printf "%d", $3-$2}'`
  set chrSize=`egrep "^$c " /scratch/data/hg19/chrom.sizes | cut -f2`
  set result="psl/${t}/${q}.psl"
  /bin/mkdir -p "psl/${t}"
  set tmpLift="/scratch/tmp/${t}.${q}.lift"
  set tResult="/scratch/tmp/${t}.${q}.psl"
  echo $ctgStart $t $ctgSize $c $chrSize > ${tmpLift}
  
  blat /scratch/data/hg19/hg19.2bit:${t} ../../splitEnds/${q}.fa stdout \
      | liftUp -type=.psl ${tResult} ${tmpLift} error stdin
  pslReps  -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons ${tResult} \
                          ${result} /dev/null
  /bin/rm -f ${tmpLift} ${tResult}
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      ssh swarm
      cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
      gensub2 hg19.list fosEnds.list template jobList
      #	340,470 jobs
      para create jobList
      para try
      para check ... push ... etc
  # Completed: 340470 of 340470 jobs
  # CPU time in finished jobs:  306516290s 5108604.83m 85143.41h 3547.64d  9.720 y
  # IO & Wait Time:              22550855s  375847.59m  6264.13h  261.01d  0.715 y
  # Average job time:                 967s      16.11m     0.27h    0.01d
  # Longest finished job:           18257s     304.28m     5.07h    0.21d
  # Submission to last job:        572370s    9539.50m   158.99h    6.62d
  
      cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
      time pslSort dirs raw.psl /scratch/tmp psl/ch* > sort.log 2>&1 &
      # 340470 files in 873 dirs
      # Got 340470 files 583 files per mid file
      #	real    291m28.005s
  # -rw-rw-r--   1 11527396213 Jul 13 14:53 raw.psl
  
  
      # and now that all those individual results are together, filter all
      # of them
      cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
      time pslReps  -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons \
  	run4M/raw.psl hg19.fosEnds.psl /dev/null
      #	Processed 64042919 alignments
      #	real    11m9.402s
      #	-rw-rw-r-- 1   299504257 Jul 13 15:18 hg19.fosEnds.psl
  
      # FYI: to see all minimum covers:
  grep "^[0-9]" raw.psl \
  	| awk '{printf "%.2f\n", 100*($1+$3)/$11}' | ave stdin
  # Q1 7.320000
  # median 8.040000
  # Q3 11.690000
  # average 13.519178
  # min 0.980000
  # max 100.000000
  # count 627281022
  # total 8480323865.619440
  # standard deviation 13.540966
  
      cd /hive/data/genomes/hg19/bed/fosEndPairs
      time /cluster/home/hiram/bin/x86_64/pslPairs \
      -tInsert=5000 -minId=0.94 -noBin -min=30000 -max=50000 -slop -short \
  	-long -orphan -mismatch -verbose mapEnds/hg19.fosEnds.psl \
  	/hive/data/outside/fosends.4/endPairs.txt all_fosends hg19.fosEnds
      #	real    0m10.042s
      # filter for score over 300
      awk '$5 >= 300' hg19.fosEnds.pairs | sort -k1,1 -k2,2n \
  	> hg19.fosEndPairs.bed
      wc -l hg19.fosEnds.pairs hg19.fosEndPairs.bed
      #	230791 hg19.fosEnds.pairs
      #	230583 hg19.fosEndPairs.bed
  
      awk '$5 >= 300' hg19.fosEnds.slop hg19.fosEnds.short hg19.fosEnds.long \
  	hg19.fosEnds.mismatch hg19.fosEnds.orphan | sort -k1,1 -k2,2n \
  	> hg19.fosEndPairsBad.bed
  
      # for all names in the hg19.fosEndPairs.bed and hg19.fosEndPairsBad.bed
      # files, extract those names from the mapEnds/hg19.fosEnds.psl file
      # to construct a psl file to load up to represent all ends
      awk '{print $11}' hg19.fosEndPairs.bed hg19.fosEndPairsBad.bed \
  	| tr '[,]' '[\n]' | sort -u > hg19.allEnds.names
      # the '\t' here actually needs to be the literal: Ctrl-v i
      #	to get a "real" tab there as the character
      headRest 5 mapEnds/hg19.fosEnds.psl | sort -k10 \
  	| join -t '\t' -1 10 -2 1 \
  -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21 - hg19.allEnds.names | sort -k 14,14 -k 15,15n \
  	> hg19.fosEnds.load.psl
  
      sed -e "s/fosEndPairs/hg19FosEndPairs/" \
  	$HOME/kent/src/hg/lib/fosEndPairs.sql > hg19FosEndPairs.sql
      sed -e "s/all_fosends/hg19AllFosEnds/" hg19.fosEndPairs.bed \
  	| sort -u | hgLoadBed -notItemRgb hg19 hg19FosEndPairs stdin \
  	    -sqlTable=hg19FosEndPairs.sql
      # Loaded 229708 elements
      # why so few compared to before:
      hgsql -e "select count(*) from fosEndPairs" hg19
      #	384442
      wc -l /hive/data/genomes/hg17/bed/fosends/fosEndPairs.bed
      #	384558 /hive/data/genomes/hg17/bed/fosends/fosEndPairs.bed
      # looking at one of those pairs that did not map:
      # G248P800001RA7 G248P800001FA7
      # compared to /hive/data/outside/fosends.2/fosEnds.fa
      # and /hive/data/outside/fosends.3/fosEnds.fa
      # the end sequences in those old files is shorter than the new ones
      # those shorter sequences more easily pass the minCover filter
  
      # note - this track isn't pushed to RR, just used for assembly QA
      sed -e "s/fosEndPairsBad/hg19FosEndPairsBad/" \
                   ~/kent/src/hg/lib/fosEndPairsBad.sql > hg19FosEndPairsBad.sql
  
      sed -e "s/all_fosends/hg19AllFosEnds/" hg19.fosEndPairsBad.bed \
  	| sort -u | hgLoadBed -notItemRgb hg19 hg19FosEndPairsBad \
  	    hg19.fosEndPairsBad.bed stdin -sqlTable=hg19FosEndPairsBad.sql
      #	Loaded 198665 elements of size 11
      # why do we have so many more ?
      wc -l /hive/data/genomes/hg17/bed/fosends/fosEndPairsBad.bed
      #	30830 /hive/data/genomes/hg17/bed/fosends/fosEndPairsBad.bed
  
      time hgLoadPsl hg19 -table=hg19AllFosEnds hg19.fosEnds.load.psl
      #	load of hg19AllFosEnds did not go as planned: 1160639 record(s),
      #	0 row(s) skipped, 229 warning(s) loading psl.tab
      #	real    0m35.096s
      #	with some warnings such as:
  # Warning 1264 Out of range value adjusted for column 'qBaseInsert' at row 3053
  # Warning 1264 Out of range value adjusted for column 'qBaseInsert' at row 10569
  # Warning 1264 Out of range value adjusted for column 'qBaseInsert' at row 13642
  # ...
  
  # the sequences are already loaded from the hg18 lift:
      grep "^>" /gbdb/hg19/fosends/fosEnds.fromHg18.fa | sed -e 's/>//' \
  	| sort > hg18.fosEnds.names
      # but they are not the same names:
      grep "^>" mapEnds/fosends.4.fa | sed -e 's/>//' \
  	| sort > hg19.fosEnds.names
      sed -e 's/_T.//' hg19.fosEnds.names | sort -u > hg19.unique.fosEnd.names
      wc -l hg18.fosEnds.names hg19.unique.fosEnd.names
      #	1087670 hg18.fosEnds.names
      #	1567737 hg19.unique.fosEnd.names
      comm -12 hg18.fosEnds.names hg19.unique.fosEnd.names | wc -l
      #	1087670
  
      grep "^>" /hive/data/outside/fosends.3/fosEnds.fa \
  	| sed -e 's/>//' | sort > fosends.3.names
      zcat /hive/data/outside/fosends.4/Hs*.dr.mfa.gz | grep "^>" \
  	| awk '{print $2}' | sort > fosends.4.names
  
      zcat /hive/data/outside/fosends.4/Hs.WGS*.dr.mfa.gz \
         | sed -e "s/^>gnl.* />/; s/\.T/_T/" > fosEnds.4.fa
  
      mkdir /gbdb/hg19/fosends
      ln -s /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/fosends.4.fa \
      ln -s /hive/data/genomes/hg19/bed/fosEndPairs/fosEnds.4.fa \
  	/gbdb/hg19/fosends/hg19FosEndPairs.fa
      time hgLoadSeq hg19 /gbdb/hg19/fosends/hg19FosEndPairs.fa
      #	1615492 sequences
      #	real    1m26.084s
  
  ##############################################################################
  # hg18 <-> hg19 difference tracks (DONE - 2010-09-03 - Hiram)
      # this is documented in hg18.txt, same heading as above
      mkdir /hive/data/genomes/hg18/bed/liftOverHg19
      cd /hive/data/genomes/hg18/bed/liftOverHg19
  
  #############################################################################
  # HGDP GEOGRAPHIC SNP MAPS (DONE 9/15/10 angie)
      # Project data downloaded and parsed in /hive/data/outside/hgdpGeo,
      # see makeDb/doc/hgdpGeo.txt.
      mkdir /hive/data/genomes/hg19/bed/hgdpGeo
      cd /hive/data/genomes/hg19/bed/hgdpGeo
      # Make an rsId-sorted snp coords file for joining with the hgdpGeo data.
      grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
        ../snp131/snp131.bed \
      | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3, $8;}' \
      | sort > snp131CoordsAndRef.txt
      # How many distinct SNPs in there?  (compare to 657000 from HGDP):
      cut -f 1 snp131CoordsAndRef.txt | uniq | wc -l
  #656332
      # Join files to make a track table -- well, first we'll need to
      # normalize alleles to the + strand:
      join -e ERROR -t'	' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4,1.5 \
        snp131CoordsAndRef.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
      | sed -re 's/([AGTC])\*/\1/' \
      | sort -k1,1 -k2n,2n \
        > hgdpGeo.fixme
      wc -l hgdpGeo.fixme
  #667392 hgdpGeo.fixme
      # Use the snp131 reference allele to detect when we need to rev-comp
      # the alleles to match the + strand.  Also, throw out SNPs for which
      # the ref allele is multi-base -- it's questionable whether we're giving
      # the right coords (some funny things happen with dbSNP's clustering...):
      cat > fixAlleles.pl <<'_EOF_'
  #!/usr/bin/env perl
  use warnings;
  use strict;
  my %rc = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A');
  while (<>) {
    chomp;  my ($c, $s, $e, $rs, $ancAl, $derAl, $freqs, $ref) = split;
    next unless ($ref =~ /^[ACGT]$/);
    if ($ancAl ne $ref && $derAl ne $ref) {
      $ancAl = $rc{$ancAl};
      $derAl = $rc{$derAl};
    }
    print join("\t", $c, $s, $e, $rs, $ancAl, $derAl, $freqs) . "\n";
  }
  '_EOF_'
      # << emacs
      chmod a+x fixAlleles.pl
      ./fixAlleles.pl hgdpGeo.fixme > hgdpGeo.tab
      wc -l hgdpGeo.tab
  #667349 hgdpGeo.tab
      hgLoadBed hg19 hgdpGeo hgdpGeo.tab \
        -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
  #Loaded 667349 elements of size 7
  
  
  ###########################################################################
  # RECOMBINATION RATES (DONE 2010-08-26 - Chin)
  
  # The STS MArkers track must be completed prior to creating this track
  
      ssh kkstore02
      cd /hive/data/genomes/hg19/bed
      mkdir -p recombRate
      cd recombRate
  
  # Copy other necessary files here (in future, can take from previous
  # version)
  # NOTE: these are stable, and could be saved in a permanent spot
  
      cp -p /projects/hg2/booch/psl/info/decode_all .
      cp -p /projects/hg2/booch/psl/info/marshfield_all .
      cp -p /projects/hg2/booch/psl/info/genethon_all .
  
  # Compared these 3 files with the 3 files of hg17, they are identical.
  
  # Determine maximum concordant set of markers for each of the maps
      /cluster/bin/scripts/assignGPsts -full -maxcon \
          /hive/data/outside/ncbi/sts.11/stsAlias.bed \
          /hive/data/genomes/hg19/bed/sts/stsMarkers_pos.rdb \
          decode_all > decode.marker.rdb
      /cluster/bin/scripts/assignGPsts -full -maxcon \
          /hive/data/outside/ncbi/sts.11/stsAlias.bed \
          /hive/data/genomes/hg19/bed/sts/stsMarkers_pos.rdb \
          marshfield_all > marshfield.marker.rdb
      /cluster/bin/scripts/assignGPsts -full -maxcon \
          /hive/data/outside/ncbi/sts.11/stsAlias.bed \
          /hive/data/genomes/hg19/bed/sts/stsMarkers_pos.rdb \
          genethon_all > genethon.marker.rdb
  
  # Determine the rates for each of the maps
      /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
              hive/data/genomes/hg19/chrom.sizes 1000000 1000000 \
                  > decode_1mb_slide_1mb
      /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
              /hive/data/genomes/hg19/chrom.sizes 1000000 1000000 \
                  >  genethon_1mb_slide_1mb
  # got many "... out of genetic distance order. DISCARDING" messages.
  
      /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
              /hive/data/genomes/hg19/chrom.sizes 1000000 1000000 \
                > marshfield_1mb_slide_1mb
  # Got many "... out of genetic distance order. DISCARDING" messages.
  
  # Convert files to proper format
  # which requires the "inserts" file
      # get the size of true chrom:
      cd  /hive/data/genomes/hg19/bed/recombRate
      cat /hive/data/genomes/hg19/chrom.sizes | awk '$1 !~/_/ && !/^chrM/ \
      {print $1, $2}'  > chr.sizes
  
      # order contigs on each chrom:
      mkdir /hive/data/genomes/hg19/bed/recombRate/orderedCtg
  
      cat << '_EOF_' > orderCtg.pl
  #!/usr/bin/perl
  # create ordered contig lists for each chrom
  my $db = hg19;
  my $ordDir = "/hive/data/genomes/hg19/bed/recombRate/orderedCtg";
  @chroms = (1..22, X, Y);
  
  foreach $chr (@chroms)
  {
  my $chrName = "chr$chr";
  my $chrFile = "chr$chr.ol";
  my $ordFile = "$ordDir/$chrFile";
  my $sqlstmt="hgsql $db -s -e \'SELECT c.chromStart, c.contig, c.size, c.chrom, i.size FROM ctgPos c, chromInfo i where c.chrom = \"$chrName\" AND c.chrom=i.chrom order by chromStart ASC\' ";
  system(" $sqlstmt  > $ordFile");
  }
  '_EOF_'
      # << happy emacs
  
  
      chmod +x orderCtg.pl
      ./orderCtg.pl
  
      # create the inserts file
      cat << '_EOF_' > createInserts.pl
  #!/usr/local/bin/perl
  # FILE: createInserts
  # Author: Terry Furey
  # Date: 7/13/2004
  # Modified by Chin 2010-10-14
  # Description: Create inserts file used in creation of some tracks.
  # 	This used to be created when lift files being created
  
  $dir = "/hive/data/genomes/hg19";
  
  @chroms = (1..22, X, Y);
  
  print "#Chrom\tBefore_Contig\tAfter_Contig\tNum_bases\tType\n";
  $i = 0;
  # Look at certain gaps in chroms
  foreach $chr (@chroms) {
      open(AGP, "$dir/$chr/chr$chr.agp");
      while ($line = <AGP>) {
  	chomp($line);
  	@fields = split("\t", $line);
  	$lastctg = "";
  	$laststart = 1;
  	# Want centromeres, large heterochromatin, short_arms, and large gaps
  	if (($fields[6] eq "centromere") ||
  	    (($fields[6] eq "heterochromatin") && ($fields[2] - $fields[1] > 1000000)) ||
  	    ($fields[6] eq "short_arm") ||
  	    (($fields[2] - $fields[1]) > 1000000)) {
  	    # Record info about gap
  	    $chr[$i] = $fields[0];
  	    $start[$i] = $fields[1];
  	    $end[$i] = $fields[2];
  	    $size[$i] = $end[$i] - $start[$i] + 1;
  	    $type[$i] = $fields[6];
  	    # Find contigs surrounding gap
              open(ORDCTG, "$dir/bed/recombRate/orderedCtg/chr$chr.ol") || die("Could not open $dir/bed/recombRate/orderedCtg/chr$chr.ol\n") ;
  	    # short_arm gaps have no previous contig
  	    if ($type[$i] eq "short_arm") {
  		$ctg1[$i] = "-";
  		$start[$i] = 1;
  		# Nest record has next contig
  		$line1 = <ORDCTG>;
  		@fields1 = split("\t",$line1);
  		$ctg2[$i] = $fields1[1];
  		# Reset end and recalculate size
  		$end[$i] = $fields1[0];
  		$size[$i] = $end[$i] - $start[$i] + 1;
  	    # non-short_arm gaps
  	    } else {
  		# Find gap immediately before gap
  		while ($line1 = <ORDCTG>) {
  		    chomp($line1);
  		    @fields1 = split("\t", $line1);
  		    # This contig ends where gap begins
  		    if (($fields1[0] + $fields1[2] + 1) == $start[$i]) {
  			$ctg1[$i] = $fields1[1];
  			# Succeeding contig is in next record
  			if ($line1 = <ORDCTG>) {
  			    @fields1 = split("\t", $line1);
  			    $ctg2[$i] = $fields1[1];
  			    # Reset end coordinate and re-calculate size
  			    $end[$i] = $fields1[0];
  			    $size[$i] = $end[$i] - $start[$i] + 1;
  			} else {
  			    $ctg2[$i] = "-";
  			}
  		    # Keep track of possible previous contigs and starts
  		    } elsif (($fields1[0] + $fields1[2] + 1) < $start[$i]) {
  			$lastctg = $fields1[1];
  			$laststart = $fields1[0] + $fields1[2] + 1;
  		    # Another gap separated this gap from previous contig,
  		    # so didn't find match and passed it up
  		    } elsif (($ctg1[$i] eq "") && ($laststart > 1)) {
  			# Set start coordinate to end of last contig
  			$ctg1[$i] = $lastctg;
  			$start[$i] = $laststart;
  			# Reset end coordinate and re-calculate size
  			@fields1 = split("\t", $line1);
  			$ctg2[$i] = $fields1[1];
  			$end[$i] = $fields1[0];
  			$size[$i] = $end[$i] - $start[$i] + 1;
  		    }
  		}
  	    }
  	    close(ORDCTG);
  	    $i++;
  	}
      }
      close(AGP);	
  }
  $num = $i;
  
  # Print them out
  for ($i = 0; $i < $num; $i++) {
      # Don't print out duplicate lines for same gap (i.e. centromere and heterochromatin
      # in same gap
      if (($chr[$i] ne $chr[$i-1]) || (($ctg1[$i] ne $ctg1[$i-1]) && ($start[$i] > $end[$i-1]))) {
  	# Large gaps must be heterochromatin
  	if ($size[$i] > 3100000) {
  	    $type[$i] = "heterochromatin";
  	}
  	# gaps at beginning must be short_arm
  	if ($start[$i] <= 1) {
  	    $type[$i] = "short_arm";
  	}
  	# Only want large heterochromatic regions, not telomeres
  	if (($type[$i] ne "heterochromatin") || ($size[$i] > 1000000)) {
  	    print "$chr[$i]\t$ctg1[$i]\t$ctg2[$i]\t$size[$i]\t$type[$i]\n";
  	}
      }
  }
  '_EOF_'
      # << happy emacs
  
      chmod +x createInserts.pl
      ./createInserts.pl > inserts
  
  # Convert files to proper format
      cat << '_EOF_' > convRecombRate.pl
  #!/usr/local/bin/perl
  # File: convRecombRate
  # Author: Terry Furey
  # Date: 9/2002
  # Modified by Chin 2010-10-14
  # Project: Human
  # Description: Changes recomb rates in large gaps to nan
  # Usage message
  if ($#ARGV != 3) {
    print stderr "USAGE: createSetBands <recomb file> <insert file> <ctg dir> <window (kb)>\n";
    exit(1);
  }
  
  # Read parameters
  $rrfile = shift(@ARGV);
  $insfile = shift(@ARGV);
  $ctgdir = shift(@ARGV);
  $basewind = shift(@ARGV);
  $window = $basewind * 1000;
  
  # Determine the golden path positions for each of the inserts
  open(INSERT, "<$insfile") || die("Could not open $insfile\n");
  $line = <INSERT>; #header
  while ($line = <INSERT>) {
    next if (substr($line, 0, 1) eq "#");
    chomp($line);
    ($chr, $first, $second, $length, $type) = split(' ',$line);
  
    $thischr = substr($chr,3);
    open(ORDCTG, "<$ctgdir/orderedCtg/chr$thischr.ol") || die("Could not open $ctgdir/orderedCtg/chr$thischr.ol\n");
  
    # Determine first window for the insert
    if ($first eq "-") {
      $begin = 0;
      $end = int(($length)/$window)*$window;
    } else {
      $found = 0;
      print stderr "Finding $chr $first\n";
      while(!$found) {
        $line = <ORDCTG>;
        chomp($line);
        ($ctgstart, $ctg, $ctglen, $ctgchr, $chrlen) = split(' ',$line);
        if ($ctg eq $first) {
  	$begin = int(($ctgstart + $ctglen)/$window)*$window;
  	$end = int(($ctgstart + $ctglen + $length)/$window)*$window + $window;
  	$found = 1;
        }
      }
    }
    close(ORDCTG);
    print stderr "$chr $begin - $end $type\n";
    for ($i = $begin; $i < $end; $i=$i+$window) {
      $gap{$chr}{$i} = 1;
    }
  }
  close(INSERT);
  
  # Now, match up with PCT
  open(RR, "<$rrfile") || die("Could not open $rrfile\n");
  while ($line = <RR>) {
    chomp($line);
    ($chr, $start, $end, $ave, $female, $male) = split("\t", $line);
    if ($gap{$chr}{$start}) {
      print "$chr\t$start\t$end\tNaN\tNaN\tNaN\n";
    } else {
      print "$line\n";
    }
  }
  close(RR);
  '_EOF_'
      # << happy emacs
  
      chmod +x convRecombRate.pl
  
  
  ./convRecombRate.pl  decode_1mb_slide_1mb inserts \
          . 1000 > decode_1mb_slide_1mb_conv
  
  ./convRecombRate.pl  marshfield_1mb_slide_1mb inserts \
          . 1000 > marshfield_1mb_slide_1mb_conv
  
  ./convRecombRate.pl  genethon_1mb_slide_1mb inserts \
          . 1000 > genethon_1mb_slide_1mb_conv
  
  # Create bed file and load
      /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
          marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
                  > recombRate.bed
  
      ssh hgwdev
      # reuse the recombRate.sql and recombRate.as from before
      hgLoadBed -noBin -tab \
          -sqlTable=$HOME/kent/src/hg/lib/recombRate.sql \
              hg19 recombRate recombRate.bed
  
  
  ############################################################################
  # GENE BOUNDS (RNACLUSTER) (DONE 2010-10-20 - Chin)
  # Create rnaCluster table (depends on {est,mrna}OrientInfo)
  
  cd /hive/data/genomes/hg19/bed
  mkdir rnaCluster
  cd rnaCluster/
  mkdir chrom
  
  # Create a list of accessions that come from RAGE libraries and need to
  # be excluded.
  ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg19 rage.libs
  
      cat << '_EOF_' > runClusterRna
  #!/bin/csh -fe
  foreach f (/hive/data/genomes/hg19/nib/chr*.nib)
      set c = $f:t:r
      set out = chrom/$c.bed
      # Exclude accesions in the RAGE file
      echo clusterRna -mrnaExclude=hg19.rage.libs hg19 /dev/null $out  -chrom=$c
      clusterRna -mrnaExclude=hg19.rage.libs -verbose=2 \
      -rna=all_mrna -est=intronEst \
      hg19 /dev/null $out -chrom=$c
   end
  '_EOF_'
      # << happy emacs
      chmod +x ./runClusterRna
  
      ./runClusterRna
  
  hgLoadBed hg19 rnaCluster chrom/*.bed
  
  #############################################################################
  # dbSNP BUILD 132 (SNP132) BASIC TABLES (DONE 11/17/10 angie)
  # Initially loaded 11/15/10; I found some missing or improperly located rs_fasta
  # sequences and dbSNP re-dumped rs_fasta, so I rebuilt everything dependent on
  # rs_fasta 11/17.
      # Set up build directory
      mkdir -p /hive/data/outside/dbSNP/132/{human,shared}
  
      # Get field encodings -- if there are changes or additions to the
      # encoding of the corresponding fields, you might need to update
      # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
      # hg/lib/snp125Ui.c).
      cd /hive/data/outside/dbSNP/132/shared
      alias wg wget --timestamping
      set ftpShared = ftp://ftp.ncbi.nih.gov/snp/database/shared_data
      wg $ftpShared/LocTypeCode.bcp.gz
      wg $ftpShared/SnpClassCode.bcp.gz
      wg $ftpShared/SnpFunctionCode.bcp.gz
      wg $ftpShared/SnpValidationCode.bcp.gz
      wg $ftpShared/Allele.bcp.gz
  
      ########################## DOWNLOAD #############################
      cd /hive/data/outside/dbSNP/132/human
      mkdir data schema rs_fasta
      # Get data from NCBI (anonymous FTP)
      set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
      wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
      cd /hive/data/outside/dbSNP/132/human/data
      # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
      wg $ftpSnpDb/organism_data/b132_SNPContigLoc_37_1.bcp.gz
      # ContigLocusId table has functional annotations
      wg $ftpSnpDb/organism_data/b132_SNPContigLocusId_37_1.bcp.gz
      wg $ftpSnpDb/organism_data/b132_ContigInfo_37_1.bcp.gz
      # MapInfo has alignment weights
      wg $ftpSnpDb/organism_data/b132_SNPMapInfo_37_1.bcp.gz
      # SNP has univar_id, validation status and heterozygosity
      wg $ftpSnpDb/organism_data/SNP.bcp.gz
      # New info as of 132: allele freq, 'clinical' bit, SNP submitter handles
      wg $ftpSnpDb/organism_data/SNPAlleleFreq.bcp.gz
      wg $ftpSnpDb/organism_data/SNP_bitfield.bcp.gz
      wg $ftpSnpDb/organism_data/Batch.bcp.gz
      wg $ftpSnpDb/organism_data/SubSNP.bcp.gz
      wg $ftpSnpDb/organism_data/SNPSubSNPLink.bcp.gz
  
      # Get schema
      cd /hive/data/outside/dbSNP/132/human/schema
      wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
      wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
  
      # Get fasta files
      # using headers of fasta files for molType, class, observed
      cd /hive/data/outside/dbSNP/132/human/rs_fasta
      # Re-downloaded 11/17/10 (dbSNP re-dump):
      wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
  
      ########################## LOAD NCBI TABLES #############################
      # Simplify names of data files -- strip version & extras to get
      # local canonical table names.
      cd /hive/data/outside/dbSNP/132/human/data
      foreach f (*.bcp.gz)
        set new = `echo $f \
                   | sed -e 's/^b132_SNP//; s/^b132_//; s/_37_1//; s/.bcp//;'`
        mv $f $new
        echo $new
      end
  
      cd /hive/data/outside/dbSNP/132/human/schema
      zcat human_9606_table.sql.gz \
      | perl -we '$/ = "\nGO\n\n\n"; \
          while (<>) { \
            next unless /^CREATE TABLE \[(b132_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP|SNPAlleleFreq)(_37_1)?\]/; \
            s/b132_(SNP)?//; s/_37_1//; \
            s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
            s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
            s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
            s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
            s/(image|varchar\s+\(\d+\))/BLOB/g; \
            print; \
          }' \
        > table.sql
      zcat dbSNP_main_table.sql.gz \
      | sed -re 's/\r//g;' \
      | perl -we '$/ = "\nGO\n\n\n";  \
          while (<>) { \
            next unless /^CREATE TABLE \[Allele\]/; \
            s/[\[\]]//g;  s/GO\n\n\n/;\n/;  s/smalldatetime/datetime/g; \
            print; \
          }' \
        >> table.sql
  
      # load on hgwdev
      hgsql -e 'create database hg19snp132'
      cd /hive/data/outside/dbSNP/132/human/schema
      hgsql hg19snp132 < table.sql
      cd ../data
  
      # Avoid wasting space by excluding mappings to non-reference contigs (ContigInfo.group_label):
      zcat ContigInfo.gz | cut -f 12 | uniq | sort -u
  #CRA_TCAGchr7v2
  #Celera
  #GRCh37
  #HuRef
      foreach t (ContigInfo MapInfo ContigLocusId)
        zcat $t.gz \
        | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
        | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
        | hgLoadSqlTab -oldTable hg19snp132 $t placeholder stdin
      end
  
      # Compare contig list between liftContigs.lft and reference contigs in ContigInfo.
      cut -f 2 /hive/data/genomes/hg19/jkStuff/liftContigs.lft | sort > /data/tmp/1
      # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
      hgsql hg19snp132 -N -B -e 'select contig_acc from ContigInfo;' | sort > /data/tmp/2
      diff /data/tmp/[12]
  #1c1
  #< NC_001807
  #---
  #> NC_012920
      # darn mitochondria version oops.  Use NC_012920ToChrM.over.chain generated for snp131.
  
      # Make sure there are no orient != 0 contigs among those selected.
      hgsql hg19snp132 -NBe \
        'select count(*) from ContigInfo where orient != 0;'
  #0
  
      # ContigLoc is huge, and we want just the reference contig mappings.
      # Keep lines only if they have a word match to some reference contig ID.
      # That probably will allow some false positives from coord matches,
      # but we can clean those up afterward.
      zcat ContigInfo.gz | g -w GRCh37 | cut -f 1 | sort -n > GRCh37ContigInfo.ctg_id.txt
      wc -l GRCh37ContigInfo.ctg_id.txt
  #259 GRCh37ContigInfo.ctg_id.txt
      zcat ContigLoc.gz \
      | grep -Fwf GRCh37ContigInfo.ctg_id.txt \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp132 ContigLoc placeholder stdin
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 1
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 2
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 3
  #Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 4
  #...
  #load of ContigLoc did not go as planned: 34610577 record(s), 0 row(s) skipped, 4143419 warning(s) loading /dev/stdin
      # Get rid of those false positives (crazy slow, create indices first next time):
      hgsql hg19snp132 -e 'create table ContigLocFix select cl.* from ContigLoc as cl, ContigInfo as ci where cl.ctg_id = ci.ctg_id;'
      hgsql hg19snp132 -e 'drop table ContigLoc; \
                           rename table ContigLocFix to ContigLoc;'
  
      zcat SNP.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp132 SNP placeholder stdin
  #Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 1
  #Warning 1366 Incorrect integer value: '' for column 'map_property' at row 1
  #Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
  #Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
  #Warning 1265 Data truncated for column 'avg_heterozygosity' at row 3
  #Warning 1265 Data truncated for column 'het_se' at row 3
  #...
  #load of SNP did not go as planned: 30443714 record(s), 0 row(s) skipped, 21366755 warning(s) loading /dev/stdin
      # ... no big deal.
      foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
       echo -n "${t}:\t"
        hgsql -N -B hg19snp132 -e 'select count(*) from '$t
      end
  #ContigInfo:     259
  #ContigLoc:      34610479
  #ContigLocusId:  19829016
  #MapInfo:        30399417
  #SNP:    	 30443714
  
      #################### EXTRACT INFO FROM NCBI TABLES ####################
      # Glom each SNP's function codes together and load up a new hg19Snp132 table.
      # Also extract NCBI's annotations of coding SNPs' effects on translation.
      # We extract ContigLocusId info only for reference assembly mapping.
      # Some SNP's functional annotations are for an alternate assembly, so we will
      # have no NCBI functional annotations to display for those (but our own are
      # available).
      cd /hive/data/outside/dbSNP/132/human
      # Add indices to tables for a big join (5 or 6 minutes):
      hgsql hg19snp132 -e \
        'alter table ContigInfo add index (ctg_id); \
         alter table ContigLocusId add index (ctg_id);'
      hgsql hg19snp132 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
                             fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
                             from ContigLocusId as cli, ContigInfo as ci \
                             where cli.ctg_id = ci.ctg_id;' \
        > ncbiFuncAnnotations.txt
      wc -l ncbiFuncAnnotations.txt
  #19828052 ncbiFuncAnnotations.txt
      # Ignore function code 8 (cds-reference, just means that some allele matches reference)
      # and glom functions for each SNP id:
      cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
      | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
      | perl -we 'while (<>) { chomp; \
                    ($id, undef, $s, $e, $f, $c) = split; \
                    if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
                      $prevFunc .= "$f," unless ($f == 8); \
                    } else { \
                      if (defined $prevId) { \
                        print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc); \
                      } \
                      $prevFunc = ($f == 8) ? "" : "$f,"; \
                    } \
                    ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
                  } \
                  print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc);' \
        > ucscFunc.txt
      wc -l ucscFunc.txt
  #11673179 ucscFunc.txt
      cat > ucscFunc.sql <<EOF
  CREATE TABLE ucscFunc (
          snp_id int NOT NULL ,
          ctg_id int(10) NOT NULL ,
          asn_from int(10) NOT NULL ,
          asn_to int(10) NOT NULL ,
          fxn_class varchar(255) NOT NULL ,
          INDEX snp_id (snp_id),
          INDEX ctg_id (ctg_id)
  );
  EOF
      hgLoadSqlTab hg19snp132 ucscFunc{,.sql,.txt}
      # ucscFunc coords are NCBI's 0-based, fully-closed, 2-base-wide insertions.
      # We need to leave the coords alone here so ucscFunc can be joined below.
      # Make a list of SNPs with func anno's that are insertion SNPs, so we can use
      # the list to determine what type of coord fix to apply to each annotation
      # when making snp130CodingDbSnp below.
      hgsql hg19snp132 -NBe \
        'select ci.contig_acc, cl.asn_from, cl.asn_to, uf.snp_id \
         from ucscFunc as uf, ContigLoc as cl, ContigInfo as ci \
         where uf.snp_id = cl.snp_id and \
               uf.ctg_id = cl.ctg_id and uf.asn_from = cl.asn_from and uf.asn_to = cl.asn_to and \
               cl.loc_type = 3 and \
               cl.ctg_id = ci.ctg_id' \
        > ncbiFuncInsertions.ctg.bed
      wc -l ncbiFuncInsertions.ctg.bed
  #1099530 ncbiFuncInsertions.ctg.bed
  
      # Extract observed allele, molType and snp class from FASTA headers gnl
      mkdir rs_fasta/rejects
      mv rs_fasta/rs_ch{AltOnly,NotOn}.fas.gz rs_fasta/rejects/
      zcat rs_fasta/rs_ch*.fas.gz \
      | grep '^>gnl' \
      | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
      | sort -nu \
        > ucscGnl.txt
  #547.545u 92.641s 6:08.04 173.9% 0+0k 0+0io 0pf+0w
      wc -l ucscGnl.txt
  #30144822 ucscGnl.txt
  # weird -- it shrunk from the original 30152349
      cut -f 1 ucscGnl.txt | uniq | wc -l
  #30144822
      cat > ucscGnl.sql <<EOF
  CREATE TABLE ucscGnl (
          snp_id int NOT NULL ,
          observed varchar(255) NOT NULL,
          molType varchar(255) NOT NULL,
          class varchar(255) NULL ,
          INDEX snp_id (snp_id)
  );
  EOF
      hgLoadSqlTab hg19snp132 ucscGnl{,.sql,.txt}
  
      # Add indices to tables for a big join:
      hgsql hg19snp132 -e \
        'alter table ContigLoc  add index (ctg_id); \
         alter table SNP        add index (snp_id); \
         alter table MapInfo    add index (snp_id);'
  
  #TODO: add SNPAlleleFreq to this query and snpNcbiToUcsc (bitfield/"clinical" too)
  
      # Big leftie join to bring together all of the columns that we want in snp132,
      # using all of the available joining info:
      hgsql hg19snp132 -NBe \
       'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
               ug.observed, ug.molType, ug.class, \
               s.validation_status, s.avg_heterozygosity, s.het_se, \
               uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
        FROM \
        ((((ContigLoc as cl JOIN ContigInfo as ci \
                 ON cl.ctg_id = ci.ctg_id) \
            LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
           LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
          LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
         LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
                                  and uf.asn_from = cl.asn_from;' \
        > ucscNcbiSnp.ctg.bed
  #73.036u 12.668s 25:32.42 5.5%   0+0k 0+0io 0pf+0w
      wc -l ucscNcbiSnp.ctg.bed
  #34610479 ucscNcbiSnp.ctg.bed
  
      # Use liftUp for everything except mito, then liftOver for mito.
      # There are some weird cases of length=1 but locType=range... in all the cases
      # that I checked, the length really seems to be 1 so I'm not sure where they got
      # the locType=range.  Tweak locType in those cases so we can keep those SNPs:
      grep -vw ^NC_012920 ucscNcbiSnp.ctg.bed \
      | awk -F"\t" 'BEGIN{OFS="\t";} \
             $2 == $3 && $14 == 1 {$14=2; \
                                   if (numTweaked < 10) {print $4 > "/dev/stderr";} \
                                   numTweaked++;}  {print;} \
             END{print numTweaked, "single-base, locType=range, tweaked locType" > "/dev/stderr";}' \
      | liftUp ucscNcbiSnp.bed \
        /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
  #TODO: examine these again, report to dbSNP:
  #118203330
  #118203339
  #118203340
  #118203367
  #118203389
  #118203401
  #118203405
  #118203425
  #118203428
  #118203433
  #588     single-base, locType=range, tweaked locType
  #217.470u 28.776s 2:54.46 141.1% 0+0k 0+0io 0pf+0w
      # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
      # doesn't deal with 0-base items.  Fake out phys_pos_from to 0 because many coords
      # will differ, oh well.
      grep -w NC_012920 ucscNcbiSnp.ctg.bed \
      | awk -F"\t" 'BEGIN{OFS="\t";} {$3 += 1; $16 = 0; print;}' \
      | liftOver -bedPlus=3 stdin \
          /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain stdout chrM.unmapped \
      | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
      | sort -k2n,2n \
        > chrMNcbiSnp.bed
  #2.827u 1.576s 0:48.15 9.1%      0+0k 0+0io 0pf+0w
  #2.805u 1.392s 0:51.90 8.0%      0+0k 0+0io 7pf+0w
      cat chrM.unmapped
      # Good, got all but 3 SNPS (rs28693675, rs55749223 and rs112781979, partially deleted/deleted)
      cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
      wc -l ucscNcbiSnp.bed
  #34610476 ucscNcbiSnp.bed
  
      # Translate NCBI's encoding into UCSC's, and perform a bunch of checks.
      cd /hive/data/outside/dbSNP/132/human/
      # Updated snpNcbiToUCSC for new MAX_SNPID (80M -> 120M),
      # new named alleles oddball formats: CHLC.GGAA2D04, GDB:190880, SHGC-35515, =D22S272
      # new MAX_SNPSIZE (1k -> 16k)
      snpNcbiToUcsc ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp132
  #spaces stripped from observed:
  #chr12   6093134 6093134 rs41402545
  #count of snps with weight  0 = 69071
  #count of snps with weight  1 = 29465124
  #count of snps with weight  2 = 523595
  #count of snps with weight  3 = 3037908
  #count of snps with weight 10 = 1514756
  #Skipped 976 snp mappings due to errors -- see snp132Errors.bed
  #214.507u 7.542s 4:38.56 79.7%   0+0k 0+0io 0pf+0w
      head snp132Errors.bed
  #chr1    11082586        11082587        rs80356737      Unexpected refNCBI "AT" for locType "between" (3) -- expected "-"
  #chr1    11082586        11082587        rs80356737      rs80356737 is 1 bases long but refNCBI is different length: AT
  #chr1    43392806        43392807        rs80359840      Unexpected refNCBI "CA" for locType "between" (3) -- expected "-"
  #chr1    43392806        43392807        rs80359840      rs80359840 is 1 bases long but refNCBI is different length: CA
  #chr1    43395420        43395421        rs80359834      Unexpected refNCBI "AC" for locType "between" (3) -- expected "-"
  #chr1    43395420        43395421        rs80359834      rs80359834 is 1 bases long but refNCBI is different length: AC
  #chr1    43395659        43395660        rs80359833      Unexpected refNCBI "AG" for locType "between" (3) -- expected "-"
  #chr1    43395659        43395660        rs80359833      rs80359833 is 1 bases long but refNCBI is different length: AG
  #chr1    43396801        43396802        rs80359831      Unexpected refNCBI "GC" for locType "between" (3) -- expected "-"
  #chr1    43396801        43396802        rs80359831      rs80359831 is 1 bases long but refNCBI is different length: GC
      wc -l snp*
  #  33026121 snp132.bed
  #        22 snp132.sql
  #       976 snp132Errors.bed
  #        18 snp132ExceptionDesc.tab
  #   4945948 snp132Exceptions.bed
      # 7M new snps, not a big increase in exceptions (snp131 had 4281351)
  
      # Make one big fasta file.
      # It's a monster: 18G!  Can we split by hashing rsId?
      zcat rs_fasta/rs_ch*.fas.gz \
      | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
        > snp132.fa
  #611.912u 114.850s 7:40.41 157.8%        0+0k 0+0io 0pf+0w
      # Check for duplicates.
      grep ^\>rs snp132.fa | sort > /data/tmp/seqHeaders
      wc -l /data/tmp/seqHeaders
  #30144822 /data/tmp/seqHeaders
      uniq /data/tmp/seqHeaders | wc -l
  #30144822
      # Use hgLoadSeq to generate .tab output for sequence file offsets,
      # and keep only the columns that we need: acc and file_offset.
      # Index it and translate to snpSeq table format.
      hgLoadSeq -test placeholder snp132.fa
  #30144822 sequences
  #52.698u 14.570s 9:18.88 12.0%   0+0k 0+0io 0pf+0w
      cut -f 2,6 seq.tab > snp132Seq.tab
      rm seq.tab
  
      # Load up main track tables.
      cd /hive/data/outside/dbSNP/132/human
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp132 -sqlTable=snp132.sql snp132.bed
  #Loaded 33026121 elements of size 17
  #124.626u 10.983s 8:20.73 27.0%  0+0k 0+0io 0pf+0w
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 snp132Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
        snp132Exceptions.bed
  #Loaded 4945948 elements of size 5
  #14.816u 0.930s 0:52.42 30.0%    0+0k 0+0io 0pf+0w
      hgLoadSqlTab hg19 snp132ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
        snp132ExceptionDesc.tab
      # Load up sequences.
      mkdir -p /gbdb/hg19/snp
      ln -s /hive/data/outside/dbSNP/132/human/snp132.fa /gbdb/hg19/snp/snp132.fa
      hgLoadSqlTab hg19 snp132Seq ~/kent/src/hg/lib/snpSeq.sql snp132Seq.tab
  
      # Put in a link where one would expect to find the track build dir...
      ln -s /hive/data/outside/dbSNP/132/human /hive/data/genomes/hg19/bed/snp132
  
  #*** TODO: ask cluster-admin to pack the snp132 table (or whatever tables we'll push)
  
      # Look at the breakdown of exception categories:
      cd /hive/data/outside/dbSNP/132/human
      cut -f 5 snp132Exceptions.bed | sort | uniq -c | sort -nr
  #3644435 MultipleAlignments
  # 964493 ObservedMismatch
  #  90035 SingleClassTriAllelic
  #  77552 SingleClassZeroSpan
  #  43631 ObservedTooLong
  #  33650 MixedObserved
  #  26701 FlankMismatchGenomeShorter
  #  25574 SingleClassLongerSpan
  #  12222 RefAlleleMismatch
  #  11525 DuplicateObserved
  #   8340 SingleClassQuadAllelic
  #   4463 NamedDeletionZeroSpan
  #   2052 FlankMismatchGenomeLonger
  #    806 ObservedContainsIupac
  #    317 NamedInsertionNonzeroSpan
  #    150 FlankMismatchGenomeEqual
  #      1 RefAlleleRevComp
  #      1 ObservedWrongFormat
  #TODO: Sent a few bug reports to dbSNP
  
  
  ############################################################################
  # SNP132 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 11/22/10 angie)
      mkdir /hive/data/genomes/hg19/bed/snp132Ortho
      cd /hive/data/genomes/hg19/bed/snp132Ortho
  
      # Following Heather's lead in snp126orthos, filter SNPs to to keep
      # only those with class=single, length=1, chrom!~random;
      # Exclude those with exceptions MultipleAlignments,
      # SingleClassTriAllelic or SingleClassQuadAllelic.
      awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
        /hive/data/outside/dbSNP/132/human/snp132Exceptions.bed \
      | sort -u \
        > snp132ExcludeIds.txt
      awk '$3-$2 == 1 && $1 !~ /_random/ && $1 !~ /^chrUn/ && $11 == "single" {print;}' \
        /hive/data/outside/dbSNP/132/human/snp132.bed \
      | grep -vFwf snp132ExcludeIds.txt \
        > snp132Simple.bed
  #264.984u 13.702s 3:57.29 117.4% 0+0k 0+0io 0pf+0w
      wc -l snp132Simple.bed
  #23908516 snp132Simple.bed
  
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        snp132Simple.bed > snp132ForLiftOver.bed
  #62.518u 2.141s 1:09.79 92.6%    0+0k 0+0io 0pf+0w
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
  #*** NOTE FOR NEXT TIME: make this 10000 not 50000:
      splitFile ../snp132ForLiftOver.bed 50000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
          \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh swarm
      cd /hive/data/genomes/hg19/bed/snp132Ortho/run.liftOChimp
      para make jobList
  #Completed: 479 of 479 jobs
  #CPU time in finished jobs:     168182s    2803.03m    46.72h    1.95d  0.005 y
  #IO & Wait Time:                 12873s     214.55m     3.58h    0.15d  0.000 y
  #Average job time:                 378s       6.30m     0.10h    0.00d
  #Longest finished job:            1152s      19.20m     0.32h    0.01d
  #Submission to last job:          1165s      19.42m     0.32h    0.01d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 479 of 479 jobs
  #CPU time in finished jobs:     413536s    6892.27m   114.87h    4.79d  0.013 y
  #IO & Wait Time:                 29174s     486.23m     8.10h    0.34d  0.001 y
  #Average job time:                 924s      15.40m     0.26h    0.01d
  #Longest finished job:            2299s      38.32m     0.64h    0.03d
  #Submission to last job:          2309s      38.48m     0.64h    0.03d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
          \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 479 of 479 jobs
  #CPU time in finished jobs:     463852s    7730.87m   128.85h    5.37d  0.015 y
  #IO & Wait Time:                 32857s     547.62m     9.13h    0.38d  0.001 y
  #Average job time:                1037s      17.28m     0.29h    0.01d
  #Longest finished job:            2354s      39.23m     0.65h    0.03d
  #Submission to last job:          2444s      40.73m     0.68h    0.03d
  
      cd /hive/data/genomes/hg19/bed/snp132Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
      | sort > panTro2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
      | sort > rheMac2.orthoGlom.txt
      wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
  #22382184 panTro2.orthoGlom.txt
  #21280289 ponAbe2.orthoGlom.txt
  #19249238 rheMac2.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of snp132OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
  #113.229u 24.575s 1:26.02 160.1% 0+0k 0+0io 0pf+0w
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac2.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp132OrthoPt2Pa2Rm2.bed
  #516.332u 101.651s 7:59.63 128.8%        0+0k 0+0io 0pf+0w
      wc -l snp132OrthoPt2Pa2Rm2.bed
  #23235355 snp132OrthoPt2Pa2Rm2.bed
  
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp132OrthoPt2Pa2Rm2 snp132OrthoPt2Pa2Rm2.bed
  #Loaded 23235355 elements of size 22
  #87.826u 8.471s 8:22.46 19.1%    0+0k 0+0io 0pf+0w
  
      # Cleanup:
      rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
      nice gzip snp132Simple.bed snp132ExcludeIds.txt snp132ForLiftOver.bed &
  
  
  ############################################################################
  # DBSNP CODING ANNOTATIONS (132) (DONE 11/17/10 angie)
  # These annotations are not restricted to the ones that we display,
  # so it wasn't necessary to rebuild this after rebuilding snp132 to
  # include SNPs missing from the first rs_fasta dump.
      cd /hive/data/outside/dbSNP/132/human
      # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
      # For anything except an insertion (0 bases between flanks),
      # we need to add 1 to the end coord.  For an insertion, we need
      # to add 1 to the start coord.  Make a hash of the insertion IDs,
      # then look up each ID in ncbiFuncAnnotations.txt to tell which
      # transform to apply.
      # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
      perl -we 'open($IDS, "ncbiFuncInsertions.ctg.bed") || die "ids: $!"; \
                while (<$IDS>) { chomp; $ids{$_} = 1; } \
                close($IDS); \
                %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
                while (<>) { \
                  chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                  next unless $coding{$w[5]}; \
                  $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                  if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                    $w[2]++; # 2-base insertions: increment start coord \
                  } else { \
                    $w[3]++; # increment end coord to get half-open \
                  } \
                  print join("\t", @w) . "\n"; \
                }' ncbiFuncAnnotations.txt \
      | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
      | uniq \
        > ncbiCodingAnnotations.txt
      wc -l ncbiCodingAnnotations.txt
  #1015611 ncbiCodingAnnotations.txt
      # How many & what kinds of function types?
      cut -f 6 ncbiCodingAnnotations.txt \
      | sort -n | uniq -c
  # 179089 3   (coding-synon)
  # 493143 8   (cds-reference -- ignored)
  #  10575 41  (nonsense)
  # 272848 42  (missense)
  #  57934 44  (frameshift)
  #   2022 45  (cds-indel)
      # Gather up multiple annotation lines into one line per {snp, gene, frame}:
      perl -e  'while (<>) { chomp; \
                  my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                  if (defined $lastRs && \
                      ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                       $lastTx ne $txId || $lastFrm ne $frm)) { \
                    if (defined $refRow) { \
                      $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                      $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                    } \
                    print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                          "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                    $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                  } \
                  ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                      ($rsId, $ctg, $s, $e, $txId, $frm); \
                  $count++; \
                  if ($fxn == 8) { \
                    $refRow = [$fxn, $nt, $aa, $codon]; \
                  } else { \
                   $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                  } \
                } \
                if (defined $refRow) { \
                  $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                  $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                } \
                print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                      "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
        ncbiCodingAnnotations.txt \
      | liftUp snp132CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
      hgLoadBed hg19 snp132CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
        -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
        snp132CodingDbSnp.bed
  #Loaded 492412 elements of size 11
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP132 (DONE 11/22/10 angie)
      mkdir /hive/data/genomes/hg19/snp132Mask
      cd /hive/data/genomes/hg19/snp132Mask
  
      # Identify rsIds with various problems -- we will exclude those.
      awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
        /hive/data/outside/dbSNP/132/human/snp132Exceptions.bed \
        | sort -u \
        > snp132ExcludeRsIds.txt
      grep -vFwf snp132ExcludeRsIds.txt \
        /hive/data/outside/dbSNP/132/human/snp132.bed \
        > snp132Cleaned.bed
  #262.134u 4.612s 4:39.15 95.5%   0+0k 0+0io 0pf+0w
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp132Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
      | faSplit byname stdin substitutions/
  #Masked 23848747 snps in 23846674 out of 3134643623 genomic bases
      # 2,361 warnings about differing observed strings at same base position --
      # saved as diffObserved.txt.
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3134643623 (difference is 2517641)
  #53.892u 15.183s 3:24.90 33.7%   0+0k 0+0io 0pf+0w
      # Check that 2517641 is the total #bases in sequences with nothing in snp132Cleaned:
      grep -Fw single snp132Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
      grep -vwf /data/tmp/1 ../chrom.sizes
      grep -vwf /data/tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #2517641
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10326 (y != t)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 61004 (r != a)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
        gzip $f:r.subst.fa
      end
  
      # Insertions & deletions not done.  To date we have only offered substs for download.
      # If there is user demand, use template from snp131 above.
  
      # Clean up and prepare for download:
      gzip snp132Cleaned.bed &
      foreach d (substitutions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg19/snp131Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt.
  
      # Create download links on hgwdev.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp132Mask
      ln -s /hive/data/genomes/hg19/snp132Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp132Mask/
  
  #############################################################################
  #CREATE MICROSAT TRACK (DONE 2001-11-13 - Chin)
      ssh hgwdev
      mkdir /cluster/data/hg19/bed/microsat
      cd /cluster/data/hg19/bed/microsat
  
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
         ../simpleRepeat/simpleRepeat.bed > microsat.bed
  
      hgLoadBed hg19 microsat microsat.bed
  
  #############################################################################
  # Add Human RNA-editing track hg18 (Done, galt, 7/12/2010)
  
  # DARNED=DAtabase of RNa EDiting
  #http://darned.ucc.ie/
  #University College Cork
  
  mkdir -p /hive/data/genomes/hg19/bed/darned
  cd /hive/data/genomes/hg19/bed/darned
  # create go.csh to download and compose allChroms.bed
  ./go.csh
  hgLoadBed hg19 darned allChroms.bed
  # at human, level
  # added darned.html
  # added trackDb.ra entry
  
  # Bug #6417 duplicate records (6) in Human RNA editing (DARNED) track
  # (DONE 2012-12-22 Chin)
      cat allChroms.bed | sort  > allSort.bed
      cat allChroms.bed | sort -u > allSort.bed
      wc -l *.bed
      #  42045 allChroms.bed
      #  42045 allSort.bed
      #  42039 allUniq.bed
      hgLoadBed hg19 darned allUniq.bed
      # Loaded 42039 elements of size 9
  
  #############################################################################
  # lsSnpPdb: import of LS-SNP/PDB data for SNP 131 (2010-12-03 markd)
      # down load from JHU
      ssh genbank
      sudo su - genbank
      cd /cluster/data/genbank
      ./bin/lsSnpPdbDownloadStep hg19
      # load into hgwdev database
      ssh hgwdev
      cd /cluster/data/genbank
      ./bin/lsSnpPdbDbLoadStep hg19
      # once this has been QAed, will auto-update from genbank scripts
  
  #############################################################################
  # NEW SNP132 (DONE 3/8/11 angie)
  # 3/8/11: Re-ran snpNcbiToUcsc & reloaded to not count PAR SNPs as multiply mapped
  # Reloaded 1/24/11 to get rid of a couple exceptions that were derived from dbSNP bitfields
  # Previously loaded 1/5/11
      # New table type snp132Ext, with same columns as snp125 plus exceptions,
      # allele freqs, and submitter handles, using new script doDbSnp.pl.
      mkdir -p /hive/data/outside/dbSNP/132/human
      cd /hive/data/outside/dbSNP/132/human
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
      # to find the subdir name to use as orgDir below (human_9606 in this case).
      # Then click into that directory and look for file names like
      #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
      # -- use the first num for build and the second num_num for buildAssembly.
      # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606
  build 132
  buildAssembly 37_1
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
  # *** This release contains more than one assembly label.
  # *** Please examine this list in case we need to exclude any of these:
  #
  #CRA_TCAGchr7v2
  #Celera
  #GRCh37
  #HuRef
  # *** Add refAssemblyLabel to config.ra.  If keeping all labels, it will
  # *** look like this:
  #
  #refAssemblyLabel CRA_TCAGchr7v2,Celera,GRCh37,HuRef
  #
  # *** Edit out any of those that are not included in hg19 (e.g. Celera).
  # *** Then restart this script with -continue=loadDnSnp .
      # GRCh37 is the only one that corresponds to hg19, so add it to config.ra:
      echo "refAssemblyLabel GRCh37" >> config.ra
  
      # Try again with updated config:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra \
        -continue=loadDbSnp >>& do.log &
      tail -f do.log
  
      # 3/8/11:
      cd /hive/data/outside/dbSNP/132/human
      mkdir -p `cat workingDir`
      cp -p ucscNcbiSnp.bed.gz `cat workingDir`
      cd `cat workingDir`
      hgsql hg19 -NBe 'select chrom,chromStart,chromEnd,name from par' > par.bed
      snpNcbiToUcsc -par=par.bed \
        -snp132Ext ucscNcbiSnp.bed.gz /hive/data/genomes/hg19/hg19.2bit snp132
      # No change to snp132Errors.bed; only change to snp132ExceptionDesc.tab was
      # the MultipleAlignments count; snp132.bed items that lost their MultipleAlignments
      # exceptions were X & Y PAR matches.
      gzip snp132.bed snp132ExceptionDesc.tab snp132Errors.bed
      mv snp132* /hive/data/outside/dbSNP/132/human/
      cd /hive/data/outside/dbSNP/132/human/
      # Doh, dbSNP also assigned a weight of 3 to the PAR SNPs, and that is triggering
      # our snp132NonUnique filter (below).  I sent dbSNP an email about that, and will
      # tweak weight to 1 where I find a PAR SNP without MultipleAlignments, since I see
      # it as a bug fix.
      zcat snp132.bed.gz \
      | awk -F"\t" 'BEGIN{OFS="\t";} \
             (($1 == "chrX" && \
               (($3 > 60000 && $2 < 2699520) || ($3 > 154931043 && $2 < 155260560))) || \
              ($1 == "chrY" && \
               (($3 > 10000 && $2 < 2649520) || ($3 > 59034049 && $2 < 59363566)))) && \
             $18 !~/MultipleAlignments/ {$17 = 1;} \
             {print;}' > snp132.parWeightTweak.bed
        wc -l snp132.parWeightTweak.bed
  #33026121 snp132.parWeightTweak.bed
      # Make sure only the weight has changed:
      zcat snp132.bed.gz | cut -f 1-16,18-25 > /data/tmp/snp132.weightless.bed
      cut -f 1-16,18-25 snp132.parWeightTweak.bed > /data/tmp/snp132.parTweak.weightless.bed
      cmp /data/tmp/snp132*.weightless.bed
      # No output, good.
      # Reload snp132 with the tweaked weights:
      hgLoadBed -tab -onServer -tmpDir=$TMPDIR -allowStartEqualEnd \
        hg19 snp132 -sqlTable=snp132.sql snp132.parWeightTweak.bed
      zcat snp132ExceptionDesc.tab.gz \
      | hgLoadSqlTab hg19 snp132ExceptionDesc $HOME/kent/src/hg/lib/snp125ExceptionDesc.sql stdin
      gzip snp132.parWeightTweak.bed
  
  
  #############################################################################
  # Agilent arrays (2010-12-01 Andy)
  cd /hive/data/genomes/hg19/bed/agilentProbes/
  # first move all the lifted versions out of the way
  mkdir lifted.2009-07-28/
  mv * lifted.2009-07-28/
  # FTP download from ftp.agilent.com using given user/pass from Anniek De-witte
  # (anniek_de-witte@agilent.com)
  # downloaded files are gzipped beds. The files are typically located in a
  # directory called "FOR_UCSC" or something like that.  The user/pass and the
  # directory are deleted after it's confirmed they're received, so it's not
  # too helpful to mention specifics here.
  ftp -u user -p password ftp.agilent.com
  > cd directory
  > get 014693_D_BED_20100501.bed.gz
  > get 014698_D_BED_20100501.bed.gz
  > get 014950_D_BED_20100501.bed.gz
  > get 021529_D_BED_20100501.bed.gz
  > get 021850_D_BED_20100430.bed.gz
  > get 021924_D_BED_20100501.bed.gz
  > get 022060_D_BED_20100501.bed.gz
  > get 023642_D_BED_20100430.bed.gz
  > get 028081_D_BED_20101014.bed.gz
  > get 029830_D_BED_20100922.bed.gz
  # unzip everything
  gunzip *
  ln -s 022060_D_BED_20100501.bed agilent4x180k.bed
  ln -s 021529_D_BED_20100501.bed agilentCgh1x1m.bed
  ln -s 014693_D_BED_20100501.bed agilentCgh1x244k.bed
  ln -s 014698_D_BED_20100501.bed agilentCgh2x105k.bed
  ln -s 021850_D_BED_20100430.bed agilentCgh2x400k.bed
  ln -s 014950_D_BED_20100501.bed agilentCgh4x44k.bed
  ln -s 021924_D_BED_20100501.bed agilentCgh8x60k.bed
  ln -s 028081_D_BED_20101014.bed agilentCghSnp2x400k.bed
  ln -s 029830_D_BED_20100922.bed agilentCghSnp4x180k.bed
  ln -s 023642_D_BED_20100430.bed agilentHrd1x1m.bed
  for bed in agilent*.bed; do
      tail -n +2 $bed | hgLoadBed hg19 ${bed%.bed} stdin
  done
  rm bed.tab
  ### Update (2011-11-01 Andy Pohl
  # (acquired bed file in e-mail attachment)
  cd /hive/data/genomes/hg19/bed/agilentProbes
  ln -s 030587_D_BED_20101006_Colored.bed agilentCghSnpCancer4x180k.bed
  tail +3 agilentCghSnpCancer4x180k.bed | hgLoadBed hg19 agilentCghSnpCancer4x180k stdin
  
  ### Update (2022-03-07 Max Haeussler and Daniel Schmelter)
  cd /hive/data/genomes/hg19/bed/agilentProbes/genetiSureCyto
  for i in hg19*.bed; do egrep -v 'browser|track' $i > /tmp/temp.bed; bedSort /tmp/temp.bed /tmp/temp.bed; bedToBigBed /tmp/temp.bed ../../../chrom.sizes `basename $i .bed`.bb; done
  mkdir /gbdb/hg19/genotypeArrays
  cd /gbdb/hg19/genotypeArrays
  for $i in /hive/data/genomes/hg19/bed/agilentProbes/genetiSureCyto/*.bb; do ln -s $i; done
  
  
  #################################################################################
  # Rfam (2011-11-30 Melissa Cline)
  #
  # This contains genomic alignments of Rfam sequences, from the Rfam group.
  #
  # This data is used in building UCSC Genes.
  #
  cd /hive/data/outside/Rfam
  mkdir 111130
  cd 111130
  wget ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT/genome.gff3.tar.gz
  tar xzvf genome.gff3.tar.gz
  mkdir hg19
  cat /hive/data/genomes/hg19/chrom.aliases \
   |awk '{ print("cat /hive/data/outside/Rfam/111130/genome_gff/" $1 ".gff3",
                 "|sed", sprintf("%c", 39) "s/" $1 "/" $2 "/" sprintf("%c", 39))}' |bash \
   |grep -v -e "^#" \
   |awk '{ print($1 "\t" $4 - 1 "\t" $5 "\t" $9 "\t1\t" $7 "\t"
                 $4 - 1 "\t" $5 "\t0\t1\t" $5 - $4 + 1 "\t0"  }' \
  > hg19/Rfam.bed
  
  
  
  
  #####################################################
  # Vista Enhancers (galt 2010-12-09 done)
  #
  # Vista from Lawrence-Berkeley has assayed
  # 301 human conserved non-coding intra- and inter-
  # genic elements for their ability to promote
  # lacZ in mouse embryos.  A positive looks like
  # a mouse with a purple spine.
  #
  
  mkdir /hive/data/genomes/hg19/bed/vistaEnhancers
  cd /hive/data/genomes/hg19/bed/vistaEnhancers
  
  # download data file from the vista browser (coordinates are for hg19)
  wget -O enhancerbrowser.datadownload.txt 'http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?page_size=100;show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1'
  
  # give elements with positive label a score of 900,
  # give elements with negative label a score of 200.
  # print to 5-field bed file
  cat enhancerbrowser.datadownload.txt \
          | grep ">" \
          | sed -e 's#^<pre>##' \
          | sed -e 's#</pre>$##' \
          | grep "^>Human" \
          | sed -e 's#^>Human|##' \
          | tr :- ' ' \
          | sed -e 's/positive/900/'\
          | sed -e 's/negative/200/' \
          | awk '{print $1"\t"$2"\t"$3"\telement_"$6"\t"$8}' \
          | grep -P -v "^chr\t" \
          > vistaEnhancers.bed
  hgLoadBed hg19 vistaEnhancers vistaEnhancers.bed
  #Loaded 1339 elements of size 5
  
  
  # add to hg19/trackDb.ra
  track vistaEnhancers override
  url http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=$$&organism_id=1
  
  
  #####################################################
  # UNIGENE/SAGE TRACK (RE-BUILT - 2010-12-10 Fan)
  
  # Create the uniGene alignments
  
      # Download of the latest UniGene version is now automated by a
      # cron job -- see /cluster/home/angie/crontab ,
      # /cluster/home/angie/unigeneVers/unigene.csh .
  
      ssh hgwdev
      mkdir -p /hive/data/genomes/hg19/bed/uniGene/101210
      cd /hive/data/genomes/hg19/bed/uniGene/101210
  
      set Version = 228
  
      zcat /hive/data/outside/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
      sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa
  
      ssh swarm
      set Version = 228
      mkdir -p /hive/data/genomes/hg19/bed/uniGene/101210/run.blat
      cd /hive/data/genomes/hg19/bed/uniGene/101210/run.blat
  
      ls -1 /hive/data/genomes/hg19/nib/*.nib > genome.lst
      ls -1S \
      /hive/data/genomes/hg19/bed/uniGene/101210/Hs.seq.uniq.simpleHeader.fa \
        > uniGene.lst
  
      cat << '_EOF_' > template.sub
  #LOOP
  /cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/hive/data/genomes/hg19/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
  #ENDLOOP
  '_EOF_'
  
      gensub2 genome.lst uniGene.lst template.sub para.spec
      para create para.spec
      mkdir psl
      para try
      para check
      para push
  
  #Completed: 93 of 93 jobs
  #CPU time in finished jobs:      68896s    1148.26m    19.14h    0.80d  0.002 y
  #IO & Wait Time:                  4789s      79.82m     1.33h    0.06d  0.000 y
  #Average job time:                 792s      13.21m     0.22h    0.01d
  #Longest finished job:            5274s      87.90m     1.47h    0.06d
  #Submission to last job:          5840s      97.33m     1.62h    0.07d
  #Estimated complete:                 0s       0.00m     0.00h    0.00d
  
      pslSort dirs raw.psl tmp psl >& pslSort.log
      cat raw.psl|\
      pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
        stdin hg19.uniGene.pslReps.psl /dev/null
  
      gzip raw.psl
  
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/uniGene/101210/run.blat
  
      hgLoadPsl -table=uniGene_3 hg19 hg19.uniGene.pslReps.psl
  
      mkdir -p /gbdb/hg19/uniGene
      cd /gbdb/hg19/uniGene
  
      rm Hs.seq.uniq.simpleHeader.fa
      ln -s \
      /hive/data/genomes/hg19/bed/uniGene/101210/Hs.seq.uniq.simpleHeader.fa \
      Hs.seq.uniq.simpleHeader.fa
  
  # load the sequence
  
      hgLoadSeq -replace hg19 /gbdb/hg19/uniGene/Hs.seq.uniq.simpleHeader.fa
  
  ##############################################################################
  
  ##############################################################################
  # GAD View Lift (DONE, Andy 2010-12-12)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir gad
  cd gad/
  echo "select * from gad" | hgsql hg18 | tail -n +2 > hg18.bed
  liftOver hg18.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.{bed,unmapped}
  hgLoadBed -noBin hg19 gad hg19.bed
  rm bed.tab
  wc -l *.bed
  #  1883 hg18.bed
  #  1860 hg19.bed
  grep -i split hg19.unmapped | wc -l
  #18
  grep -i part hg19.unmapped | wc -l
  #5
  for table in gadAll gadList; do
      hgsqldump hg18 $table | hgsql hg19
  done
  
  #############################################################################
  # EVOFOLD (Done, 2010-12-13) - Galt using Jakob's procedure from hg18.txt.
  
  # RNA secondary structure predictions lifted from hg17 and filtered
    ssh -C hgwdev
    mkdir -p /cluster/data/hg19/bed/evofold
    cd /cluster/data/hg19/bed/evofold
    echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
    liftOver -bedPlus=6 -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg19.over.chain.gz tmp.bed unmapped.bed
    # remove elements which are wrong size after lifting
    awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsHg19.bed
  
    # structure filters
    # first, remove pairs that can't form in human
    cut -f 1-6 rawFoldsHg19.bed > tmp.bed
    # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
    nice sequenceForBed -db=hg19 -bedIn=tmp.bed -fastaOut=tmp.fa
    cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
               | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsHg19Seq.tab
    # Several python scripts were originally in /cluster/home/jsp/scripts/
    # I copied them to this directory and
    # I removed the optional "psyco" speedup library which does not work with our 64-bit python
    join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsHg19.bed foldsHg19Seq.tab | sed -e 's/  */\t/g' | sort -k1,1 \
  	     | ./tabFoldFilter.py > cleanFolds.tab
    join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsHg19.bed cleanFolds.tab | sed -e 's/  */\t/g' > tmp1.bed
    # second, remove poor predictions
    # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
    cat tmp1.bed | ./bedRnassFilter.py --dangling --minAvrStemSize=3 | ./bedRnassFilter.sh 1 3 \
  	       | ./roundListFloats.py -c9 > foldsHg19.bed
    # clean up
    rm tmp.bed tmp1.bed foldsHg17.bed foldsHg19Seq.tab rawFoldsHg19.bed tmp.fa cleanFolds.tab
  
    # upload
    hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql hg19 evofold foldsHg19.bed
  #############################################################################
  # CREATE .PNG PICTURE FILES OF EVOFOLD RNA STRUCTURES. (DONE, 4/29/2011, Fan)
  
  ssh hgwdev
  mkdir /hive/data/genomes/hg19/bed/evofold/doEvoFold
  cd /hive/data/genomes/hg19/bed/evofold/doEvoFold
  
  # Creaet sub-directories to store .png files (total of about 47.5 K of them) sparated by chromosomes.
  
  mkdir -p evoFold/chr1
  mkdir -p evoFold/chr10
  mkdir -p evoFold/chr11
  mkdir -p evoFold/chr12
  mkdir -p evoFold/chr13
  mkdir -p evoFold/chr14
  mkdir -p evoFold/chr15
  mkdir -p evoFold/chr16
  mkdir -p evoFold/chr17
  mkdir -p evoFold/chr18
  mkdir -p evoFold/chr19
  mkdir -p evoFold/chr2
  mkdir -p evoFold/chr20
  mkdir -p evoFold/chr21
  mkdir -p evoFold/chr22
  mkdir -p evoFold/chr3
  mkdir -p evoFold/chr4
  mkdir -p evoFold/chr5
  mkdir -p evoFold/chr6
  mkdir -p evoFold/chr7
  mkdir -p evoFold/chr8
  mkdir -p evoFold/chr9
  mkdir -p evoFold/chrM
  mkdir -p evoFold/chrX
  mkdir -p evoFold/chrY
  
  # get latest verion of the .jar file of VARNA
  
  wget --timestamping http://varna.lri.fr/bin/VARNAv3-7.jar
  
  # Create Java command line files
  
  echo 'doEvoFold hg19 do$1 $1' >do1Chrom
  chmod +x do1Chrom
  
  do1Chrom chr1
  do1Chrom chr10
  do1Chrom chr11
  do1Chrom chr12
  do1Chrom chr13
  do1Chrom chr14
  do1Chrom chr15
  do1Chrom chr16
  do1Chrom chr17
  do1Chrom chr18
  do1Chrom chr19
  do1Chrom chr2
  do1Chrom chr20
  do1Chrom chr21
  do1Chrom chr22
  do1Chrom chr3
  do1Chrom chr4
  do1Chrom chr5
  do1Chrom chr6
  do1Chrom chr7
  do1Chrom chr8
  do1Chrom chr9
  do1Chrom chrM
  do1Chrom chrX
  do1Chrom chrY
  
  # run the dochrXX command files in small batches with '&' to exploit multiple CPU
  # wait an hour for each batch to finish so that we don't suck in too much computational resources.
  
  dochr1 &
  dochr2 &
  dochr3 &
  dochr4 &
  dochr5 &
  
  sleep 3600
  
  dochr6 &
  dochr7 &
  dochr8 &
  dochr9 &
  dochr10 &
  
  sleep 3600
  
  dochr11 &
  dochr12 &
  dochr13 &
  dochr14 &
  dochr15 &
  
  sleep 3600
  
  dochr16 &
  dochr17 &
  dochr18 &
  dochr19 &
  dochr20 &
  
  sleep 3600
  
  dochr21 &
  dochr22 &
  dochrX &
  dochrY &
  dochrM &
  
  # check the resulting .png files
  
  # create a simple script file, check1, with the following 3 lines:
  
  echo $1
  hgsql hg19 -N -e "select count(*) from evofold where chrom='${1}'"
  ls evoFold/$1/*.png|wc
  
  chmod +x check1
  
  # create another script file, checkAll, with the following lines:
  
  check1 chr1
  check1 chr10
  check1 chr11
  check1 chr12
  check1 chr13
  check1 chr14
  check1 chr15
  check1 chr16
  check1 chr17
  check1 chr18
  check1 chr19
  check1 chr2
  check1 chr20
  check1 chr21
  check1 chr22
  check1 chr3
  check1 chr4
  check1 chr5
  check1 chr6
  check1 chr7
  check1 chr8
  check1 chr9
  check1 chrM
  check1 chrX
  check1 chrY
  
  chmod +x checkAll
  checkAll >j.check
  
  # examing the resuls in j.check to make sure things are OK.
  
  # create symbolic links
  
  ln -s /hive/data/genomes/hg19/bed/evofold/doEvoFold/evoFold  /gbdb/hg19/evoFold
  ln -s /gbdb/hg19/evoFold /usr/local/apache/htdocs/evoFold/hg19
  
  ##########################################################################
  # Build targetScanS track - (DONE - 2010-12-13 galt)
  #       requested by: George Bell gbell at wi.mit.edu
      ssh hgwdev
      mkdir -p /cluster/data/hg19/bed/targetScanS
      cd /cluster/data/hg19/bed/targetScanS
  
      wget --timestamping http://www.targetscan.org/vert_50/ucsc/hg19/hg19Cons_ALL_CHRS.BED
  
      hgLoadBed hg19 targetScanS hg19Cons_ALL_CHRS.BED
      #	Loaded 54199 elements of size 6
      featureBits hg19 targetScanS
      #   354163 bases of 2897316137 (0.012%) in intersection
  
      # Create/edit/check in targetScans.html and trackDb.ra under
      # kent/src/hg/makeDb/trackDb/human/hg19
  
  ##########################################################################
  # Update targetScanS track to Version 7.2 - (DONE 2021-02-03 Brittney & Kate)
  
  cd /hive/data/genomes/hg19/bed/targetScanS
  mkdir vert_72
  
  wget --header="accept-encoding: gzip" http://www.targetscan.org/vert_72/vert_72_data_download/Predicted_Target_Locations.default_predictions.hg19.bed.zip
  
  unzip Predicted_Target_Locations.default_predictions.hg19.bed.zip
  
  # The downloaded file is a BED12. The score column has values between -1 and 100.
  # Everything else looks good.
  # Create a new BED12+1 file with this score value as field 13 and change the score values
  # to 0 in column 5.
  
  awk 'OFS="\t" {c=$5; $5="0"; print $0, c}' \
          < Predicted_Target_Locations.default_predictions.hg19.bed
          > bed12+1_targetScanSitesV72.bed
  
  # copy "bedDetail.as" from ~/kent/src/hg/lib and rename to targetScanS.as and edit to
  # match our new BED12+1 file.
  cp ~/kent/src/hg/lib/bedDetail.as .
  mv bedDetail.as targetScanS.as
  
  bedToBigBed -type=bed12+1 -as=targetScanS.as bed12+1_targetScanSitesV72.bed \
          ../../../chrom.sizes targetScanSitesV72.bb
  
  # Link bigBed file into /gbdb directory for access by browser
  cd /gbdb/hg19
  mkdir targetScan
  cd targetScan
  ln -s /hive/data/genomes/hg19/bed/targetScanS/vert_72/targetScanSitesV72.bb .
  
  ##########################################################################
  # Neandertal tracks for hg19 (DONE - 2010-12-14 - Hiram)
      # data supplied by Ed Green into /hive/data/outside/homNea/hg19
      # add Neandertal group to hg19 grp
      hgsql hg19 -e \
        "insert into grp values ('neandertal', 'Neandertal Assembly and Analysis', 6.5, 1);"
  
      mkdir -p /hive/data/genomes/hg19/bed/homNea/seqAlis
      cd /hive/data/genomes/hg19/bed/homNea/seqAlis
  
  for T in Feld1 Mez1 Sid1253 Vi33.16 Vi33.25 Vi33.26
  do
      ln -s /hive/data/outside/homNea/hg19/${T}.hg18.bam \
  	./SL${T}.hg19.bam
  done
  
      ln -s /hive/data/outside/homNea/hg19/*.bam .
      for F in *.bam
  do
      samtools index $F
  done
  
      mkdir -p /gbdb/hg19/neandertal/seqAlis
      ln -s `pwd`/SL*.b* /gbdb/hg19/neandertal/seqAlis/
  
  for T in Feld1 Mez1 Sid1253
  do
      hgBbiDbLink hg19 bamSL${T} \
  	/gbdb/hg19/neandertal/seqAlis/SL${T}.hg19.bam
  done
  for T in 16 25 26
  do
      hgBbiDbLink hg19 bamSLVi33dot${T} \
  	/gbdb/hg19/neandertal/seqAlis/SLVi33.${T}.hg19.bam
  done
  
  ##########################################################################
  # DECIPHER, RGD QTL, RGD RAT QTL (MAYBE DONE, Andy 2010-12-13)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir decipher rgdQtl rgdRatQtl
  for tab in decipher rgdQtl rgdRatQtl; do
    echo "select * from "$tab | hgsql hg18 | tail -n +2 | cut -f2- > ${tab}/hg18.bed
    liftOver ${tab}/hg18.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz ${tab}/hg19.{bed,unmapped}
    hgLoadBed hg19 $tab ${tab}/hg19.bed
  done
  rm bed.tab
  wc -l {decipher,rgdQtl,rgdRatQtl}/hg1{8,9}.bed
  #  4227 decipher/hg18.bed
  #  4048 decipher/hg19.bed
  #   254 rgdQtl/hg18.bed
  #   225 rgdQtl/hg19.bed
  #  6033 rgdRatQtl/hg18.bed
  #  5804 rgdRatQtl/hg19.bed
  ## This isn't very good.  In each case, the unmapped % is over 2%.
  ## DECIPHER: 95.8%, RGD QTL: 88.6%, RGD RAT QTL: 96.2%
  ## update for rgdQtl:
  hgsqldump hg18 rgdQtlLink | hgsql hg19
  hgsqldump hg18 rgdRatQtlLink | hgsql hg19
  
  #############################################################################
  # FISH CLONES LIFT (DONE, Andy 2010-12-14)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir fishClones
  cd fishClones/
  echo "select * from fishClones" | hgsql hg18 | tail -n +2 > hg18.bed5p
  liftOver -bedPlus=5 hg18.bed5p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.bed5p hg19.unmapped
  wc -l *.bed5p
  #   9788 hg18.bed5p
  #   9758 hg19.bed5p
  grep -i split hg19.unmapped  | wc -l
  # 17
  grep -i partially  hg19.unmapped  | wc -l
  # 13
  cp ~/kent/src/hg/lib/fishClones.sql .
  hgLoadBed -tab -sqlTable=fishClones.sql -notItemRgb hg19 fishClones hg19.bed5p
  
  #############################################################################
  # CGAP SAGE LIFT (DONE, Galt 2010-12-16)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir cgapSage
  cd cgapSage
  echo "select * from cgapSage" | hgsql hg18 -N > hg18.bed8p
  liftOver -tab -bedPlus=8 -hasBin hg18.bed8p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.bed8p hg19.unmapped
  wc -l *.bed8p
  #  276905 hg18.bed8p
  #  276865 hg19.bed8p
  grep -i split hg19.unmapped  | wc -l
  # 0
  grep -i partially  hg19.unmapped  | wc -l
  # 3
  cp ~/kent/src/hg/lib/cgapSage/cgapSage.sql .
  hgLoadBed -tab -hasBin -sqlTable=cgapSage.sql -notItemRgb hg19 cgapSage hg19.bed8p
  
  # no lift needed for the lib table
  echo "select * from cgapSageLib" | hgsql hg18 -N > cgapSageLib.tab
  cp ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql .
  hgLoadSqlTab hg19 cgapSageLib cgapSageLib.sql cgapSageLib.tab
  
  #############################################################################
  # LASTZ Zebrafish DanRer7 (DONE - 2010-12-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17
      cd /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17
  
      cat << '_EOF_' > DEF
  # human vs X. zebrafish
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Zebrafish danRer7
  SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit
  SEQ2_LEN=/scratch/data/danRer7/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	Elapsed time: 1698m29s
      cat fb.hg19.chainDanRer7Link.txt
      #	80849592 bases of 2897316137 (2.790%) in intersection
  
      #	running the swap
      mkdir /hive/data/genomes/danRer7/bed/blastz.hg19.swap
      cd /hive/data/genomes/danRer7/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    42m52.920s
      cat fb.danRer7.chainHg19Link.txt
      #	86716552 bases of 1409770109 (6.151%) in intersection
  
  ##############################################################################
  # UMass Med School brain histone ChIP-seq
  # (DONE - 2010-12-17 - Kate)
  #
  # From Troy Whitfield, submitting for Zhiping Weng, collab with Akbarian
  # Published in PNAS, April 2010
  # Variables: cell, sample, sex, age
  # Tissue:  Prefrontal cortex (PFC)
  #
  # 11 individuals, age .5 to 69 years
  # 13 bigWigs of H3K4me3 enrichment: 11 neuronal, 2 non-neuronal
  # 3 peaks files (bed5FloatScore)
  # Neuronal cells selected by FACS sorting, based on NeuN marker (NeuN+)
  # Non-neuronal (NeuN-) cells are largely glia, microglia, and endothelium
  # 3 peak files
  
  # Note: used publicly available blood cell (lymphocyte)
  # ChIP-seq as controls:
  # K562, GM12878 (from Bernstein ENCODE group), CD4+ (from Barski, HLB)
  
      cd /hive/data/genomes/hg19/bed
  
      mkdir uMassBrainHistone
      cd uMassBrainHistone
  
      wget http://zlab.umassmed.edu/~whitfiet/hg19/uMassBrainHist.tar.gz
      tar xvfz uMassBrainHist.tar.gz
      cd data
  
      set t = "uMassBrainHistone"
  
  # Load peak tracks
  # peaks in neuron not in  blood
      hgLoadBed hg19 -noNameIx -renameSqlTable \
        -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
        ${t}PeaksNeuron 11Neuronal_vs_3Blood_hg19.bed
  # Loaded 7947 elements of size 6
  
  # peaks in infants (<1 year), not in seniors (>60)
      hgLoadBed hg19 -noNameIx -renameSqlTable \
        -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
        ${t}PeaksInfant 3Young_vs3Old_hg19.bed
  # Loaded 1292 elements of size 6
  
  # peaks specific to individuals
      hgLoadBed hg19 -noNameIx -renameSqlTable \
        -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
        ${t}PeaksSample SampleSpecific_hg19.bed
  # Loaded 3214 elements of size 6
  
  # Load signal tracks
  # Get metadata from .ddf file
  
      cd ..
      cat uMassMedBrainHist.ddf
  
  #files   view    sample  cell    sex     age
  #data/s6.bw      Signal  6       neuron  male    4.7
  #data/s6n.bw     Signal  6       non-neuron      male    4.7
  #data/s2.bw      Signal  2       neuron  male    0.58
  #data/s3.bw      Signal  3       neuron  female  0.75
  #data/s1.bw      Signal  1       neuron  male    0.5
  #data/s9.bw      Signal  9       neuron  female  68
  #data/s5.bw      Signal  5       neuron  female  2.8
  #data/s11.bw     Signal  11      neuron  female  69
  #data/s11n.bw    Signal  11      non-neuron      female  69
  #data/s7.bw      Signal  7       neuron  male    8.8
  #data/s8.bw      Signal  8       neuron  male    14
  #data/s10.bw     Signal  10      neuron  female  69
  #data/s4.bw      Signal  4       neuron  male    1.3
  #data/11Neuronal_vs_3Blood_hg19.bed      Peaks   N/A     N/A     N/A     N/A
  #data/3Young_vs3Old_hg19.bed     Peaks   N/A     N/A     N/A     N/
  #data/SampleSpecific_hg19.bed    Peaks   N/A     N/A     N/A     N/A
  
  
  # Generate table names from DDF
  # Format:  uMassBrainHistoneSignalS<sample>Neu<P|M><age>yrs<M|F>
  # e.g. uMassBrainHistoneSignalS1185Neu4pt7yrsM
  
      grep Signal uMassMedBrainHist.ddf | \
  
      cat << 'EOF' > list.pl
  while (<>) {
      ($file, $view, $sample, $cell, $sex, $age) = split;
      next unless $view eq 'Signal';
      $cell = ($cell eq 'neuron' ? 'P' : 'M');
      $age =~ s/\./pt/;
      $sex = ($sex eq 'male' ? 'M' : 'F');
      $table = "uMassBrainHistoneSignalS" . $sample . "Neu" . $cell . $age . "yrs" . $sex;
      print $file . "." . $table . "\n";
  }
  'EOF'
  
      cat << 'EOF' > load.csh
  set gbdb = "/gbdb/hg19/bbi/uMassBrainHistone"
  foreach x (`perl list.pl < uMassMedBrainHist.ddf`)
      set f = $x:r
      set t = $x:e
      echo "Loading $f into $t"
      bigWigInfo $f
      ln -s `pwd`/$f $gbdb/$t.bw
      hgBbiDbLink hg19 $t $gbdb/$t.bw
  end
  'EOF'
  
      csh load.csh >&! load.out &
  
  #############################################################################
  # SwitchDB TSS Track (DONE 2010-12-17 galt)
  #
  # This liftover is tricky because of the gmStart and gmEnd, which
  # are not lifted automatically.
  # The gm coordinates have to be lifted separately.
  
  ssh hgwdev
  mkdir /cluster/data/hg19/bed/switchDbTss
  cd /cluster/data/hg19/bed/switchDbTss
  ln -s /cluster/data/hg17/bed/switchDbTss/hg17.bed hg17.bed
  liftOver -tab -bedPlus=6 hg17.bed /gbdb/hg17/liftOver/hg17ToHg19.over.chain.gz hg19.bed unMapped
  cat unMapped | grep '^#' | sort | uniq -c
  #     61 #Deleted in new
  
  ln -s ~/kent/src/hg/lib/switchDbTss.sql
  hgLoadBed -renameSqlTable -sqlTable=switchDbTss.sql hg19 switchDbTssTemp hg19.bed
  
  mysql> select count(*) from switchDbTssTemp;
  +----------+
  | count(*) |
  +----------+
  |   132332 |
  +----------+
  
  hgsql -N hg19 -e "select distinct chrom, gmChromStart, gmChromEnd, gmName from switchDbTssTemp" > gmLoc.hg17.bed4
  liftOver -tab -bedPlus=4 gmLoc.hg17.bed4 /gbdb/hg17/liftOver/hg17ToHg19.over.chain.gz gmLoc.hg19.bed4 gmUnMapped
  
  cat gmUnMapped | grep '^#' | sort | uniq -c
  #      1 #Deleted in new
  #     58 #Partially deleted in new
  #     57 #Split in new
  
  hgLoadBed hg19 switchDbTssGmLocTemp gmLoc.hg19.bed4
  
  
  hgsql hg19 < switchDbTss.sql
  
  hgsql hg19 -e "insert into switchDbTss select a.bin, a.chrom, a.chromStart, a.chromEnd, a.name, a.score, a.strand, a.confScore, a.gmName, b.chromStart as gmChromStart, b.chromEnd as gmChromEnd, a.isPseudo from switchDbTssTemp a, switchDbTssGmLocTemp b where a.gmName = b.name"
  
  mysql> select count(*) from switchDbTss;
  +----------+
  | count(*) |
  +----------+
  |   131780 |
  +----------+
  
  hgsql hg19 -e "drop table switchDbTssTemp"
  hgsql hg19 -e "drop table switchDbTssGmLocTemp"
  
  #############################################################################
  
  #############################################################################
  # FOSMID END PAIRS LIFT FROM HG18 (DONE 2010-12-28, Andy)
  
  mkdir /hive/data/genomes/hg19/bed/hg18MassiveLift/fosEndPairs
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift/fosEndPairs/
  echo "select * from fosEndPairs" | hgsql hg18 | tail -n +2 | cut -f2- > hg18.fosEndPairs.fep.bed
  echo "select * from all_fosends" | hgsql hg18 | tail -n +2 | cut -f2- > hg18.all_fosends.psl
  # Converting to bed 12 because of the positional info in nonstandard fields.
  # the awk script is pretty simple and is in the directory.
  awk -f toBed12.awk hg18.fosEndPairs.fep.bed > hg18.fosEndPairs.bed12
  liftOver -bedPlus=12 hg18.fosEndPairs.bed12 /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.fosEndPairs.{bed12,unmapped12}
  liftOver -pslT hg18.all_fosends.psl /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.{psl,unmapped}
  # remove pairs which have one (or both) ends not in the lifted set of all ends
  cut -f14 hg19.fosEndPairs.bed12 | tr ',' '\n' | sort > hg19.fosEndPairs.names
  cut -f10 hg19.all_fosends.psl | sort > hg19.all_fosends.names
  grep -Fxv -f hg19.all_fosends.names hg19.fosEndPairs.names > bad.names
  grep -Fv -f bad.names hg19.fosEndPairs.bed12 > hg19.fosEndPairs.good.bed12
  wc -l hg19*.bed12
  #  384635 hg19.fosEndPairs.bed12
  #  384442 hg19.fosEndPairs.good.bed12
  ## so... 193 bad ones were removed.  These would have caused join errors if they were left in.
  
  ## convert back to fosEndPairs bed6+
  awk -f toFosEndPairs.awk hg19.fosEndPairs.good.bed12 > hg19.fosEndPairs.fep.bed
  
  cp ~/kent/src/hg/lib/fosEndPairs.sql .
  hgLoadBed -sqlTable=fosEndPairs.sql -notItemRgb hg19 fosEndPairs hg19.fosEndPairs.fep.bed
  hgLoadPsl -table=all_fosends hg19 hg19.all_fosends.psl
  
  wc -l *.fep.bed
  #  386129 hg18.fosEndPairs.fep.bed
  #  384442 hg19.fosEndPairs.fep.bed
  ## 99.6% lifted
  
  ## Now we need the sequences from all_fosends to be loaded into the seq table in hg19
  
  mkdir /gbdb/hg19/fosends
  ln -s /gbdb/hg18/fosends/fosEnds.fa /gbdb/hg19/fosEnds.fromHg18.fa
  hgLoadSeq hg19 /gbdb/hg19/fosends/fosEnds.fromHg18.fa
  
  #############################################################################
  # DECIPHER LIFT FROM HG18 (DONE 2010-12-27, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift/
  mkdir decipher
  cd decipher/
  hgsql -e "select * from decipherRaw" hg18 | tail -n +2 > hg18.decipherRaw.txt
  cat hg18.decipherRaw.txt | awk 'BEGIN{FS="\t";OFS="\t"}{ chr="chr"$4; $4=$1; $1=chr; $2 = $2 - 1; print;}' | liftOver -bedPlus=4 -tab stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz stdout hg19.decipherRaw.unmapped | sed 's/^chr//' | awk 'BEGIN{FS="\t";OFS="\t"}{ t=$1; $1=$4; $4=t; print;}' > hg19.decipherRaw.txt
  cp ~/kent/src/hg/lib/decipherRaw.sql .
  hgLoadSqlTab hg19 decipherRaw decipherRaw.sql hg19.decipherRaw.txt
  hgsql -e "select * from decipher" hg18 | tail -n +2 | cut -f2- > hg18.decipher.bed
  liftOver hg18.decipher.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.decipher.{bed,unmapped}
  cut -f1 hg19.decipherRaw.txt > hg19.decipherRaw.names
  cut -f4 hg19.decipher.bed > hg19.decipher.names
  # how many of the lifted deciphers are not in the lifted decipherRaws?
  grep -Fvx -f hg19.decipher.names hg19.decipherRaw.names | wc -l
  #0
  # none. ok then, we are done.
  rm *.names
  wc -l *.bed
  #  4227 hg18.decipher.bed
  #  4048 hg19.decipher.bed
  hgLoadBed hg19 decipher hg19.decipher.bed
  
  #############################################################################
  # CLONE COVERAGE LIFT FROM HG18 (DONE 2010-12-29, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift/
  mkdir clonePos
  cd clonePos/
  hgsql --skip-column-names -e "select * from clonePos" hg18 | awk 'BEGIN{FS="\t"; OFS="\t";}{print $4, $5, $6, $1, $2, $3, $7, $8;}' > hg18.clonePos.bed4p
  liftOver -tab -bedPlus=4 hg18.clonePos.bed4p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.clonePos.{bed4p,unmapped}
  awk 'BEGIN{FS="\t"; OFS="\t";}{print $4, $5, $6, $1, $2, $3, $7, $8;}' hg19.clonePos.bed4p > hg19.clonePos.txt
  hgLoadSqlTab hg19 clonePos clonePos.sql hg19.clonePos.txt
  ## it loaded but there seems to be a dependency on the "chr*_gl" tables.  sigh..
  mkdir gl
  cd gl/
  echo "show tables like 'chr%_gl'" | hgsql hg18 | tail -n +2 | while read table; do echo "select * from "$table | hgsql hg18 | tail -n +2 | cut -f2- | awk -v chr=${table%_gl} 'BEGIN{OFS="\t"}{print chr, $2, $3, $1, "1000", $4;}'; done > hg18.gl.bed
  liftOver hg18.gl.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.gl.{bed,unmapped}
  hgLoadBed -noLoad hg19 gl hg19.gl.bed
  mv bed.tab hg19.gl.withBin.bed
  awk 'BEGIN{OFS="\t";}{fname=$2"_gl.txt"; print $1, $5, $3, $4, $7 >> fname;}' hg19.gl.withBin.bed
  cd ../
  for tab in `echo show tables like "'chr%_gl'" | hgsql hg19 | tail -n +2`; do
    echo select frag from $tab | hgsql hg19 | tail -n +2  >> gl.names;
  done
  sed 's/_.*//' gl.names | sort | uniq > uniq.gl.names
  cut -f4 hg19.clonePos.bed4p > hg19.clonePos.names
  diff uniq.gl.names hg19.clonePos.names | grep '<' | sed 's/< //' > bad_gl.names
  for f in chr*.txt; do
     tab=${f%.txt}
     grep -v -f ../bad_gl.names $f > ${tab}.update.txt
     hgLoadSqlTab hg19 ${tab} hg18.chr1_gl.sql ${tab}.update.txt;
  done
  
  #############################################################################
  # MGI MOUSE QTL LIFT (DONE 2010-12-30, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir mgiMouseQtl
  cd mgiMouseQtl/
  ## There are two subtracks to deal with but it's not a big deal.
  ## Both are bed4.
  for tab in jaxQtlAsIs jaxQtlPadded; do
     hgsql hg18 --skip-column-names -e "select chrom,chromStart,chromEnd,name from "$tab > hg18.${tab}.bed
     liftOver hg18.${tab}.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bed,unmapped}
     hgLoadBed hg19 $tab hg19.${tab}.bed
  done
  wc -l *.bed
  #    398 hg18.jaxQtlAsIs.bed
  #   1463 hg18.jaxQtlPadded.bed
  #    383 hg19.jaxQtlAsIs.bed
  #   1462 hg19.jaxQtlPadded.bed
  ## 96.2% for jaxQtlAsIs, 99.9% for jaxQtlPadded.
  
  #############################################################################
  # H-INV LIFT (DONE 2010-12-30, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir HInv
  cd HInv/
  hgsql hg18 --skip-column-names -e "select * from HInvGeneMrna" | cut -f2- > hg18.HInvGeneMrna.psl
  liftOver -pslT hg18.HInvGeneMrna.psl /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.HInvGeneMrna.{psl,unmapped}
  hgLoadPsl -table=HInvGeneMrna hg19 hg19.HInvGeneMrna.psl
  ## A couple non-positional tables too:
  hgsqldump hg18 ensToHInv | hgsql hg19
  hgsqldump hg18 HInv | hgsql hg19
  
  #############################################################################
  # SIB ALT-SPLICING LIFT (DONE 2010-12-30, Andy)
  # Note:
  # Obsolete. See the section "SIB Transcriptome (DONE 2011-12-02 Chin)"
  # down below and redmine track #5630 for more details
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir sibTxGraph
  cd sibTxGraph/
  ## there is positional data in the 9th column, so I'll convert it to a bed12+
  ## and back to the native format later.  Again, the awk scripts to do that
  ## are aptly named, and reside in the hg19 directory with the data.
  hgsql hg18 --skip-column-names -e "select * from sibTxGraph" | cut -f2- | awk -f toBed12Plus.awk > hg18.sibTxGraph.bed12p
  liftOver -tab -bedPlus=12 hg18.sibTxGraph.bed12p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.sibTxGraph.{bed12p,unmapped}
  awk -f toSib.awk hg19.sibTxGraph.bed12p > hg19.sibTxGraph.txt
  cut -f1-3 hg19.sibTxGraph.bed12p > hg19.sibTxGraph.bed3
  hgLoadBed -noLoad hg19 sibTxGraph hg19.sibTxGraph.bed3
  cut -f1 bed.tab > hg19.sibTxGraph.bins
  paste hg19.sibTxGraph.bins hg19.sibTxGraph.txt > hg19.sibTxGraph.withBin.txt
  ## Oddly, there's no .sql file for this in the kent source-tree, so I'll
  ## make one directly from hg18 that's suitable for hgLoadSqlTab
  hgsqldump --no-data --compact hg18 sibTxGraph | sed '/^SET/d;s/ENGINE.*//'  > sibTxGraph.sql
  hgLoadSqlTab hg19 sibTxGraph sibTxGraph.sql hg19.sibTxGraph.withBin.txt
  
  wc -l *.bed12p
  #    47094 hg18.sibTxGraph.bed12p
  #    47008 hg19.sibTxGraph.bed12p
  ## 99.8% lifted, not bad.
  
  #############################################################################
  # ILLUMINA WG-6 LIFT TO HG19 (DONE 2010-12-30, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir illuminaProbes
  cd illuminaProbes/
  ## Just copy the seq table to hg19
  hgsqldump hg18 illuminaProbesSeq | hgsql hg19
  ## Two tables: a PSL and a BED:
  hgsql hg18 --skip-column-names -e "select * from illuminaProbes" | cut -f2- > hg18.illuminaProbes.bed
  hgsql hg18 --skip-column-names -e "select * from illuminaProbesAlign" | cut -f2- > hg18.illuminaProbesAlign.psl
  liftOver hg18.illuminaProbes.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.illuminaProbes.{bed,unmapped}
  liftOver -pslT hg18.illuminaProbesAlign.psl /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.illuminaProbesAlign.{psl,unmapped}
  hgLoadBed hg19 illuminaProbes hg19.illuminaProbes.bed
  hgLoadPsl -table=illuminaProbesAlign hg19 hg19.illuminaProbesAlign.psl
  ## Just to check the probes and align tables are essentially the same
  cut -f4 hg19.illuminaProbes.bed | sort > hg19.illuminaProbes.names
  cut -f10 hg19.illuminaProbesAlign.psl | sort > hg19.illuminaProbesAlign.names
  diff *.names
  #(no output)
  wc -l *.bed *.psl
  #   44163 hg18.illuminaProbes.bed
  #   44088 hg19.illuminaProbes.bed
  #   44163 hg18.illuminaProbesAlign.psl
  #   44088 hg19.illuminaProbesAlign.psl
  ## 99.8% lifted
  
  #############################################################################
  # EIO/JCVI NAS LIFT TO HG19 (DONE 2010-12-30, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir eioJcviNAS
  cd eioJcviNAS/
  for tab in eioJcviNASNeg eioJcviNASPos; do
     hgsql hg18 --skip-column-names -e "select * from "$tab > hg18.${tab}.bed
     liftOver hg18.${tab}.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bed,unmapped}
     hgLoadBed hg19 ${tab} hg19.${tab}.bed
  done
  wc -l *.bed
  #  338278 hg18.eioJcviNASNeg.bed
  #  130535 hg18.eioJcviNASPos.bed
  #  338238 hg19.eioJcviNASNeg.bed
  #  130504 hg19.eioJcviNASPos.bed
  ## > 99.9% of items lifted in both tables: pretty good.
  ## One strange thing about this one is that the hg18 tables don't have a bin
  ## field.  I doubt it's important to keep it that way.
  
  #############################################################################
  # ORegAnno LIFT TO HG19 (DONE 2010-12-31, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir oreganno
  cd oreganno
  hgsql hg18 --skip-column-names -e "select * from oreganno" | cut -f2- > hg18.oreganno.bed3p
  liftOver -bedPlus=3 -tab hg18.oreganno.bed3p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.oreganno.{bed3p,unmapped}
  cp ~/kent/src/hg/lib/oreganno.sql .
  hgLoadBed -tab -sqlTable=oreganno.sql hg19 oreganno hg19.oreganno.bed3p
  hgsqldump hg18 oregannoLink | hgsql hg19
  hgsqldump hg18 oregannoAttr | hgsql hg19
  wc -l *.bed3p
  #  23130 hg18.oreganno.bed3p
  #  23118 hg19.oreganno.bed3p
  ## 99.9% lifted.
  
  #	2013-11-07 eliminate some names from Attr that are not in primary table
  #	as identified by joinerCheck:
  
  joinerCheck  -database=hg19 -identifier=oregId -keys all.joiner
  Checking keys on database hg19
   hg19.oregannoAttr.id - hits 93587 of 93635 (99.949%)
  Error: 48 of 93635 elements (0.051%) of hg19.oregannoAttr.id are not in key oreganno.id line 3550 of all.joiner
  Example miss: OREG0009903
   hg19.oregannoLink.id - hits 70670 of 70706 (99.949%)
  Error: 36 of 70706 elements (0.051%) of hg19.oregannoLink.id are not in key oreganno.id line 3551 of all.joiner
  Example miss: OREG0009903
  
  # Example miss: OREG0029365
   sacCer3.oregannoLink.id - hits 21886 of 21895
  # Error: 9 of 21895 elements of sacCer3.oregannoLink.id are not in key oreganno.id line 2872 of all.joiner
  # Example miss: OREG0029365
  
      cd /hive/data/genomes/hg19/bed/hg18MassiveLift/oreganno
  
      hgsql -N -e "select id from oregannoAttr;" hg19 \
  	| sort -u > oregannoAttr.id.txt
      hgsql -N -e "select id from oreganno;" hg19 \
  	| sort -u > oreganno.id.txt
      comm -13 oreganno.id.txt oregannoAttr.id.txt
  # OREG0009903
  # OREG0009904
  # OREG0009905
  # OREG0018375
  # OREG0018376
  # OREG0018390
  # OREG0018391
  # OREG0018392
  # OREG0018393
  # OREG0018394
  # OREG0024661
  # OREG0026761
  
      comm -13 oreganno.id.txt oregannoAttr.id.txt | while read id
  do
      echo $id
      hgsql -e "delete from oregannoAttr where id=\"${id}\";" hg19
  done
  
      hgsql -N -e "select id from oregannoLink;" hg19 \
  	| sort -u > oregannoLink.id.txt
      comm -13 oreganno.id.txt oregannoLink.id.txt
  # OREG0009903
  # OREG0009904
  # OREG0009905
  # OREG0018375
  # OREG0018376
  # OREG0018390
  # OREG0018391
  # OREG0018392
  # OREG0018393
  # OREG0018394
  # OREG0024661
  # OREG0026761
  
      comm -13 oreganno.id.txt oregannoLink.id.txt | while read id
  do
      echo $id
      hgsql -e "delete from oregannoLink where id=\"${id}\";" hg19
  done
  
     # joinerCheck is now clean:
  joinerCheck  -database=hg19 -identifier=oregId -keys all.joiner
  Checking keys on database hg19
   hg19.oregannoAttr.id - hits 93587 of 93587 (100.000%) ok
   hg19.oregannoLink.id - hits 70670 of 70670 (100.000%) ok
  
  #############################################################################
  # NK NUC LAMINA LIFT TO HG19 (DONE 2010-12-31, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir laminB1
  cd laminB1/
  hgsql hg18 --skip-column-names -e "select * from laminB1Lads" | cut -f2- > hg18.laminB1Lads.bed
  liftOver hg18.laminB1Lads.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.laminB1Lads.{bed,unmapped}
  hgLoadBed hg19 laminB1Lads hg19.laminB1Lads.bed
  wc -l *.bed
  # 1344 hg18.laminB1Lads.bed
  # 1302 hg19.laminB1Lads.bed
  ## 96.9% lifted... ok, not bad I guess.
  
  ln -s /hive/data/genomes/hg18/bed/nuclearLamina/LaminB1_080513.wig hg18.laminB1.customWigVarStep
  awk -f toBedGraph.awk hg18.laminB1.customWigVarStep > hg18.laminB1.bg
  liftOver -bedPlus=3 -tab hg18.laminB1.bg /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.laminB1.{bg,unmapped}
  awk '{if ($3 - $2 == 60) print;}' hg19.laminB1.bg | sort -k1,1 -k2,2n | awk 'BEGIN{prev=-100;chrom="";FS="\t";OFS="\t";}{ if ((chrom != $1) || ($2 - prev > 60)) {print; chrom = $1; prev = $2;}}'> hg19.laminB1.span60.bg
  wigBedToStep hg19.laminB1.span60.bg hg19.laminB1.span60.wigVarStep
  ln -s hg19.laminB1.span60.wigVarStep laminB1.wig
  wigEncode laminB1.{wig,wiggle,wib}
  #Converted laminB1.wig, upper limit 5.68, lower limit -6.60
  ln -s `pwd`/laminB1.wib /gbdb/hg19/wib/laminB1.wib
  hgLoadWiggle hg19 laminB1 laminB1.wiggle
  wc -l hg18.laminB1.bg hg19.laminB1.span60.bg
  #  2909178 hg18.laminB1.bg
  #  2908692 hg19.laminB1.span60.bg
  ## In total, 99.98% of the datapoints lifted cleanly.
  
  #############################################################################
  # UCSF BRAIN METHYLATION (DONE 2010-12-31, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir ucsfBrainMethyl
  cd ucsfBrainMethyl/
  
  ## 10 tables:
  ##
  ## ucsfChipSeqH3K4me3BrainCoverage    (bedGraph/bed3+)
  ## ucsfMreSeqBrainReads               (bed9)
  ## ucsfMreSeqBrainCpG                 (bedGraph/bed3+)
  ## ucsfMedipSeqBrainReads             (bed9)
  ## ucsfMedipSeqBrainCpG               (bedGraph/bed3+)
  ## ucsfMedipSeqBrainCoverage          (bedGraph/bed3+)
  ## ucsfRnaSeqBrainAllReads            (bed9)
  ## ucsfRnaSeqBrainAllCoverage         (bedGraph/bed3+)
  ## ucsfRnaSeqBrainSmartReads          (bed9)
  ## ucsfRnaSeqBrainSmartCoverage       (bedGraph/bed3+)
  
  ## Do the bed9s first:
  
  for tab in ucsfMreSeqBrainReads ucsfMedipSeqBrainReads ucsfRnaSeqBrainAllReads ucsfRnaSeqBrainSmartReads; do
     hgsql hg18 --skip-column-names -e "select * from "$tab | cut -f2- > hg18.${tab}.bed
     liftOver hg18.${tab}.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bed,unmapped}
     hgLoadBed hg19 $tab hg19.${tab}.bed
     wc -l *.${tab}.bed
  done
  
  #   10110644 hg18.ucsfMreSeqBrainReads.bed
  #   10109979 hg19.ucsfMreSeqBrainReads.bed
  ## 99.99% lifted
  #   44130143 hg18.ucsfMedipSeqBrainReads.bed
  #   44120612 hg19.ucsfMedipSeqBrainReads.bed
  ## 99.98% lifted
  #   63033692 hg18.ucsfRnaSeqBrainAllReads.bed
  #   63031432 hg19.ucsfRnaSeqBrainAllReads.bed
  #   26767318 hg18.ucsfRnaSeqBrainSmartReads.bed
  #   26766288 hg19.ucsfRnaSeqBrainSmartReads.bed
  ## getting old now, we get it... it lifts.
  
  for tab in ucsfChipSeqH3K4me3BrainCoverage ucsfMreSeqBrainCpG ucsfMedipSeqBrainCpG ucsfMedipSeqBrainCoverage ucsfRnaSeqBrainAllCoverage ucsfRnaSeqBrainSmartCoverage; do
     hgsql hg18 --skip-column-names -e "select * from "$tab | cut -f2- > hg18.${tab}.bg
     liftOver -bedPlus=3 hg18.${tab}.bg /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bg,unmapped}
     hgLoadBed -bedGraph=4 hg19 $tab hg19.${tab}.bg
     wc -l *.${tab}.bg
  done
  
  #  2598517 hg18.ucsfChipSeqH3K4me3BrainCoverage.bg
  #  2598085 hg19.ucsfChipSeqH3K4me3BrainCoverage.bg
  #  1165599 hg18.ucsfMreSeqBrainCpG.bg
  #  1165521 hg19.ucsfMreSeqBrainCpG.bg
  # 20862283 hg18.ucsfMedipSeqBrainCpG.bg
  # 20859033 hg19.ucsfMedipSeqBrainCpG.bg
  # 80960101 hg18.ucsfMedipSeqBrainCoverage.bg
  # 80943454 hg19.ucsfMedipSeqBrainCoverage.bg
  # 17019268 hg18.ucsfRnaSeqBrainAllCoverage.bg
  # 17017461 hg19.ucsfRnaSeqBrainAllCoverage.bg
  #  6141663 hg18.ucsfRnaSeqBrainSmartCoverage.bg
  #  6140890 hg19.ucsfRnaSeqBrainSmartCoverage.bg
  ## again in each case, almost all the data in the table lifts.
  
  for f in *; do gzip $f; echo $f zipped; done
  
  ## One more thing: remove overlapping items in lifted bedGraphs:
  
  for f in hg19*.bg.gz; do
      pre=${f%.bg.gz};
      tab=${pre#hg19.};
      echo $tab;
      gunzip -c $f | sort -k1,1 -k2,2n | bedGraphLegalize -report=${pre}.bad.txt stdin stdout | gzip -c > ${pre}.legal.bg.gz
      hgLoadBed -bedGraph=4 hg19 $tab ${pre}.legal.bg.gz
  done
  
  #############################################################################
  # SNP ARRAYS LIFT TO HG19 (DONE 2010-12-31, Andy)
  
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift
  mkdir snpArray
  cd snpArray/
  
  ## 12 arrays:
  ##
  ## snpArrayAffy6                   (bed6+, 8 fields)
  ## snpArrayAffy6SV                 (bed6)
  ## snpArrayAffy5                   (bed6+, 8 fields)
  ## snpArrayAffy250Nsp              (bed6+, 8 fields)
  ## snpArrayAffy250Sty              (bed6+, 8 fields)
  ## snpArrayIllumina650             (bed6+, 7 fields)
  ## snpArrayIllumina550             (bed6+, 7 fields)
  ## snpArrayIllumina300             (bed6+, 7 fields)
  ## snpArrayIllumina1M              (bed6+, 7 fields)
  ## snpArrayIlluminaHumanCytoSNP_12 (bed6+, 7 fields)
  ## snpArrayIlluminaHuman660W_Quad  (bed6+, 7 fields)
  ## snpArrayIlluminaHumanOmni1_Quad (bed6+, 7 fields)
  
  ## Get the bed6 one out of the way first:
  
  hgsql hg18 --skip-column-names -e "select * from snpArrayAffy6SV" | cut -f2- > hg18.snpArrayAffy6SV.bed
  liftOver hg18.snpArrayAffy6SV.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.snpArrayAffy6SV.{bed,unmapped}
  hgLoadBed hg19 snpArrayAffy6SV hg19.snpArrayAffy6SV.bed
  wc -l *.bed
  #  945805 hg18.snpArrayAffy6SV.bed
  #  945615 hg19.snpArrayAffy6SV.bed
  
  ## The rest each may or may not have their own module in lib
  ## For simplicity sake, I'll just dump the CREATEs straight from
  ## hg18 into their own .sql file.
  
  for table in snpArrayAffy6 snpArrayAffy5 snpArrayAffy250Nsp snpArrayAffy250Sty snpArrayIllumina650 snpArrayIllumina550 snpArrayIllumina300 snpArrayIllumina1M snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHuman660W_Quad snpArrayIlluminaHumanOmni1_Quad; do
     hgsql hg18 --skip-column-names -e "select * from "$table | cut -f2- > hg18.${table}.bed6p
     hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > hg18.${table}.sql
     liftOver -bedPlus=6 hg18.${table}.bed6p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed6p,unmapped}
     hgLoadBed -sqlTable=hg18.${table}.sql -renameSqlTable hg19 $table hg19.${table}.bed6p
  done
  for table in snpArrayAffy6 snpArrayAffy5 snpArrayAffy250Nsp snpArrayAffy250Sty snpArrayIllumina650 snpArrayIllumina550 snpArrayIllumina300 snpArrayIllumina1M snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHuman660W_Quad snpArrayIlluminaHumanOmni1_Quad; do
     hg18=`wc -l hg18.${table}.bed6p | awk '{print $1}'`
     hg19=`wc -l hg19.${table}.bed6p | awk '{print $1}'`
     perc=`echo ${hg19}"/"${hg18}" * 100" | R --vanilla | grep "\[1\]" | awk '{print $2}'`
     printf "%s: %d/%d items lifted (%.3f%%)\n" $table $hg19 $hg18 $perc
  done
  # snpArrayAffy6: 909297/909508 items lifted (99.977%)
  # snpArrayAffy5: 440638/440734 items lifted (99.978%)
  # snpArrayAffy250Nsp: 257159/257213 items lifted (99.979%)
  # snpArrayAffy250Sty: 233887/233941 items lifted (99.977%)
  # snpArrayIllumina650: 660388/660557 items lifted (99.974%)
  # snpArrayIllumina550: 560972/561122 items lifted (99.973%)
  # snpArrayIllumina300: 318046/318117 items lifted (99.978%)
  # snpArrayIllumina1M: 1217520/1219961 items lifted (99.800%)
  # snpArrayIlluminaHumanCytoSNP_12: 302127/302402 items lifted (99.909%)
  # snpArrayIlluminaHuman660W_Quad: 664655/665901 items lifted (99.813%)
  # snpArrayIlluminaHumanOmni1_Quad: 1169872/1175447 items lifted (99.526%)
  
  ## Now there's a few "Raw" tables to lift. Convert them to bed3+ first:
  
  for tab in `echo show tables like "'snpArray%Raw'" | hgsql hg18 | tail -n +2`; do
    hgsql hg18 --skip-column-names -e "select * from "$table | awk 'BEGIN{FS="\t";OFS="\t"}{print "chr"$6, $7 - 1, $7, $1, $2, $3, $4, $5}' > hg18.${table}.bed3p
    liftOver -bedPlus=3 -tab hg18.${table}.bed3p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed3p,unmapped}
    awk 'BEGIN{FS="\t";OFS="\t"}{print $4, $5, $6, $7, $8, substr($1, 4), $3;}' hg19.${table}.bed3p > hg19.${table}.txt
    hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > hg18.${table}.sql
    hgLoadSqlTab hg19 $table hg18.${table}.sql hg19.${table}.txt
  done
  
  ############################################################################
  # HAPMAP SNPS AND HAPMAP LD PHASED LIFTS FROM HG18 (Andy)
  
  mkdir /hive/data/genomes/hg19/bed/hg18MassiveLift/hapmapSnps
  cd /hive/data/genomes/hg19/bed/hg18MassiveLift/hapmapSnps
  
  ## All the tables in the trackDb entry seem to be bed 6 +
  for table in `grep -B1 "parent hapmapSnps" ~/kent/src/hg/makeDb/trackDb/human/trackDb.ra | grep track | sed 's/.*track\ //'`; do
     echo $table >> tables.txt;
  done
  for table in `cat tables.txt`; do
     hgsql hg18 --skip-column-names -e "select * from "$table | cut -f2- > hg18.${table}.bed6p;
     liftOver -bedPlus=6 -tab hg18.${table}.bed6p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed6p,unmapped};
     hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > ${table}.sql
     hgLoadBed -sqlTable=${table}.sql -tab hg19 $table hg19.${table}.bed6p;
     wc -l hg1{8,9}.${table}.bed6p >> lifts.txt
  done
  ## Also need hapmapLd% and hapmapAllelesSummary
  for table in `hgsql hg18 --skip-column-names -e "show tables like 'hapmapLd%'"` hapmapAllelesSummary; do
     hgsql hg18 --skip-column-names -e "select * from "$table | cut -f2- > hg18.${table}.bed6p;
     liftOver -bedPlus=6 -tab hg18.${table}.bed6p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed6p,unmapped};
     hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > ${table}.sql
     hgLoadBed -sqlTable=${table}.sql -tab hg19 $table hg19.${table}.bed6p;
     wc -l hg1{8,9}.${table}.bed6p >> lifts.txt
  done
  ## Also need hapmapPhaseIIISummary
  hgsql hg18 --skip-column-names -e "select * from hapmapPhaseIIISummary" | cut -f2- > hg18.hapmapPhaseIIISummary.bed5p
  liftOver -bedPlus=5 hg18.hapmapPhaseIIISummary.bed5p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.hapmapPhaseIIISummary.{bed5p,unmapped}
  hgsqldump --no-data --compact hg18 hapmapPhaseIIISummary | sed '/^SET/d;s/ENGINE.*//' > hapmapPhaseIIISummary.sql
  hgLoadBed -sqlTable=hapmapPhaseIIISummary.sql hg19 hapmapPhaseIIISummary hg19.hapmapPhaseIIISummary.bed5p
  
  
  ############################################################################
  # INDEL-BASED CONSERVATION TRACK liftOver to hg19 (DONE - 2010-12-21 - Chin )
  # Data from the Gerton Lunter (gerton.lunter at anat.ox.ac.uk), MRC
  # Functional Genetics Unit, University of Oxford, United Kingdom.
  # Data is from the paper:
  # Lunter G, Ponting CP and Hein J Genome-wide identification of human
  # functional DNA using a neutral indel model. PLoS Comput Biol. 2006
  # Jan;2(1):e5.
      mkdir -p /hive/data/genomes/hg19/bed/consIndels/data
      cd /hive/data/genomes/hg19/bed/consIndels
      cp /hive/data/genomes/hg18/bed/consIndels/README.indels .
      cp /hive/data/genomes/hg18/bed/consIndels/igs-hg18mm8cf2.zip .
      # 38 Mb zip file in GFF format. This contains data for hg18
      # comparing it to mm8 and cf2 (canFam2).
      unzip igs-hg18mm8cf2.zip
      mv *.gff ./data/
      cd /hive/data/genomes/hg19/bed/consIndels/data
  
      for f in *.gff
      do
         echo processing $f
         grep -v "track" $f >> ../allNoHeader.tmp
      done
  
      cd /hive/data/genomes/hg19/bed/consIndels/	
      cat allNoHeader.tmp | \
      awk '{print $1,$4,$5,$6,$9,$10,$11}' > consIndels.bed7p
  
      liftOver -bedPlus=3 consIndels.bed7p \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
        consIndelsHg19Mm8CanFam2.bed7p unMapped
  
      wc -l *.bed7p
      #  2603017 consIndels.bed7p
      #  2602701 consIndelsHg19Mm8CanFam2.bed7p
      grep -i split unMapped | wc -l
      # 41
      grep -i partially unMapped | wc -l
      # 66
      rm allNoHeader.tmp
      rm consIndels.bed7p
  
      # strip off the end of the name e.g. IGS0001.1:p=.74; FDR 0.27
      # so that the name displayed is short - IGS0001.1. The score field
      # is used to determine colouring and this is calculated from FDR
      cd /cluster/data/hg19/bed/consIndels
      cat  consIndelsHg19Mm8CanFam2.bed7p |
        awk '{print $1,$2, $3, $5, $4}' |
        sed -e 's/:p[=<]\.[0-9][0-9]*;//' \
         > consIndelsHg19Mm8CanFam2.bed
  
      # load data
      cd /hive/data/genomes/hg19/bed/consIndels
      hgLoadBed hg19 consIndelsHgMmCanFam consIndelsHg19Mm8CanFam2.bed
      # Reading consIndelsHg19Mm8CanFam2.bed
      # Loaded 2602701 elements of size 5
      # Sorted
      # Creating table definition for consIndelsHgMmCanFam
      # Saving bed.tab
      # Loading hg19
  
      # Get the IDs, posterior probabilities (p) for the segment being
      # neutral,
      # and the FDR from the original GFFs for a separate table. Some
      # items
      # have p<.001. Can not do Table Browser queries restricting
      # p to <, =, or > a specified value unless all values are floats.
      # Contacted the data contributor, Gerton Lunter, and he said it
      # would be
      # ok to change all p<.001 to p=0.0005
      cd /hive/data/genomes/hg19/bed/consIndels/
      cat consIndelsHg19Mm8CanFam2.bed7p \
        | awk 'BEGIN {FS="\t"} {print $5, $6, $7}'  \
        | sed -e 's/:/\t/' \
        | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
        | sed -e 's/;\sFDR/\t/' > consIndelsHg19Mm8CanFam2Conf.txt
  
      # there are no GFF files for the haplotype chroms
      # Reuse $HOME/kent/src/hg/lib/itemConf.* for the table of identifier,
      # posterior probability and false discovery rate (FDR).
  
      cd /hive/data/genomes/hg19/bed/consIndels
      hgLoadSqlTab hg19 consIndelsHgMmCanFamConf \
           $HOME/kent/src/hg/lib/itemConf.sql \
           consIndelsHg19Mm8CanFam2Conf.txt
  
      # check that all itesm are in this table.
      hgsql -N -e 'select distinct(name) from consIndelsHgMmCanFam;' hg19 \
           | sort > consIndels.names.sort
      hgsql -N -e 'select distinct(id) from consIndelsHgMmCanFamConf;' hg19 \
           | sort > consIndels.idsfromConf.sort
      wc -l *.sort
      # 2602701 consIndels.idsfromConf.sort
      # 2602701 consIndels.names.sort
  
      comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
      # 2602701
      # so all element IDs are in both tables.
      # cleanup
      rm ./data/*.bak *.sort
  
      # add trackDb/human/hg19/trackDb.ra entry and add description that
      # was written by the data contributor. Add code to hgc.c to display
      # the posterior probability and the FDR on the details page for
      # track elements. Gerton Lunter provided a description for the data
      # on 2007-09-12.
      # Add hg19 to the "identifier consIndelsId" in all.joiner.
  
  
  ############################################################################
  # POLYA_DB TRACK (DONE 2011-01-04 - Chin)
  #
  
      # Data files and program:
      # "Bin Tian" <btian@umdnj.edu> provided the following two
      # data files at /hive/data/outside/polyA:
      #   hg18.polyadb.bed  hg19.polyadb.bed
      # Andy found the SVM program he used before, and in here:
      #  /hive/data/genomes/hg18/bed/polyaDB/polya_svm_2.2.tar.gz
      # Copy it to /hive/data/outside/polyA. Unzip the program to
      # polya_svm_2.2
  
      mkdir /hive/data/genomes/hg19/bed/polyaDB
      cd /hive/data/genomes/hg19/bed/polyaDB
      cp /hive/data/outside/polyA/hg19.polyadb.bed .
      hgLoadBed hg19 polyaDb hg19.polyadb.bed
      # add trackDb entry in human/hg19
      # polyA.html is at top human level
  
      # since hg19.polyadb.bed provided is lifted over from hg18.polyadb.bed,
      # it it safe to lift polyaPredict table from hg18 to hg19 without re-run
      # the svm.
      hgsql -N -e "select * from polyaPredict;" hg18 | \
        cut -f2-9 > hg18.polyaPredict.bed
      wc -l hg18.polyaPredict.bed
      # 52182 hg18.polyaPredict.bed
  
      liftOver -bedPlus=8 hg18.polyaPredict.bed \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
        hg19.polyaPredict.bed unMapped.polyaPrdict
      wc -l hg19.polyaPredict.bed
      # 52169 hg19.polyaPredict.bed
      hgLoadBed hg19 polyaPredict hg19.polyaPredict.bed
      # Reading hg19.polyaPredict.bed
      # Loaded 52169 elements of size 8
      # Sorted
      # Creating table definition for polyaPredict
      # Saving bed.tab
      # Loading hg19
  
  
  #############################################################################
  # FILTER SNP132 (DONE 3/8/11 angie)
  # 4/8/11: changing table names to be consistent with shortLabel:
  #         snp132Patient -> snp132Flagged
  #         snp132NonUnique -> snp132Mult
  # 3/8/11: redone after snp132 with tweaked weights in PARs, see above
  # Previously done 1/24/11 after snp132
      # Redmine: Track #1684 (SNPs 132 (dbSNP))
      # Make several tracks that are filtered subsets of snp132:
      # First, filter out the multiply-aligned and/or weight >1 SNPs [any other exceptions?]
      cd /hive/data/outside/dbSNP/132/human
      zcat snp132.parWeightTweak.bed.gz \
      | perl -we \
        'open($mult, "| gzip -c > snp132Mult.bed.gz") || die; \
         open($common,    "| gzip -c > snp132Common.bed.gz") || die; \
         open($flagged,   "| gzip -c > snp132Flagged.bed.gz") || die; \
         open($misc,      "| gzip -c > snp132Misc.bed.gz") || die; \
         while (<>) { \
           @w = split("\t"); \
           if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
             print $mult $_; \
           } else { \
             my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
             my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
             my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
             my ($total2N, $maxAlleleFreq) = (0, 0); \
             for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
               $total2N += $alNs[$i]; \
               $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
             } \
             if ($alleleFreqCount >= 2 && $total2N >= 4 && $maxAlleleFreq <= 0.99) { \
               print $common $_; \
             } elsif($w[24] =~ /clinically-assoc/)  { \
               print $flagged $_; \
             } else { \
               print $misc $_; \
             } \
           } \
         } \
         close($mult);  close($common); close($flagged);  close($misc);'
      zcat snp132Mult.bed.gz | wc -l
  #3568988
      zcat snp132Common.bed.gz | wc -l
  #14024295
      zcat snp132Flagged.bed.gz | wc -l
  #18084
      zcat snp132Misc.bed.gz | wc -l
  #15414754
  
      # Load tables
      foreach subset (Mult Common Flagged Misc)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          hg19 snp132$subset -sqlTable=snp132.sql snp132$subset.bed.gz
      end
  
  
  #############################################################################
  # BUILD DECIPHER WITH NEW HG19 RELEASE (Done Fan, 2/8/11)
  
  # The decipher track is built by an automated process.  The following two scripts:
  
  #   kent/src/utils/decipher/checkDecipher.sh
  #   kent/src/utils/decipher/buildDecipher
  
  #are used to automatically detect update on DECIPHER ftp sites and then
  #download and build the decipher track.
  
  # checkDecipher.sh is invoked by a cron job, it will call buildDecipher to build
  # the decipher track after the new data are downloaded.
  
  #############################################################################
  # GRC Incident database (DONE - 2011-02-10 - Hiram)
      # used to be NCBI Incident - changed to GRC Incident 2012-04-12
      # this procedure is run as a cron job in Hiram's account:
  
      #	43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo
  
      # using the two scrips there: runUpdate.sh and update.sh
      # which are checked into the source tree as files:
      #	src/hg/utils/automation/grcIncidentUpdate.sh
      #	src/hg/utils/automation/grcRunIncidentUpdate.sh
  
      # they fetch the XML files from NCBI, convert them to SQL text
      # files, construct a bigBed file, and pushes it to genomewiki if
      # it is an update from previous
  
      # the table in the dataBase is: grcIncidentDb
      # which is the URL to the bb file, a single row:
      # http://genomewiki.ucsc.edu/images/6/67/Hg19.grcIncidentDb.bb
  
  #############################################################################
  # UNIGENE/SAGE TRACK (RE-BUILT - 2011-02-22 Fan)
  
  # Create the uniGene alignments
  
      # Download of the latest UniGene version is now automated by a
      # cron job -- see /cluster/home/angie/crontab ,
      # /cluster/home/angie/unigeneVers/unigene.csh .
  
      ssh hgwdev
      mkdir -p /hive/data/genomes/hg19/bed/uniGene/022211
      cd /hive/data/genomes/hg19/bed/uniGene/022211
  
      set Version = 229
  
      zcat /hive/data/outside/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
      sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa
  
      ssh swarm
      set Version = 229
      mkdir -p /hive/data/genomes/hg19/bed/uniGene/022211/run.blat
      cd /hive/data/genomes/hg19/bed/uniGene/022211/run.blat
  
      ls -1 /hive/data/genomes/hg19/nib/*.nib > genome.lst
      ls -1S \
      /hive/data/genomes/hg19/bed/uniGene/022211/Hs.seq.uniq.simpleHeader.fa \
        > uniGene.lst
  
      cat << '_EOF_' > template.sub
  #LOOP
  /cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/hive/data/genomes/hg19/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
  #ENDLOOP
  '_EOF_'
  
      gensub2 genome.lst uniGene.lst template.sub para.spec
      para create para.spec
      mkdir psl
      para try
      para check
      para push
  
  #Completed: 93 of 93 jobs
  #CPU time in finished jobs:      67404s    1123.41m    18.72h    0.78d  0.002 y
  #IO & Wait Time:                  5838s      97.29m     1.62h    0.07d  0.000 y
  #Average job time:                 788s      13.13m     0.22h    0.01d
  #Longest finished job:            5228s      87.13m     1.45h    0.06d
  #Submission to last job:          5320s      88.67m     1.48h    0.06d
  #Estimated complete:                 0s       0.00m     0.00h    0.00d
  
      pslSort dirs raw.psl tmp psl >& pslSort.log
      cat raw.psl|\
      pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
        stdin hg19.uniGene.pslReps.psl /dev/null
  
      gzip raw.psl
  
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/uniGene/022211/run.blat
  
      hgLoadPsl -table=uniGene_3 hg19 hg19.uniGene.pslReps.psl
  
      mkdir -p /gbdb/hg19/uniGene
      cd /gbdb/hg19/uniGene
  
      rm Hs.seq.uniq.simpleHeader.fa
      ln -s \
      /hive/data/genomes/hg19/bed/uniGene/022211/Hs.seq.uniq.simpleHeader.fa \
      Hs.seq.uniq.simpleHeader.fa
  
  # load the sequence
  
      hgLoadSeq -replace hg19 /gbdb/hg19/uniGene/Hs.seq.uniq.simpleHeader.fa
  
  ##############################################################################
  # Chimp Lastz run (DONE - 2011-02-23 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22
      cd /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22
      cat << '_EOF_' > DEF
  # human vs chimp
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Chimp PanTro3
  SEQ2_DIR=/scratch/data/panTro3/panTro3.2bit
  SEQ2_LEN=/scratch/data/panTro3/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      screen # use screen to manage this long-running job
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-syntenicNet > do.log 2>&1 &
      # problems with memk, after recovery, continue chainMerge:
      time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-continue=chainMerge -syntenicNet > chainMerge.log 2>&1 &
      #	real    103m34.088s
  
      cat fb.hg19.chainPanTro3Link.txt
      #	2760939621 bases of 2897316137 (95.293%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 panTro3 > rbest.log 2>&1
      # real    50m49.740s
  
  
  #	running the swap
      mkdir /hive/data/genomes/panTro3/bed/blastz.hg19.swap
      cd /hive/data/genomes/panTro3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-swap /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-syntenicNet > swap.log 2>&1 &
  #	real    86m49.706s
      cat fb.panTro3.chainHg19Link.txt
      #	2772816267 bases of 2900529764 (95.597%) in intersection
  
  ############################################################################
  # MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE weirauch braney 03/07/11)
  # Questions?  braney at soe.ucsc.edu
  
  ssh hgwdev
  mkdir /cluster/data/hg19/bed/tfbsCons
  cd /cluster/data/hg19/bed/tfbsCons
  
  # Define all parameters in 'PARAMS.txt'
  # Define all chromosomes in 'CHROMS.txt'
  # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
  set tarfile=/cluster/data/hg19/bed/tfbsCons/tfbsConsUtils.tar.gz
  tar zxf $tarfile
  
  nice ./getRefseqStats.pl &
  nice ./getBatchQueries.pl &
  
  ssh swarm
  mkdir /cluster/bluearc/braney/tfloc
  mkdir /hive/users/weirauch/tfloc_hg18
  # Copy ./tmp/ctfbs_batch_list.txt to this dir
  # Copy ./scripts/doit to this dir
  para create ctfbs_batch_list.txt
  para try
  para push
  
  # When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.
  
  nice ./getBedFile.pl &
  
  hgLoadBed -noSort hg19  tfbsConsSites -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed -tab
  hgLoadSqlTab hg19 tfbsConsFactors $HOME/kent/src/hg/lib/tfbsConsFactors.sql tfbsConsFactors.bed
  #########################################################################
  # BUILD THE TRACK OF IKMC MAPPED TO HUMAN GENOME. (DONE, Fan, 3/15/11)
  
      ssh hgwdev
      mkdir -p /hive/data/genomes/hg19/bed/ikmc/110314
      cd /hive/data/genomes/hg19/bed/ikmc/110314
  
  # recieve  20110301_human.gff.gz from Carol Bult [Carol.Bult@jax.org] and place it under this subdirectory.
      gzip -d 20110301_human.gff.gz
  
  # build hgIkmc table from raw data file ucschuman.gff (substitue some
  # troublesome chroms in raw data file and remove some records mapped to 'chrUn')
  
      cat 20110301_human.gff |sort -u \
      |sed -e 's/chr9|NT_113911.1/chr9/' \
      |grep -v 'chrUn' \
      | perl -we \
        'while (<>) { \
           s/\r?\n$//; \
           ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
           if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
           $col = ($col eq "Yellow") ? "255,215,0" : \
                  ($col eq "Green")  ? "0,240,0" : \
                  ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
           $s--; \
           $id =~ s/^MGI:\d+; (\w+); .*/$1/ || die "Cant parse id \"$id\""; \
           my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
           push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
        } \
        warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
        foreach my $geneId (keys %geneBlks) { \
           my @blks = @{$geneBlks{$geneId}}; \
           my ($chrom, $center, $name) = split(/\|/, $geneId); \
           my $blkCount = @blks; \
           @blks = sort {$a->[0] <=> $b->[0]} @blks; \
           my $chromStart = $blks[0]->[0]; \
           my $chromEnd = $blks[$blkCount-1]->[1]; \
           my $color = $blks[0]->[2]; \
           my $blkStarts = ""; \
           my $blkSizes = ""; \
           foreach my $blk (@blks) { \
             my ($start, $end, $col) = @{$blk}; \
             $blkStarts .= ($start - $chromStart) . ","; \
             $blkSizes  .= ($end - $start) . ","; \
             if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
           } \
          print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                     $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
        }' \
      | sort -k 1,1 -k 2n,2n > hgIkmc.bed
  # Got 41936 genes.
  
  # build hgIkmcExtra table
  
      cat 20110301_human.gff \
      | grep -v 'chrUn' \
      | perl -wpe 's/\r?\n$//; @w = split("\t"); \
        if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
        if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
        $w[8] =~ m/^(MGI:\d+); (\w+); (\w.*)/ || die; \
        ($mgi, $designId, $status) = ($1, $2, $3); \
        $_ = "$w[10]_$designId\t$mgi,$designId,$w[2],$status\n";' \
      | sort -u > hgIkmcExtra.tab
      wc -l hgIkmcExtra.tab
  # 41936 hgIkmcExtra.tab
  
  # load tables
      hgLoadBed hg19 hgIkmc hgIkmc.bed
      checkTableCoords -verbose=2 hg19 hgIkmc
  
      hgLoadSqlTab hg19 hgIkmcExtra $HOME/kent/src/hg/lib/genericAlias.sql hgIkmcExtra.tab
  
  #########################################################################
  # LOAD ACEMBLY (DONE 2011-03-14 - Chin)
      mkdir /hive/data/outside/acembly
      cd /hive/data/outside/acembly
      wget --timestamping \
  ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Aug10.human.genes/AceView.ncbi_37.genes_gff.gff.gz
      wget --timestamping \
  ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Aug10.human.genes/AceView.ncbi_37.good_proteins_peptide.fasta.gz
  
      mkdir /cluster/data/hg19/bed/acembly
      cd /cluster/data/hg19/bed/acembly
      cp -p /hive/data/outside/acembly/AceView.ncbi_37.genes_gff.gff.gz .
      cp -p /hive/data/outside/acembly/AceView.ncbi_37.good_proteins_peptide.fasta.gz .
      gzip -d AceView.ncbi_37.genes_gff.gff.gz
      gzip -d AceView.ncbi_37.good_proteins_peptide.fasta.gz
  
      # If the result of this command is > 0, then some lines have end <
      # start
      # and need to be fixed:
      awk '$5 < $4 {print;}' AceView.ncbi_37.genes_gff.gff | wc -l
      # 0
  
      # Filter out empty lines, lines where the product_id has a stray
      # newline before it, and $chr|Hs# IDs that don't appear liftable.
      # (Note: the new gff does not have these two cases anymore.)
      # Add 'chr' prefix to chrom number at field 1
      egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' AceView.ncbi_37.genes_gff.gff \
      | sed -e 's/^/chr/;' \
        > acembly.gff
      # fixed the chrmito prefix to chrM
      mv acembly.gff acembly.tmp
      cat acembly.tmp | sed -e 's/^chrmito/chrM/;' > acembly.gff
  
  
      # Extract annotation classes from original gff:
      cat AceView.ncbi_37.genes_gff.gff | awk  '{print $12}' | sort | uniq
      # cDNA_supported;
      # Note: version 37 gff have only one gene type - cDNA_supported;
      # Per Danielle and Jean request, use pink to display them. 	
      # the following replace become no-op.
      egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' AceView.ncbi_37.genes_gff.gff \
      | perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
                   s/Cloud$/cloud/ || s/Spliced_gene$/spliced_gene/ || \
                     die "Unrecognized class/Gene_type:\n$_\n";' \
      | sort -u \
        > acemblyClass.tab
      # Useless use of single ref constructor in void context at -e line 2.
      # warning was issued, however it is harmless.
  
  
      # Some gff transcript_id's end in -unspliced (no intron), but the
      # corresponding protein fasta IDs to not have that suffix.  We need
      # them to match, so add where necessary.
      # Use perl to make a perl script to add -unspliced to protein IDs
      # where necessary:
      grep unspliced acemblyClass.tab | wc -l
      # 54180
      egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$'  AceView.ncbi_37.genes_gff.gff \
      | perl -wpe 's@^.*transcript_id (\S+)-unspliced;.*$@\$unsp{"$1"} = 1;@ || s/^.*\n$//;' \
      | sort -u \
        > addUnspliced.pl
      wc -l addUnspliced.pl
      # 54180 addUnspliced.pl
      cat >> addUnspliced.pl <<'_EOF_'
  while (<>) {
    if (/^>(\S+)$/) {
      if ($unsp{$1}) {
        s/^>(\S+)/>$1-unspliced/;
      }
    }
    print;
  }
  '_EOF_'
      # << emacs
  
      # Add -unspliced suffix to protein IDs where necessary, and pare
      # down
      # proteins to just the ones that we have transcripts for:
      awk '{print $1;}' acemblyClass.tab   > transcriptNames.txt
      perl addUnspliced.pl AceView.ncbi_37.good_proteins_peptide.fasta \
      | faSomeRecords stdin transcriptNames.txt acemblyPep.fa
      grep unspliced acemblyPep.fa | wc -l
      # 31956
      # Danielle Thierry-Mieg explained that noncoding genes are included
      # so
      # the number of proteins can be smaller than the number of
      # transcripts.
  
      # Load tables
      ssh hgwdev
      cd /cluster/data/hg19/bed/acembly
      ldHgGene -gtf hg19 acembly acembly.gff
      # Reading acembly.gff
      # Read 259440 transcripts in 3870073 lines in 1 files
      #   259440 groups 25 seqs 1 sources 5 feature types
      # 259440 gene predictions
  
      hgLoadSqlTab hg19 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
        acemblyClass.tab
      # Scanning through 1 files
      hgPepPred hg19 generic acemblyPep acemblyPep.fa
      # rm acemblyPep.tab
      runJoiner.csh hg19 acembly
      # found identifiers:
      # acemblyName
      # Checking keys on database hg19
      #  hg19.acemblyPep.name - hits 187692 of 187692 ok
      #  hg19.acemblyClass.name - hits 259440 of 259440 ok
      # running -times flag
  
  
  #############################################################################
  # Affy Exon track (DONE 2011-03-08 - Melissa Cline)
  
  # scripts/splitAffyExonBeds.py (below)
  #!/usr/bin/env python
  
  import fileinput
  import re
  import string
  
  filehandle = None
  for line in fileinput.input():
      if re.search("track\tname", line):
          if filehandle != None:
              filehandle.close()
          filename = fileinput.filename()
          filename = string.replace(filename, "hg19-bed", "hg19-split-bed")
          if re.search("gene level exon", line):
              filename = string.replace(filename, ".bed", ".exon.bed")
          elif re.search("gene probeset", line):
              filename = string.replace(filename, ".bed", ".geneProbeset.bed")
          elif re.search("exon probeset", line):
              filename = string.replace(filename, ".bed", ".probeset.bed")
          elif re.search("probe", line):
              filename = string.replace(filename, ".bed", ".probe.bed")
          filehandle = open(filename, 'w')
      filehandle.write(line)
  
  
  # scripts/mergeAcrossChromosomes.bash (below)
  #!/usr/bin/env bash
  
  
  PATHNAME="/hive/users/cline/Affy/"
  tail -n +2 $PATHNAME/Affy-HuEx-hg19-split-bed/HuEx-1_0-st-v2.hg19.*.$1.bed \
    |grep -v "==>" > $PATHNAME/mergedBeds/AffyHuEx.$1.bed
  
  
  # scripts/splitByProbesetType.py (below)
  #!/usr/bin/env python
  from optparse import OptionParser
  import re
  
  parser = OptionParser()
  parser.add_option("-s", "--supplementalData", dest="supplementalData",
                    default="supportingAnnotations/HuEx-1_0-st-v2.na31.hg19.probeset.csv")
  parser.add_option("-b", "--bedData", dest="bedData", default="noOverlaps/AffyHuEx.probeset.overlapsMerged.bed")
  (parameters, args) = parser.parse_args()
  coreProbesets = dict()
  extendedProbesets = dict()
  fullProbesets = dict()
  ambiguousProbesets = dict()
  freeProbesets = dict()
  supplementalFile = open(parameters.supplementalData)
  for line in supplementalFile:
      tokens = line.split(",")
      probesetId = re.sub("\"", "", tokens[0])
      if re.search("core", line):
          coreProbesets[probesetId] = 1
      elif re.search("extended", line):
          extendedProbesets[probesetId] = 1
      elif re.search("full", line):
          fullProbesets[probesetId] = 1
      elif re.search("ambiguous", line):
          ambiguousProbesets[probesetId] = 1
      elif re.search("free", line):
          freeProbesets[probesetId] = 1
  supplementalFile.close()
  coreProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                re.sub("overlapsMerged", "core.overlapsMerged",
                                       parameters.bedData))
  coreProbesetsOut = open(coreProbesetsOutfile, 'w')
  extendedProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                    re.sub("overlapsMerged",
                                           "extended.overlapsMerged",
                                           parameters.bedData))
  extendedProbesetsOut = open(extendedProbesetsOutfile, 'w')
  fullProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                re.sub("overlapsMerged", "full.overlapsMerged",
                                       parameters.bedData))
  fullProbesetsOut = open(fullProbesetsOutfile, 'w')
  ambiguousProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                     re.sub("overlapsMerged", "ambiguous.overlapsMerged",
                                            parameters.bedData))
  ambiguousProbesetsOut = open(ambiguousProbesetsOutfile, 'w')
  freeProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                     re.sub("overlapsMerged", "free.overlapsMerged",
                                            parameters.bedData))
  freeProbesetsOut = open(freeProbesetsOutfile, 'w')
  bedInput = open(parameters.bedData)
  for line in bedInput:
      tokens = line.split()
      if len(tokens) < 3:
          print "error: misformed line", line
      else:
          name = line.split()[3]
          (gene,probeset) = name.split("_")
          if coreProbesets.has_key(probeset):
              coreProbesetsOut.write(line)
          elif extendedProbesets.has_key(probeset):
              extendedProbesetsOut.write(line)
          elif fullProbesets.has_key(probeset):
              fullProbesetsOut.write(line)
          elif ambiguousProbesets.has_key(probeset):
              ambiguousProbesetsOut.write(line)
          elif freeProbesets.has_key(probeset):
              freeProbesetsOut.write(line)
          else:
              print "warning: orphan line", line
  
  
  # 1. Given data from the vendor, one file per chromosome with four types of
  # bed entries per file (one file per chromosome), split them into four bed
  # files (yielding four bed files per chromosome).
  ls Affy-HuEx-hg19-bed/*bed \
      | awk '{ print "scripts/splitAffyExonBeds.py", $1 }' |bash
  
  # 2. Given a subdirectory with four bed files per chromosome for probeset
  # and probe bed files, merge them into one bed file for probeset data
  # and one bed file per probe data.  From here, we are ignoring the exon
  # and gene probeset data.
  scripts/mergeAcrossChromosomes.bash probeset
  scripts/mergeAcrossChromosomes.bash probe
  
  # 3. There is an issue that the probeset data contains overlapping blocks.
  # Fix this with bedMergeOverlappingBlocks, written by Andy Pohl.  Note that
  # the probe data does not contain overlapping blocks.
  bedMergeOverlappingBlocks mergedBeds/affyHuEx.probeset.bed \
      noOverlaps/AffyHuEx.probeset.overlapsMerged.bed
  cp mergedBeds/AffyHuEx.probe.bed noOverlaps/AffyHuEx.probe.overlapsMerged.bed
  
  # 4. There are five different types of probesets, each of which has a different
  # significance to the user.  These should best be represented as five different
  # subtracks of a parent track (one parent track for probesets, and one for probes).
  # Split the data by probeset type.  Note that the same process can be applied to
  # both the probeset and probe data, because the probe data is represented by the
  # probeset ID rather than by a distinct probe ID.
  scripts/splitByProbesetType.py \
      -s supportingAnnotations/HuEx-1_0-st-v2.na31.hg19.probeset.csv \
      -p noOverlaps/AffyHuEx.probe.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbeAmbiguous \
      partitioned2/AffyHuEx.probe.ambiguous.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbeCore \
      partitioned2/AffyHuEx.probe.core.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbeExtended \
      partitioned2/AffyHuEx.probe.extended.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbeFree \
      partitioned2/AffyHuEx.probe.free.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbeFull \
      partitioned2/AffyHuEx.probe.full.overlapsMerged.bed
  
  scripts/splitByProbesetType.py \
      -s supportingAnnotations/HuEx-1_0-st-v2.na31.hg19.probeset.csv \
      -p noOverlaps/AffyHuEx.probeset.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbesetAmbiguous \
      partitioned2/AffyHuEx.probeset.ambiguous.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbesetCore \
      partitioned2/AffyHuEx.probeset.core.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbesetExtended \
      partitioned2/AffyHuEx.probeset.extended.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbesetFree \
      partitioned2/AffyHuEx.probeset.free.overlapsMerged.bed
  hgLoadBed hg19 affyExonProbesetFull \
      partitioned2/AffyHuEx.probeset.full.overlapsMerged.bed
  
  
  #########################################################################
  # LASTZ Turkey MelGal1 (DONE - 2011-03-28 - Chin)
      mkdir /hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28
      cd /hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28
  
      cat << '_EOF_' > DEF
  # Turkey vs Human
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Turkey melGal1 - single chunk big enough to run entire chrom
  SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
  SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      # real    100m33.646s
      cat fb.hg19.chainMelGal1Link.txt
      #   76647912 bases of 2897316137 (2.645%) in intersection
      # Create link
      cd /hive/data/genomes/hg19/bed
      ln -s  lastzMelGal1.2011-03-28 lastz.melGal1
  
  
      #   running the swap
      mkdir /hive/data/genomes/melGal1/bed/blastz.hg19.swap
      cd /hive/data/genomes/melGal1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28/DEF \
          -swap \
          -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #   real    6m51.280s
      cat fb.melGal1.chainHg19Link.txt
      #   62120143 bases of 935922386 (6.637%) in intersection
      cd /hive/data/genomes/melGal1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #############################################################################
  # BUILD seudoYale60 TRACK
  # Rachel built this for Gencode first, Fan adopted for UCSC GB, DONE, 3/31/2011.
  #
  # First, how Rachel built it:
  # YALE PSEUDOPIPE PSEUDOGENE PREDICTIONS BASED ON ENSEMBL 60
  # (hartera, 2011-01-10, DONE)
  # FTP site e-mailed by Suganthi Balasubramanian (suganthi.bala@yale.edu) from
  # the Gerstein lab. Data is from their PseudoPipe pipeline and it is based on
  # proteins from Ensembl Build 60 (pseudogene data from December 2010?).
  
  mkdir -p /hive/groups/gencode/browser/hg19/gencodeYalePseudoBuild60
  cd /hive/groups/gencode/browser/hg19/gencodeYalePseudoBuild60
  
  wget --timestamping \
       "http://tables.pseudogene.org/dump.cgi?table=Human60"
  
  # Then re-name the file:
  mv dump.cgi\?table=Human60 Human60YalePseudo.txt
  # Header from data file.
  ID      Chromosome      Start Coordinate        Stop Coordinate Strand  Parent
  Protein  Protein Start   Protein Stop    Parent Gene     Fraction        Num
  Insertions  Num Deletions   Num Shifts      Num Stops       E Value Identity
  PolyA   Disablements    Exons   Introns Class   Sequence        Link
  
  # urls are of type:
  # http://tables.pseudogene.org/human60/<ID> so this can be added to the
  # trackDb as for the previous track.
  
  # Get list of haplotype chroms:
  grep _ Human60YalePseudo.txt | awk '{print $2}' | sort | uniq
  HSCHR17_1
  HSCHR6_MHC_APD
  HSCHR6_MHC_COX
  HSCHR6_MHC_DBB
  HSCHR6_MHC_MANN
  HSCHR6_MHC_MCF
  HSCHR6_MHC_QBL
  HSCHR6_MHC_SSTO
  
  # These correspond to the haplotype chroms in GRCh37 (hg19).
  # Convert data to genePred:
  # chromsomomes are 1-22, X, Y, chr17_ctg5_hap1 (HSCHR17_1) and the chr6
  # haplotypes e.g. chr6_cox_hap1 (HSCHR6_MHC_COX)
  
  cat << '_EOF_' > formatPseudogenesToGenePred
  #!/usr/bin/awk -f
  # Parse Yale pseudogene data file.
  # Exon coordinates are in this format: [[28688544, 28688864], [28689678, 2869117# 4], [28694308, 28694460], [28701327, 28701749]]
  # Ignore header line
  /^ID/ {
    next;
  }
  # Parse the data lines
  BEGIN {FS="\t"} {OFS="\t"} {
    gsub(/\[/, "", $19);
    gsub(/\]/, "", $19);
    split($19, exons, ",");
    # Count the number of start and end coordinates for exons and
    # calculate the number of exons.
    count=(length(exons))/2;
    # Write out genePred. Add chr in front of chrom only if not haplotype.
    if ($2 !~ /HSCHR/) {
       printf "%s\tchr%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count;
    }
    else {
       printf "%s\t%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count;
    }
    # get list of exon starts, convert from 1-based to 0-based
    for (i=1; i <= length(exons); i+=2) {
       printf "%d,", exons[i]-1",";
    }
    printf "\t";
    # get list of exon ends
    for (i=2; i <= length(exons); i+=2) {
       printf "%d,", exons[i]",";
    }
    printf "\n";
  }
  '_EOF_'
  
  chmod +x formatPseudogenesToGenePred
  # format the Yale pseudogenes data to genePred.
  ./formatPseudogenesToGenePred Human60YalePseudo.txt \
      > gencodeYalePseudoBuild60.gp
  
  # The Genome Browser represents just the haplotype region as a separate
  # "chromosome" whereas the coordinates represent the haplotype region embedded
  # into chr6.
  cp -p /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift .
  # The lift file assumes the following chrom names
  -69170076       HSCHR4_1        191154276       chr4_ctg9_hap1  590426
  -43384863       HSCHR17_1       81195210        chr17_ctg5_hap1 1680828
  -28696603       HSCHR6_MHC_APD  171115067       chr6_apd_hap1   4622290
  -28477796       HSCHR6_MHC_COX  171115067       chr6_cox_hap2   4795371
  -28696603       HSCHR6_MHC_DBB  171115067       chr6_dbb_hap3   4610396
  -28696603       HSCHR6_MHC_MANN 171115067       chr6_mann_hap4  4683263
  -28696603       HSCHR6_MHC_MCF  171115067       chr6_mcf_hap5   4833398
  -28696603       HSCHR6_MHC_QBL  171115067       chr6_qbl_hap6   4611984
  -28659142       HSCHR6_MHC_SSTO 171115067       chr6_ssto_hap7  4928567
  
  liftUp -type=.gp gencodeYalePseudoBuild60HapsLifted.gp \
         ensGene.haplotype.lift carry gencodeYalePseudoBuild60.gp
  # Got 68 lifts in ensGene.haplotype.lift
  # Lifting gencodeYalePseudoBuild55.gp
  
  wc -l gencode*.gp
  17888 gencodeYalePseudoBuild60.gp
  17888 gencodeYalePseudoBuild60HapsLifted.gp
  
  # Load table and then check some haplotype regions. See if look plausible.
  # Load the genePred file into hg19
  hgLoadGenePred hg19 gencodeYalePseudoBuild60 \
  gencodeYalePseudoBuild60HapsLifted.gp
  # Didn't load. There are 12 invalid genePreds (10 in Ensembl 55, 12 for 59):
  Error: invalid genePred: PGOHUM00000244617 exon 2 overlaps previous exon
  Error: invalid genePred: PGOHUM00000244796 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000248470 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000251325 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000250199 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000243651 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000232858 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000232933 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000233065 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000236237 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000241760 exon 1 overlaps previous exon
  Error: invalid genePred: PGOHUM00000233784 exon 8 overlaps previous exon
  Error: 12 invalid genePreds, database unchanged
  
  # These are on chroms 1, 6, 7, 9, X, Y.
  # File didn't load into database.
  # Make a file of these ids - invalidIds
  grep -f invalidIds -vw gencodeYalePseudoBuild60HapsLifted.gp \
      > gencodeYalePseudoBuild60HapsLiftedNoInvalidGps.gp
  wc -l gencode*gp
  # 17888 gencodeYalePseudoBuild60.gp
  # 17888 gencodeYalePseudoBuild60HapsLifted.gp
  # 17876 gencodeYalePseudoBuild60HapsLiftedNoInvalidGps.gp
  
  # Then re-load database
  hgLoadGenePred hg19 gencodeYalePseudoBuild60 \
        gencodeYalePseudoBuild60HapsLiftedNoInvalidGps.gp
  
  # Add trackDb.ra entry for track, add a search and make sure
  # there is a description page, copy over from the gencodeYalePseudoBuild59
  # html. Commit these to SVN.
  # Add to the html description, the list of 12 IDs of genes that were removed
  # due to overlapping exon coordinates. This was also a problem for the Yale
  # pseudogenes based on Ensembl Build 53 and 55 but there were 10 problem IDs
  # for those builds.
  
  # Build class table for colouring pseudogenes by type.
  # copy over class table definition from a previous set of Yale pseudogenes.
  cp -p ../gencodeYalePseudoBuild55/gencodeYalePseudoBuild55Class.sql \
   gencodeYalePseudoBuild60Class.sql
  
  # Make the class table file:
  tail -n +2 Human60YalePseudo.txt \
    | tawk '{print $1, $21, "Yale"}' | sort > yalePseudoBuild60Class.txt
  
  # load table
  hgLoadSqlTab hg19 gencodeYalePseudoBuild60Class \
      gencodeYalePseudoBuild60Class.sql yalePseudoBuild60Class.txt
  
  hgsql -e 'select distinct(class) from gencodeYalePseudoBuild60Class;' hg19
  +------------+
  | class      |
  +------------+
  | Ambiguous  |
  | Processed  |
  | Duplicated |
  +------------+
  
  # Add these classes to the trackDb.ra entry for the geneClasses field and
  # to the list of classes with colours.
  
  # Next, how Fan adopted it:
  
  ssh hgwdev
  mkdir -p /hive/data/genomes/hg19/bed/pseudoYale60
  cd /hive/data/genomes/hg19/bed/pseudoYale60
  
  hgsql hg19 < ~/src/hg/lib/pseudoYale60.sql
  hgsql hg19 < ~/src/hg/lib/pseudoYale60Class.sql
  
  hgsql hg19 -e "insert into pseudoYale60 select * from gencodeYalePseudoBuild60"
  hgsql hg19 -e "insert into pseudoYale60Class select * from gencodeYalePseudoBuild60Class"
  
  ##############################################################################
  # LASTZ Lizard AnoCar2 (DONE - 2011-04-19 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19
      cd /hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19
  
      cat << '_EOF_' > DEF
  # human vs lizard
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Lizard anoCar2
  SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit
  SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
  	-bigClusterHub=swarm -qRepeats=windowmaskerSdust > do.log 2>&1 &
      #	real    195m52.809s
      cat fb.hg19.chainAnoCar2Link.txt
      #	102917023 bases of 2897316137 (3.552%) in intersection
  
      #	running the swap - DONE - 2011-04-19
      mkdir /hive/data/genomes/anoCar2/bed/blastz.hg19.swap
      cd /hive/data/genomes/anoCar2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-syntenicNet -swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
      #	real    20m45.683s
      cat fb.anoCar2.chainHg19Link.txt
      #	88296392 bases of 1701353770 (5.190%) in intersection
  
  ##############################################################################
  # NCBI patch 3 (NOT COMPLETE - 2011-04-21 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch3
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch3
      # these scripts were altered slightly for improvements and corrections
      # to this patch3 business
      cp -p ../patches/gatherNames.pl .
      # business added to gatherNames.pl to construct patches.chrom.sizes file
      ./gatherNames.pl . > ucscNames.patch3.txt
      cp -p ../patch2/mkTables.pl .
      ./mkTables.pl  patches.chrom.sizes ucscNames.patch3.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
      # output to stdout is the contents of alt.scaf.agp.gz
      # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
      cp -p ../patch2/mkCtgPos2.pl .
      ./mkCtgPos2.pl ucscNames.patch3.txt patches.chrom.sizes > ctgPos2.txt
      cp -p ../patch2/mkHapLocate.pl .
      ./mkHapLocate.pl ctgPos.txt \
  	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
  	> haplotypeLocations.bed
  # not found: GL339449.1 HSCHR5_1_CTG1
  # not found: GL339450.1 HG79_PATCH
  
  ##############################################################################
  # NCBI patch 5 (DONE - 2011-07-01,13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch5
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch5
      wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
          -nH --ftp-user=anonymous --ftp-password=yourName@domain.edu \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p5/
      # the scripts from patch4 were modified slightly to update and fix some
      #	of the new names in this patch5
      cp ../patch4/gatherNames.pl .
      ./gatherNames.pl . > ucscNames.patch5.txt
      cp -p ../patch4/mkTables.pl .
      ./mkTables.pl  patches.chrom.sizes ucscNames.patch5.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
      # output to stdout is the contents of alt.scaf.agp.gz
      # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
      cp -p ../patch4/mkCtgPos2.pl .
      ./mkCtgPos2.pl ucscNames.patch5.txt patches.chrom.sizes > ctgPos2.txt
      cp -p ../patch4/mkHapLocate.pl .
      ./mkHapLocate.pl ctgPos.txt \
  	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
  	> haplotypeLocations.bed
      cp haplotypeLocations.bed altSequence.bed
      ln -s ../patch2/before.patch2.hapLoc.bed hg19.hapLoc.bed
      awk '{printf "%s\t%d\t%d\t%s\t500\t+\t%d\t%d\t32,32,190\n", $2,$3,$4,$5,$3,$4}' \
  hg19.hapLoc.bed >> altSequence.bed
  
      # a new script for patch5
      ./mkFasta.pl ucscNames.patch5.txt > hg19.patch5.fa
      # the build of hg19Patch5 can be seen in hg19Patch5.txt
  
      egrep -v "32,32,190" altSequence.bed  \
  	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
  	    > altSeqPatchesP5.tab
      egrep "32,32,190" altSequence.bed  \
  	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
  	    > altSeqHaplotypesP5.tab
      # verify none lost
      wc -l altSeqPatchesP5.tab altSeqHaplotypesP5.tab
      #	41 altSeqPatchesP5.tab
      #	75 altSeqHaplotypesP5.tab
      #	116 total
      wc -l altSequence.bed
      #	116 altSequence.bed
      hgLoadBed hg19 altSeqHaplotypesP5 altSeqHaplotypesP5.tab
      #	Loaded 75 elements of size 6
      hgLoadBed hg19 altSeqPatchesP5 altSeqPatchesP5.tab
      #	Loaded 41 elements of size 6
  
      # to replace the existing track:
      hgLoadBed hg19 altSeqHaplotypes altSeqHaplotypesP5.tab
      #	Loaded 75 elements of size 6
      hgLoadBed hg19 altSeqPatches altSeqPatchesP5.tab
      #	Loaded 41 elements of size 6
  
  ##############################################################################
  # NCBI patch 9 (DONE - 2012-07-16 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch9
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch9
      wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
          -nH --ftp-user=anonymous --ftp-password=yourName@domain.edu \
  ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p9/
      # the scripts from patch5 were modified slightly to update and fix some
      #	of the new names in this patch9
      cp ../patch5/gatherNames.pl .
      ./gatherNames.pl . > ucscNames.patch9.txt
      # examine the names for sanity:
      awk '{print $NF}' ucscNames.patch9.txt | sort
      # and they should not be longer than 31 characters:
      awk '{print $NF}' ucscNames.patch9.txt | sort | awk '{print length($0)}' \
          | sort -n | tail
      cp -p ../patch5/mkTables.pl .
      ./mkTables.pl  patches.chrom.sizes ucscNames.patch9.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
      # output to stdout is the contents of alt.scaf.agp.gz
      # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
      cp -p ../patch5/mkCtgPos2.pl .
      ./mkCtgPos2.pl ucscNames.patch9.txt patches.chrom.sizes > ctgPos2.txt
      cp -p ../patch5/mkHapLocate.pl .
      ./mkHapLocate.pl ctgPos.txt \
  	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
  	> haplotypeLocations.bed
      cp -p haplotypeLocations.bed altSequence.bed
      ln -s ../patch2/before.patch2.hapLoc.bed hg19.hapLoc.bed
      awk '{printf "%s\t%d\t%d\t%s\t500\t+\t%d\t%d\t32,32,190\n", $2,$3,$4,$5,$3,$4}' \
  hg19.hapLoc.bed >> altSequence.bed
  
      # a new script for patch9
      cp -p ../patch5/mkFasta.pl .
      ./mkFasta.pl ucscNames.patch9.txt > hg19.patch9.fa
      # the build of hg19Patch9 can be seen in hg19Patch9.txt
  
      egrep -v "32,32,190" altSequence.bed  \
  	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
  	    > altSeqPatchesP9.tab
      egrep "32,32,190" altSequence.bed  \
  	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
  	    > altSeqHaplotypesP9.tab
      # verify none lost
      wc -l altSeqPatchesP9.tab altSeqHaplotypesP9.tab
      #	82 altSeqPatchesP9.tab
      #	81 altSeqHaplotypesP9.tab
      #	163 total
      wc -l altSequence.bed
      #	163 altSequence.bed
      hgLoadBed hg19 altSeqHaplotypesP9 altSeqHaplotypesP9.tab
      #	Loaded 75 elements of size 6
      # do not need the chrM_rCRS item:
      hgsql -e 'delete from altSeqHaplotypesP9 where chrom="chrM_rCRS";' hg19
      hgLoadBed hg19 altSeqPatchesP9 altSeqPatchesP9.tab
      #	Loaded 41 elements of size 6
  
      #    these tables are part of human/hg19/altSeqComposite9.ra
      # to replace the existing track:
      grep -v "^chrM_rCRS" altSeqHaplotypesP9.tab \
          | hgLoadBed hg19 altSeqHaplotypes stdin
      #   Read 80 elements of size 6 from stdin
      hgLoadBed hg19 altSeqPatches altSeqPatchesP9.tab
      #   Read 82 elements of size 6 from altSeqPatchesP9.tab
  
  ##############################################################################
  #  hg19 - Human - Ensembl Genes version 62  (DONE - 2011-04-22 - hiram)
      # This human gene set need a lot of work to get the name translation
      #	to work again.  The contig names have changed in Ensembl for this
      #	version and they defined genes on patch sequence that UCSC does not
      #	include
      ssh hgwdev
      cd /hive/data/genomes/hg19
      cat << '_EOF_' > hg19.ensGene.ra
  # required db variable
  db hg19
  # optional nameTranslation, the sed command that will transform
  #       Ensemble names to UCSC names.  With quotes just to make sure.
  # delete commands take out genes that are only in patch sequence
  nameTranslation 's/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; /^GL3.*/d; /^HSCHR[1-5]/d; /^HSCHR[7-9]/d; /^HG/d'
  # optionally update the ensToEnsembl table after ensGene updated
  ensToEnsembl yes
  # optional haplotype lift-down from Ensembl full chrom coordinates
  #       to UCSC simple haplotype coordinates
  haplotypeLift /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift
  # Ensembl 62 has new sequence names for some of the random bits
  liftUp /hive/data/genomes/hg19/jkStuff/ens.62.lft
  '_EOF_'
  #  << happy emacs
  
      doEnsGeneUpdate.pl  -ensVersion=62 hg19.ensGene.ra
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/ensGene.62
      featureBits hg19 ensGene
      # 109947258 bases of 2897316137 (3.795%) in intersection
  
      hgsql -e \
  'update trackVersion set dateReference="current" where db="hg19" AND version=62;' hgFixed
  
  ############################################################################
  # BUILD hg19 GERP TRACK (DONE 4/25/11, Fan)
  
  ssh hgwdev
  mkdir /hive/data/genomes/hg19/bed/gerp
  cd /hive/data/genomes/hg19/bed/gerp
  
  # place the wig data file, All_hg19_RS.wig, here.
  
  ulimit -d 180000000
  ulimit -v 180000000
  
  wigToBigWig All_hg19_RS.wig /hive/data/genomes/hg19/chrom.sizes All_hg19_RS.bw
  
  ln -s `pwd`/All_hg19_RS.bw /gbdb/hg19/bbi/All_hg19_RS.bw
  
  hgsql hg19 -e 'drop table if exists allHg19RS_BW; \
                 create table allHg19RS_BW (fileName varchar(255) not null); \
  	       insert into allHg19RS_BW values ("/gbdb/hg19/bbi/All_hg19_RS.bw");'
  
  # create corresponding trackDb.ra section and html description page.
  
  
  #########################################################################
  # LASTZ Cow BosTau6 (DONE - 2011-05-16 - Chin)
      mkdir /hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16
      cd /hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16
  
      cat << '_EOF_' > DEF
  # human vs cow
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Cow bosTau6
  SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit
  SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
      # real    481m23.263s
      cat fb.hg19.chainBosTau6Link.txt
      # 1370696434 bases of 2897316137 (47.309%) in intersection
      # Create link
      cd /hive/data/genomes/hg19/bed
      ln -s  lastzBosTau6.2011-05-16 lastz.bosTau6
  
  
      #   running the swap
      mkdir /hive/data/genomes/bosTau6/bed/blastz.hg19.swap
      cd /hive/data/genomes/bosTau6/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16/DEF \
          -swap  -syntenicNet \
          -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
      #   real     98m22.477s
      cat fb.bosTau6.chainHg19Link.txt
      #   1336966333 bases of 2649682029 (50.458%) in intersection
      cd /hive/data/genomes/bosTau6/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  
  ###################################################################
  # BUILD OMIM RELATED TRACKS (REBUILT 5/17/11, Fan)
  
  ssh hgwdev
  cd /hive/data/genomes/hg19/bed
  mkdir -p omim/05172011
  cd omim/05172011
  
  # obtain the following files from OMIM and place them at this subdirectory
  
         genemap.txt
         mim2gene.txt
         mimAV.txt
         script1.pl
         script2.pl
  
  cat genemap.txt|sed -e 's/|/\t/g' > genemap.tab
  
  hgLoadSqlTab -warn hg19 omimGeneMap ~/kent/src/hg/lib/omimGeneMap.sql genemap.tab
  
  # Load mim2gene table
  
  hgsql hg19 -e 'drop table mim2gene'
  hgsql hg19 < ~/kent/src/hg/lib/mim2gene.sql
  hgsql hg19 -e 'load data local infile "mim2gene.txt" into table mim2gene ignore 1 lines'
  
  doOmimDisorders hg19 omimDisorderMap.tab
  hgsql hg19 -e "drop table omimDisorderMap"
  hgsql hg19 < ~/kent/src/hg/lib/omimDisorderMap.sql
  hgLoadSqlTab -warn hg19 omimDisorderMap ~/kent/src/hg/lib/omimDisorderMap.sql omimDisorderMap.tab
  
  # build omimGeneSymbol table
  
  doOmimGeneSymbols hg19 j.out
  cat j.out |sort -u >omimGeneSymbol.tab
  hgLoadSqlTab -warn hg19 omimGeneSymbol ~/kent/src/hg/lib/omimGeneSymbol.sql omimGeneSymbol.tab
  
  perl ./script1.pl --gene-map-file=genemap.txt >omimPhenotype.tab
  hgLoadSqlTab -warn hg19 omimPhenotype ~/kent/src/hg/lib/omimPhenotype.sql omimPhenotype.tab
  
  hgsql hg19 -e 'update omimPhenotype set phenotypeClass = -1 where
  phenotypeClass=0'
  hgsql hg19 -e 'update omimPhenotype set phenotypeId = -1 where phenotypeId=0'
  
  doOmimGene2 hg19 j.tmp
  cat j.tmp |sort -u > omimGene2.tab
  
  hgLoadBed hg19 omimGene2 omimGene2.tab
  
  rm j.tmp
  ##############################################################
  # build the omimAvSnp track
  
  cd /hive/data/genomes/hg19/bed/omim/05172011
  mkdir av
  cd av
  
  # get the mimAV.txt data file from OMIM
  
  cut -f 1 mimAV.txt >j1
  cut -f 2 mimAV.txt >j2
  cut -f 3  mimAV.txt >j3
  cut -f 4  mimAV.txt >j4
  cut -f 5  mimAV.txt >j5
  
  cat j1 |sed -e 's/\./\t/' >j1.2
  
  cat j4 |sed -e 's/,/\t/' >j4-2
  cut -f 1 j4-2 >j4.1
  cut -f 2 j4-2 >j4.2
  
  paste j1 j1.2 j3 j4 j4.1 j4.2 j5 j2 >omimAv.tab
  
  hgsql hg19 -e 'drop table omimAv'
  hgsql hg19 < ~/src/hg/lib/omimAv.sql
  hgsql hg19 -e \
  'load data local infile "omimAv.tab" into table omimAv ignore 1 lines'
  hgsql hg19 -e 'update omimAv set repl2 = rtrim(ltrim(repl2))'
  
  doOmimAv hg19 omimAvRepl.tab  2>j.err
  
  hgsql hg19 -e "drop table omimAvRepl"
  hgsql hg19 < ~/kent/src/hg/lib/omimAvRepl.sql
  hgsql hg19 -e 'load data local infile "omimAvRepl.tab" into table omimAvRepl'
  
  rm j1.2  j1 j2 j3  j4  j4-2  j4.1  j4.2  j5
  
  hgsql hg19 -N -e 'select chrom, chromStart, chromEnd, avId from omimAvRepl r,
  snp132 s where s.name = dbSnpId order by avId' >omimAvSnp.tab
  
  hgLoadBed -allowStartEqualEnd  hg19 omimAvSnp omimAvSnp.tab
  ##############################################################
  # build the omimLocation track
  
  cd /hive/data/genomes/hg19/bed/omim/05172011
  mkdir location
  cd location
  
  doOmimLocation hg19 omimLocation.bed 2>j.err
  hgLoadBed hg19 omimLocation omimLocation.bed
  
  # Remove all gene entries in omimGene2 from omimLocation table
  
  hgsql hg19 -N -e \
  'delete from omimLocation where name  in (select name from omimGene2) '
  
  # Per OMIM request, delete all the gray entries in omimLocation table.
  
  mkdir cleanUpOmimLocation
  cd cleanUpOmimLocation
  
  hgsql hg19 -N -e \
  'select distinct name from omimLocation' |sort -u >j.all
  
  hgsql hg19 -N -e \
  'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=1' >j.1
  hgsql hg19 -N -e \
  'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=2' >j.2
  hgsql hg19 -N -e \
  'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=3' >j.3
  hgsql hg19 -N -e \
  'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=4' >j.4
  
  cat j.1 j.2 j.3 j.4 |sort -u >j.1234
  
  diff j.all j.1234 |grep "<" |sed -e 's/</do1/' >doall
  
  cat << '_EOF_' > do1
  hgsql hg19 -e "delete from omimLocation where name='${1}'"
  '_EOF_'
  # << emacs
  
  ./doall
  
  ############################################################################
  # NUMTS TRACK (DONE 2011-06-03 - Chin)
  
      mkdir -p /hive/data/outside/Numts/hg19
      cd /hive/data/outside/Numts/hg19
      wget http://193.204.182.50/files/hg19/all_hg19_NumtS_tracks.txt
      wget http://193.204.182.50/files/hg19/HSA_NumtS_hg19_details.html
      wget http://193.204.182.50/files/bam/allNumtS.sorted.bam
      wget http://193.204.182.50/files/bam/allNumtS.sorted.bam.bai
  
      mkdir /cluster/data/hg19/bed/NumtS
      cd  /cluster/data/hg19/bed/NumtS
      cp /hive/data/outside/Numts/hg19/*.* .
  
  
      # split the all_hg19_NumtS_tracks.txt into 3 bed files
      # numtSAssembled.bed, numtS.bed, an numtSMitochondrion.bed
  
      cat all_hg19_NumtS_tracks.txt | awk ' /^track name/ {print $_}'  > tracks.list
      cat all_hg19_NumtS_tracks.txt | awk ' /^track type/ {print $_}'  >> tracks.list
  
      # load the 3 bed files to hg19
      hgLoadBed hg19  numtSAssembled  numtSAssembled.bed
      hgLoadBed hg19 numtS numtS.bed
      hgLoadBed hg19 numtSMitochondrion numtSMitochondrion.bed
      # Make /gbdb/ links and load bam
      mkdir /gbdb/hg19/NumtS
      ln -s `pwd`/allNumtS.sorted.bam{,.bai} /gbdb/hg19/NumtS/
      hgBbiDbLink hg19 bamAllNumtSSorted /gbdb/hg19/NumtS/allNumtS.sorted.bam
  
      # setup trackDb for hg19
  
  ############################################################################
  # Add Gene name search to Ensembl gene track (DONE - 2011-07-22 - Hiram)
      cd /hive/data/genomes/hg19/bed/ensGene.62/process
      cut -f1,9 infoOut.txt | grep -v "^#" | sort > ensemblToGeneName.tab
      NL=`awk '{print length($1)}' ensemblToGeneName.tab | sort -rn | head -1`
      VL=`awk '{print length($2)}' ensemblToGeneName.tab | sort -rn | head -1`
      sed -e "s/ ensTo / ensemblToGeneName /; s/ens gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \
  	$HOME/kent/src/hg/lib/ensTo.sql > ensemblToGeneName.sql
  
      hgLoadSqlTab hg19 ensemblToGeneName ensemblToGeneName.sql ensemblToGeneName.tab
  
      # add this search specification to trackDb.ra
  searchName ensGeneName
  searchTable ensGene
  searchType genePred
  searchMethod prefix
  xrefTable ensemblToGeneName
  xrefQuery select name,value from %s where value like '%%%s%%'
  searchPriority 50
  
  ############################################################################
  # COSMIC TRACK (DONE 2011-07-15 Fan)
  
  mkdir /hive/data/outside/cosmic/20110711
  # put raw data file, EnsMutExp_v54_080711.csv, received by email to there.
  
  mkdir /hive/data/genomes/hg19/bed/cosmic/20110711
  cd /hive/data/genomes/hg19/bed/cosmic/20110711
  
  cp -p /hive/data/outside/cosmic/20110711/EnsMutExp_v54_080711.csv .
  
  cat EnsMutExp_v54_080711.csv|sed -e 's/\t//g' |sed -e 's/,/\t/g' |\
  grep -v COSMIC_MUTATION_ID |grep -v 'selected'|grep COSM
  >EnsMutExp_v54_080711.tab
  
  hgsql hg19 -e 'drop table cosmicRaw'
  hgsql hg19 < ~/kent/src/hg/lib/cosmicRaw.sql
  
  hgLoadSqlTab hg19 cosmicRaw ~/kent/src/hg/lib/cosmicRaw.sql
  EnsMutExp_v54_080711.tab
  
  # use  grch37_start-1 for our zero based chromStart and
  # conver their chr23 and chr24 to chrX and chrY.
  
  hgsql hg19 -N -e 'select "chr", chromosome, grch37_start-1, grch37_stop,
  cosmic_mutation_id from cosmicRaw' \
  |grep -v NULL |sed -e 's/chr\t/chr/'|sort -u|sed -e 's/chr23/chrX/' |sed -e
  's/chr24/chrY/' >cosmic.bed
  
  hgLoadBed -allowStartEqualEnd  hg19 cosmic cosmic.bed
  
  #############################################################################
  # HI SEQ DEPTH (DONE 7/15/11 angie)
      mkdir /hive/data/genomes/hg19/bed/hiSeqDepth
      cd /hive/data/genomes/hg19/bed/hiSeqDepth
      foreach cov (001 005 01 05 1)
        wget --timestamp http://eqtl.uchicago.edu/Masking/seq.cov$cov.ONHG19.bed.gz
        gunzip -N seq.cov$cov.ONHG19.bed.gz
      end
      wc -l seq.cov*
  #    522 seq.cov001.ONHG19.bed
  #   1224 seq.cov005.ONHG19.bed
  #   2060 seq.cov01.ONHG19.bed
  #  16119 seq.cov05.ONHG19.bed
  #  30671 seq.cov1.ONHG19.bed
      foreach cov (001 005 01 05 1)
        echo seq.cov$cov.ONHG19.bed
        featureBits -countGaps hg19 seq.cov$cov.ONHG19.bed
      end
  #seq.cov001.ONHG19.bed
  #55092 bases of 3137161264 (0.002%) in intersection
  #seq.cov005.ONHG19.bed
  #175379 bases of 3137161264 (0.006%) in intersection
  #seq.cov01.ONHG19.bed
  #344425 bases of 3137161264 (0.011%) in intersection
  #seq.cov05.ONHG19.bed
  #3073270 bases of 3137161264 (0.098%) in intersection
  #seq.cov1.ONHG19.bed
  #5736695 bases of 3137161264 (0.183%) in intersection
      # Compare hg19 coverage to hg18:
      calc 55092 / 57409
  #55092 / 57409 = 0.959640
      calc 175379 / 183848
  #175379 / 183848 = 0.953935
      calc 344425 / 362423
  #344425 / 362423 = 0.950340
      calc 3073270 / 3462959
  #3073270 / 3462959 = 0.887469
      calc 5736695 / 6466376
  #5736695 / 6466376 = 0.887158
  
      # Not all small ones are strict subsets of larger ones.
      featureBits hg19 -countGaps seq.cov001.ONHG19.bed \!seq.cov005.ONHG19.bed
  #128 bases of 3137161264 (0.000%) in intersection
      featureBits hg19 -countGaps seq.cov005.ONHG19.bed \!seq.cov01.ONHG19.bed
  #222 bases of 3137161264 (0.000%) in intersection
      featureBits hg19 -countGaps seq.cov01.ONHG19.bed \!seq.cov05.ONHG19.bed
  #4185 bases of 3137161264 (0.000%) in intersection
      featureBits hg19 -countGaps seq.cov05.ONHG19.bed \!seq.cov1.ONHG19.bed
  #41831 bases of 3137161264 (0.001%) in intersection
      # No overlap w/gap track:
      featureBits hg19 -countGaps seq.cov1.ONHG19.bed gap -bed=gapOverlaps.ONHG19.bed
  #0 bases of 3137161264 (0.000%) in intersection
  
      # Load tables:
      hgLoadBed hg19 hiSeqDepthTopPt1Pct seq.cov001.ONHG19.bed
  #Loaded 522 elements of size 3
      hgLoadBed hg19 hiSeqDepthTopPt5Pct seq.cov005.ONHG19.bed
  #Loaded 1224 elements of size 3
      hgLoadBed hg19 hiSeqDepthTop1Pct seq.cov01.ONHG19.bed
  #Loaded 2060 elements of size 3
      hgLoadBed hg19 hiSeqDepthTop5Pct seq.cov05.ONHG19.bed
  #Loaded 16119 elements of size 3
      hgLoadBed hg19 hiSeqDepthTop10Pct seq.cov1.ONHG19.bed
  #Loaded 30671 elements of size 3
  
  ############################################################################
  # adding new decode data (DONE - 2011-08-18 - Hiram)
  # liftOver from hg18 tracks:
      mkdir /hive/data/outside/decode/hg19
      cd /hive/data/outside/decode/hg19
  # some of the items end up overlapping as result of liftOver,
  # this filters them out:
  export OVERLAPS="9202536|9235404|9225403|9215395|9192536|9182536|9172536|110561813|110582154|110572154|110552191|110542194|110532192|110522233|110512223|110502226|110492220|110482221|110472216|36283195|36273203|36252392|81325902|36262322"
  
  for T in female female_carrier female_noncarrier male male_carrier \
          male_noncarrier sex-averaged sex-averaged_carrier \
          sex-averaged_noncarrier maleFemale
  do
  liftOver ../hg18/${T}.bedGraph \
          /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
          ${T}.hg19.txt ${T}.unMapped.bedGraph
  wc -l ${T}.hg19.txt ${T}.unMapped.bedGraph
  awk '$3-$2 > 8000 && $3-$2 < 12000' ${T}.hg19.txt | sort -k1,1 -k2,2n \
  	| egrep -v "${OVERLAPS}" > ${T}.hg19.bedGraph
  awk '$3-$2 < 8001 || $3-$2 > 11999' ${T}.hg19.txt >> ${T}.unMapped.bedGraph
  awk '$3-$2 > 8000 && $3-$2 < 12000' ${T}.hg19.txt | sort -k1,1 -k2,2n \
      | egrep "${OVERLAPS}" >> ${T}.unMapped.bedGraph
  bedGraphToBigWig ${T}.hg19.bedGraph /hive/data/genomes/hg19/chrom.sizes ${T}.bw
  wc -l ${T}.hg19.txt ${T}.hg19.bedGraph ${T}.unMapped.bedGraph
  done
  
      # load the bigWig files into SQL table name friendly tables:
      mkdir /gbdb/hg19/decode
  for C in female female_carrier female_noncarrier \
  	male male_carrier male_noncarrier \
  	sex-averaged sex-averaged_carrier sex-averaged_noncarrier
  do
      N=${C}
      case ${C} in
          female) N="Female" ;;
          female_carrier) N="FemaleCarrier" ;;
          female_noncarrier) N="FemaleNonCarrier" ;;
          male) N="Male" ;;
          male_carrier) N="MaleCarrier" ;;
          male_noncarrier) N="MaleNonCarrier" ;;
          sex-averaged) N="SexAveraged" ;;
          sex-averaged_carrier) N="SexAveragedCarrier" ;;
          sex-averaged_noncarrier) N="SexAveragedNonCarrier" ;;
      esac
      echo $C $N
      rm -f /gbdb/hg19/decode/${C}.bw /gbdb/hg19/decode/${N}.bw
      ln -s `pwd`/${C}.bw /gbdb/hg19/decode/${N}.bw
      hgsql -e "drop table decode${N};" hg19
      hgBbiDbLink hg19 decode${N} /gbdb/hg19/decode/${N}.bw
  done
  
      # compute male - female difference
      awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' male.hg19.bedGraph \
  	| sort > ordered.male.txt
      awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' female.hg19.bedGraph \
  	| sort > ordered.female.txt
      join ordered.male.txt ordered.female.txt > maleFemale.txt
  
      awk '{printf "%s\t%.6f\n", $1, $2-$3}' maleFemale.txt \
  	| sed -e "s/_/\t/g" | sort -k1,1 -k2,2n > maleFemale.bedGraph
      # same result as what was lifted:
      sum maleFemale.hg19.bedGraph maleFemale.bedGraph
      #	14015  7950 maleFemale.hg19.bedGraph
      #	14015  7950 maleFemale.bedGraph
  
  
      # and hot spots
      awk '$4 > 9.99' female.hg19.bedGraph > hotSpotFemale.bed
      awk '$4 > 9.99' male.hg19.bedGraph > hotSpotMale.bed
  
      hgLoadBed hg19 decodeHotSpotFemale hotSpotFemale.bed
      #	Loaded 4135 elements of size 4
      hgLoadBed hg19 decodeHotSpotMale hotSpotMale.bed
      #	Loaded 4771 elements of size 4
  
      bedGraphToBigWig maleFemale.bedGraph /hive/data/genomes/hg19/chrom.sizes \
          MaleFemaleDifference.bw
      ln -s `pwd`/MaleFemaleDifference.bw /gbdb/hg19/decode/
      hgsql -e "drop table decodeMaleFemaleDifference;" hg19
      hgBbiDbLink hg19 decodeMaleFemaleDifference /gbdb/hg19/decode/MaleFemaleDifference.bw
  
  
  #############################################################################
  # DBSNP B134 / SNP134 (DONE 9/1/11)
  # Redmine #5133
  # Originally run 8/30/11; re-run 9/1/11 to incorporate 1000 Genomes frequency data
  # that dbSNP had moved out to a new database table, SNPAlleleFreq_TGP.
      mkdir -p /hive/data/outside/dbSNP/134/human
      cd /hive/data/outside/dbSNP/134/human
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
      # to find the subdir name to use as orgDir below (human_9606 in this case).
      # Then click into that directory and look for file names like
      #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
      # -- use the first num for build and the second num_num for buildAssembly.
      # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
      #
      # Some trial and error was required to get the config.ra just right -- assembly
      # label now has ".p2" at end, and GRCh37 patch contigs needed to be filtered out:
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606
  build 134
  buildAssembly 37_2
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p2
  ignoreDbSnpContigs NW_0033159[0-9][0-9]
  EOF
      # << emacs
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
  
  
  #############################################################################
  # FILTER SNP134 (DONE 9/2/11 angie)
      # Redmine #5133
      # Make several tracks that are filtered subsets of snp134:
      # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp134Mult
      # Second, siphon off the common variants -> snp134Common
      # Third, take the (uniquely mapped, not ens to be common) variants
      # w/dbSNP's "clinically-assoc" flag -> snp134Flagged
      cd /hive/data/outside/dbSNP/134/human
      zcat snp134.bed.gz \
      | perl -we \
        '$minTotal2N = 10; \
         ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \
         open($mult, "| gzip -c > snp134Mult.bed.gz") || die; \
         open($common,    "| gzip -c > snp134Common.bed.gz") || die; \
         open($flagged,   "| gzip -c > snp134Flagged.bed.gz") || die; \
         open($misc,      "| gzip -c > snp134Misc.bed.gz") || die; \
         while (<>) { \
           @w = split("\t"); \
           if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
             print $mult $_; \
             $multCount++; \
           } else { \
             my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
             my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
             my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
             my ($total2N, $maxAlleleFreq) = (0, 0); \
             for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
               $total2N += $alNs[$i]; \
               $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
             } \
             if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \
               print $common $_; \
               $comCount++; \
             } elsif($w[24] =~ /clinically-assoc/)  { \
               print $flagged $_; \
               $flagCount++; \
             } else { \
               print $misc $_; \
               $miscCount++; \
             } \
           } \
         } \
         close($mult);  close($common); close($flagged);  close($misc); \
         print "snp134Mult:    $multCount\nsnp134Common:  $comCount\nsnp134Flagged: $flagCount\n" . \
               "leftover:      $miscCount\n";'
  #snp134Mult:    3603177
  #snp134Common:  13413905
  #snp134Flagged: 26496
  #leftover:      26910747
  
      # Load tables
      foreach subset (Mult Common Flagged)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          hg19 snp134$subset -sqlTable=snp134.sql snp134$subset.bed.gz
      end
  
  
  #############################################################################
  # SNP134 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 9/2/11 angie)
      mkdir /hive/data/genomes/hg19/bed/snp134Ortho
      cd /hive/data/genomes/hg19/bed/snp134Ortho
      # Filter snp134 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
      zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
      | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      | sort -u \
        > snp134ExcludeIds.txt
      wc -l snp134ExcludeIds.txt
  #1178007 snp134ExcludeIds.txt
      zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
      | awk '$3-$2 == 1 && $11 == "single" {print;}' \
      | grep -vFwf snp134ExcludeIds.txt \
        > snp134Simple.bed
      wc -l snp134Simple.bed
  #32818637 snp134Simple.bed
  
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        snp134Simple.bed > snp134ForLiftOver.bed
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
      splitFile ../snp134ForLiftOver.bed 10000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro3.over.chain.gz \
          \{check out exists out/panTro3.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh swarm
      cd /hive/data/genomes/hg19/bed/snp134Ortho/run.liftOChimp
      para make jobList
  #Completed: 3282 of 3282 jobs
  #CPU time in finished jobs:     314951s    5249.18m    87.49h    3.65d  0.010 y
  #IO & Wait Time:                 32669s     544.49m     9.07h    0.38d  0.001 y
  #Average job time:                 106s       1.77m     0.03h    0.00d
  #Longest finished job:             268s       4.47m     0.07h    0.00d
  #Submission to last job:           444s       7.40m     0.12h    0.01d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 3282 of 3282 jobs
  #CPU time in finished jobs:     681601s   11360.02m   189.33h    7.89d  0.022 y
  #IO & Wait Time:                 57733s     962.21m    16.04h    0.67d  0.002 y
  #Average job time:                 225s       3.75m     0.06h    0.00d
  #Longest finished job:             586s       9.77m     0.16h    0.01d
  #Submission to last job:          1598s      26.63m     0.44h    0.02d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
          \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 3282 of 3282 jobs
  #CPU time in finished jobs:     826108s   13768.47m   229.47h    9.56d  0.026 y
  #IO & Wait Time:                 68165s    1136.08m    18.93h    0.79d  0.002 y
  #Average job time:                 272s       4.54m     0.08h    0.00d
  #Longest finished job:             679s      11.32m     0.19h    0.01d
  #Submission to last job:          1775s      29.58m     0.49h    0.02d
  
      cd /hive/data/genomes/hg19/bed/snp134Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro3.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro3/panTro3.2bit \
      | sort > panTro3.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
      | sort > rheMac2.orthoGlom.txt
      wc -l panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
  #30880910 panTro3.orthoGlom.txt
  #29376791 ponAbe2.orthoGlom.txt
  #26505681 rheMac2.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of snp134OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac2.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp134OrthoPt3Pa2Rm2.bed
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp134OrthoPt3Pa2Rm2 snp134OrthoPt3Pa2Rm2.bed
  #Loaded 31924973 elements of size 22
      # Cleanup:
      rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
      gzip snp134Simple.bed snp134ExcludeIds.txt snp134ForLiftOver.bed &
  
  
  ############################################################################
  # DBSNP CODING ANNOTATIONS (134) (DONE 8/30/11 angie)
  # It wasn't necessary to redo this following the 9/1 re-run of doDbSnp.pl because
  # that simply picked up new allele frequency info, no change to exceptions etc.
      cd /hive/data/outside/dbSNP/134/human
      # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
      # For anything except an insertion (0 bases between flanks),
      # we need to add 1 to the end coord.  For an insertion, we need
      # to add 1 to the start coord.  Make a hash of the insertion IDs,
      # then look up each ID in ncbiFuncAnnotations.txt to tell which
      # transform to apply.
      # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
      zcat ncbiFuncAnnotations.txt.gz \
      | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \
                while (<$IDS>) { chomp; $ids{$_} = 1; } \
                close($IDS); \
                %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
                while (<>) { \
                  chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                  next unless $coding{$w[5]}; \
                  $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                  if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                    $w[2]++; # 2-base insertions: increment start coord \
                  } else { \
                    $w[3]++; # increment end coord to get half-open \
                  } \
                  print join("\t", @w) . "\n"; \
                }' \
      | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
      | uniq \
        > ncbiCodingAnnotations.txt
      wc -l ncbiCodingAnnotations.txt
  #2510704 ncbiCodingAnnotations.txt
      # How many & what kinds of function types?
      cut -f 6 ncbiCodingAnnotations.txt \
      | sort -n | uniq -c
  # 461567 3   (coding-synon)
  #1244159 8   (cds-reference -- ignored)
  #  21296 41  (nonsense)
  # 729942 42  (missense)
  #  52778 44  (frameshift)
  #    962 45  (cds-indel)
      # Gather up multiple annotation lines into one line per {snp, gene, frame}:
      perl -e  'while (<>) { chomp; \
                  my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                  if (defined $lastRs && \
                      ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                       $lastTx ne $txId || $lastFrm ne $frm)) { \
                    if (defined $refRow) { \
                      $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                      $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                    } \
                    print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                          "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                    $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                  } \
                  ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                      ($rsId, $ctg, $s, $e, $txId, $frm); \
                  $count++; \
                  if ($fxn == 8) { \
                    $refRow = [$fxn, $nt, $aa, $codon]; \
                  } else { \
                   $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                  } \
                } \
                if (defined $refRow) { \
                  $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                  $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                } \
                print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                      "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
        ncbiCodingAnnotations.txt \
      | liftUp snp134CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
      hgLoadBed hg19 snp134CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
        -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
        snp134CodingDbSnp.bed
  #Loaded 1244179 elements of size 11
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP134 (DONE 8/30/11 angie)
  # It wasn't necessary to redo this following the 9/1 re-run of doDbSnp.pl because
  # that simply picked up new allele frequency info, no change to exceptions etc.
      mkdir /hive/data/genomes/hg19/snp134Mask
      cd /hive/data/genomes/hg19/snp134Mask
      # Identify rsIds with various problems -- we will exclude those.
      zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
      | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
        | sort -u \
        > snp134ExcludeRsIds.txt
      zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
      | grep -vFwf snp134ExcludeRsIds.txt \
        > snp134Cleaned.bed
      wc -l snp134Cleaned.bed
  #37853186 snp134Cleaned.bed
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp134Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
      | faSplit byname stdin substitutions/
  #Masked 32668329 snps in 32666100 out of 3131050506 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3131050506 (difference is 6110758)
      # warnings about differing observed strings at same base position:
      wc -l diffObserved.txt
  #2545 diffObserved.txt
      # Check that 6110758 is the total #bases in sequences with nothing in snp134Cleaned:
      grep -Fw single snp134Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
      grep -vwf /data/tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #6110758
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10233 (y != c)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
        gzip $f:r.subst.fa &
      end
  
      # Insertions & deletions not done.  To date we have only offered substs for download.
      # If there is user demand, use template from snp131 above.
  
      # Clean up and prepare for download:
      gzip snp134Cleaned.bed &
      foreach d (substitutions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg19/snp132Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt.
  
      # Create download links on hgwdev.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp134Mask
      ln -s /hive/data/genomes/hg19/snp134Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp134Mask/
  
  
  #############################################################################
  # LASTZ X. tropicalis XenTro3 (DONE - 2011-09-20 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20
      cd /hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20
  
      cat << '_EOF_' > DEF
  # human vs X. tropicalis
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Frog xenTro3
  SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit
  SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    395m51.626s
      cat fb.hg19.chainXenTro3Link.txt
      #	87928753 bases of 2897316137 (3.035%) in intersection
      cd /hive/data/genomes/hg19/bed
      ln -s lastzXenTro3.2011-09-20 lastz.xenTro3
  
      #	running the swap - DONE - 2011-09-21
      mkdir /hive/data/genomes/xenTro3/bed/blastz.hg19.swap
      cd /hive/data/genomes/xenTro3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20/DEF \
  	-chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  	-swap > swap.log 2>&1 &
      #	real    37m7.001s
      cat fb.xenTro3.chainHg19Link.txt
      #	90929066 bases of 1358334882 (6.694%) in intersection
  
  ############################################################################
  # GENEREVIEWS TRACK (DONE 2014-04-09 - Chin)
  # geneReviews data located at ftp://ftp.ncbi.nlm.nih.gov/pub/GeneReviews/
  # which will be updated weekly
      mkdir /hive/data/genomes/hg19/bed/geneReviews
      mkdir -p /hive/data/outside/ncbi/geneReviews/current
      cd /hive/data/outside/ncbi/geneReviews/current
      wget --timestamping ftp://ftp.ncbi.nih.gov/pub/GeneReviews/*.*
      chmod 660 *.txt
      wc -l *.txt
      #  606 GRtitle_shortname_NBKid.txt
      # 2792 NBKid_shortname_OMIM.txt
      # 1337 NBKid_shortname_genesymbol.txt
  
      cp -p NBKid_shortname_genesymbol.txt /hive/data/genomes/hg19/bed/geneReviews/.
      cp -p NBKid_shortname_OMIM.txt /hive/data/genomes/hg19/bed/geneReviews/.
      cp -p GRtitle_shortname_NBKid.txt /hive/data/genomes/hg19/bed/geneReviews/.
  
      cd /hive/data/genomes/hg19/bed/geneReviews
  
      cat NBKid_shortname_genesymbol.txt | grep -v "^#" \
       | awk  '{FS="\t"} {OFS="\t"} {if ($3!="Not applicable") \
        print $3,$2,$1}' | sort -k1 > geneReviewsGrshortNBKid.tab
      # Create geneReviewsGrshortNBKid table
      cat << '_EOF_' > $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql
  CREATE TABLE geneReviewsGrshortNBKid (
      geneSymbol  varchar(255) not null,   # refSeq gene symbol
      grShort     varchar(255) not null,   # short name for GeneReviews article
      NBKid       varchar(255) not null,   # NCBI book ID of the review article
      index (geneSymbol)
  );
  '_EOF_'
      # << happy emacs
      # load RefSeg Gene to geneReviews article mapping list to hg19
      hgLoadSqlTab -warn hg19 geneReviewsGrshortNBKid \
        $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql geneReviewsGrshortNBKid.tab
  
      # Generate a list of refSeq genes that have geneReview assoicate with it.
      cat  geneReviewsGrshortNBKid.tab | awk -F'\t' '{printf  "%s\n", $1}' \
         | sort  | uniq  > grRefGene.lst
      wc -l *.*
      #  606 GRtitle_shortname_NBKid.txt
      # 1337 NBKid_shortname_genesymbol.txt
      # 1324 geneReviewsGrshortNBKid.tab
      # 1126 grRefGene.lst
  
      # Note: awk bug -- when reading the first input line, awk treat space as
      # TAB. So need to work around it by print out all line, then use grep to
      # filter first header line
      cat GRtitle_shortname_NBKid.txt | awk  '{FS="\t"} {OFS="\t"}  {print $1,$2,$3}' |  grep -v "^#" | sort -k1 > geneReviewsGrshortTitleNBKid.tab
      # Create and load geneReviewsGrshortTitleNBKid
      cat << '_EOF_' > $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql
  CREATE TABLE geneReviewsGrshortTitleNBKid (
      grShort     varchar(255) not null,   # short name for GeneReviews article
      grTitle     varchar(255) not null,   # full geneReviews article name
      NBKid       varchar(255) not null,   # NCBI book ID of the review article
      index (grShort)
  );
  '_EOF_'
  
      hgLoadSqlTab -warn hg19 geneReviewsGrshortTitleNBKid \
        $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql geneReviewsGrshortTitleNBKid.tab
  
      cat << '_EOF_' > createGeneReviewsTab.sh
  ##!/bin/sh
      # for each refGen in grRefGene.lst, create a non-overlapping bed row.
      cat grRefGene.lst | while read G
        do
          echo ${G}
          hgsql hg19 -N -e \
            "SELECT e.chrom,e.txStart,e.txEnd,j.geneSymbol \
            FROM ensGene e, kgXref j WHERE e.alignID = j.kgID AND \
            j.geneSymbol ='${G}' ORDER BY e.chrom,e.txStart;" > temp.in
          bedRemoveOverlap temp.in temp.out
          cat temp.out >> geneReviews.tab
        done
      rm temp.*
  '_EOF_'
  # << happy emacs
      chmod +x createGeneReviewsTab.sh
      ./createGeneReviewsTab.sh
  
      wc -l *.tab
      #  1165 bed.tab
      #  1165 geneReviews.tab
      #  1324 geneReviewsGrshortNBKid.tab
      #   605 geneReviewsGrshortTitleNBKid.tab
  
      # load the collapsed bed4 file to hg19,
      hgLoadBed hg19 geneReviews geneReviews.tab
  
      # Create and load geneReviewsDetail table
      # the new table will replace the old geneReviewsRefGene with diseaseID in
      # it.
      cat << '_EOF_' > $HOME/kent/src/hg/lib/geneReviewsDetail.sql
  CREATE TABLE geneReviewsDetail (
      geneSymbol  varchar(255) not null,   # refSeq gene symbol
      grShort     varchar(255) not null,   # short name for geneReviews article
      NBKid       varchar(255) not null,   # NCBI book ID of geneRreviews article
      grTitle     varchar(255) not null,   # full geneReviews article name
      index (geneSymbol)
  );
  '_EOF_'
  # << happy emacs
  
      hgsql hg19 -N -e \
        "SELECT  s.geneSymbol, s.grShort, t.NBKid, t.grTitle \
            FROM geneReviewsGrshortNBKid s, geneReviewsGrshortTitleNBKid t \
            WHERE s.grShort = t.grShort ORDER BY s.geneSymbol;" > geneReviewsDetail.tab
  
      hgLoadSqlTab -warn hg19 geneReviewsDetail \
        $HOME/kent/src/hg/lib/geneReviewsDetail.sql geneReviewsDetail.tab
  
      # Check in the following to git:
      # $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql
      # $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql
      # $HOME/kent/src/hg/lib/geneReviewsDetail.sql
      # Update the human/trackDb.ra and human/geneReviews.html
  
  
  
  ##############################################################################
  # hgPal downloads redone for new ensGene (re-DONE 2012-01-06 braney)
  #   FASTA from 46way for  ensGene, ensCanonical
  
      ssh hgwdev
      screen
      bash
  #    rm -rf /cluster/data/hg19/bed/multiz46way/pal
  #    mkdir /cluster/data/hg19/bed/multiz46way/pal
      cd /cluster/data/hg19/bed/multiz46way/pal
      for i in `cat ../species.list`; do echo $i; done > order.lst
  
      mz=multiz46way
      gp=ensGene
      db=hg19
      mkdir exonAA exonNuc ppredAA ppredNuc
      for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
      do
  	echo "date"
  	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
  	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
  	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
  	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
  	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
  	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
  	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
  	    gzip -c > exonAA/$j.exonAA.fa.gz"
      done > $gp.$mz.jobs
  
      time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
      sleep 1
      tail -f $gp.$mz.job.log
  
  # real    205m16.105s
  # user    36m32.438s
  # sys     6m12.046s
  
      mz=multiz46way
      gp=ensGene
      db=hg19
  
      zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
      zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
  
      rm -rf exonAA exonNuc ppredAA ppredNuc
  
      mz=multiz46way
      gp=ensGene
      db=hg19
      pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
  
      # now do the canonical set
      cd /cluster/data/hg19/bed/multiz46way/pal
      mz=multiz46way
      gp=ensCanonical
      db=hg19
      for j in `awk '{print $1}' /cluster/data/hg19/chrom.sizes`
      do
  	echo "select chrom, chromStart, chromEnd, transcript from ensCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.ens.bed
      done
  
      mkdir exonAA exonNuc ppredAA ppredNuc
      for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
      do
  	echo "date"
  	echo "mafGene -geneBeds=$j.ens.bed  $db $mz ensGene order.lst stdout | \
  	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
  	echo "mafGene -geneBeds=$j.ens.bed -noTrans $db $mz ensGene order.lst stdout | \
  	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
  	echo "mafGene -geneBeds=$j.ens.bed -exons -noTrans $db $mz ensGene order.lst stdout | \
  	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
  	echo "mafGene -geneBeds=$j.ens.bed -exons $db $mz ensGene order.lst stdout | \
  	    gzip -c > exonAA/$j.exonAA.fa.gz"
      done > $gp.$mz.jobs
  
      time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
      sleep 1
      tail -f $gp.$mz.job.log
  
  # real    166m1.220s
  # user    13m35.246s
  # sys     2m50.683s
  
      rm *.ens.bed
      mz=multiz46way
      gp=ensCanonical
      db=hg19
      zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
      zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
  
      rm -rf exonAA exonNuc ppredAA ppredNuc
  
      mz=multiz46way
      gp=ensCanonical
      db=hg19
      pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
  ############################################################################
  # UPDATE COSMIC TRACK (DONE 2011-10-11 Fan)
  
  mkdir /hive/data/outside/cosmic/20111011
  # put raw data file, UCSCMutExp_v55_090911.csv, received by email to there.
  
  mkdir /hive/data/genomes/hg19/bed/cosmic/20111011
  cd /hive/data/genomes/hg19/bed/cosmic/20111011
  
  cp -p /hive/data/outside/cosmic/20111011/UCSCMutExp_v55_090911.csv .
  
  cat UCSCMutExp_v55_090911.csv|sed -e 's/\t//g' |sed -e 's/,/\t/g' |\
  grep -v COSMIC_MUTATION_ID |grep -v 'selected'|grep COSM >UCSCMutExp_v55_090911.tab
  
  hgsql hg19 -e 'drop table cosmicRaw'
  hgsql hg19 < ~/kent/src/hg/lib/cosmicRaw.sql
  
  hgLoadSqlTab hg19 cosmicRaw ~/kent/src/hg/lib/cosmicRaw.sql UCSCMutExp_v55_090911.tab
  
  # use  grch37_start-1 for our zero based chromStart and
  # conver their chr23 and chr24 to chrX and chrY.
  
  hgsql hg19 -N -e 'select "chr", chromosome, grch37_start-1, grch37_stop, cosmic_mutation_id from cosmicRaw' \
  |grep -v NULL |sed -e 's/chr\t/chr/'|sort -u|sed -e 's/chr23/chrX/' |sed -e 's/chr24/chrY/' >cosmic.bed
  
  hgLoadBed -allowStartEqualEnd  hg19 cosmic cosmic.bed
  
  #############################################################################
  # LASTZ Gorilla GorGor3 (DONE - 2011-10-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17
      cd /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17
  
      cat << '_EOF_' > DEF
  # human vs gorilla
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Frog gorGor3
  SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit
  SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    159m46.598s
      cat fb.hg19.chainGorGor3Link.txt
      #	2603997992 bases of 2897316137 (89.876%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzGorGor3.2011-10-17 lastz.gorGor3
  
      # better to have reciprocal best for this one since it is low coverage:
      cd /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17
      time doRecipBest.pl hg19 gorGor3 -buildDir=`pwd` -workhorse=hgwdev \
  	> best.log 2>&1 &
      #   real     166m43.489s
  
      #	running the swap - DONE - 2011-09-21
      mkdir /hive/data/genomes/gorGor3/bed/blastz.hg19.swap
      cd /hive/data/genomes/gorGor3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17/DEF \
  	-swap -syntenicNet \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #	real    69m39.685s
      cat fb.gorGor3.chainHg19Link.txt
      #	2571797450 bases of 2822760080 (91.109%) in intersection
  
  ############################################################################
  # ISCA FROM DBVAR (DONE 5/21/12 angie)
  # Updated 3/02/12 angie
      # Redmine: Track #34 (dbVar for human)
      set today = `date +%Y_%m_%d`
      mkdir /hive/data/genomes/hg19/bed/isca/$today
      cd /hive/data/genomes/hg19/bed/isca/$today
      # Get variants submitted on this assembly, and variants remapped from other assemblies.
      wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.GRCh37.submitted.all.germline.ucsc.gvf.gz
      wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.GRCh37.remap.all.germline.ucsc.gvf.gz
      # New 5/21/12: ISCA Curated
      wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd45_ISCA_curated_dataset/gvf/nstd45_ISCA_curated_dataset.GRCh37.remap.all.germline.ucsc.gvf.gz
      zcat nstd37_ISCA*.gvf.gz \
      | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \
        > isca.bed
      zcat nstd45_ISCA*.gvf.gz \
      | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \
        > iscaCurated.bed
      wc -l isca*.bed
  #   12943 isca.bed
  #      84 iscaCurated.bed
      # Split into subtracks by clinical_int value.
      zcat nstd37_ISCA*.gvf.gz \
      | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c
  #   4307 Benign
  #   4600 Pathogenic
  #   3406 Uncertain significance
  #    466 Uncertain significance: likely benign
  #    164 Uncertain significance: likely pathogenic
      zcat nstd45_ISCA*.gvf.gz \
      | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c
  #     29 Benign
  #     55 Pathogenic
      foreach subtrack (Benign Pathogenic)
        grep -w  $subtrack isca.bed > isca$subtrack.bed
        hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
          -allowStartEqualEnd hg19 isca$subtrack isca$subtrack.bed
        grep -w  $subtrack iscaCurated.bed > iscaCurated$subtrack.bed
        hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
          -allowStartEqualEnd hg19 iscaCurated$subtrack iscaCurated$subtrack.bed
      end
  #Read 4307 elements of size 11 from iscaBenign.bed
  #Read 29 elements of size 11 from iscaCuratedBenign.bed
  #Read 4600 elements of size 11 from iscaPathogenic.bed
  #Read 55 elements of size 11 from iscaCuratedPathogenic.bed
  
      # The subcategories of Uncertain need a bit more sophisticated treatment:
      set subtrack = Uncertain
      grep -w $subtrack isca.bed \
      | grep -vi 'Uncertain Significance: likely' \
        > isca$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg19 isca$subtrack isca$subtrack.bed
  #Read 3406 elements of size 11 from iscaUncertain.bed
  
      foreach unc (benign pathogenic)
        set subtrack = Likely`perl -we 'print ucfirst("'$unc'");'`
        grep -wi "Uncertain Significance: likely $unc" isca.bed \
          > isca$subtrack.bed
        hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
          -allowStartEqualEnd hg19 isca$subtrack isca$subtrack.bed
      end
  #Read 466 elements of size 11 from iscaLikelyBenign.bed
  #Read 164 elements of size 11 from iscaLikelyPathogenic.bed
  
  ## more for this track below v ##
  
  ############################################################################
  # ISCA AGGREGATE PATHOGENIC TRACKS (DONE 2012-05-21 angie)
  # First done 2012-02-08 by b0b; updated 2012-03-03 by b0b.
  
      # files of ISCA Pathogenic Gain and Loss were fetched in the previous section --
      # use same dir.
      set today = `date +%Y_%m_%d`
      cd /hive/data/genomes/hg19/bed/isca/$today
  
      # make bedGraphs
      hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \
          WHERE attrVals LIKE '%number_gain%'" hg19 | sort \
      | bedItemOverlapCount hg19 stdin > iscaPathGain.bedGraph
  
      hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \
          WHERE attrVals LIKE '%number_loss%'" hg19 | sort \
      | bedItemOverlapCount hg19 stdin > iscaPathLoss.bedGraph
  
      # load tables
      hgLoadBed -bedGraph=4 hg19 iscaPathGainCum iscaPathGain.bedGraph
  #Read 2001 elements of size 4 from iscaPathGain.bedGraph
  
      hgLoadBed -bedGraph=4 hg19 iscaPathLossCum iscaPathLoss.bedGraph
  #Read 3567 elements of size 4 from iscaPathLoss.bedGraph
  
      # End of track build instructions; historical notes follow.
  
    # trackDb (these values from original load, not update)
    # get 2 stdDev value to set default maxHeightPixels
      # use average of: median + 2 SD - would mean be better?
      # use same viewLimit for both:  average the two
      # note that chr21 (Down's) is overrepresented in the dataset
      ave  hg19.iscaPathGain.bedGraph -col=4
  #    median 9.000000
  #    average 20.970921
  #    max 105.000000
  #    standard deviation 25.652712
      median + 2SD = 60
  
      ave  hg19.iscaPathLoss.bedGraph -col=4
  #    median 6.000000
  #    average 15.998146
  #    max 171.000000
  #    standard deviation 23.526095
      median + 2SD = 53
  
    # move some settings down to existing subtracks:
         type gvf
         noScoreFilter .
    # add settings to parent track:
        type bed
        noInherit on
    # set these two new tracks:
        release alpha
        type bedGraph 4
        maxHeightPixels 100:57:16
        viewLimits 0:60 (halfway betw 2 SD and max)
        alwaysZero on
        color 0,0,200 (Gain)
        color 200,0,0 (Loss)
    #  new html page using override in trackDb/human/hg19/trackDb.ra
  
  
  ##############################################################################
  # ISCA AGGREGATE BENIGN TRACKS (DONE 2015-11-18 jcasper)
  
      cd /hive/data/outside/otto/isca/2015-09-16/
      for db in hg19 hg38; do
  
      # make bedGraphs
    hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaBenign \
        WHERE attrVals LIKE '%number_gain%'" $db | sort \
    | bedItemOverlapCount $db stdin > $db/iscaBenignGain.bedGraph
  
    hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaBenign \
        WHERE attrVals LIKE '%number_loss%'" $db | sort \
    | bedItemOverlapCount $db stdin > $db/iscaBenignLoss.bedGraph
  
    # load tables
    hgLoadBed -bedGraph=4 $db iscaBenignGainCum $db/iscaBenignGain.bedGraph
    hgLoadBed -bedGraph=4 $db iscaBenignLossCum $db/iscaBenignLoss.bedGraph
      done
      # hg19
      # Read 1889 elements of size 4 from iscaBenignGain.bedGraph
      # Read 1252 elements of size 4 from iscaBenignLoss.bedGraph
      # hg38
      # Read 1861 elements of size 4 from iscaBenignGain.bedGraph
      # Read 1265 elements of size 4 from iscaBenignLoss.bedGraph
  
    # Plus changes made to the scripts in kent/src/hg/utils/otto/isca/
    # to ensure that these tables are re-built and tested with the rest
  
  ############################################################################
  # LINCRNAS FROM BROAD (DONE 2011-10-10 Chin)
  # Human lincRNA Catalog
  
      # unzip data from Board to /hive/data/outside/lincRnaFromCabili
      mkdir /hive/data/genomes/hg19/bed/lincRnaFromCabili
      cd /hive/data/genomes/hg19/bed/lincRnaFromCabili
      cp /hive/data/outside/lincRnaFromCabili/Cabili_etal_BodyMaplincRNAs.key.txt .
      cp /hive/data/outside/lincRnaFromCabili/Cabili_etal_description.doc .
      cp /hive/data/outside/lincRnaFromCabili/Cabili_etal_BodyMapLincRNAs.bed .
      cp /hive/data/outside/lincRnaFromCabili/lincRNAs_transcripts.gtf .
  
      # Load data for lincRNAsTranscripts track	
      cd /hive/data/genomes/hg19/bed/lincRnaFromCabili
      ldHgGene -gtf hg19 lincRNAsTranscripts lincRNAs_transcripts.gtf
      # Read 21630 transcripts in 67096 lines in 1 files
      #   21630 groups 43 seqs 7 sources 1 feature types
      # 21630 gene predictions
  
      cat << '_EOF_' > createExpDataByCellType.pl
  #!/usr/bin/perl
  # Create expData table form a microarray bed15 file
  # for further analysis
  # usage: ./createExpDataByCellType.pl <bed15File>
  use strict;
  use warnings;
  my $line;
  my @bF;
  my $i;
  my $outFname;
  my $name;
  my $expCount;
  my $expIds;
  my $expScores;
  my @expId;
  my @expScore;
  my $score;
  my $tScore;
  my $fScore;
  my $log2 = log(2);
  my $sLog2;
  my $tLog2;
  # Define the 22 cell type array
  my @cellType = ("Adipose","Adrenal","Brain","Breast","Colon","Heart","Kidney",
                "Liver","Lung","LymphNode","Ovary","Prostate","SkeletalMuscle",
                "WhiteBloodCell","Testes","Thyroid","Testes_R","Brain_R",
                "Placenta_R","Foreskin_R","hLF_r2","hLF_r1");
  #Read in the microarray (bed15) files
  # Assume number of exp and score agreed
  my $argc=scalar(@ARGV);
  if ($argc < 1)
     {
        print "usage: ./createExpDataByCellType.pl <bed15File>";
      }
  my $fName = $ARGV[0];
  my $outFName;
  # Loop thru each cell type
  for($i = 0; $i < scalar(@cellType); $i++) {
    open(FHIN, $fName) or die "Can not open $fName";
    $outFName= "lincRNAsCT" . $cellType[$i] . "\.tab";
    open(FHOUT, ">$outFName") or die "Can not open $outFName";
    while ($line = <FHIN>) {
       chomp($line);
       @bF = split('\t', $line);
       printf(FHOUT "%s\t%s\t%s\t%s\t",
                    $bF[0],$bF[1],$bF[2],$bF[3]);
       $bF[14] =~ s/,$//;
       $expScores = $bF[14];
       @expScore = split(",",$expScores);
       # Process the expRatio
       $tLog2 = log($expScore[$i] + 0.5)/$log2;
       $sLog2 = sprintf("%.3f",$tLog2);
       # scale sLog2 using 0 (-1) .. 1000 (4)
       if ($sLog2 <= 4.0) {
           $tScore = ($sLog2 + 1) * (1000/5);
           $fScore = sprintf("%3d",$tScore);
         } else {
           $tScore = 1000;
         }
       if ($tScore >= 1000) {
           $fScore = 1000;
         } else {
           $fScore = sprintf("%3d",$tScore);
         }
       printf(FHOUT "%s\t%s\t%s\n",$fScore, $expScore[$i],$sLog2);
    } # end while
    close FHIN;
    close FHOUT;
  } # for loop
  '_EOF_'
      # << happy emacs
      chmod +x createExpDataByCellType.pl
      ./createExpDataByCellType.pl Cabili_etal_BodyMapLincRNAs.bed
  
      cat << '_EOF_' > deleteCTTables.sh
  #!/bin/sh
  #
  cellType=(Adipose Adrenal Brain Breast Colon Heart Kidney
                Liver Lung LymphNode Ovary Prostate SkeletalMuscle
                WhiteBloodCell Testes Thyroid Testes_R Brain_R
                Placenta_R Foreskin_R hLF_r2 hLF_r1)
  for c in "${cellType[@]}"
  do
      echo Processing lincRNAs$c
      hgsql  hg19 -e "DROP TABLE IF EXISTS  lincRNAsCT$c;"
  done
  '_EOF_'
      # << happy emacs
      chmod +x deleteCTTables.sh
      ./deleteCTTables.sh
  
      cat << '_EOF_' > lincRNAsCTTemp.sql
  CREATE TABLE lincRNAsCTTemp (
      chrom varchar(255) not null,        # Human chromosome or FPC contig
      chromStart int unsigned not null,   # Start position in chromosome
      chromEnd int unsigned not null,     # End position in chromosome
      name varchar(255) not null,         # Name of item
      score int unsigned not null,        # Score from 0-1000
      rawScore float not null,            # Raw Signal Score
      log2RawScore float not null         # log2 of raw score
  );
  '_EOF_'
      # << happy emacs
  
      cat << '_EOF_' > loadLincRNAsAllCellType.sh
  #!/bin/sh
  #
  cellType=(Adipose Adrenal Brain Breast Colon Heart Kidney
                Liver Lung LymphNode Ovary Prostate SkeletalMuscle
                WhiteBloodCell Testes Thyroid Testes_R Brain_R
                Placenta_R Foreskin_R hLF_r2 hLF_r1)
  for c in "${cellType[@]}"
  do
      echo Processing lincRNAsCT$c.tab
      hgLoadBed -tab -sqlTable=lincRNAsCTTemp.sql hg19 lincRNAsCTTemp lincRNAsCT$c.tab
      hgsql hg19 -e "RENAME TABLE lincRNAsCTTemp to lincRNAsCT$c"
  done
  '_EOF_'
      # << happy emacs
      chmod +x loadLincRNAsAllCellType.sh
      ./loadLincRNAsAllCellType.sh
  
  #############################################################################
  # LASTZ Gibbon NomLeu1 (DONE - 2011-11-04 - Chin)
  
      mkdir /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04
      cd /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04
  
      cat << '_EOF_' > DEF
  # human vs gibbon
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Gibbon nomLeu1
  SEQ2_DIR=/scratch/data/nomLeu1/nomLeu1.2bit
  SEQ2_LEN=/scratch/data/nomLeu1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #   establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          > do.log 2>&1 &
      #   real    724m15s
      cat fb.hg19.chainNomLeu1Link.txt
      #   2543943556 bases of 2897316137 (87.803%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzNomLeu1.2011-11-04 lastz.nomLeu1
  
      #   running the swap - DONE - 2011-11-08
      mkdir /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap
      cd /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04/DEF \
          -swap -syntenicNet \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          > swap.log 2>&1 &
      #   real     69m27s
      cat fb.nomLeu1.chainHg19Link.txt
      #   2480558770 bases of 2756591777 (89.986%) in intersection
  
  
  #############################################################################
  # DBSNP B135 / SNP135 (DONE 8/19/13)
  # originally done 11/9/11; updated 8/19/13 to fix allele frequencies (#11544)
  # Redmine #5170
      mkdir -p /hive/data/outside/dbSNP/135/human
      cd /hive/data/outside/dbSNP/135/human
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
      # to find the subdir name to use as orgDir below (human_9606 in this case).
      # Then click into that directory and look for file names like
      #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
      # -- use the first num for build and the second num_num for buildAssembly.
      # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
      #
      # Some trial and error was required to get the config.ra just right -- assembly
      # label now has ".p5" at end despite buildAssembly being 37_3, and more GRCh37
      # patch contigs needed to be filtered out:
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606
  build 135
  buildAssembly 37_3
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p5
  ignoreDbSnpContigs NW_003(3159[0-9][0-9]|5710[3-6][0-9])
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
      # Sent dbSNP some emails about conditions causing lots of SNPs in snp135Errors.bed.gz
      # due to inconsistent locType vs. coords.  2.3M SNPs have new exception SingleAlleleFreq,
      # and 474k SNPs have new exception InconsistentAlleles (allele freqs vs observed).
  
      # 8/19/13: rebuilding after fix #11544 -- many allele frequencies were combined
      # incorrectly due to stranded data (but not strand flag) in SNPAlleleFreq and
      # positive-strand-only data in SNPAlleleFreq_TGP.
      cd /hive/data/outside/dbSNP/135/human
      mkdir preAlleleFreqFix
      mv ucscAlleleFreq.txt.gz snp135*.bed.gz preAlleleFreqFix/
      # Run with -debug to regenerate addToDbSnp script:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra \
        -continue addToDbSnp -stop addToDbSnp -debug
      # Now re-run the updated snpAddTGPAlleleFreq.pl command:
      grep snpAdd addToDbSnp.csh
      ~/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg19snp135 \
        -contigLoc=b135_SNPContigLoc_37_3 > ucscAlleleFreq.txt
      # Reload the ucscAlleleFreq table:
      hgLoadSqlTab hg19snp135 ucscAlleleFreq{,.sql,.txt} >>& do.log & tail -f do.log
      # Redo the big join:
      mkdir -p `cat workingDir`
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin -stop bigJoin \
        >>& do.log & tail -f do.log
      # Manually re-run snpNcbiToUcsc (the translate step includes fasta file concat
      # and indexing, which would waste a lot of time):
      set tmpDir = `cat /hive/data/outside/dbSNP/135/human/workingDir`
      cd $tmpDir
      echo 'select * from par' | hgsql hg19 -NB > par.bed
      snpNcbiToUcsc -snp132Ext -par=par.bed ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp135 \
        >>& /hive/data/outside/dbSNP/135/human/do.log &
      tail -f /hive/data/outside/dbSNP/135/human/do.log
      head snp135Errors.bed >>& /hive/data/outside/dbSNP/135/human/do.log &
      tail -f /hive/data/outside/dbSNP/135/human/do.log
      wc -l snp135* >>& /hive/data/outside/dbSNP/135/human/do.log &
      tail -f /hive/data/outside/dbSNP/135/human/do.log
      gzip *.txt *.bed *.tab
      cp -p * /hive/data/outside/dbSNP/135/human/
      cd /hive/data/outside/dbSNP/135/human
      rm $tmpDir/*
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue load \
        >>& do.log & tail -f do.log
  
  
  #############################################################################
  # FILTER SNP135 (DONE 8/19/13 angie)
  # originally done 11/14/11; rebuilt 8/19/13 to fix allele frequencies (#11544)
      # Redmine #5170
      # Make several tracks that are filtered subsets of snp135:
      # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp135Mult
      # Second, siphon off the common variants -> snp135Common
      # Third, take the (uniquely mapped, not ens to be common) variants
      # w/dbSNP's "clinically-assoc" flag -> snp135Flagged
      cd /hive/data/outside/dbSNP/135/human
      zcat snp135.bed.gz \
      | perl -we \
        '$minTotal2N = 10; \
         ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \
         open($mult, "| gzip -c > snp135Mult.bed.gz") || die; \
         open($common,    "| gzip -c > snp135Common.bed.gz") || die; \
         open($flagged,   "| gzip -c > snp135Flagged.bed.gz") || die; \
         open($misc,      "| gzip -c > snp135Misc.bed.gz") || die; \
         while (<>) { \
           @w = split("\t"); \
           if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
             print $mult $_; \
             $multCount++; \
           } else { \
             my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
             my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
             my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
             my ($total2N, $maxAlleleFreq) = (0, 0); \
             for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
               $total2N += $alNs[$i]; \
               $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
             } \
             if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \
               print $common $_; \
               $comCount++; \
             } elsif($w[24] =~ /clinically-assoc/)  { \
               print $flagged $_; \
               $flagCount++; \
             } else { \
               print $misc $_; \
               $miscCount++; \
             } \
           } \
         } \
         close($mult);  close($common); close($flagged);  close($misc); \
         print "snp135Mult:    $multCount\nsnp135Common:  $comCount\nsnp135Flagged: $flagCount\n" . \
               "leftover:      $miscCount\n";'
  #snp135Mult:    3538479
  #snp135Common:  11504891
  #snp135Flagged: 32187
  #leftover:      39722367
      # Compare to counts from 11/14/11, before fixing allele freqs:
  #snp135Mult:    3538479
  #snp135Common:  11525489
  #snp135Flagged: 32077
  #leftover:      39116035
      # So 110 SNPs were initially left out of snp135Flagged
  
      # Load tables
      foreach subset (Mult Common Flagged)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          hg19 snp135$subset -sqlTable=snp135.sql snp135$subset.bed.gz
      end
  
  
  #############################################################################
  # SNP135 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 11/14/11 angie)
      mkdir /hive/data/genomes/hg19/bed/snp135Ortho
      cd /hive/data/genomes/hg19/bed/snp135Ortho
      # Filter snp135 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
      zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
      | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      | sort -u \
        > snp135ExcludeIds.txt
      wc -l snp135ExcludeIds.txt
  #1297409 snp135ExcludeIds.txt
      zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
      | awk '$3-$2 == 1 && $11 == "single" {print;}' \
      | grep -vFwf snp135ExcludeIds.txt \
  #NOTE FOR NEXT TIME: pipe output straight to awk command below... don't need this 7G intermediate:
        > snp135Simple.bed
      wc -l snp135Simple.bed
  #44228667 snp135Simple.bed
  
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        snp135Simple.bed > snp135ForLiftOver.bed
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
      splitFile ../snp135ForLiftOver.bed 10000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro3.over.chain.gz \
          \{check out exists out/panTro3.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh swarm
      cd /hive/data/genomes/hg19/bed/snp135Ortho/run.liftOChimp
      para make jobList
  #Completed: 4423 of 4423 jobs
  #CPU time in finished jobs:     430555s    7175.92m   119.60h    4.98d  0.014 y
  #IO & Wait Time:                 45877s     764.61m    12.74h    0.53d  0.001 y
  #Average job time:                 108s       1.80m     0.03h    0.00d
  #Longest finished job:             262s       4.37m     0.07h    0.00d
  #Submission to last job:           542s       9.03m     0.15h    0.01d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 4423 of 4423 jobs
  #CPU time in finished jobs:     591884s    9864.74m   164.41h    6.85d  0.019 y
  #IO & Wait Time:                 55485s     924.74m    15.41h    0.64d  0.002 y
  #Average job time:                 146s       2.44m     0.04h    0.00d
  #Longest finished job:             380s       6.33m     0.11h    0.00d
  #Submission to last job:          1403s      23.38m     0.39h    0.02d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
          \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 4423 of 4423 jobs
  #CPU time in finished jobs:    1097552s   18292.53m   304.88h   12.70d  0.035 y
  #IO & Wait Time:                 91301s    1521.69m    25.36h    1.06d  0.003 y
  #Average job time:                 269s       4.48m     0.07h    0.00d
  #Longest finished job:             697s      11.62m     0.19h    0.01d
  #Submission to last job:          1555s      25.92m     0.43h    0.02d
  
      cd /hive/data/genomes/hg19/bed/snp135Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro3.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro3/panTro3.2bit \
      | sort > panTro3.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
      | sort > rheMac2.orthoGlom.txt
      wc -l panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
  #41804363 panTro3.orthoGlom.txt
  #39856046 ponAbe2.orthoGlom.txt
  #35918623 rheMac2.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of snp135OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac2.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp135OrthoPt3Pa2Rm2.bed
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp135OrthoPt3Pa2Rm2 snp135OrthoPt3Pa2Rm2.bed
  #Loaded 43184090 elements of size 22
      # Cleanup:
      rm -r run*/split tmp.txt *.orthoGlom.txt snp135Simple.bed
      gzip snp135ExcludeIds.txt snp135ForLiftOver.bed &
  
  
  ############################################################################
  # DBSNP CODING ANNOTATIONS (135) (DONE 11/14/11 angie)
  # It wasn't necessary to redo this following the 9/1 re-run of doDbSnp.pl because
  # that simply picked up new allele frequency info, no change to exceptions etc.
      cd /hive/data/outside/dbSNP/135/human
      # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
      # For anything except an insertion (0 bases between flanks),
      # we need to add 1 to the end coord.  For an insertion, we need
      # to add 1 to the start coord.  Make a hash of the insertion IDs,
      # then look up each ID in ncbiFuncAnnotations.txt to tell which
      # transform to apply.
      # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
      zcat ncbiFuncAnnotations.txt.gz \
      | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \
                while (<$IDS>) { chomp; $ids{$_} = 1; } \
                close($IDS); \
                %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
                while (<>) { \
                  chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                  next unless $coding{$w[5]}; \
                  $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                  if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                    $w[2]++; # 2-base insertions: increment start coord \
                  } else { \
                    $w[3]++; # increment end coord to get half-open \
                  } \
                  print join("\t", @w) . "\n"; \
                }' \
      | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
      | uniq \
        > ncbiCodingAnnotations.txt
      wc -l ncbiCodingAnnotations.txt
  #2803490 ncbiCodingAnnotations.txt
      # How many & what kinds of function types?
      cut -f 6 ncbiCodingAnnotations.txt \
      | sort -n | uniq -c
  # 512390 3   (coding-synon)
  #1385793 8   (cds-reference -- ignored)
  #  23909 41  (nonsense)
  # 827675 42  (missense)
  #  53703 44  (frameshift)
  #     20 45  (cds-indel)
      # Gather up multiple annotation lines into one line per {snp, gene, frame}:
      perl -e  'while (<>) { chomp; \
                  my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                  if (defined $lastRs && \
                      ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                       $lastTx ne $txId || $lastFrm ne $frm)) { \
                    if (defined $refRow) { \
                      $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                      $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                    } \
                    print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                          "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                    $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                  } \
                  ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                      ($rsId, $ctg, $s, $e, $txId, $frm); \
                  $count++; \
                  if ($fxn == 8) { \
                    $refRow = [$fxn, $nt, $aa, $codon]; \
                  } else { \
                   $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                  } \
                } \
                if (defined $refRow) { \
                  $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                  $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                } \
                print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                      "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
        ncbiCodingAnnotations.txt \
      | liftUp snp135CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
      hgLoadBed hg19 snp135CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
        -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
        snp135CodingDbSnp.bed
  #Loaded 1385812 elements of size 11
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP135 (DONE 11/14/11 angie)
      mkdir /hive/data/genomes/hg19/snp135Mask
      cd /hive/data/genomes/hg19/snp135Mask
      # Identify rsIds with various problems -- we will exclude those.
      zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
      | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
        | sort -u \
        > snp135ExcludeRsIds.txt
      zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
      | grep -vFwf snp135ExcludeRsIds.txt \
        > snp135Cleaned.bed
      wc -l snp135Cleaned.bed
  #49922101 snp135Cleaned.bed
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp135Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
      | faSplit byname stdin substitutions/
  #Masked 44283699 snps in 44281659 out of 3131050506 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3131050506 (difference is 6110758)
      # warnings about differing observed strings at same base position:
      wc -l diffObserved.txt
  #3661 diffObserved.txt
  #TODO: send list to dbSNP.
      # Check that 6110758 is the total #bases in sequences with nothing in snp135Cleaned:
      grep -Fw single snp135Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
      grep -vwf /data/tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #6110758
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10233 (y != c)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
        gzip $f:r.subst.fa &
      end
  
      # Insertions & deletions not done.  To date we have only offered substs for download.
      # If there is user demand, use template from snp131 above.
  
      # Clean up and prepare for download:
      gzip snp135Cleaned.bed &
      foreach d (substitutions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg19/snp132Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt.
  
      # Create download links on hgwdev.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp135Mask
      ln -s /hive/data/genomes/hg19/snp135Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp135Mask/
  
  
  #############################################################################
  # SIB Transcriptome (DONE 2011-12-02 Chin)
  
      # Create working directory and download data from where Christian
      # Iseli (Christian.Iseli at licr.org) put it, and unpack.
      mkdir -p /hive/data/outside/lirc
      cd /hive/data/outside/lirc
      wget --timestamping ftp://ftp.licr.org/pub/hg19/HTr.gtf.gz
      wget --timestamping ftp://ftp.licr.org/pub/hg19/txg.tar.gz
  
      cd /hive/data/genomes/hg19/bed/
      mkdir sibTranscriptome
      cd sibTranscriptome
      tar -zxvf /hive/data/outside/lirc/txg.tar.gz
      cp /hive/data/outside/lirc/HTr.gtf.gz .
  
      zcat HTr.gtf.gz | ldHgGene hg19 sibGene stdin
      # Reading stdin
      # Read 195300 transcripts in 2564421 lines in 1 files
      # 195300 groups 25 seqs 1 sources 2 feature types
      # 195300 gene predictions
  
      # Do a little data cleanup and transformation and load splice graphs
      # into database.
      sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql
      cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \
        -sqlTable=sibTxGraph.sql hg19 sibTxGraph stdin
      # Reading stdin
      # Loaded 46973 elements of size 18
      # Sorted
      # Creating table definition for sibTxGraph
      # Saving bed.tab
      # Loading hg19
  
      # Create sibAltEvents track for analysed alt-splices.
      # Not on RR for hg18 and hg19, so do not push it out
     cat txg/*.txg | txgAnalyze stdin /cluster/data/hg19/hg19.2bit sibAltEvents.bed
     awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
     hgLoadBed hg19 sibAltEvents foo.bed
      # Reading foo.bed
      # Loaded 431590 elements of size 6
      # Sorted
      # Creating table definition for sibAltEvents
      # Saving bed.tab
      # Loading hg19
  
  
      # push sibGene and sibTxGraph for hg19
  
  
  ############################################################################
  # HGNC: Hugo Gene Nomenclature Committee (DONE 2012-05-20 cline)
  
  mkdir /hive/data/outside/hgnc
  cd /hive/data/outside/hgnc
  mkdir 052012
  cd 052012
  wget -O hgnc.txt "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&preset=all&status=Approved&status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag"
  
  tail -n +2 hgnc.txt |grep -v withdrawn \
   | hgLoadSqlTab hg19 hgnc ~/kent/src/hg/lib/hgnc.sql stdin
  #########################################################################
  # LASTZ Cow BosTau7 (DONE - 2012-01-23 - Chin)
      mkdir /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23
      cd /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23
  
      cat << '_EOF_' > DEF
  # human vs cow
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Cow bosTau7
  SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit
  SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  
  BASE=/hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      # real    433m45.814s
      cat fb.hg19.chainBosTau7Link.txt
      # 1360887008 bases of 2897316137 (46.971%) in intersection
      # Create link
      cd /hive/data/genomes/hg19/bed
      ln -s  lastzBosTau7.2012-01-23 lastz.bosTau7
  
      #   running the swap
      mkdir /hive/data/genomes/bosTau7/bed/blastz.hg19.swap
      cd /hive/data/genomes/bosTau7/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23/DEF \
          -swap  -syntenicNet \
          -noLoadChainSplit \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #   real     95m9.611s
      cat fb.bosTau7.chainHg19Link.txt
      #   1388551419 bases of 2804673174 (49.508%) in intersection
      cd /hive/data/genomes/bosTau7/bed
      ln -s blastz.hg19.swap lastz.hg19
  ############################################################################
  # UPDATE COSMIC TRACK - v57 (DONE 2012-01-26 larrym)
  
  # Table stats before
  
  hgsql hg19 -s -e 'select count(*) from cosmicRaw'
  55579
  hgsql hg19 -s -e 'select count(*) from cosmic'
  49087
  
  ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v57_180112.csv
  
  # Table stats after
  
  hgsql hg19 -s -e 'select count(*) from cosmicRaw'
  78152
  hgsql hg19 -s -e 'select count(*) from cosmic'
  71437
  
  Here's some ID's that were added in this release (and not present on RR):
  
  COSM143850
  COSM143824
  COSM143823
  COSM143802
  COSM143801
  
  ############################################################################
  # POLYA-SEQ TRACK (from Adnan Derti, Merck) (LOADED, Andy 2012-01-30)
  
  # Fan unpacked the .zip a while back
  cd /hive/data/genomes/hg19/bed/polyA
  # make links with more UCSCish naming
  mkdir /hive/data/genomes/{hg18,hg19,canFam2,mm9,rn4,rheMac2}/bed/polyASeq
  mkdir -p /gbdb/{hg18,hg19,canFam2,mm9,rn4,rheMac2}/bbi
  for set in `ls -1 | grep -v orig | grep -v table`; do
    suff=`echo $set | sed 's/^[a-z]\+_//'`;
    db=`echo $suff | sed 's/\_.*//'`;
    tiss=`echo $suff | sed 's/.\+\_//; s/^[a-z]/\U&/; s/-[a-z]/\U&/; s/-//'`;
    Db=`echo $db | sed 's/^[a-z]/\U&/'`;
    printf "%s\t%s\t%s\t%s\t%s\n" $set $db $Db $tiss $suff
  done > table.info
  for set in `ls -1 | grep -v orig | grep -v table`; do
    suff=`echo $set | sed 's/^[a-z]\+_//'`;
    db=`echo $suff | sed 's/\_.*//'`;
    tiss=`echo $suff | sed 's/.\+\_//; s/^[a-z]/\U&/; s/-[a-z]/\U&/; s/-//'`;
    Db=`echo $db | sed 's/^[a-z]/\U&/'`;
    fwdTable=polyASeqSites${tiss}Fwd
    revTable=polyASeqSites${tiss}Rev
    fwdBg=/hive/data/genomes/${db}/bed/polyASeq/${fwdTable}.bedGraph
    revBg=/hive/data/genomes/${db}/bed/polyASeq/${revTable}.bedGraph
    fwdBw=${fwdBg%.bedGraph}.bw
    revBw=${revBg%.bedGraph}.bw
    tail -n +2 ${set}/polyaseq_sites_fwd_strand.bedgraph | sort -k1,1 -k2,2n > $fwdBg;
    tail -n +2 ${set}/polyaseq_sites_rev_strand.bedgraph | sort -k1,1 -k2,2n > $revBg;
    bedGraphToBigWig $fwdBg /hive/data/genomes/${db}/chrom.sizes $fwdBw
    bedGraphToBigWig $revBg /hive/data/genomes/${db}/chrom.sizes $revBw
    ln -s $fwdBw /gbdb/${db}/bbi/
    ln -s $revBw /gbdb/${db}/bbi/
    hgBbiDbLink $db $fwdTable /gbdb/${db}/bbi/${fwdTable}.bw
    hgBbiDbLink $db $revTable /gbdb/${db}/bbi/${revTable}.bw
  done
  # silly loop to take care of the majority of the trackDb.
  # the rest copy/paste
  cat table.info | while read -a line; do
    db=${line[1]}
    for Strand in Fwd Rev; do
       strand=`echo $Strand | tr [:upper:] [:lower:]`
       bg=${line[0]}/polyaseq_sites_${strand}_strand.bedgraph
       tiss=${line[3]}
       table=polyASeqSites${tiss}${Strand}
       bw=/gbdb/${db}/bbi/${table}.bw
       min=`bigWigInfo $bw | grep "^min" | sed 's/min: //'`
       max=`bigWigInfo $bw | grep "^max" | sed 's/max: //'`
       echo "        track "$table
       echo "        parent polyASeqSitesSignalView"
       echo "        subGroups view=Signal tissType="$tiss" strand="$strand
       echo "        shortLabel PolyA-Seq "$tiss
       echo "        longLabel Poly(A)-tail sequencing of "$tiss" from Merck ("$Strand" strand)"
       if [ $strand = "fwd" ]; then
       echo "        color 153,51,51"
       else
       echo "        color 0,0,0"
       fi
       echo "        type bigWig "$min" "$max
       echo
    done >> ${db}.ra
  done
  
  ##############################################################################
  # LASTZ MOUSE Mm10 (DONE - 2012-03-08 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
      cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
  
      cat << '_EOF_' > DEF
  # human vs mouse
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  BLASTZ_ABRIDGE_REPEATS=1
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Mouse Mm9
  SEQ2_DIR=/scratch/data/mm10/nib
  SEQ2_SMSK=/scratch/data/mm10/notInOthers
  SEQ2_LEN=/scratch/data/mm10/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -noLoadChainSplit -syntenicNet \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #	real    197m23.436s
      cat fb.hg19.chainMm10Link.txt
      #	1021265143 bases of 2897316137 (35.249%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMm10.2012-03-07 lastz.mm10
  
      #	and the swap
      mkdir /hive/data/genomes/mm10/bed/blastz.hg19.swap
      cd /hive/data/genomes/mm10/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzMm10.2012-03-07/DEF \
  	-swap -noLoadChainSplit -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    72m32.794s
      cat fb.mm10.chainHg19Link.txt
      #	1014045890 bases of 2652783500 (38.226%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/mm10/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LIFT ENCODE REGIONS FROM HG19 (DONE, Andy)
  
  echo "select * from encodeRegions" | hgsql hg18 | tail -n +2 \
    | liftOver /dev/stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz /dev/stdout encodeRegions.unmapped \
    | hgLoadBed -noBin hg19 encodeRegions /dev/stdin
  # (all mapped cleanly)
  
  #########################################################################
  ## WINDOWMASKER (DONE - 2012-04-19 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/windowMasker
      cd /hive/data/genomes/hg19/bed/windowMasker
      time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
  	-dbHost=hgwdev hg19 > do.log 2>&1 &
      #   real    225m45.489s
  
      # Masking statistics
      twoBitToFa hg19.wmsk.2bit stdout | faSize stdin
      #   3137161264 bases (239850802 N's 2897310462 real 1828487268 upper
      #   1068823194 lower) in 93 sequences in 1 files
      #   Total size: mean 33732916.8 sd 63483709.4
      #   min 4262 (chr18_gl000207_random) max 249250621 (chr1) median 172294
      #   %34.07 masked total, %36.89 masked real
  
      twoBitToFa hg19.wmsk.sdust.2bit stdout | faSize stdin
      #   3137161264 bases (239850802 N's 2897310462 real 1811306328 upper
      #   1086004134 lower) in 93 sequences in 1 files
      #   Total size: mean 33732916.8 sd 63483709.4
      #   min 4262 (chr18_gl000207_random) max 249250621 (chr1) median 172294
      #   %34.62 masked total, %37.48 masked real
  
      hgLoadBed hg19 windowmaskerSdust windowmasker.sdust.bed.gz
      #   Read 16318719 elements of size 3 from windowmasker.sdust.bed.gz
  
      featureBits -countGaps hg19 windowmaskerSdust
      #	1325854876 bases of 3137161264 (42.263%) in intersection
  
      #	eliminate the gaps from the masking
      featureBits hg19 -not gap -bed=notGap.bed
      #	2897316137 bases of 2897316137 (100.000%) in intersection
      time nice -n +19 featureBits hg19 windowmaskerSdust notGap.bed \
          -bed=stdout | gzip -c > cleanWMask.bed.gz
      #   1086009749 bases of 2897316137 (37.483%) in intersection
      #   real    2m11.261s
  
      #	reload track to get it clean
      hgLoadBed hg19 windowmaskerSdust cleanWMask.bed.gz
      #	Read 16318560 elements of size 4 from cleanWMask.bed.gz
      time featureBits -countGaps hg19 windowmaskerSdust
      #   1086009749 bases of 3137161264 (34.618%) in intersection
      #   real    1m34.044s
  
      #	do *not* need to mask with this clean result since RepeatMasker
      #	does a very good job here.  Using RM masking instead.
  #    zcat cleanWMask.bed.gz \
  #	| twoBitMask ../../hg19.unmasked.2bit stdin \
  #	    -type=.bed hg19.cleanWMSdust.2bit
  #    twoBitToFa hg19.cleanWMSdust.2bit stdout | faSize stdin \
  #        > hg19.cleanWMSdust.faSize.txt
  #    cat hg19.cleanWMSdust.faSize.txt
  
      # how much does this window masker and repeat masker overlap:
      time featureBits -countGaps hg19 rmsk windowmaskerSdust
      #   849334688 bases of 3137161264 (27.073%) in intersection
      #   real    2m4.634s
  
      # RM by itself:
      time featureBits -countGaps hg19 rmsk
      #   1465724774 bases of 3137161264 (46.721%) in intersection
      #   real    0m33.408s
  
  ##########################################################################pubStart
  # Publications track (DONE - 04-27-12 - Max)
  
  # article download and conversion is run every night on hgwdev:
  # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
  # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
  # then converts them to text into /hive/data/outside/literature/{pmc,elsevier}
  
  # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py
  
  # data processing was run manually like this
  export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
  # pmc
  cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
  pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
  ssh swarm
  cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
  pubBlat steps:annot-tables
  exit
  pubBlat load
  
  # elsevier
  cd /hive/data/inside/literature/pubtools/runs/elsBlat/
  pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
  ssh swarm
  cd /hive/data/inside/literature/pubtools/runs/elsBlat/
  pubBlat steps:annot-tables
  exit
  pubBlat load
  #--pubEnd
  
  #############################################################################
  # lifting HapMap recombination maps from hg18 (DONE - 2012-05-09 - Hiram)
      mkdir -p /hive/data/genomes/hg19/bed/hapmap/release24FromHg18
      cd /hive/data/genomes/hg19/bed/hapmap/release24FromHg18
      ln -s /hive/data/genomes/hg18/bed/hapmap/release24/hapMapRelease24CEURecombMap.bedGraph hg18.hapMapRelease24CEURecombMap.bedGraph
      ln -s /hive/data/genomes/hg18/bed/hapmap/release24/hapMapRelease24YRIRecombMap.bedGraph hg18.hapMapRelease24YRIRecombMap.bedGraph
      ln -s /hive/data/genomes/hg18/bed/hapmap/release24/hapMapRelease24CombinedRecombMap.bedGraph hg18.hapMapRelease24CombinedRecombMap.bedGraph
  
      liftOver hg18.hapMapRelease24CEURecombMap.bedGraph \
          /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
          stdout hapMapRelease24CEURecombMap.unmapped | sort -k1,1 -k2,2n \
          > hapMapRelease24CEURecombMap.bedGraph
  
      liftOver hg18.hapMapRelease24YRIRecombMap.bedGraph \
          /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
          stdout hapMapRelease24YRIRecombMap.unmapped | sort -k1,1 -k2,2n \
          > hapMapRelease24YRIRecombMap.bedGraph
  
      liftOver hg18.hapMapRelease24CombinedRecombMap.bedGraph \
          /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
          stdout hapMapRelease24CombinedRecombMap.unmapped | sort -k1,1 -k2,2n \
          > hapMapRelease24CombinedRecombMap.bedGraph
  
  
  for F in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \
          hapMapRelease24YRIRecombMap
  do
  bedGraphToBigWig -verbose=2 ${F}.bedGraph \
          /hive/data/genomes/hg19/chrom.sizes ${F}.bw > ${F}.log 2>&1
  done
  
  for T in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \
          hapMapRelease24YRIRecombMap
  do
      rm -f /gbdb/hg19/decode/${T}.bw
      ln -s `pwd`/${T}.bw /gbdb/hg19/decode/${T}.bw
      hgsql -e "drop table ${T};" hg19
      hgBbiDbLink hg19 ${T} /gbdb/hg19/decode/${T}.bw
  done
  
  #############################################################################
  # 1000 GENOMES PHASE 1 VARIANT CALLS (UPDATE DONE 10/9/12 angie)
  # Autosomes and chrX loaded 5/21/12; chrY (and chrM but it's rCRS of course)
  # became available in July '12.  Existing released files were quietly updated
  # 10/1/12 with some new variant IDs.
      # This is a lot of data.  Use aspera (ascp) instead of ftp, run in a screen.
      screen -S phase1
      mkdir -p /hive/data/genomes/hg19/bed/1000Genomes/phase1
      cd /hive/data/genomes/hg19/bed/1000Genomes/phase1
      set ascpCmd = /opt/aspera/connect/bin/ascp
      set ascpArgs = '-i /opt/aspera/connect/etc/asperaweb_id_dsa.putty -QTr -l150M'
      set phase1Path = anonftp@ftp-private.ncbi.nlm.nih.gov:/1000genomes/ftp/phase1/analysis_results/integrated_call_sets
      $ascpCmd $ascpArgs \
        $phase1Path/README.ALL.BI_genome_strip_hq_chrY.20101123 \
        $phase1Path/README_phase1_integrated_call_set_20120621 \
        $phase1Path/integrated_call_samples.20101123.ALL.panel \
        $phase1Path/integrated_call_samples.20101123.ped \
        $phase1Path/uniq.chrY.human.ncbi37.txt \
        .
      # BTW if you see "Error 51 [Destination: Permission denied]" when reloading,
      # it's because the files are read-only -- move aside or rm, then try again.
      foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
        set file = $phase1Path/ALL.chr$c.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
        set cmd = "$ascpCmd $ascpArgs $file $file.tbi ."
        echo $cmd
        $cmd
        if ($status != 0) then
          echo ================ ERROR chrom $c ======================
        endif
      end
      du -sh --apparent .
  #142G    .
      $ascpCmd $ascpArgs $phase1Path/ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz{,.tbi} .
      $ascpCmd $ascpArgs $phase1Path/ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz{,.tbi} .
      # I wondered why they didn't merge the two chrY call sets (SNPs and SVs) --
      # the SVS file has 456 individs and the SNPs file has 526.  At least the
      # 456 are a subset of the 526... but for now I will just use the SNPs file.
  
      # Grab chrMT even though we don't have liftOver for VCF at this point:
      $ascpCmd $ascpArgs $phase1Path/ALL.chrMT.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz{,.tbi} .
  
      # Hmmmm, how much space do we have on /gbdb?  Well, link it on hgwdev anyway:
      mkdir /gbdb/hg19/1000Genomes
      ln -s `pwd`/*.vcf.gz* /gbdb/hg19/1000Genomes/
      cp /dev/null tgpPhase1.txt
      foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
        set file = ALL.chr$c.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
        echo "/gbdb/hg19/1000Genomes/$file\tchr$c" >> tgpPhase1.txt
      end
      echo "/gbdb/hg19/1000Genomes/ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\tchrY" \
        >> tgpPhase1.txt
      hgLoadSqlTab hg19 tgpPhase1 ~/kent/src/hg/lib/bbiChroms.sql tgpPhase1.txt
      # Make a chromosomes line for trackDb:
      use hg19 -NBe 'select seqName from tgpPhase1' | xargs echo | sed -e 's/ /,/g'
  #chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX,chrY
  
  
  ############################################################################
  # 1000 GENOMES PAIRED-END ACCESSIBLE REGIONS (#1079) (DONE 8/22/12 angie)
      # Data provided by Tom Blackwell at University of Michigan
      mkdir /hive/data/genomes/hg19/bed/1000Genomes/phase1Mapability
      cd /hive/data/genomes/hg19/bed/1000Genomes/phase1Mapability
      wget -r ftp://share.sph.umich.edu/public
      ln -s `pwd`/share.sph.umich.edu/public/paired.end.mapping.1000G..pilot.bb \
        /gbdb/hg19/1000Genomes/
      ln -s `pwd`/share.sph.umich.edu/public/paired.end.mapping.1000G.strict.bb \
        /gbdb/hg19/1000Genomes/
      hgBbiLink hg19 tgpPhase1AccessibilityPilotCriteria \
        /gbdb/hg19/1000Genomes/paired.end.mapping.1000G..pilot.bb
      hgBbiLink hg19 tgpPhase1AccessibilityStrictCriteria \
        /gbdb/hg19/1000Genomes/paired.end.mapping.1000G.strict.bb
  
  
  ############################################################################
  # UPDATE COSMIC TRACK - v59 (DONE 2012-05-23 larrym)
  
  ~/kent/src/hg/utils/automation/loadCosmic.pl -oldVer=55 hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v59_230512.csv
  Loading COSMIC v59
  New length: 136638
  Old length: 49087
  Percent bed overlap with previous version: 99.95%
  Number of deleted IDs: 20
  Number of added IDs: 87571
  
  #########################################################################
  # LASTZ Rat Rn5 (DONE - 2009-05-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27
      cd /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27
  
      cat << '_EOF_' > DEF
  # human vs rat
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Rat Rn5
  SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit
  SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRn5.2012-06-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -noLoadChainSplit \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #   real    658m53.984s
      cat fb.hg19.chainRn5Link.txt
      #   917356917 bases of 2897316137 (31.662%) in intersection
  
      #	running the swap - DONE - 2012-06-27
      mkdir /hive/data/genomes/rn5/bed/blastz.hg19.swap
      cd /hive/data/genomes/rn5/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzRn5.2012-06-27/DEF \
  	-swap -noLoadChainSplit \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #   real    66m53.095s
      cat fb.rn5.chainHg19Link.txt
      #	933922552 bases of 2572853723 (36.299%) in intersection
  
  ##############################################################################
  # LASTZ tenrec echTel1 (DONE - 2012-06-29 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S hg19EchTel1
      mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
      cd /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      cat << '_EOF_' > DEF
  # tenrec vs human
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: tenrec EchTel1
  SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
  SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=700
  
  BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #   real    411m54.452s
      cat fb.hg19.chainEchTel1Link.txt
      #   670299345 bases of 2897316137 (23.135%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzEchTel1.2012-06-29 lastz.echTel1
  
      # better to have reciprocal best for this one since it is low coverage:
      cd /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
      time doRecipBest.pl hg19 echTel1 -buildDir=`pwd` -workhorse=hgwdev \
  	> best.log 2>&1 &
      #   real    48m11.157s
  
      mkdir /hive/data/genomes/echTel1/bed/blastz.hg19.swap
      cd /hive/data/genomes/echTel1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29/DEF \
  	-swap -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #   real    405m49.935s
      cat fb.echTel1.chainMm10Link.txt
      #   659524096 bases of 2111581369 (31.234%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/echTel1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ dog canFam3 (DONE - 2012-07-03 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S hg19CanFam3
      mkdir /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03
      cd /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      cat << '_EOF_' > DEF
  # human vs dog
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: dog CanFam3
  SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit
  SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      # forgot to copy to the log
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #   real    1019m39.790s
  
      cat fb.hg19.chainCanFam3Link.txt
      #   1502192631 bases of 2897316137 (51.848%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzCanFam3.2012-07-03 lastz.canFam3
  
      mkdir /hive/data/genomes/canFam3/bed/blastz.hg19.swap
      cd /hive/data/genomes/canFam3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03/DEF \
  	-swap -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #   real    103m14.464s
      cat fb.canFam3.chainHg19Link.txt
      #   1455183825 bases of 2392715236 (60.817%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/canFam3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  
  ##############################################################################
  # DBSNP B137 / SNP137 (DONE 8/19/13)
  # Originally done 7/11/12; updated w/corrections from dbSNP 9/10/12, 10/10, 11/9/12
  # -- see comments below and #8360 note 36, 42, 45
  # Updated 8/19/13 to fix allele frequencies (#11544)
  # Redmine #8360
      mkdir -p /hive/data/outside/dbSNP/137/human
      cd /hive/data/outside/dbSNP/137/human
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
      # to find the subdir name to use as orgDir below (human_9606 in this case).
      # Then click into that directory and look for file names like
      #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
      # -- use the first num for build and the second num_num for buildAssembly.
      # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
      #
      # Some trial and error was required to get the config.ra just right --
      # the b* filenames don't include buildAssembly!
      # patch contigs needed to be filtered out:
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606
  build 137
  buildAssembly
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p5
  ignoreDbSnpContigs NW_003(3159[0-9][0-9]|5710[3-6][0-9])
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
      # Script failed with mysql warnings because b137_ContigInfo.bcp.gz had an extra
      # column in the middle, relative to the CREATE TABLE def in human_9606_table.sql.gz.
      # I emailed dbsnp-collab-all@ncbi and manually spliced out the unexplained column:
      cd data
      mv b137_ContigInfo.bcp.gz b137_ContigInfo_extraColumn.bcp.gz
      zcat b137_ContigInfo_extraColumn.bcp.gz | cut -f 1-13,15-27 | gzip -c > b137_ContigInfo.bcp.gz
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue loadDbSnp \
        >>& do.log & tail -f do.log
      # mysql server crashed when loading the last table. Added "if (0)" around the
      # parts of loadDbSnp.csh that succeeded and ran it again to catch the last table:
      ./loadDbSnp.csh >>& do.log & tail -f do.log
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
        >>& do.log & tail -f do.log
      # Next error:
  #/cluster/home/angie/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg19snp137
  #SNPAlleleFreq_TGP data are not sorted on snp_id (183304030 follows 191299099) at /cluster/home/angie/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl line 75, <$tgpAlF> line 4.
      # I modified the mysql queries in snpAddTGPAlleleFreq.pl to order by snp_id,
      # added "if (0)" around the successful portion addToDbSnp.csh, ran again:
      ./addToDbSnp.csh >>& do.log & tail -f do.log
      # Had to do the above a few more times to deal with other unexpected conditions
      # e.g. an allele called "+".
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin \
        >>& do.log & tail -f do.log
      # Some tweaks to snpNcbiToUcsc.c required (larger MAX_SNPID, new locType 7, new func 30=ncRNA):
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue translate \
        >>& do.log & tail -f do.log
      # After final snpNcbiToUcsc tweaking:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue load \
        >>& do.log & tail -f do.log
  
      # 9/10/12, 10/10/12, 11/9/12: updates w/corrections from dbSNP.
      # The 9/10 update included SNPContigLoc with some corrected mappings -- and some
      # dropped mappings!
      # All 3 updates included new SNPContigLocusId (func predictions).
      # So, last time I'm doing this for snp137 -- new SNPContigLocusId, and
      # new SNPContigLoc with dropped lines added back in from original SNPContigLoc.
      cd /hive/data/outside/dbSNP/137/human/data
      # 9/10:
      mv b137_SNPContigLoc.bcp.gz b137_SNPContigLoc.orig.bcp.gz
      mv b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId.orig.bcp.gz
      wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/b137_SNPContigLoc.bcp.gz
      wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/b137_SNPContigLocusId.bcp.gz
      # 10/10:
      wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/b137_SNPContigLocusId_before_QA.bcp
      mv b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId.120910.bcp.gz
      gzip b137_SNPContigLocusId_before_QA.bcp
      ln -s b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId_before_QA.bcp.gz
      # 11/9:
      wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/2012_nov_08_preview/b137_SNPContigLocusId.bcp.gz
      mv b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId.121108.bcp.gz
      ln -s b137_SNPContigLocusId.121108.bcp.gz b137_SNPContigLocusId.bcp.gz
      # No update to SNPContigLoc, which in the Sep. update dropped some mappings
      # from the original.  Add the dropped mappings back:
      cd /hive/data/outside/dbSNP/137/human/data
      cat > combineContigLoc.pl <<EOF
  #!/usr/bin/env perl
  # The latest b137_SNPContigLoc.bcp.gz contains some corrected mappings -
  # but it is also missing some mappings that it should have!
  # So step through the numerically sorted (by {snp_id, ctg_id, asn_from})
  # new and old files; if the old file has something that the new file does not,
  # add it back in.
  
  use warnings;
  use strict;
  
  open(my $fNew, "zcat b137_SNPContigLoc.bcp.gz |") || die "$!";
  open(my $fOld, "zcat b137_SNPContigLoc.orig.bcp.gz |") || die "$!";
  open(my $fOut, "| gzip -c > b137_SNPContigLoc.merged.bcp.gz") || die "$!";
  
  sub cmpLocs {
    my ($newRef, $oldRef) = @_;
    if (!defined $newRef->[1] || !defined $oldRef->[1]) {
      die;
    }
    my $diff = $newRef->[1] <=> $oldRef->[1];
    return $diff unless ($diff == 0);
    $diff = $newRef->[2] <=> $oldRef->[2];
    return $diff unless ($diff == 0);
    $diff = $newRef->[3] <=> $oldRef->[3];
    return $diff;
  }
  
  my @old = split("\t", <$fOld>);
  
  while (<$fNew>) {
    my @new = split("\t");
    if (defined $old[1]) {
      my $diff = &cmpLocs(\@new, \@old);
      while ($diff > 0) {
        # old file's line is missing from the new file -- print old line & get next old line
        print $fOut join("\t", @old);
        my $nextOld = <$fOld>;
        @old = split("\t", $nextOld);
        last if (! defined $old[1]);
        $diff = &cmpLocs(\@new, \@old);
      }
      if ($diff == 0) {
        # same line in new and old -- advance to next line from old file
        @old = split("\t", <$fOld>);
      }
    }
    # always print line from new file.
    print $fOut join("\t", @new);
  }
  if (defined $old[1]) {
    print $fOut join("\t", @old);
    while (<$fOld>) {
      print $fOut $_;
    }
  }
  EOF
      chmod a+x combineContigLoc.pl
      ./combineContigLoc.pl
  
      # Redo the parts of the doDbSnp.pl process that depend on SNPContigLoc and SNPContigLocusId:
      hgsql hg19snp137 -e 'drop table b137_SNPContigLoc; drop table b137_SNPContigLocusId;'
      hgsql hg19snp137 < schema/SNPContigLocs.sql
      # Relevant subset of loadDbSnp.csh:
      cd /hive/data/outside/dbSNP/137/human
      setenv TMPDIR /data/tmp
      set tmpDir = `mktemp -d $TMPDIR/doDbSnp.pl.translate.XXXXXX`
      chmod 775 $tmpDir
      pushd $tmpDir
      echo $tmpDir > /hive/data/outside/dbSNP/137/human/workingDir
      set t = b137_SNPContigLocusId
      zcat /hive/data/outside/dbSNP/137/human/data/$t.bcp.gz | egrep -vw '(HuRef|CRA_TCAGchr7v2)'  | egrep -vw 'NW_003(3159[0-9][0-9]|5710[3-6][0-9])'\
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp137 $t placeholder stdin
      hgsql hg19snp137 -e 'alter table b137_SNPContigLocusId add index (ctg_id);'
      zcat /hive/data/outside/dbSNP/137/human/data/b137_ContigInfo.bcp.gz | egrep -vw '(HuRef|CRA_TCAGchr7v2)' \
      | cut -f 1 | sort -n > b137_ContigInfo.ctg_id.txt
      zcat /hive/data/outside/dbSNP/137/human/data/b137_SNPContigLoc.merged.bcp.gz \
      | grep -Fwf b137_ContigInfo.ctg_id.txt \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
        > tmp.tab
      hgLoadSqlTab -oldTable hg19snp137 b137_SNPContigLoc placeholder tmp.tab
      hgsql hg19snp137 -e 'alter table b137_SNPContigLoc add index (ctg_id);'
      hgsql hg19snp137 -e 'create table ContigLocFix select cl.* from b137_SNPContigLoc as cl, b137_ContigInfo as ci where cl.ctg_id = ci.ctg_id;'
      hgsql hg19snp137 -e 'alter table ContigLocFix add index (ctg_id);'
      hgsql hg19snp137 -e 'drop table b137_SNPContigLoc; \
                           rename table ContigLocFix to b137_SNPContigLoc;'
      hgsql hg19snp137 -e 'alter table b137_SNPContigLoc add index (snp_id);'
      popd
      # Run the first parts of addToDbSnp (if(0)'d out the rest):
      ./addToDbSnp.csh >>& do.log & tail -f do.log
      # Redo from bigJoin onward...
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin \
        >>& do.log & tail -f do.log
  
      # NOTE FOR NEXT TIME - 6/26/13
      # In MLQ #, a user reported missing SNP IDs and I suspect that some of the
      # losses might have something to do with IDs mapped to patch contigs losing
      # the primary reference contigs.  There are ~2500 rs's in the first SNPContigLoc
      # that have patch, but not primary mappings.  Next time do this early in the
      # process in case they haven't fixed they pipeline:
      # Look for reference assembly mappings that may have been lost due to patch mappings (?)
      cd /hive/data/outside/dbSNP/137/human/data
      # Make lists of primary and patch contig IDs, prefixed by "^" because we'll grep
      # for those words at the beginning of lines in the next step.
      zcat b137_ContigInfo.bcp.gz | g GRCh37 | g -v PATCHES \
      | cut -f 1 | sed -re 's/^/^/' \
        > primaryContigIds.txt
      zcat b137_ContigInfo.bcp.gz | g -w PATCHES \
      | cut -f 1 | sed -re 's/^/^/' \
        > patchContigIds.txt
      # Make lists of SNPs mapped to primary contigs and patch contigs.
      # Put contig IDs first because we're grepping to find them at the beginning of the line.
      zcat b137_SNPContigLoc.orig.bcp.gz | awk '{print $3, $2;}' > contigIdsAndRsIds.txt
      grep -wf primaryContigIds.txt contigIdsAndRsIds.txt > primaryContigsAndRsIds.txt
      grep -wf patchContigIds.txt contigIdsAndRsIds.txt > patchContigsAndRsIds.txt
      # Now trim to keep just rs IDs:
      awk '{print $2;}' primaryContigsAndRsIds.txt | uniq > rsIdsOnPrimary.txt
      awk '{print $2;}' patchContigsAndRsIds.txt | uniq > rsIdsOnPatches.txt
      # Compare to find rs IDs mapped to patch contigs but not to primary contigs:
      sort rsIdsOnPrimary.txt > rsIdsOnPrimary.alphaSort.txt
      sort rsIdsOnPatches.txt > rsIdsOnPatches.alphaSort.txt
      comm -13 rsIdsOnPrimary.alphaSort.txt rsIdsOnPatches.alphaSort.txt \
      | sort -nu \
        > rsIdsOnPatchesNotPrimary.txt
      wc -l rsIdsOnPatchesNotPrimary.txt
  #2566 rsIdsOnPatchesNotPrimary.txt
  
      # 8/19/13: rebuilding after fix #11544 -- many allele frequencies were combined
      # incorrectly due to stranded data (but not strand flag) in SNPAlleleFreq and
      # positive-strand-only data in SNPAlleleFreq_TGP.
      cd /hive/data/outside/dbSNP/137/human
      mkdir preAlleleFreqFix
      mv ucscAlleleFreq.txt.gz snp137*.bed.gz preAlleleFreqFix/
      # Run with -debug to regenerate addToDbSnp script:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra \
        -continue addToDbSnp -stop addToDbSnp -debug
      # Now re-run the updated snpAddTGPAlleleFreq.pl command:
      grep snpAdd addToDbSnp.csh
      ~/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg19snp137 \
        -contigLoc=b137_SNPContigLoc > ucscAlleleFreq.txt
      # Reload the ucscAlleleFreq table:
      hgLoadSqlTab hg19snp137 ucscAlleleFreq{,.sql,.txt} >>& do.log & tail -f do.log
      # Redo the big join:
      mkdir -p `cat workingDir`
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin -stop bigJoin \
        >>& do.log & tail -f do.log
      # Manually re-run snpNcbiToUcsc (the translate step includes fasta file concat
      # and indexing, which would waste a lot of time):
      set tmpDir = `cat /hive/data/outside/dbSNP/137/human/workingDir`
      cd $tmpDir
      echo 'select * from par' | hgsql hg19 -NB > par.bed
      snpNcbiToUcsc -snp132Ext -par=par.bed ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp137 \
        >>& /hive/data/outside/dbSNP/137/human/do.log &
      tail -f /hive/data/outside/dbSNP/137/human/do.log
      head snp137Errors.bed >>& /hive/data/outside/dbSNP/137/human/do.log &
      tail -f /hive/data/outside/dbSNP/137/human/do.log
      wc -l snp137* >>& /hive/data/outside/dbSNP/137/human/do.log &
      tail -f /hive/data/outside/dbSNP/137/human/do.log
      gzip *.txt *.bed *.tab
      cp -p * /hive/data/outside/dbSNP/137/human/
      cd /hive/data/outside/dbSNP/137/human
      rm $tmpDir/*
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue load \
        >>& do.log & tail -f do.log
  
  
  #############################################################################
  # FILTER SNP137 (DONE 8/19/13 angie)
  # Originally done 7/11/12; updated 9/11/12, 10/15/12, 11/9/12, 8/19/13 -- see SNP137 above
      # Redmine #8360
      # Make several tracks that are filtered subsets of snp137:
      # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp137Mult
      # Second, siphon off the common variants -> snp137Common
      # Third, take the (uniquely mapped, not ens to be common) variants
      # w/dbSNP's "clinically-assoc" flag -> snp137Flagged
      cd /hive/data/outside/dbSNP/137/human
      zcat snp137.bed.gz \
      | perl -we \
        '$minTotal2N = 10; \
         ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \
         open($mult, "| gzip -c > snp137Mult.bed.gz") || die; \
         open($common,    "| gzip -c > snp137Common.bed.gz") || die; \
         open($flagged,   "| gzip -c > snp137Flagged.bed.gz") || die; \
         open($misc,      "| gzip -c > snp137Misc.bed.gz") || die; \
         while (<>) { \
           @w = split("\t"); \
           if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
             print $mult $_; \
             $multCount++; \
           } else { \
             my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
             my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
             my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
             my ($total2N, $maxAlleleFreq) = (0, 0); \
             for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
               $total2N += $alNs[$i]; \
               $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
             } \
             if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \
               print $common $_; \
               $comCount++; \
             } elsif($w[24] =~ /clinically-assoc/)  { \
               print $flagged $_; \
               $flagCount++; \
             } else { \
               print $misc $_; \
               $miscCount++; \
             } \
           } \
         } \
         close($mult);  close($common); close($flagged);  close($misc); \
         print "snp137Mult:    $multCount\nsnp137Common:  $comCount\nsnp137Flagged: $flagCount\n" . \
               "leftover:      $miscCount\n";'
  #snp137Mult:    3633662
  #snp137Common:  13873636
  #snp137Flagged: 42824
  #leftover:      38698577
     # Compare to counts from 11/9/12, before fixing allele freqs:
  #snp137Mult:    3633662
  #snp137Common:  13894623
  #snp137Flagged: 42733
  #leftover:      38677681
     # So 91 SNPs were left out of snp137Flagged.
  
      # Load tables
      foreach subset (Mult Common Flagged)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          hg19 snp137$subset -sqlTable=snp137.sql snp137$subset.bed.gz
      end
  
  
  #############################################################################
  # SNP137 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 7/12/12 angie)
  # Chose not to redo this 9/11/12 (see SNP137 above) because this is only for SNVs
  # and only indel locations were changed.
      mkdir /hive/data/genomes/hg19/bed/snp137Ortho
      cd /hive/data/genomes/hg19/bed/snp137Ortho
      # Filter snp137 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
      zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
      | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      | sort -u \
        > snp137ExcludeIds.txt
      wc -l snp137ExcludeIds.txt
  #1267059 snp137ExcludeIds.txt
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
      | awk '$3-$2 == 1 && $11 == "single" {print;}' \
      | grep -vFwf snp137ExcludeIds.txt \
      | awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        > snp137ForLiftOver.bed
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
      splitFile ../snp137ForLiftOver.bed 10000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro3.over.chain.gz \
          \{check out exists out/panTro3.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh swarm
      cd /hive/data/genomes/hg19/bed/snp137Ortho/run.liftOChimp
      para make jobList
  #Completed: 4597 of 4597 jobs
  #CPU time in finished jobs:     443120s    7385.34m   123.09h    5.13d  0.014 y
  #IO & Wait Time:                 46429s     773.81m    12.90h    0.54d  0.001 y
  #Average job time:                 106s       1.77m     0.03h    0.00d
  #Longest finished job:             261s       4.35m     0.07h    0.00d
  #Submission to last job:           558s       9.30m     0.15h    0.01d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 4597 of 4597 jobs
  #CPU time in finished jobs:     924695s   15411.59m   256.86h   10.70d  0.029 y
  #IO & Wait Time:                 90764s    1512.73m    25.21h    1.05d  0.003 y
  #Average job time:                 221s       3.68m     0.06h    0.00d
  #Longest finished job:             580s       9.67m     0.16h    0.01d
  #Submission to last job:          1201s      20.02m     0.33h    0.01d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
          \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 4597 of 4597 jobs
  #CPU time in finished jobs:    1189764s   19829.40m   330.49h   13.77d  0.038 y
  #IO & Wait Time:                107365s    1789.42m    29.82h    1.24d  0.003 y
  #Average job time:                 282s       4.70m     0.08h    0.00d
  #Longest finished job:             694s      11.57m     0.19h    0.01d
  #Submission to last job:          1565s      26.08m     0.43h    0.02d
  
      cd /hive/data/genomes/hg19/bed/snp137Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro3.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro3/panTro3.2bit \
      | sort > panTro3.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
      | sort > rheMac2.orthoGlom.txt
      wc -l panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
  #  43311070 panTro3.orthoGlom.txt
  #  41254270 ponAbe2.orthoGlom.txt
  #  37148915 rheMac2.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of snp137OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac2.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp137OrthoPt3Pa2Rm2.bed
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp137OrthoPt3Pa2Rm2 snp137OrthoPt3Pa2Rm2.bed
  #Read 44774198 elements of size 22 from snp137OrthoPt3Pa2Rm2.bed
      # Cleanup:
      rm -r run*/split tmp.txt *.orthoGlom.txt snp137Simple.bed
      gzip snp137ExcludeIds.txt snp137ForLiftOver.bed &
  
  
  ############################################################################
  # DBSNP CODING ANNOTATIONS (137) (DONE 11/10/12 angie)
  # Originally done 7/11/12 but code 43 (stop-loss) was omitted, and filtering out
  # NULL frame caused us to lose 45 (cds-indel) too.
  # Updated 7/30/12 with corrections for that issue.
  # Updated 9/11/12 with extensive corrections from dbSNP (see SNP137 above, #8360 note 36)
  # Updated 10/15/12 w/more corrections (see SNP137 above, #8360 note 42)
  # Updated 11/9/12 w/more corrections (see SNP137 above, #8360 note 45)
      cd /hive/data/outside/dbSNP/137/human
      # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
      # For anything except an insertion (0 bases between flanks),
      # we need to add 1 to the end coord.  For an insertion, we need
      # to add 1 to the start coord.  Make a hash of the insertion IDs,
      # then look up each ID in ncbiFuncAnnotations.txt to tell which
      # transform to apply.
      # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
      zcat ncbiFuncAnnotations.txt.gz \
      | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \
                while (<$IDS>) { chomp; $ids{$_} = 1; } \
                close($IDS); \
                %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \
                while (<>) { \
                  chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                  next unless $coding{$w[5]}; \
                  $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                  if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                    $w[2]++; # 2-base insertions: increment start coord \
                  } else { \
                    $w[3]++; # increment end coord to get half-open \
                  } \
                  print join("\t", @w) . "\n"; \
                }' \
      | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
      | uniq \
        > ncbiCodingAnnotations.txt
      wc -l ncbiCodingAnnotations.txt
  #3873975 ncbiCodingAnnotations.txt
      # How many & what kinds of function types?
      cut -f 6 ncbiCodingAnnotations.txt \
      | sort -n | uniq -c
  # 681327 3   (coding-synon)
  #1917237 8   (cds-reference -- ignored)
  #  35591 41  (nonsense)
  #1190533 42  (missense)
  #   1153 43  (stop-loss)
  #  41695 44  (frameshift)
  #   6439 45  (cds-indel)
      # In b137, the functional annotations include non-coding (frame = NULL),
      # which we'll exclude here because this is supposed to be just coding stuff...
      # probably need to update how we show dbSNP's func annos anyway, e.g.
      # it is a shame that we toss out codon number and transcript offset.
      # Gather up multiple annotation lines into one line per {snp, gene, frame}:
      perl -e  'while (<>) { chomp; \
                  my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                  next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \
                  if (defined $lastRs && \
                      ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                       $lastTx ne $txId || $lastFrm ne $frm)) { \
                    if (defined $refRow) { \
                      $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                      $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                    } \
                    $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                          "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                    $lineOut =~ s@NULL@n/a@g; \
                    print $lineOut; \
                    $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                  } \
                  ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                      ($rsId, $ctg, $s, $e, $txId, $frm); \
                  $count++; \
                  if ($fxn == 8) { \
                    $refRow = [$fxn, $nt, $aa, $codon]; \
                  } else { \
                   $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                  } \
                } \
                if (defined $refRow) { \
                  $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                  $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                } \
                $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                      "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                $lineOut =~ s@NULL@n/a@g; \
                print $lineOut;' \
        ncbiCodingAnnotations.txt \
      | liftUp snp137CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
      hgLoadBed hg19 snp137CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
        -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
        snp137CodingDbSnp.bed
  #Read 1922594 elements of size 11 from snp137CodingDbSnp.bed
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP137 (DONE 7/12/12 angie)
  # Chose not to redo this 9/11/12 (see SNP137 above) because this is only for SNVs
  # and only indel locations were changed.
      mkdir /hive/data/genomes/hg19/snp137Mask
      cd /hive/data/genomes/hg19/snp137Mask
      # Identify rsIds with various problems -- we will exclude those.
      zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
      | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
        | sort -u \
        > snp137ExcludeRsIds.txt
      zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
      | grep -vFwf snp137ExcludeRsIds.txt \
        > snp137Cleaned.bed
      wc -l snp137Cleaned.bed
  #52047160 snp137Cleaned.bed
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp137Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
      | faSplit byname stdin substitutions/
  #Masked 46091199 snps in 46090845 out of 3131225094 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3131225094 (difference is 5936170)
      # Check that 5936170 is the total #bases in sequences with nothing in snp137Cleaned:
      grep -Fw single snp137Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
      grep -vwf /data/tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #5936170
      # warnings about differing observed strings at same base position:
      wc -l diffObserved.txt
  #448 diffObserved.txt
  #TODO: send list to dbSNP.
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10176 (m != a)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
        gzip $f:r.subst.fa &
      end
  
      # Insertions & deletions not done.  To date we have only offered substs for download.
      # If there is user demand, use template from snp131 above.
  
      # Clean up and prepare for download:
      gzip snp137Cleaned.bed &
      foreach d (substitutions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg19/snp135Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt.
  
      # Create download links on hgwdev.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp137Mask
      ln -s /hive/data/genomes/hg19/snp137Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp137Mask/
  
  
  #########################################################################
  # LASTZ Macaca Mulatta RheMac3 (DONE - 2012-03-15 - Chin)
      mkdir /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15
      cd /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15
  
      cat << '_EOF_' > DEF
  # human vs macaca mulatta
  BLASTZ=lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  # and place those items here
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Macaca Mulatta RheMac3
  SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit
  SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          > do.log 2>&1 &
      #   real     322m50.822s
      cat fb.hg19.chainRheMac3Link.txt
      #   2400694407 bases of 2897316137 (82.859%) in intersection
      cd /hive/data/genomes/hg19/bed
      ln -s lastzRheMac3.2012-03-15 lastz.rheMac3
  
  
      #   running the swap - DONE - 20i12-03-16
      mkdir /hive/data/genomes/rheMac3/bed/blastz.hg19.swap
      cd /hive/data/genomes/rheMac3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15/DEF \
          -swap \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          > swap.log 2>&1 &
      #    58m38.594s
      cat fb.rheMac3.chainHg19Link.txt
      #   2313806886 bases of 2646704109 (87.422%) in intersection
      cd /hive/data/genomes/rheMac3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  
  ############################################################################
  # UPDATE COSMIC TRACK - v60 (DONE 2012-07-23 larrym)
  
  ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v60_190712.csv.gz
  New length: 166164
  Old length: 136638
  Percent bed overlap with previous version: 100.00%
  Number of deleted IDs: 4
  Number of added IDs: 29530
  
  ############################################################################
  # UPDATE COSMIC TRACK - v61 (DONE - 2012-11-09 - Hiram)
  
  time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v61_260912.csv.gz
  #       real    1m10.070s
  New length: 220318
  Old length: 166164
  Percent bed overlap with previous version: 100.00%
  Number of deleted IDs: 28
  Number of added IDs: 54182
  time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v61_260912.csv.gz
  #       real    0m8.251so
  # Loading COSMIC v61
  # New length: 220318
  # Old length: 166164
  # Percent bed overlap with previous version: 100.00%
  # Number of deleted IDs: 28
  # Number of added IDs: 54182
  # Scanning through 1 files
  # Reading cosmic.bed
  # Read 220318 elements of size 4 from cosmic.bed
  # Sorted
  # Creating table definition for cosmic
  # Saving bed.tab
  # Loading hg19
  
  ############################################################################
  # UPDATE COSMIC TRACK - v62 (DONE - 2012-12-18 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
  
  time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v62_291112.csv.gz
  #       New length: 536287
  #       Old length: 220318
  #       Percent bed overlap with previous version: 100.00%
  #       Number of deleted IDs: 77
  #       Number of added IDs: 316046
  
  #       real    0m23.191s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v62/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v62/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v62_291112.csv.gz \
          > do.log 2>&1
      #   real    0m19.404s
      #   Read 536287 elements of size 4 from cosmic.bed
  
  ############################################################################
  # UPDATE COSMIC TRACK - v63 (DONE - 2013-02-19 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
      cd /hive/data/genomes/hg19/bed/cosmic
  
  time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v63_300113.csv.gz
  #       New length: 616299
  #       Old length: 536287
  #       Percent bed overlap with previous version: 100.00%
  #       Number of deleted IDs: 643
  #       Number of added IDs: 80655
  
  #       real    0m32.084s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v63/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v63/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v63_300113.csv.gz \
          > do.log 2>&1
      #   real    0m24.619s
  
      #   Read 616299 elements of size 4 from cosmic.bed
  
  ############################################################################
  # lastz Medium Ground Finch geoFor1 (DONE - 2012-07-29 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S hg19
      mkdir /hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29
      cd /hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29
  
      cat << '_EOF_' > DEF
  # Human vs. medium ground finch
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Medium Ground Finch GeoFor1
  SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit
  SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
      #   real    238m6.827s
      cat fb.hg19.chainGeoFor1Link.txt
      #   101503916 bases of 2897316137 (3.503%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzGeoFor1.2012-07-29 lastz.geoFor1
  
      #	and for the swap
      mkdir /hive/data/genomes/geoFor1/bed/blastz.hg19.swap
      cd /hive/data/genomes/geoFor1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    9m10.240s
      cat  fb.geoFor1.chainHg19Link.txt
      #	88547518 bases of 1041286029 (8.504%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/geoFor1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  
  #######################################################################
  # DENISOVA HIGH-COVERAGE VARIANTS #8886 (DONE 9/6/12 angie)
      mkdir /hive/data/genomes/hg19/bed/denisova
      cd /hive/data/genomes/hg19/bed/denisova
      # Get tabix-compressed+indexed VCF files for Denisova and 11 modern humans:
      wget ftp://ucsc_paper_data:PHZuezz7@cdna.eva.mpg.de/hg19_1000g/\*
      # Make /gbdb links and bbi tables, prefix dhcVcf for Denisova High-Coverage VCF:
      foreach f (`pwd`/*.vcf.gz{,.tbi})
        ln -s $f /gbdb/hg19/bbi/
      end
      foreach f (*.vcf.gz)
        set track = dhcVcf$f:r:r
        echo $track
        hgBbiDbLink hg19 $track /gbdb/hg19/bbi/$f
      end
  #dhcVcfDNK02
  #dhcVcfDenisovaPinky
  #dhcVcfHGDP00456
  #dhcVcfHGDP00521
  #dhcVcfHGDP00542
  #dhcVcfHGDP00665
  #dhcVcfHGDP00778
  #dhcVcfHGDP00927
  #dhcVcfHGDP00998
  #dhcVcfHGDP01029
  #dhcVcfHGDP01284
  #dhcVcfHGDP01307
      # Add Denisova track group section:
      hgsql hg19 -e "insert into grp values('denisova', 'Denisova Assembly and Analysis', 6.6, 1)"
  
  
  #########################################################################
  # DENISOVA HIGH-COVERAGE SEQUENCE READS #8886 (DONE 9/10/12)
      cd /hive/data/genomes/hg19/bed/denisova
      wget http://cdna.eva.mpg.de/denisova/alignments/T_hg19_1000g.bam
      wget http://cdna.eva.mpg.de/denisova/alignments/T_hg19_1000g.bam.bai
      # Tweak sequence names?  e.g. SN:GL000193.1 -> chr4_gl000193_random ?
      ln -s `pwd`/T_hg19_1000g.bam{,.bai} /gbdb/hg19/bbi/
      hgBbiDbLink hg19 dhcBamDenisova /gbdb/hg19/bbi/T_hg19_1000g.bam
  
  
  #########################################################################
  # DENISOVA HIGH-COVERAGE ANALYSIS #8886 (DONE 10/2/12 angie)
      mkdir /hive/data/genomes/hg19/bed/denisova
      cd /hive/data/genomes/hg19/bed/denisova
      # Fetched original .zip file on 9/6/12:
      wget --no-check-certificate https://bioinf.eva.mpg.de/download/HighCoverageDenisovaGenome/Denisova_catalog.zip
      unzip Denisova_catalog.zip
  #    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/Denisova-derived_Human-ancestral/
  #    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/GAD_Denisova-state/
  #    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/GWAS_Denisova-state/
  #    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/Human-derived_Denisova-ancestral/
  #    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/Human-derived_Denisova-state/
  
      # 9/17/12 update: no zip, dir name change, and wget -r apparently doesn't work with
      # https --no-check-cert, so just fetch updates alongside original files:
      mv Denisova_zip Denisova_zip_orig
      cp -p Denisiva_zip_orig DenHC_catalog
      cd DenHC_catalog/Human-derived_Denisova-state
      wget --no-check-certificate https://bioinf.eva.mpg.de/download/HighCoverageDenisovaGenome/DenHC_catalog/Human-derived_Denisova-state/Genome_CAT.tsv.gz
      # Good, no incomplete lines now.  But what we really want to make a track for is
      # Human-derived_Denisova-ancestral...
      cd /hive/data/genomes/hg19/bed/denisova/DenHC_catalog/Human-derived_Denisova-ancestral
      # There are two subdirs, InDels and SNCs, and each subdir has a whole bunch of files
      # which we can make into BED4+ subtracks of a composite with views.
      cat > trackify.pl <<'_EOF_'
  #!/usr/bin/env perl
  use warnings;
  use strict;
  my $isSNC = ($ARGV[0] =~ /SNC/);
  while (<>) {
    next if (/^#/);
    chomp;
    my @w = split("\t");
    my $chromEnd = $w[2];
    $w[2]--; # chromStart -> 0-based
    # Delete extra column (Grantham score) that appears only in nonsynon files:
    splice(@w, 31, 1) if (scalar(@w) > 32);
    # Measure allele lengths to determine whether this is an indel, and whether it's
    # necessary to trim an identical first base from each allele:
    my $humanFirstBase = substr($w[15], 0, 1);
    my $humanLen = length($w[15]);
    my $diffLengths = 0;
    my $sameFirstBase = 1;
    foreach my $i (16..19) {
      next if ($w[$i] eq "N/A");
      my $firstBase = substr($w[$i], 0, 1);
      my $len = length($w[$i]);
      $diffLengths = 1 if ($len != $humanLen);
      $sameFirstBase = 0 if ($firstBase ne $humanFirstBase);
    }
    if ($sameFirstBase) {
      foreach my $i (15..19) {
        next if ($w[$i] eq "N/A");
        $w[$i] =~ s/^$humanFirstBase// || die;
        $w[$i] =~ s/^$/-/; # Some alleles have trailing "-", some don't; use "-" for deletion
      }
      $w[2]++; # adjust chromStart
      $humanLen--;
    }
    if ($diffLengths) {
      $chromEnd = $w[2] + $humanLen;
    }
    if ($isSNC) {
      # "-" is used as N/A in SNCs/* -- tweak to N/A for consistency w/InDels files
      foreach my $i (16..19) {
        $w[$i] = "N/A" if ($w[$i] eq "-");
      }
    }
    my $name = "$w[15]/$w[16]";
    $name .= ":$w[21]" if (length($w[21]) > 1);
    # Add spaces to Extra for readability:
    $w[14] =~ s/;/; /g;
    print join("\t", "chr$w[1]", $w[2], $chromEnd, $name,      # BED 4
               $w[5], $w[4], $w[14],                           # Feature, Gene, Extra
               $w[7], $w[8], $w[9], $w[10], $w[11], $w[12],    # Consequence, coding effect
               $w[15], $w[16], $w[17], $w[18], $w[19],         # Human, Den, etc. alleles
               $w[20], $w[21], $w[22], $w[23], $w[31]) . "\n"; # D zyg, dbSNP, 1000g freq, flag,strnd
  }
  '_EOF_'
      # << emacs
      chmod a+x trackify.pl
  
      cat > tsvToBedAndTrackDb.pl << '_EOF_'
  #!/usr/bin/env perl
  # Parse a .tsv file path into descriptive components to use for track name and trackDb subGroups;
  # Call trackify.pl and if the resulting bed file is non-empty, print out subtrack .ra entry.
  use warnings;
  use strict;
  my $tsvPath = $ARGV[0];
  chomp $tsvPath;
  $tsvPath =~ m/^(InDels|SNCs)\/Genome_VEP(|(_genic(_ccds)?_(3utr|5utr|nonsyn_grantham|frameshift_coding|inframe_nonsyn|splice|syn))|(_regul(_motif)?(_highinfo)?))_formatted_(fixed|highfreq)/ || die;
  my ($vType, $ccds, $gType, $reg, $regMo, $regHi, $fType) = ($1, $4 || "", $5 || "", $6 || "",
                                                              $7 || "", $8 || "", $9 || "");
  # Reformat file name components: _ to inital uppercase, etc.
  $vType =~ s/s$//;
  $ccds =~ s/_(\w)/\u$1/g;
  $gType =~ s/_(\w)/\u$1/g;  $gType =~ s/^(\w)/\u$1/;
  $gType =~ s/Grantham//;  $gType =~ s/^(\d)utr/Utr$1/; $gType =~ s/nframe/nFrame/;
  $fType =~ s/^(\w)/\u$1/;
  $fType =~ s/ighfreq/ighFreq/;
  my $fShort = ($fType eq "Fixed") ? "Fxd" : "HiF";
  my $shortLabel = "";
  my $longLabel = "Modern Human Derived ($fType), Denisova Ancestral: ";
  my $color = "0,0,0";
  my ($subset, $view);
  if ($gType) {
    $subset = "$ccds$gType";
    if ($gType eq "Nonsyn" || $gType eq "Splice" || $gType eq "FrameshiftCoding" ||
        $gType eq "InFrameNonsyn") {
      $color = "200,0,0";
    } elsif ($gType =~ /^Utr/) {
      $color = "0,0,200";
    } elsif ($gType eq "Syn") {
      $color = "0,200,0";
    }
    my ($gShort, $gLong) = ($gType, $gType);
    $gShort =~ s/FrameshiftCoding/FrShft/; $gShort =~ s/InFrameNonsyn/InFrNS/;
    $gLong =~ s/([a-z])([A-Z])/$1 $2/g;  $gLong =~ s/Utr(\d)/$1\' UTR/;
    $gLong =~ s/In Frame/In-frame/;  $gLong =~ s/Nonsyn/Non-synonymous/;
    if ($ccds) {
      $gShort = "CC $gShort";
      $gLong = "CCDS $gLong";
    }
    $shortLabel .= "$gShort $fShort";
    $longLabel .= "$gLong";
    $view = $ccds ? $ccds : "Ens";
  } elsif ($reg) {
    $color = "230,130,0";
    my ($rShort, $rLong);
    if ($regHi) {
      $subset = "RegMotifHighInfo";
      ($rShort, $rLong) = ("RgMoHiInf", "Reg. Motif at High Inf Pos in TFBP");
    } elsif ($regMo) {
      $subset = "RegMotif";
      ($rShort, $rLong) = ("RegMotif", "Regulatory Motif");
    } else {
      $subset = "Reg";
      ($rShort, $rLong) = ("RegRegion", "Regulatory Region");
    }
    $shortLabel .= "$rShort $fShort";
    $longLabel .= "$rLong";
    $view = "Reg";
  } else {
    $subset = "All";
    $shortLabel .= "$fShort";
    $longLabel .= "All ";
    $view = "All";
  }
  my $track = "dhcHumDerDenAnc$vType$subset$fType";
  my $cmd = "./trackify.pl $tsvPath > $track.bed";
  system($cmd) == 0 || die "ERROR from \"$cmd\"\n\n";
  if (-s "$track.bed") {
    my $subsetTweaked = $subset;  $subsetTweaked =~ s/Syn/ZLast_Syn/;
    $subsetTweaked =~ s/^RegMotifH/DA_RegMotifH/;  $subsetTweaked =~ s/^RegM/DB_RegM/;
    $subsetTweaked =~ s/^Reg/DC_Reg/;
    my $isOff = "";
    $isOff = " off" if ($subset =~ /Syn/ || $subset eq "Reg");
    # We will combine SNCs and InDels later; print only one set of trackDb descriptions:
    if ($vType eq "InDel" || $subset =~ /Nonsyn/ || $subset =~ /Syn/) {
      print "        track dhcHumDerDenAnc$subset$fType\n";
      print "        parent dhcHumDerDenAnc$view$isOff\n";
      print "        subGroups view=$view subset=$subsetTweaked freq=$fType\n";
      print "        shortLabel $shortLabel\n";
      print "        longLabel $longLabel\n";
      print "        color $color\n\n";
    }
    if ($fType eq "Fixed") {
      # Fernando Racimo's request: separate out Fixed (in 1000Genomes) locations that are in dbSNP.
      $cmd = "egrep -vw 'rs[0-9]+' $track.bed > tmp$track.bed";
      system($cmd) == 0 || die "ERROR from \"$cmd\"\n\n";
      $cmd = "egrep -w 'rs[0-9]+' $track.bed > ${track}DbSnp.bed";
      system($cmd); # grep returns nonzero if it can't find anything, but that's OK here.
      $cmd = "mv tmp$track.bed $track.bed";
      system($cmd) == 0 || die "ERROR from \"$cmd\"\n\n";
      if (-s "${track}DbSnp.bed" &&
          ($vType eq "InDel" || $subset =~ /Nonsyn/ || $subset =~ /Syn/ ||
           $subset eq "RegMotifHighInfo")) {
        $shortLabel =~ s/Fxd/FxS/;
        $longLabel =~ s/Fixed/Fixed+dbSNP/;
        print "        track dhcHumDerDenAnc$subset${fType}DbSnp\n";
        print "        parent dhcHumDerDenAnc$view$isOff\n";
        print "        subGroups view=$view subset=$subsetTweaked freq=${fType}DbSnp\n";
        print "        shortLabel $shortLabel\n";
        print "        longLabel $longLabel\n";
        print "        color $color\n\n";
      }
    }
  }
  '_EOF_'
       # << emacs
      chmod a+x tsvToBedAndTrackDb.pl
      foreach f (*/Genome_VEP_*.tsv)
          ./tsvToBedAndTrackDb.pl $f
          if ($status != 0) break
      end
      # Check input and output file counts:
      ls -1 */G*_highfreq.tsv | wc -l
  #27
      ls -1 *HighFreq.bed | wc -l
  #27
      ls -1 */G*_fixed.tsv | wc -l
  #27
      ls -1 *Fixed.bed | wc -l
  #27
      ls -1 *FixedDbSnp.bed | wc -l
  #27
      # 54 inputs, 81 outputs because Fixed was split into Fixed and FixedDbSnp
  
      # Combine SNCs and InDels:
      foreach indel (dhcHumDerDenAncInDel*.bed)
        set snc = `echo $indel | sed -e 's/InDel/SNC/;'`
        set both = `echo $indel | sed -e 's/InDel//;'`
        if (! -e $snc) then
          mv $indel $both
        endif
      end
      foreach snc (dhcHumDerDenAncSNC*.bed)
        set indel = `echo $snc | sed -e 's/SNC/InDel/;'`
        set both = `echo $snc | sed -e 's/SNC//;'`
        if (-e $indel) then
          sort -k1,1 -k2n,2n $snc $indel > $both
          rm $snc $indel
        else
          mv $snc $both
        endif
      end
  
      # bedToBigBed:
      foreach f (dhcHumDerDenAnc*.bed)
        if (-s $f) then
          echo $f
          bedToBigBed -verbose=0 -tab -type=bed4+19 -as=$HOME/kent/src/hg/lib/dhcHumDerDenAnc.as \
            $f /hive/data/genomes/hg19/chrom.sizes $f:r.bb
          if ($status != 0) break
        else
          echo "Skipping $f (zero size)"
        endif
      end
  #Skipping dhcHumDerDenAncCcdsInFrameNonsynFixedDbSnp.bed (zero size)
  #Skipping dhcHumDerDenAncCcdsInFrameNonsynHighFreq.bed (zero size)
  #Skipping dhcHumDerDenAncInFrameNonsynFixedDbSnp.bed (zero size)
  #Skipping dhcHumDerDenAncInFrameNonsynHighFreq.bed (zero size)
      # Check bigBed file count:
      ls -1 *.bed | wc -l
  #54
      ls -1 *.bb | wc -l
  #50
      # We skipped 4 empty bed files, so that's OK.
  
      # Install in /gbdb and load up.
      mkdir /gbdb/hg19/dhcHumDerDenAnc
      ln -s `pwd`/dhcHumDerDenAnc*.bb /gbdb/hg19/dhcHumDerDenAnc/
      foreach f (dhcHumDerDenAnc*.bb)
        hgBbiDbLink hg19 $f:r /gbdb/hg19/dhcHumDerDenAnc/$f
      end
  
  
  #########################################################################
  # recip best for mm10 (DONE - 2012-09-14 - Hiram)
      # see also: redmine issue 9089
      cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
      time doRecipBest.pl -buildDir=`pwd` hg19 mm10 > rbest.log 2>&1
      #   real    157m16.369s
  
  #########################################################################
  # NCBI patch 10 (DONE - 2012-09-26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch10
      cd /hive/data/genomes/hg19/bed/additionalSequence/patch10
      rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p10/ ./
      # the scripts from patch9 were modified slightly to update and fix some
      #	of the new names in this patch10
      cp ../patch9/gatherNames.pl .
      ./gatherNames.pl . > ucscNames.patch10.txt
      # examine the names for sanity:
      awk '{print $NF}' ucscNames.patch10.txt | sort
      # and they should not be longer than 31 characters:
      awk '{print $NF}' ucscNames.patch10.txt | sort | awk '{print length($0)}' \
          | sort -n | tail
      cp -p ../patch9/mkTables.pl .
      ./mkTables.pl  patches.chrom.sizes ucscNames.patch10.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
      # output to stdout is the contents of alt.scaf.agp.gz
      # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
      cp -p ../patch9/mkCtgPos2.pl .
      ./mkCtgPos2.pl ucscNames.patch10.txt patches.chrom.sizes > ctgPos2.txt
      cp -p ../patch9/mkHapLocate.pl .
      ./mkHapLocate.pl ctgPos.txt \
  	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
  	> haplotypeLocations.bed
      cp -p haplotypeLocations.bed altSequence.bed
      ln -s ../patch2/before.patch2.hapLoc.bed hg19.hapLoc.bed
      awk '{printf "%s\t%d\t%d\t%s\t500\t+\t%d\t%d\t32,32,190\n", $2,$3,$4,$5,$3,$4}' \
  hg19.hapLoc.bed >> altSequence.bed
  
      # a new script for patch10
      cp -p ../patch9/mkFasta.pl .
      ./mkFasta.pl ucscNames.patch10.txt > hg19.patch10.fa
      # the build of hg19Patch10 can be seen in hg19Patch10.txt
  
      egrep -v "32,32,190" altSequence.bed  \
  	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
  	    > altSeqPatchesP10.tab
      egrep "32,32,190" altSequence.bed  \
  	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
  	    | grep -v "^chrM_rCRS" > altSeqHaplotypesP10.tab
      # verify only one lost
      wc -l altSeqPatchesP10.tab altSeqHaplotypesP10.tab
      #   112 altSeqPatchesP10.tab
      #   80 altSeqHaplotypesP10.tab
      #   192 total
  
      wc -l altSequence.bed
      #	193 altSequence.bed
      hgLoadBed hg19 altSeqHaplotypesP10 altSeqHaplotypesP10.tab
      #	Read 80 elements of size 6 from altSeqHaplotypesP10.tab
      hgLoadBed hg19 altSeqPatchesP10 altSeqPatchesP10.tab
      #	Read 112 elements of size 6 from altSeqPatchesP10.tab
  
      #    these tables are part of human/hg19/altSeqComposite10.ra
      #  Check the chrom coverage for the altSeqComposite10.ra listing:
      cut -f1 altSequence.bed | sort -u | xargs echo
  # chr1 chr10 chr11 chr12 chr13 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrM chrM_rCRS chrX
  
  ##############################################################################
  # lastz Lamprey petMar2 (DONE - 2012-10-17 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S petMar2
      mkdir /hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17
      cd /hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17
  
      cat << '_EOF_' > DEF
  # Human vs. Lamprey
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Lamprey PetMar2
  SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit
  SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
      #   real    76m34.446s
  
      cat fb.hg19.chainPetMar2Link.txt
      #   30305028 bases of 2897316137 (1.046%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPetMar2.2012-10-17 lastz.petMar2
  
      #	and for the swap
      mkdir /hive/data/genomes/petMar2/bed/blastz.hg19.swap
      cd /hive/data/genomes/petMar2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    15m22.099s
      cat  fb.petMar2.chainHg19Link.txt
      #	21515660 bases of 647368134 (3.324%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/petMar2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # lastz White Rhino cerSim1 (DONE - 2012-10-17 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S cerSim1
      mkdir /hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17
      cd /hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17
  
      cat << '_EOF_' > DEF
  # Human vs. White Rhino
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: White Rhino CerSim1
  SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit
  SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      #   real    1272m58.952s
      #   problem in chaining chr19, running it manually on hgwdev:
      cd /hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17/axtChain/run
  export maxMem=83886080
  ulimit -S -m $maxMem -v $maxMem
  ulimit -a
  time ./chain.csh hg19.2bit:chr19: chain/hg19.2bit:chr19:.chain
      #   real    147m46.959s
      # very impressive:
      #   -rw-rw-r-- 1 707886253 Oct 18 13:03 hg19.2bit:chr19:.chain
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -continue=chainMerge `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
      #   real    99m4.624s
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -syntenicNet -continue=syntenicNet -stop=syntenicNet `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1
      #   real     60m37.958s
  
      cat fb.hg19.chainCerSim1Link.txt
      #   1683424317 bases of 2897316137 (58.103%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzCerSim1.2012-10-17 lastz.cerSim1
  
      #	and for the swap
      mkdir /hive/data/genomes/cerSim1/bed/blastz.hg19.swap
      cd /hive/data/genomes/cerSim1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #   real     100m36s
      cat  fb.cerSim1.chainHg19Link.txt
      #	1637961407 bases of 2366858012 (69.204%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/cerSim1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # construct liftOver to hg17 (DONE - 2012-11-08 - Hiram)
      screen -S hg17	# manage this longish running job in a screen
      mkdir /hive/data/genomes/hg19/bed/blat.hg17.2012-11-08
      cd /hive/data/genomes/hg19/bed/blat.hg17.2012-11-08
      # check it with -debug first to see if it is going to work:
      time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
  	-ooc=/hive/data/genomes/hg19/11.ooc \
  	-debug -dbHost=hgwdev -workhorse=hgwdev hg19 hg17 > do.log 2>&1
      # if that is OK, then run it:
      time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
  	-ooc=/hive/data/genomes/hg19/11.ooc \
  	-dbHost=hgwdev -workhorse=hgwdev hg19 hg17 > do.log 2>&1
      #	real    333m16.756s
  
      # verify this file exists:
      #	/gbdb/hg19/liftOver/hg19ToHg17.over.chain.gz
      # and try out the conversion on genome-test from hg19 to hg17
  
  #########################################################################
  # QPCR PRIMERS (DONE - 2012-12-10 - Chin)
  # The track name is changed to "qPCR Primers"
  # Reload table with new track_mouse.BED (2013-01-28)
      # Download
      mkdir /hive/data/outside/Weizmann/qPcrPrimers
      cd /hive/data/outside/Weizmann/qPcrPrimers
      wget http://www.weizmann.ac.il/complex/compphys/software/Amit/primers/human/track_human.BED
      mkdir -p /hive/data/genomes/hg19/bed/qPcrPrimers
      cat track_human.BED | grep -v track \
         > /hive/data/genomes/hg19/bed/qPcrPrimers/qPcrPrimers_hg19.bed
      cd /hive/data/genomes/hg19/bed/qPcrPrimers
      hgLoadBed -bedDetail -tab -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/bedDetail.sql \
        hg19 qPcrPrimers qPcrPrimers_hg19.bed
      # Read 534301 elements of size 14 from qPcrPrimers_hg19.bed
      # Sorted
      # Creating table definition for qPcrPrimers
      # Saving bed.tab
      # Loading hg19
  
      # NULL descrition column
      hgsql hg19 -ne "UPDATE qPcrPrimers SET description = NULL;"
  
  ############################################################################
  # coriellDelDup track (DONE - 2013-01-03 - Hiram)
      # data came in via email, files deposited to RedMine issue 6530
      mkdir /hive/data/genomes/hg19/bed/coriell
      cd /hive/data/genomes/hg19/bed/coriell
      # output the XLS files as tab delimited files:
  # -rw-rw-r-- 1   4544 Dec  4 10:04 coriellReanalyzed.tab
  # -rw-rw-r-- 1 119331 Dec  4 10:05 coriellDetailsHg19.tab
      # convert that .tab file to a bed 9 + file:
  grep -v "^name" coriellDetailsHg19.tab \
  | sed -e 's/B-Lymphocyte/B_Lymphocyte/; s/-derived//;' \
      | awk -F'\t' '{
  rgb="200,0,0";
  if ($5 == 0) { rgb="255,0,0"; }
  if ($5 == 1) { rgb="170,68,0"; }
  if ($5 == 2) { rgb="0,0,0"; }
  if ($5 == 3) { rgb="0,68,170"; }
  if ($5 == 4) { rgb="0,0,255"; }
  gsub(" ","_",$6);
  gsub("\"","",$7);
  gsub("\"","",$8);
  printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n", $2,$3,$4,$1,$5*100,$3,$4,rgb,$5,$6,$7,$8}' \
          | sort -k1,1 -k2,2n > coriellDetailsHg19.bed9
      # added the coriellDelDup.as  coriellDelDup.sql files to the source tree
      # in src/hg/lib/
      # loading the table:
      hgLoadBed -tab -type=bed9+ \
          -sqlTable=$HOME/kent/src/hg/lib/coriellDelDup.sql -bedDetail hg19 \
          coriellDelDup coriellDetailsHg19.bed9
      # add the description of the table to tableDescriptions in order to
      # see the functioning of the hgc clicks, the nightly tableDescriptions
      # build will pick this up from the source tree:
  /bin/echo -n -e 'DELETE FROM tableDescriptions where tableName="coriellDelDup"; ' > tableDescriptions.entry.sql
  /bin/echo -n -e "INSERT INTO tableDescriptions (tableName, autoSqlDef, gbdAnchor) values ('coriellDelDup', '" >> tableDescriptions.entry.sql
  cat $HOME/kent/src/hg/lib/coriellDelDup.as >> tableDescriptions.entry.sql
  /bin/echo -e "', '');" >> tableDescriptions.entry.sql
      hgsql hg19 < tableDescriptions.entry.sql
  
  ############################################################################
  # ENCODE Regulation track -- make doc has been moved to encodeRegHg19.txt
  
  ############################################################################
  # affyCytoScan track (DONE - 2013-01-15 - kuhn )
  
  # for affyCytoScanHD chipset
  # aamp left no record of what he did.
  # This reconstructs it, as best I can figure
  # followed by what I did to finish it up.
  
  cd /hive/data/genomes/hg19/bed
  mkdir affyCytoScan
  cd affyCytoScan
  
  # files from Carl Dowds at affy:
  # CytoScanHD_ProbeList_CN.zip
  # CytoScanHD_ProbeList_SNP.zip
  
  # It looks like the files were catenated into a file called both.bed
  # Columns were added to get to coloring field
  #   score=1000, strand=+ thickStart=chromStart thickEnd=chromEnd
  #   reserved=<colors>
  # Colors are 204 for blue (C- probes) and 3368499 for green (S- probes)
  
  # Then he loaded into db, picking up a bin column.
  # File bed.tab has one color for each source file, but some
  #   probes are in both sets
  
  commTrio.csh CytoScanHD_ProbeList_CN.bed CytoScanHD_ProbeList_SNP.bed rm
  # 1953247 CytoScanHD_ProbeList_CN.bed.Only
  #   53144 CytoScanHD_ProbeList_SNP.bed.Only
  #  743304 CytoScanHD_ProbeList_CN.bed.CytoScanHD_ProbeList_SNP.bed.Only  #
  #  both sets
  
  #   (SNP-file probes are all named "S-")
  #   (CN -file probes are named "C-" and "S-", but the S- are all in the SNP
  #   file, too)
  
  ###### kuhn hereafter
  # The SNP probes in the CN file are redundant, so I dropped them
  #   by making everything one color, then uniq the whole file.
  
  hgsql -N -e "SELECT * FROM affyCytoScan" hg19 > hg19.affyCytoScan
  cat hg19.affyCytoScan | sed 's/204$/3368499/' > hg19.cytoScan.oneColor
  cat hg19.cytoScan.oneColor | sort -u > hg19.cytoScan.oneColor.uniq
  
  wc -l *oneColor*
  # 3492997 hg19.cytoScan.oneColor
  # 2749693 hg19.cytoScan.oneColor.uniq
  
  # Lost exactly the number of probes that were "S-" type,
  #   but were colored the same as the others in the "C-" file.
  #       743304  < # of dupes
  
  # remove bin for loading
  # (don't know if this is necessary, maybe it'd load with bin in place)
  cat hg19.cytoScan.oneColor.uniq | awk '{print $2, $3, $4, $5, $6, $7, $8, $9,
  $10}' \
     > hg19.cytoScan.oneColor.noBin
  # load
  hgLoadBed -type=bed9 hg19 affyCytoScanNew hg19.cytoScan.oneColor.noBin
  
  # Set colors per Carl Dowds:
  mysql> UPDATE affyCytoScanNew SET reserved = 3308830 WHERE name LIKE "C-%";
  mysql> UPDATE affyCytoScanNew SET reserved = 8913032 WHERE name LIKE "S-%";
  
  # Checked new track in Browser by making temporary block for it in trackDb.ra
  # Moved into place:
  
  mysql> RENAME TABLE affyCytoScan    TO affyCytoScanAndy;
  mysql> RENAME TABLE affyCytoScanNew TO affyCytoScan;
  
  ############################################################################
  # Chimp Lastz run (DONE 1/29/13 angie)
      mkdir /hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25
      cd /hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25
      cat << '_EOF_' > DEF
  # human vs chimp
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Chimp PanTro4
  SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit
  SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      screen # use screen to manage this long-running job
      ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
          -syntenicNet >& do.log & tail -f do.log
  
      cat fb.hg19.chainPanTro4Link.txt
  #2760526412 bases of 2897316137 (95.279%) in intersection
  
      # filter with doRecipBest.pl
      doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 panTro4 >& rbest.log & tail -f rbest.log
  
      # running the swap
      mkdir /hive/data/genomes/panTro4/bed/blastz.hg19.swap
      cd /hive/data/genomes/panTro4/bed/blastz.hg19.swap
      ~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25/DEF \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet >& swap.log & tail -f swap.log
      cat fb.panTro4.chainHg19Link.txt
  #2773561724 bases of 2902338967 (95.563%) in intersection
  
  
  #############################################################################
  # LASTZ Gibbon NomLeu3 (DONE - Tue Mar 26 18:52:03 PDT 2013 - Pauline)
  
      mkdir /hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06
      cd /hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06
  
      cat << '_EOF_' > DEF
  # human vs gibbon
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Gibbon nomLeu3
  SEQ2_DIR=/hive/data/genomes/nomLeu3/nomLeu3.2bit
  SEQ2_LEN=/hive/data/genomes/nomLeu3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #   establish a screen to control this job
      screen -S lastz
  
      /usr/bin/time -p nice doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          >& do.log &
      #   real    724m15s
      cat fb.hg19.chainNomLeu3Link.txt
      #   2542790081 bases of 2897316137 (87.764%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzNomLeu3.2013-03-06 lastz.nomLeu3
  
      #   running the swap - DONE - 2013-03-26
      mkdir /hive/data/genomes/nomLeu3/bed/blastz.hg19.swap
      cd /hive/data/genomes/nomLeu3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06/DEF \
          -swap -syntenicNet \
          -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          > swap.log 2>&1 &
      #   real   138m48s
      cat fb.nomLeu3.chainHg19Link.txt
      #   2479386532 bases of 2756609047 (89.943%) in intersection
  
  ##############################################################################
  # DBNSFP (DONE 3/22/13 angie)
      # The database of non-synonymous functional predictions (dbNSFP) contains
      # precomputed scores from a wide variety of tools on all non-synon variants
      # of all genomic positions in the CDS of Gencode transcripts.  Pick out
      # some interesting subsets of its 52 columns and translate into bigBed and
      # bigWig files that can be joined with users' variants by the Variant
      # Annotation Integrator (#6152).
      screen -S dbNSFP -t dbNSFP
      mkdir /hive/data/genomes/hg19/bed/dbNSFP2.0
      cd /hive/data/genomes/hg19/bed/dbNSFP2.0
      wget http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFP2.0.zip
      unzip dbNSFP2.0.zip
  
      # Run a perl script that digests the 52-column input files into several independent
      # bed3+ files:
      ~/kent/src/hg/utils/dbNsfpToBed.pl dbNSFP*_variant.chr*
      # There are a bunch of mild warnings like this:
  #FYI: >3 rows (5) for {chr1, 1221327, ENST00000379110}; removing less-informative duplicates.
      # The script has a workaround and follow-up error check, but it would be
      # good to report the cases to dbNSFP (see script for details).
  
      wc -l *.bed
  #    2466469 dbNsfpGerpNr.bed
  #   22275488 dbNsfpGerpRs.bed
  #     231348 dbNsfpInterPro.bed
  #   24810552 dbNsfpLrt.bed
  #   28654727 dbNsfpMutationAssessor.bed
  #   26188935 dbNsfpMutationTaster.bed
  #   27629124 dbNsfpPolyPhen2.bed
  #   31826285 dbNsfpSeqChange.bed
  #   28302771 dbNsfpSift.bed
  #     474262 dbNsfpUniProt.bed
  
      # Are all subsets present on all chroms?
      foreach f (*.bed)
        echo $f
        cut -f 1 $f | uniq -c > $f:r.chromHist
      end
      wc -l *.chromHist
  #  24 dbNsfpGerpNr.chromHist
  #  24 dbNsfpGerpRs.chromHist
  #  24 dbNsfpInterPro.chromHist
  #  24 dbNsfpLrt.chromHist
  #  24 dbNsfpMutationAssessor.chromHist
  #  24 dbNsfpMutationTaster.chromHist
  #  24 dbNsfpPolyPhen2.chromHist
  #  24 dbNsfpSeqChange.chromHist
  #  23 dbNsfpSift.chromHist
  #  24 dbNsfpUniProt.chromHist
      # -- nope, dbNsfpSift has no chrY.  SIFT limitation??
  
      # Convert Gerp scores to bigWig
      bedGraphToBigWig dbNsfpGerpNr.bed /hive/data/genomes/hg19/chrom.sizes dbNsfpGerpNr.bw
      bedGraphToBigWig dbNsfpGerpRs.bed /hive/data/genomes/hg19/chrom.sizes dbNsfpGerpRs.bw
  
      # Convert remaining files to bigBed
      foreach f (dbNsfp[^G]*.bed)
        set track = $f:r
        echo $track
        set autoSql = ~/kent/src/hg/lib/$track.as
        bedToBigBed -type=bed3+ -as=$autoSql -tab \
          $f /hive/data/genomes/hg19/chrom.sizes $track.bb
      end
  
      # Load database tables
      mkdir /gbdb/hg19/dbNsfp
      foreach f (dbNsfp*.{bb,bw})
        ln -s `pwd`/$f /gbdb/hg19/dbNsfp/
        hgBbiDbLink hg19 $f:r /gbdb/hg19/dbNsfp/$f
      end
  
      # Clean up: remove large files that can be re-unzipped or regenerated:
      rm -f search_db* dbNS*_variant.chr* dbNs*.bed
  
  ##############################################################################
  # GAD UPDATE (DONE 3/25/13 angie)
      mkdir /hive/data/genomes/hg19/bed/gad
      set today = `date +%Y_%m_%d`
      mkdir /hive/data/genomes/hg19/bed/gad/$today
      cd /hive/data/genomes/hg19/bed/gad/$today
  
      # Download requires a simple registration process: enter name and email on
      #   http://geneticassociationdb.nih.gov/cgi-bin/download.cgi
      # and then you will receive an automated email with a download link.
      wget <downloadUrlFromEmail>
      unzip -j all.zip
  
      # Examine the 43 column labels of all.txt:
      tail -n +3 all.txt | head -1 \
      | perl -wne 'chomp; @w = split("\t"); \
                   $i = 1; foreach $w (@w) { print "$i\t$w[$i-1]\n"; $i++; }'
  
      # hg/lib/gadAll.sql has 45 columns -- apparently two have been dropped
      # from the latest all.txt.  Insert empty strings for those two.
      # NOTE: the first run of this generated a bunch of 'uninitialized value'
      # warnings; there were two stray linebreaks that I manually edited out.
      tail -n +4 all.txt \
      | perl -wne 's/\r//g; @w = split(/\t/, $_, -1); \
                   next if ($w[6] eq "" || $w[9] eq "" || $w[10] eq ""); \
                   $w[9] =~ s/,//g;  $w[10] =~ s/,//g; \
                   print join("\t", @w[0..29], "", "", @w[30..$#w]);' \
      | hgLoadSqlTab hg19 gadAll ~/kent/src/hg/lib/gadAll.sql stdin
  
      # Coords given in gadAll are a little wacky, so we ignore them.
      # Use Fan's program to associate geneSymbols with genomic coords from
      # preferred sources (first ensCanonical, then refGene, then kgAlias;
      # *** NOTE FOR NEXT TIME: gadPos now also looks in a couple Gencode V14
      #     tables if they exist.  If they don't anymore, you should look for
      #     suitable replacements if there are a lot of unfound IDs. ***)
      gadPos hg19 stdout | sort -k1,1 -k2n,2n -k4,4 -u > gad.tab
  #Found in ensCanonical: 7758
  #Found in refGene: 10
  #Found in kgAlias: 193
  #Found in Gencode: 178
  #Not found: 94
  
      # use -nobin option to ensure display order is according to genomic position
      # -- table is very small so performance is fine
      hgLoadBed -nobin hg19 gad gad.tab
  #Read 9156 elements of size 4 from gad.tab
  
  #########################################################################
  # UPDATE COSMIC TRACK - v64 (DONE - 2013-04-17 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
      cd /hive/data/genomes/hg19/bed/cosmic
  
      time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v64_260313.csv.gz
  #       New length: 677957
  #       Old length: 616299
  #       Percent bed overlap with previous version: 100.00%
  #       Number of deleted IDs: 1
  #       Number of added IDs: 61659
  
  #       real    1m4.240s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v64/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v64/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v64_260313.csv.gz \
          > do.log 2>&1
      #   real    0m27.062s
      #   Read 677957 elements of size 4 from cosmic.bed
  
  #########################################################################
  # UPDATE COSMIC TRACK - v65 (DONE - 2013-06-10 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
      cd /hive/data/genomes/hg19/bed/cosmic
  
      time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v65_100613.csv.gz
  #	New length: 868893
  #	Old length: 677957
  #	Percent bed overlap with previous version: 77.53%
  #	Number of deleted IDs: 163277
  #	Number of added IDs: 354213
  
  #       real    0m42.584s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v65/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v65/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v65_100613.csv.gz \
          > do.log 2>&1
      #   real    0m39.941s
      #   Read 868893 elements of size 4 from cosmic.bed
  
  #########################################################################
  # UPDATE COSMIC TRACK - v66 (DONE - 2013-08-15 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
      cd /hive/data/genomes/hg19/bed/cosmic
  
      time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v66_250713.csv.gz
  #	New length: 1211379
  #	Old length: 868893
  #	Percent bed overlap with previous version: 100.00%
  #	Number of deleted IDs: 0
  #	Number of added IDs: 342486
  
  #       real    1m52.911s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v66/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v66/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v66_250713.csv.gz \
          > do.log 2>&1
      #   real    0m56.363s
      #   Read 1211379 elements of size 4 from cosmic.bed
  
  ############################################################################
  # UPDATE COSMIC TRACK - v67 (DONE - 2014-01-21 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
      cd /hive/data/genomes/hg19/bed/cosmic
  
      time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v67_031213.csv.gz
  #	New length: 1268325
  #	Old length: 1211379
  #	Percent bed overlap with previous version: 99.99%
  #	Number of deleted IDs: 409
  #	Number of added IDs: 57355
  
  #	real    1m4.883s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v67/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v67/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v67_031213.csv.gz \
          > do.log 2>&1
      #   real    0m57.187s
      #   Read 1268325 elements of size 4 from cosmic.bed
  
  ############################################################################
  # UPDATE COSMIC TRACK - v68 (DONE - 2014-02-25 - Hiram)
      # take a look at:
      # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
      # to see what the new version file name is, then:
  
      cd /hive/data/genomes/hg19/bed/cosmic
  
      time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v68.csv.gz
  #    New length: 1287203
  #    Old length: 1268325
  #    Percent bed overlap with previous version: 100.00%
  #    Number of deleted IDs: 197
  #    Number of added IDs: 19075
  
  #    real    1m17.976s
  
      # that created files in: /hive/data/genomes/hg19/bed/cosmic/v68/
  
      # then:
      cd /hive/data/genomes/hg19/bed/cosmic/v68/
      time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
  ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v68.csv.gz \
          > do.log 2>&1
      #   real    0m59.645s
      #   Read 1287203 elements of size 4 from cosmic.bed
  
  ############################################################################
  # SwissProt Mutations track (spMut) (2013-05-09 - Max)
  cd ~/kent/src/hg/utils/uniprotMutations/
  # make a psl file to lift from uniprot to hg19
  # we align from uniprot to refseq, then from refseq to hg19, to handle
  # introns, based on markd's ls-snp pipeline
  makeUniProtToHg.sh
  # reformat the uniProt parser output (publications track) to bed, lift it and add the extra fields
  liftUniprotMut.py
  # load the resulting bigbed track into the browser
  load.sh
  
  ############################################################################
  # LOVD Variants track (2013-05-09 - Max)
  # the LOVD track is fully ottomatized, it's updated daily with this
  # command:
  cd /hive/data/outside/otto/lovd
  download.sh && check.sh && load.sh
  
  ############################################################################
  # PERSONAL GENOME VARIANTS - RELOAD (DONE 5/31/13 angie)
      # Belinda Giardine provided database dump files for all PSU Personal
      # Genome Variant subtracks, as well as an updated trackDb.ra.
      # Download all dump files and (re)load all tables to make sure we
      # are current w/PSU.
      mkdir /hive/data/genomes/hg19/bed/pgSnp/2013-05-28
      cd /hive/data/genomes/hg19/bed/pgSnp/2013-05-28
      # Complete Genomics samples w/HG* IDs:
      wget ftp://ftp.bx.psu.edu/data/bx_browser/hg19/HG\*
      # Complete Genomics samples w/NA* IDs:
      wget ftp://ftp.bx.psu.edu/data/bx_browser/hg19/NA\*
      # Older Personal Genome Variants tables, many of which Belinda already
      # loaded on hgwdev:
      wget ftp://ftp.bx.psu.edu/data/bx_browser/hg19/pg\*
      # 1000 Genomes distilled to population allele frequencies:
      wget ftp://ftp.bx.psu.edu/data/bx_browser/hg19/1000Genomes2012/\*
      # Genome of the Netherlands population-wide allele frequencies:
      wget ftp://ftp.bx.psu.edu/data/bx_browser/hg19/gonl.pgPop.gz
  
      # Now transform filenames into table/track names matching Belinda's
      # trackDb.pgSnp.ra and (re)load tables:
      set hglbOpts = (-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
        -type=bed4+ -tab -noNameIx -allowStartEqualEnd)
      foreach f (HG*.gz)
        set table = `echo $f:r:r | sed -re 's/\.indel/indel/; s/^/pg/;'`
        echo $table
        hgLoadBed $hglbOpts hg19 $table $f
      end
  # Read 471455 elements of size 7 from HG00731.indel.pgSnp.gz
  # Read 3419274 elements of size 7 from HG00731.pgSnp.gz
  # Read 466400 elements of size 7 from HG00732.indel.pgSnp.gz
  # Read 3457087 elements of size 7 from HG00732.pgSnp.gz
  # Read 429472 elements of size 7 from HG00733.indel.pgSnp.gz
  # Read 3407941 elements of size 7 from HG00733.pgSnp.gz
  
      # Here's where it gets tricky: several of these NA IDs need to have "CG"
      # appended to their table names because there were already pgNA* tables
      # from 1000 Genomes Pilot high-coverage trios.
      foreach f (NA*.gz)
        set table = `echo $f:r:r | sed -re 's/\.indel/indel/; s/^/pg/;'`
        if ($table == pgNA12878 || $table == pgNA12891 || $table == pgNA12892 || \
            $table == pgNA19238 || $table == pgNA19239 || $table == pgNA19240) then
          set table = $table"CG"
        endif
        echo $table
        hgLoadBed $hglbOpts hg19 $table $f
      end
  #Read 497395 elements of size 7 from NA06985.indel.pgSnp.gz
  #... (no errors)
  #Read 4058556 elements of size 7 from NA21767.pgSnp.gz
  
      # Some of the pg* files/tables are not for pgSnp tracks; skip those.
      # Also watch out for some files that do already have the bin column.
      foreach f (pg*.gz)
        set table = $f:r:r
        set skipIt = `echo $table \
          | perl -wne 'if (/(Gwas|Hgmd|PhenCode|Snpedia|Mappable)/) { \
                         print "1\n"; } else {print "0\n"; }'`
        if ($skipIt == 1) then
          echo Skipping $table
        else
          echo $table
          set numCols = `zcat $f | head -1 | perl -wne '$n = s/\t//g + 1; print "$n\n";'`
          if ($numCols == 8) then
            hgLoadBed $hglbOpts -hasBin hg19 $table $f
          else
            hgLoadBed $hglbOpts hg19 $table $f
          endif
        endif
      end
  #Read 121305 elements of size 7 from pgAbt454.txt.gz
  #...
  #Skipping pgAbtGwas
  #Skipping pgAbtHgmd
  #...
  #Read 3835591 elements of size 7 from pgYoruban3.txt.gz
  
      # 5/31/13: One more file, rename a table:
      wget ftp://ftp.bx.psu.edu/data/bx_browser/hg19/pgLucier.pgSnp.gz
      hgLoadBed $hglbOpts hg19 pgLucier pgLucier.pgSnp.gz
      hgsql hg19 -e 'drop table pgKb1Comb; rename table pgKb1 to pgKb1Comb;'
  
  
  ############################################################################
  # RETROPOSED GENES ucscRetro track VERSION 5
  # (2013-02-28 - 2013-04-18, baertsch,hartera DONE)
  mkdir -p /hive/hive/groups/gencode/pseudogenes/retroFinder/hg19.20130228
  cd /hive/groups/gencode/pseudogenes/retroFinder/hg19.20130228
  
  mkdir -p /hive/data/genomes/hg19/bed/retro/
  cd /hive/groups/gencode/pseudogenes/retroFinder/hg19.20130228
  cat << '_EOF_' > DEF
  
  RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 -skipBlatMerge "
  VERSION=5
  DB=hg19
  SCORETHRESH=550
  GENOMENAME='Homo sapiens'
  GBDB=hg
  DATE=20130228
  BINDIR=/hive/users/hartera/GencodeWG/retroFinder/trunk/bin
  KENTBINDIR=/cluster/home/hartera/bin/x86_64
  MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION/
  TMPMRNA=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE/mrnaBlastz/$DB
  TMPEST=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE/est/$DB
  EST=all_est
  SPLICED_EST=intronEst
  SPLIT_EST=0
  SPLIT_SPLICED_EST=0
  SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/trunk/src/pipeline
  GENOME=/hive/data/genomes/
  RETRODIR=$GENOME/$DB/bed/retro
  BASE=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE/retro
  OUTDIR=${BASE}/${DB}.${VERSION}
  RESULT=$OUTDIR/result
  LOG=$OUTDIR/log
  OUT=$OUTDIR/out
  OVERLAPDIR=$OUTDIR/run.o
  TABLE=ucscRetroInfo$VERSION
  ORTHOTABLE=ucscRetroOrtho$VERSION
  ALIGN=ucscRetroAli$VERSION
  LOCAL=/scratch/data/$DB
  TWOBIT=$GENOME/$DB/$DB.2bit
  NIB=$LOCAL/nib
  RMSK=rmsk
  NET1=netMm10
  NET2=netCanFam3
  NET3=netRheMac3
  # these two nes determine which retros are classified as ancient, use two farthest nets
  ANCIENT1=netMm10
  ANCIENT2=netCanFam3
  GENE1=ensGene
  GENE2=refGene
  GENE3=ensGene
  CLUSTER=swarm
  SPECIES="hg19 mm10"
  ROOTDIR="~/public_html/retro/hg19Feb13"
  WEBROOT=$ROOTDIR/retro.$VERSION/
  WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu
  EXPDIR=exp
  GENEPFAM=ensGene
  PFAM=ensToPfam
  PFAMIDFIELD=name
  PFAMDOMAIN=value
  ALTSPLICE=sibTxGraph
  SPLITBYAGE=$SCRIPT/splitRetrosByAge
  PDB=proteins121210
  ARRAY=gnfAtlas2
  AFFYPROBE="affyU133A,affyGnf1h"
  ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median
  ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio
  ARRAYABS=hgFixed.gnfHumanAtlas2All
  ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps
  ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps
  ARRAYLOOKUP=ensToGnfAtlas2
  ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl"
  '_EOF_'
      # << happy emacs
  chmod +x DEF
  mkdir mrnaBlastz
  cd mrnaBlastz
  cp ../DEF .
  # Create S1.len:
  cp /hive/data/genomes/hg19/chrom.sizes S1.len
  # Edit S1.len and remove chrM and random chroms then copy over to hg19
  # genomes directory
  mkdir -p /hive/data/genomes/hg19/bed/mrnaBlastz.5
  cp S1.len /hive/data/genomes/hg19/bed/mrnaBlastz.5
  
  screen
  # Run steps 1 to 6 of RetroFinder pipeline from scripts in CCDS SVN source tree:
  retroFinder/trunk/src/pipeline/ucscStep1.sh DEF
  # check cluster job on swarm
  retroFinder/trunk/src/pipeline/ucscStep2.sh DEF
  retroFinder/trunk/src/pipeline/ucscStep3.sh DEF
  #check cluster job
  retroFinder/trunk/src/pipeline/ucscStep4.sh DEF
  #check cluster job
      # Load the track
  retroFinder/trunk/src/pipeline/ucscStep5.sh DEF
  cd /hive/groups/gencode/pseudogenes/retroFinder/hg19.20130228/retro/hg19.5
  retroFinder/trunk/src/pipeline/filterMrna.sh DEF
  retroFinder/trunk/src/pipeline/filterEst.sh DEF
  retroFinder/trunk/src/pipeline/analyseExpress.sh DEF
  cd /hive/groups/gencode/pseudogenes/retroFinder/hg19.20130228/mrnaBlastz
  retroFinder/trunk/src/pipeline/ucscStep6.sh DEF
  #added ucscRetroAli to trackDb.ra
  # copied
  # /hive/groups/gencode/pseudogenes/retroFinder/hg19.20130228/retro/hg19.5/trackDb.retro
  # entry to kent/src/hg/makeDb/trackDb/human/hg19/trackDb.ra and edited it
  # to add the version number and date.
  # Scripts copied ucscRetroAli5.psl, ucscRetroInfo5.bed and ucscRetroCds5.tab
  # to /hive/data/genomes/hg19/bed/retro/
  
  ############################################################################
  # LASTZ Tenrec EchTel2 (DONE - 2013-06-12 - Hiram)
      screen -S hg19EchTel2  # use screen to manage the long running job
      mkdir /hive/data/genomes/hg19/bed/lastzEchTel2.2013-06-12
      cd /hive/data/genomes/hg19/bed/lastzEchTel2.2013-06-12
  
      cat << '_EOF_' > DEF
  # Human vs. Tenrec
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Tenrec
  SEQ2_DIR=/hive/data/genomes/echTel2/echTel2.2bit
  SEQ2_LEN=/hive/data/genomes/echTel2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzEchTel2.2013-06-12
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
      # real    917m59.931s
      # failed during chaining on chr19, finish the one job on hgwdev
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-continue=chainMerge -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
      #  real    70m53.043s
  
      cat fb.hg19.chainEchTel2Link.txt
      #	873117393 bases of 2897316137 (30.135%) in intersection
  
      # broke down in downloads step due to errors in hgAddLiftOverChain
      # finished manually, then continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-continue=cleanup -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > cleanupSynNet.log 2>&1 &
      # real    25m46.973s
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzEchTel2.2013-06-12 lastz.echTel2
  
      # better to have reciprocal best for this one since it is low coverage:
      cd /hive/data/genomes/hg19/bed/lastzEchTel2.2013-06-12
      time doRecipBest.pl hg19 echTel2 -buildDir=`pwd` -workhorse=hgwdev \
  	> best.log 2>&1 &
      #  real    44m19.505s
  
      # and, for the swap
      mkdir /hive/data/genomes/echTel2/bed/blastz.hg19.swap
      cd /hive/data/genomes/echTel2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzEchTel2.2013-06-12/DEF \
  	-swap -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    80m29s
      cat fb.echTel2.chainHg19Link.txt
      #	852830619 bases of 2605196361 (32.736%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/echTel2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ Pika OchPri3 (DONE - 2013-06-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOchPri3.2013-06-17
      cd /hive/data/genomes/hg19/bed/lastzOchPri3.2013-06-17
  
      cat << '_EOF_' > DEF
  # Human vs. Pika
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Pika
  SEQ2_DIR=/hive/data/genomes/ochPri3/ochPri3.2bit
  SEQ2_LEN=/hive/data/genomes/ochPri3/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=30
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOchPri3.2013-06-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1057m2.114s
      # broken during loading due to no ochPri3 db existing
      # get that going, then finish the loading manually
      #  real    11m33.971s
      #  continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-continue=download `pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> download.log 2>&1 &
      #  real    27m40.806s
  
      cat fb.hg19.chainOchPri3Link.txt
      #	1004662072 bases of 2897316137 (34.676%) in intersection
  
      time doRecipBest.pl -buildDir=`pwd` hg19 ochPri3 > rbest.log 2>&1
      #	real    266m38.281s
  
      mkdir /hive/data/genomes/ochPri3/bed/blastz.hg19.swap
      cd /hive/data/genomes/ochPri3/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-swap /hive/data/genomes/hg19/bed/lastzOchPri3.2013-06-17/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> swap.log 2>&1 &
      #  real    71m5.679s
  
      cat fb.ochPri3.chainHg19Link.txt
      #  969182988 bases of 1943987870 (49.855%) in intersection
  
      cd /hive/data/genomes/ochPri3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ Sheep OviAri3 (DONE - 2013-06-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzOviAri3.2013-06-17
      cd /hive/data/genomes/hg19/bed/lastzOviAri3.2013-06-17
  
      cat << '_EOF_' > DEF
  # Human vs. Sheep
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Sheep
  SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit
  SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=20
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOviAri3.2013-06-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1303m1.839s
      # finished last chaining job:
      # real    126m52.973s
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-continue=chainMerge `pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> chainMerge.log 2>&1 &
      #  real    123m11.588s
  
      cat fb.hg19.chainOviAri3Link.txt
      #	1356890439 bases of 2897316137 (46.833%) in intersection
  
      time doRecipBest.pl -buildDir=`pwd` hg19 oviAri3 > rbest.log 2>&1 &
      # real    325m50.050s
  
      # and for the swap:
      mkdir /hive/data/genomes/oviAri3/bed/blastz.hg19.swap
      cd /hive/data/genomes/oviAri3/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg19/bed/lastzOviAri3.2013-06-17/DEF \
         -swap -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
         -bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \
         > swap.log 2>&1
      #  real    107m13.938s
  
      cat fb.oviAri3.chainHg19Link.txt
      #  1316305922 bases of 2534335866 (51.939%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/oviAri3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ Shrew SorAra2 (DONE - 2013-06-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzSorAra2.2013-06-17
      cd /hive/data/genomes/hg19/bed/lastzSorAra2.2013-06-17
  
      cat << '_EOF_' > DEF
  # Human vs. Shrew
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Shrew
  SEQ2_DIR=/hive/data/genomes/sorAra2/sorAra2.2bit
  SEQ2_LEN=/hive/data/genomes/sorAra2/chrom.sizes
  SEQ2_CHUNK=40000000
  SEQ2_LIMIT=40
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSorAra2.2013-06-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      # real   1671m27.543s
      # stuck in chaining chr19 - running lastOne manually on hgwdev
      # real    49m31.906s
      # then continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-continue=chainMerge `pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> chainMerge.log 2>&1 &
      #   Elapsed time: 87m15s
      cat fb.hg19.chainSorAra2Link.txt
      #	802006172 bases of 2897316137 (27.681%) in intersection
  
      time doRecipBest.pl -buildDir=`pwd` hg19 sorAra2 > rbest.log 2>&1
      #  real    267m10.778s
  
      # and for the swap:
      mkdir /hive/data/genomes/sorAra2/bed/blastz.hg19.swap
      cd /hive/data/genomes/sorAra2/bed/blastz.hg19.swap
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg19/bed/lastzSorAra2.2013-06-17/DEF \
         -swap -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
         -bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \
         > swap.log 2>&1
      #  real    77m5.017s
  
      cat fb.sorAra2.chainHg19Link.txt
      #  778513475 bases of 2192103426 (35.514%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/sorAra2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ Zebra finch TaeGut2 (DONE - 2013-06-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzTaeGut2.2013-06-17
      cd /hive/data/genomes/hg19/bed/lastzTaeGut2.2013-06-17
  
      cat << '_EOF_' > DEF
  # Human vs. Zebra finch
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  # distant settings
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Zebra finch
  SEQ2_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
  SEQ2_LEN=/hive/data/genomes/taeGut2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzTaeGut2.2013-06-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #   real    811m51.660s
  
      cat fb.hg19.chainTaeGut2Link.txt
      #	154165978 bases of 2897316137 (5.321%) in intersection
      # finished download step manually to get the liftOverChain in hgcentraltest
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-continue=cleanup `pwd`/DEF \
  	-chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> cleanup.log 2>&1 &
      #  real    5m29.892s
  
      time doRecipBest.pl -buildDir=`pwd` hg19 taeGut2 > rbest.log 2>&1
      #    real    286m17.228s
  
      # and, for the swap
      mkdir /hive/data/genomes/taeGut2/bed/blastz.hg19.swap
      cd /hive/data/genomes/taeGut2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzTaeGut2.2013-06-17/DEF \
  	-swap -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    42m46.344s
  
      cat fb.taeGut2.chainHg19Link.txt
      #	142016629 bases of 1222864691 (11.613%) in intersection
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/taeGut2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ Shrew VicPac2 (DONE - 2013-06-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzVicPac2.2013-06-17
      cd /hive/data/genomes/hg19/bed/lastzVicPac2.2013-06-17
  
      cat << '_EOF_' > DEF
  # Human vs. Alpaca
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Alpaca
  SEQ2_DIR=/hive/data/genomes/vicPac2/vicPac2.2bit
  SEQ2_LEN=/hive/data/genomes/vicPac2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=800
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzVicPac2.2013-06-17
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #	establish a screen to control this job
      screen
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> do.log 2>&1 &
      #	real    1130m42.615s
      cat fb.hg19.chainVicPac2Link.txt
      #	1454261693 bases of 2897316137 (50.193%) in intersection
      # manually finished the 'download' step after fixing bugs in
      # the hgAddLiftOverChain, then continue:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	-continue=cleanup `pwd`/DEF \
  	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	> cleanup.log 2>&1 &
      #   Elapsed time: 32m56s
  
      time doRecipBest.pl -buildDir=`pwd` hg19 vicPac2 > rbest.log 2>&1
      #	real    447m22.629s
  
      # and, for the swap
      mkdir /hive/data/genomes/vicPac2/bed/blastz.hg19.swap
      cd /hive/data/genomes/vicPac2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzVicPac2.2013-06-17/DEF \
  	-swap -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
      #	real    206m52.132s
      cat fb.vicPac2.chainHg19Link.txt
      #	1428125689 bases of 2078582856 (68.707%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/vicPac2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  ##############################################################################
  # LASTZ White-throated sparrow ZonAlb1 (DONE - 2013-06-26 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzZonAlb1.2013-06-26
      cd /hive/data/genomes/hg19/bed/lastzZonAlb1.2013-06-26
  
      cat << '_EOF_' > DEF
  # human vs white-throated sparrow
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: White-throated sparrow
  SEQ2_DIR=/hive/data/genomes/zonAlb1/zonAlb1.2bit
  SEQ2_LEN=/hive/data/genomes/zonAlb1/chrom.sizes
  SEQ2_CHUNK=100000000
  SEQ2_LIMIT=30
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzZonAlb1.2013-06-26
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-stop=load -qRepeats=windowmaskerSdust-syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    100m7.452s
      #  as expected, failed in the netClass operation due to no zonAlb1 db
      #  That's OK, don't need the net table loaded, run this
      # to get measurement of chainLinks:
      cd /hive/data/genomes/hg19/bed/lastzZonAlb1.2013-06-26
      featureBits hg19 chainZonAlb1Link > fb.hg19.chainZonAlb1Link.txt 2>&1
      #   100700835 bases of 2897316137 (3.476%) in intersection
  
      # after setting up zonAlb1 database, proper netClassing and loading
      # the net track.  Can finish this off:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-continue=download -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > download.log 2>&1
  
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 zonAlb1 > rbest.log 2>&1 &
      #  real    100m9.728s
  
      # and, for the swap
      mkdir /hive/data/genomes/zonAlb1/bed/blastz.hg19.swap
      cd /hive/data/genomes/zonAlb1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzZonAlb1.2013-06-26/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    9m27.735s
  
      cat fb.zonAlb1.chainHg19Link.txt
      #	86760815 bases of 1006303327 (8.622%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/zonAlb1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Saker falcon falChe1 (DONE - 2013-06-27 - Hiram)
      screen -S falChe1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzFalChe1.2013-06-27
      cd /hive/data/genomes/hg19/bed/lastzFalChe1.2013-06-27
  
      cat << '_EOF_' > DEF
  # human vs Saker falcon
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Saker falcon
  SEQ2_DIR=/hive/data/genomes/falChe1/falChe1.2bit
  SEQ2_LEN=/hive/data/genomes/falChe1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=50
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzFalChe1.2013-06-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    523m19.990s
  
      cat fb.hg19.chainFalChe1Link.txt
      #   110443921 bases of 2897316137 (3.812%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 falChe1 > rbest.log 2>&1 &
      #  real    12m28.141s
  
      # and, for the swap
      mkdir /hive/data/genomes/falChe1/bed/blastz.hg19.swap
      cd /hive/data/genomes/falChe1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzFalChe1.2013-06-27/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    10m46.936s
      cat fb.falChe1.chainHg19Link.txt
      #	94752559 bases of 1150993769 (8.232%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/falChe1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Peregrine falcon falPer1 (DONE - 2013-06-27 - Hiram)
      screen -S falPer1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzFalPer1.2013-06-27
      cd /hive/data/genomes/hg19/bed/lastzFalPer1.2013-06-27
  
      cat << '_EOF_' > DEF
  # human vs Peregrine falcon
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Peregrine falcon
  SEQ2_DIR=/hive/data/genomes/falPer1/falPer1.2bit
  SEQ2_LEN=/hive/data/genomes/falPer1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=50
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzFalPer1.2013-06-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    511m18.114s
  
      cat fb.hg19.chainFalPer1Link.txt
      #   112378981 bases of 2897316137 (3.879%) in intersection
  
      # forgot to have the falPer1 database ready, finish the load, then:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-continue=download -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > download.log 2>&1 &
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 falPer1 > rbest.log 2>&1 &
      #  about 17 minutes
  
      # and, for the swap
      mkdir /hive/data/genomes/falPer1/bed/blastz.hg19.swap
      cd /hive/data/genomes/falPer1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzFalPer1.2013-06-27/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    17m25.640s
  
      cat fb.falPer1.chainHg19Link.txt
      #	96477634 bases of 1153404357 (8.365%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/falPer1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Mallard duck anaPla1 (DONE - 2013-06-27 - Hiram)
      screen -S anaPla1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzAnaPla1.2013-06-27
      cd /hive/data/genomes/hg19/bed/lastzAnaPla1.2013-06-27
  
      cat << '_EOF_' > DEF
  # human vs Mallard duck
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Mallard duck
  SEQ2_DIR=/hive/data/genomes/anaPla1/anaPla1.2bit
  SEQ2_LEN=/hive/data/genomes/anaPla1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=500
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzAnaPla1.2013-06-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    584m8.015s
  
      cat fb.hg19.chainAnaPla1Link.txt
      #   101670707 bases of 2897316137 (3.509%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 anaPla1 > rbest.log 2>&1 &
      #  real    12m35.684s
  
      # and, for the swap
      mkdir /hive/data/genomes/anaPla1/bed/blastz.hg19.swap
      cd /hive/data/genomes/anaPla1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzAnaPla1.2013-06-27/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #  real    14m11.895s
  
      cat fb.anaPla1.chainHg19Link.txt
      #	87894467 bases of 1069972754 (8.215%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/anaPla1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Parrot amaVit1 (DONE - 2013-06-27 - Hiram)
      screen -S amaVit1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzAmaVit1.2013-06-27
      cd /hive/data/genomes/hg19/bed/lastzAmaVit1.2013-06-27
  
      cat << '_EOF_' > DEF
  # human vs Parrot
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Parrot
  SEQ2_DIR=/hive/data/genomes/amaVit1/amaVit1.2bit
  SEQ2_LEN=/hive/data/genomes/amaVit1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=600
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzAmaVit1.2013-06-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    710m51.907s
  
      cat fb.hg19.chainAmaVit1Link.txt
      #   95543127 bases of 2897316137 (3.298%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 amaVit1 > rbest.log 2>&1 &
      #  about 21 minutes
  
      # and, for the swap
      mkdir /hive/data/genomes/amaVit1/bed/blastz.hg19.swap
      cd /hive/data/genomes/amaVit1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzAmaVit1.2013-06-27/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #  real    74m7.469s
  
      cat fb.amaVit1.chainHg19Link.txt
      #	88086731 bases of 1128255752 (7.807%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/amaVit1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Budgerigar melUnd1 (DONE - 2013-06-27 - Hiram)
      screen -S melUnd1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzMelUnd1.2013-06-27
      cd /hive/data/genomes/hg19/bed/lastzMelUnd1.2013-06-27
  
      cat << '_EOF_' > DEF
  # human vs Budgerigar
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Budgerigar
  SEQ2_DIR=/hive/data/genomes/melUnd1/melUnd1.2bit
  SEQ2_LEN=/hive/data/genomes/melUnd1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=200
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMelUnd1.2013-06-27
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      # forgot to capture the output in do.log
      #  real    539m3.747s
  
      cat fb.hg19.chainMelUnd1Link.txt
      #   103624916 bases of 2897316137 (3.577%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 melUnd1 > rbest.log 2>&1 &
      # real    11m14.969s
  
      # and, for the swap
      mkdir /hive/data/genomes/melUnd1/bed/blastz.hg19.swap
      cd /hive/data/genomes/melUnd1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzMelUnd1.2013-06-27/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    12m1.732s
  
      cat fb.melUnd1.chainHg19Link.txt
      #	89509004 bases of 1086614815 (8.237%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/melUnd1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # lastz alligator allMis1 (DONE - 2013-06-28 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S hg19AllMis1
      mkdir /hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28
      cd /hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28
  
      cat << '_EOF_' > DEF
  # Human vs. alligator
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: alligator allMis1
  SEQ2_DIR=/hive/data/genomes/allMis1/allMis1.2bit
  SEQ2_LEN=/hive/data/genomes/allMis1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
      #	Elapsed time: 234m48s
  
      cat fb.hg19.chainAllMis1Link.txt
      #	205244552 bases of 2897316137 (7.084%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzAllMis1.2013-06-28 lastz.allMis1
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 allMis1 > rbest.log 2>&1 &
      #  real    12m4.813s
  
      #	and for the swap
      mkdir /hive/data/genomes/allMis1/bed/blastz.hg19.swap
      cd /hive/data/genomes/allMis1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28/DEF \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   forgot to get stderr in the swap.log
      #	real    27m54.364s
  
      cat  fb.allMis1.chainHg19Link.txt
      #	172919308 bases of 2129659933 (8.120%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/allMis1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ rock pigeon colLiv1 (DONE - 2013-06-29 - Hiram)
      screen -S colLiv1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzColLiv1.2013-06-29
      cd /hive/data/genomes/hg19/bed/lastzColLiv1.2013-06-29
  
      cat << '_EOF_' > DEF
  # human vs rock pigeon
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: rock pigeon
  SEQ2_DIR=/hive/data/genomes/colLiv1/colLiv1.2bit
  SEQ2_LEN=/hive/data/genomes/colLiv1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzColLiv1.2013-06-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    256m49.711s
  
      cat fb.hg19.chainColLiv1Link.txt
      #   106693699 bases of 2897316137 (3.683%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 colLiv1 > rbest.log 2>&1 &
      #   real    6m40.777s
  
      # and, for the swap
      mkdir /hive/data/genomes/colLiv1/bed/blastz.hg19.swap
      cd /hive/data/genomes/colLiv1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzColLiv1.2013-06-29/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    10m6.608s
  
      cat fb.colLiv1.chainHg19Link.txt
      #	92036020 bases of 1086925875 (8.468%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/colLiv1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ collard flycatcher ficAlb2 (DONE - 2013-06-29 - Hiram)
      screen -S ficAlb2    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzFicAlb2.2013-06-29
      cd /hive/data/genomes/hg19/bed/lastzFicAlb2.2013-06-29
  
      cat << '_EOF_' > DEF
  # human vs collard flycatcher
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: collard flycatcher
  SEQ2_DIR=/hive/data/genomes/ficAlb2/ficAlb2.2bit
  SEQ2_LEN=/hive/data/genomes/ficAlb2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=150
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzFicAlb2.2013-06-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    324m27.608s
  
      cat fb.hg19.chainFicAlb2Link.txt
      #   107639589 bases of 2897316137 (3.715%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 ficAlb2 > rbest.log 2>&1 &
      #   real    12m23.760s
  
      # and, for the swap
      mkdir /hive/data/genomes/ficAlb2/bed/blastz.hg19.swap
      cd /hive/data/genomes/ficAlb2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzFicAlb2.2013-06-29/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    11m28.655s
  
      cat fb.ficAlb2.chainHg19Link.txt
      #	95070548 bases of 1102325870 (8.625%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/ficAlb2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ scarlet macaw araMac1 (DONE - 2013-06-29 - Hiram)
      screen -S araMac1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzAraMac1.2013-06-29
      cd /hive/data/genomes/hg19/bed/lastzAraMac1.2013-06-29
  
      cat << '_EOF_' > DEF
  # human vs scarlet macaw
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: scarlet macaw
  SEQ2_DIR=/hive/data/genomes/araMac1/araMac1.2bit
  SEQ2_LEN=/hive/data/genomes/araMac1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=800
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzAraMac1.2013-06-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    518m3.915s
  
      cat fb.hg19.chainAraMac1Link.txt
      #   84373956 bases of 2897316137 (2.912%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 araMac1 > rbest.log 2>&1 &
      # real    8m57.897s
  
      # and, for the swap
      mkdir /hive/data/genomes/araMac1/bed/blastz.hg19.swap
      cd /hive/data/genomes/araMac1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzAraMac1.2013-06-29/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   Elapsed time: 45m27s
  
      cat fb.araMac1.chainHg19Link.txt
      #	73025327 bases of 997636166 (7.320%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/araMac1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ chicken galGal4 (DONE - 2013-06-29 - Hiram)
      screen -S galGal4    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzGalGal4.2013-06-29
      cd /hive/data/genomes/hg19/bed/lastzGalGal4.2013-06-29
  
      cat << '_EOF_' > DEF
  # human vs chicken
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: chicken
  SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit
  SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzGalGal4.2013-06-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    170m30.547s
  
      cat fb.hg19.chainGalGal4Link.txt
      #   106728507 bases of 2897316137 (3.684%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 galGal4 > rbest.log 2>&1 &
      #   real    13m39.680s
  
      # and, for the swap
      mkdir /hive/data/genomes/galGal4/bed/blastz.hg19.swap
      cd /hive/data/genomes/galGal4/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzGalGal4.2013-06-29/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #  real    12m22.704s
  
      cat fb.galGal4.chainHg19Link.txt
      #	92856222 bases of 1032854810 (8.990%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/galGal4/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ tibetan ground jay pseHum1 (DONE - 2013-06-29 - Hiram)
      screen -S pseHum1    # screen to manage this job
      mkdir /hive/data/genomes/hg19/bed/lastzPseHum1.2013-06-29
      cd /hive/data/genomes/hg19/bed/lastzPseHum1.2013-06-29
  
      cat << '_EOF_' > DEF
  # human vs tibetan ground jay
  # distant settings for human-aves alignment
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=10000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: tibetan ground jay
  SEQ2_DIR=/hive/data/genomes/pseHum1/pseHum1.2bit
  SEQ2_LEN=/hive/data/genomes/pseHum1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=40
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPseHum1.2013-06-29
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	`pwd`/DEF \
  	-qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #  real    275m15.275s
  
      cat fb.hg19.chainPseHum1Link.txt
      #   115368965 bases of 2897316137 (3.982%) in intersection
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 pseHum1 > rbest.log 2>&1 &
      #  real    8m31.432s
  
      # and, for the swap
      mkdir /hive/data/genomes/pseHum1/bed/blastz.hg19.swap
      cd /hive/data/genomes/pseHum1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzPseHum1.2013-06-29/DEF \
  	-swap -qRepeats=windowmaskerSdust -syntenicNet \
  	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
  	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real    10m5.748s
  
      cat fb.pseHum1.chainHg19Link.txt
      #	99760063 bases of 1030030436 (9.685%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/pseHum1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # Green monkey/chlSab1 Lastz run (DONE - 2013-06-30 - Hiram)
      screen -S chlSab1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzChlSab1.2013-06-30
      cd /hive/data/genomes/hg19/bed/lastzChlSab1.2013-06-30
      cat << '_EOF_' > DEF
  # human vs green monkey
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: green monkey ChlSab1
  SEQ2_DIR=/hive/data/genomes/chlSab1/chlSab1.2bit
  SEQ2_LEN=/hive/data/genomes/chlSab1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzChlSab1.2013-06-30
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #  real    211m59.612s
  
      cat fb.hg19.chainChlSab1Link.txt
      #   2504623210 bases of 2897316137 (86.446%) in intersection
  
      # filter with doRecipBest.pl
      doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 chlSab1 > rbest.log 2>&1 &
      # about 50 minutes
  
      # running the swap
      mkdir /hive/data/genomes/chlSab1/bed/blastz.hg19.swap
      cd /hive/data/genomes/chlSab1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzChlSab1.2013-06-30/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    84m22.132s
  
      cat fb.chlSab1.chainHg19Link.txt
      #  2425423103 bases of 2752019208 (88.132%) in intersection
  
  #########################################################################
  # crab-eating macaque/macFas5 Lastz run (DONE - 2013-06-30 - Hiram)
      screen -S macFas5    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMacFas5.2013-06-30
      cd /hive/data/genomes/hg19/bed/lastzMacFas5.2013-06-30
      cat << '_EOF_' > DEF
  # human vs crab-eating macaque
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: crab-eating macaque MacFas5
  SEQ2_DIR=/hive/data/genomes/macFas5/macFas5.2bit
  SEQ2_LEN=/hive/data/genomes/macFas5/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzMacFas5.2013-06-30
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #  real    390m17.113s
  
      cat fb.hg19.chainMacFas5Link.txt
      #   2510997453 bases of 2897316137 (86.666%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMacFas5.2013-06-30 lastz.macFas5
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 macFas5 > rbest.log 2>&1 &
      # real    57m59.876s
  
      # running the swap
      mkdir /hive/data/genomes/macFas5/bed/blastz.hg19.swap
      cd /hive/data/genomes/macFas5/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMacFas5.2013-06-30/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      # real    83m44.043s
  
      cat fb.macFas5.chainHg19Link.txt
      #  2432538960 bases of 2803866698 (86.757%) in intersection
  
      cd /hive/data/genomes/macFas5/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # squirrel monkey/saiBol1 Lastz run (DONE - 2013-06-30 - Hiram)
      screen -S saiBol1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzSaiBol1.2013-06-30
      cd /hive/data/genomes/hg19/bed/lastzSaiBol1.2013-06-30
      cat << '_EOF_' > DEF
  # human vs squirrel monkey
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: squirrel monkey SaiBol1
  SEQ2_DIR=/hive/data/genomes/saiBol1/saiBol1.2bit
  SEQ2_LEN=/hive/data/genomes/saiBol1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSaiBol1.2013-06-30
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #  real    303m20.294s
  
      cat fb.hg19.chainSaiBol1Link.txt
      #   2016474757 bases of 2897316137 (69.598%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzSaiBol1.2013-06-30 lastz.saiBol1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 saiBol1 > rbest.log 2>&1 &
      # about 50 minutes
  
      # running the swap
      mkdir /hive/data/genomes/saiBol1/bed/blastz.hg19.swap
      cd /hive/data/genomes/saiBol1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzSaiBol1.2013-06-30/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    73m49.157s
  
      cat fb.saiBol1.chainHg19Link.txt
      #  1952822096 bases of 2477131095 (78.834%) in intersection
  
      cd /hive/data/genomes/saiBol1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # bushbaby/otoGar3 Lastz run (DONE - 2013-07-01 - Hiram)
      screen -S otoGar3    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzOtoGar3.2013-07-01
      cd /hive/data/genomes/hg19/bed/lastzOtoGar3.2013-07-01
      cat << '_EOF_' > DEF
  # human vs bushbaby
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: bushbaby OtoGar3
  SEQ2_DIR=/hive/data/genomes/otoGar3/otoGar3.2bit
  SEQ2_LEN=/hive/data/genomes/otoGar3/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  
  BASE=/hive/data/genomes/hg19/bed/lastzOtoGar3.2013-07-01
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #  real    395m56.229s
      #  failed on chr19 chaining, finish on hgwdev, then continuing:
      #  real    158m41.105s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    128m1.356s
  
      cat fb.hg19.chainOtoGar3Link.txt
      #   1588586980 bases of 2897316137 (54.830%) in intersection
  
      # filter with doRecipBest.pl
      doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 otoGar3 > rbest.log 2>&1 &
      # about 1 hour
  
      # running the swap
      mkdir /hive/data/genomes/otoGar3/bed/blastz.hg19.swap
      cd /hive/data/genomes/otoGar3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzOtoGar3.2013-07-01/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    123m31.066s
  
      cat fb.otoGar3.chainHg19Link.txt
      #  1534315855 bases of 2359530453 (65.026%) in intersection
  
      cd /hive/data/genomes/otoGar3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # Chinese tree shrew/tupChi1 Lastz run (DONE - 2013-07-05 - Hiram)
      screen -S tupChi1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzTupChi1.2013-07-05
      cd /hive/data/genomes/hg19/bed/lastzTupChi1.2013-07-05
      cat << '_EOF_' > DEF
  # human vs Chinese tree shrew
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Chinese tree shrew TupChi1
  SEQ2_DIR=/hive/data/genomes/tupChi1/tupChi1.2bit
  SEQ2_LEN=/hive/data/genomes/tupChi1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=300
  
  BASE=/hive/data/genomes/hg19/bed/lastzTupChi1.2013-07-05
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #  real    1334m42.001s
  
      cat fb.hg19.chainTupChi1Link.txt
      #   1348933101 bases of 2897316137 (46.558%) in intersection
  
      # filter with doRecipBest.pl
      doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 tupChi1 > rbest.log 2>&1 &
      # about 53 minutes
  
      # running the swap
      mkdir /hive/data/genomes/tupChi1/bed/blastz.hg19.swap
      cd /hive/data/genomes/tupChi1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzTupChi1.2013-07-05/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    118m8.921s
  
      cat fb.tupChi1.chainHg19Link.txt
      #  1360938926 bases of 2706389135 (50.286%) in intersection
  
      cd /hive/data/genomes/tupChi1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # brush-tailed rat/octDeg1 Lastz run (DONE - 2013-07-05 - Hiram)
      screen -S octDeg1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzOctDeg1.2013-07-05
      cd /hive/data/genomes/hg19/bed/lastzOctDeg1.2013-07-05
      cat << '_EOF_' > DEF
  # human vs brush-tailed rat
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: brush-tailed rat OctDeg1
  SEQ2_DIR=/hive/data/genomes/octDeg1/octDeg1.2bit
  SEQ2_LEN=/hive/data/genomes/octDeg1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzOctDeg1.2013-07-05
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    781m31.547s
      # chaining failed on chr19, finish manually on hgwdev
      #   real    156m20.850s
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
  	-qRepeats=windowmaskerSdust \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    101m24.482s
  
      cat fb.hg19.chainOctDeg1Link.txt
      #   1199059384 bases of 2897316137 (41.385%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 octDeg1 > rbest.log 2>&1 &
      # about 47 minutes
  
      # running the swap
      mkdir /hive/data/genomes/octDeg1/bed/blastz.hg19.swap
      cd /hive/data/genomes/octDeg1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzOctDeg1.2013-07-05/DEF \
  	-qRepeats=windowmaskerSdust \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    105m14.455s
  
      cat fb.octDeg1.chainHg19Link.txt
      #  1202975420 bases of 2526254702 (47.619%) in intersection
  
      cd /hive/data/genomes/octDeg1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # aardvark/oryAfe1 Lastz run (DONE - 2013-07-07 - Hiram)
      screen -S oryAfe1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzOryAfe1.2013-07-07
      cd /hive/data/genomes/hg19/bed/lastzOryAfe1.2013-07-07
      cat << '_EOF_' > DEF
  # human vs aardvark
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: aardvark OryAfe1
  SEQ2_DIR=/hive/data/genomes/oryAfe1/oryAfe1.2bit
  SEQ2_LEN=/hive/data/genomes/oryAfe1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=400
  
  BASE=/hive/data/genomes/hg19/bed/lastzOryAfe1.2013-07-07
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1825m38.733s
      # finish chr19 chain manually on hgwdev:
      time ./chain.csh hg19.2bit:chr19: chain/hg19.2bit:chr19:.chain
      #   real    165m17.851s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    121m35.148s
  
      cat fb.hg19.chainOryAfe1Link.txt
      #   1207247367 bases of 2897316137 (41.668%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 oryAfe1 > rbest.log 2>&1 &
      #   real    60m56.697s
  
      # running the swap
      mkdir /hive/data/genomes/oryAfe1/bed/blastz.hg19.swap
      cd /hive/data/genomes/oryAfe1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzOryAfe1.2013-07-07/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    108m54.136s
  
      cat fb.oryAfe1.chainHg19Link.txt
      #  1162956488 bases of 3415340621 (34.051%) in intersection
  
      cd /hive/data/genomes/oryAfe1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # cape golden mole/chrAsi1 Lastz run (DONE - 2013-07-07 - Hiram)
      screen -S chrAsi1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzChrAsi1.2013-07-07
      cd /hive/data/genomes/hg19/bed/lastzChrAsi1.2013-07-07
      cat << '_EOF_' > DEF
  # human vs cape golden mole
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: cape golden mole ChrAsi1
  SEQ2_DIR=/hive/data/genomes/chrAsi1/chrAsi1.2bit
  SEQ2_LEN=/hive/data/genomes/chrAsi1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=400
  
  BASE=/hive/data/genomes/hg19/bed/lastzChrAsi1.2013-07-07
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1869m30.259s
      # finish chr19 chain manually on hgwdev:
      time ./chain.csh hg19.2bit:chr19: chain/hg19.2bit:chr19:.chain
      #   real    185m8.939s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    113m20.012s
  
      cat fb.hg19.chainChrAsi1Link.txt
      #   991149670 bases of 2897316137 (34.209%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 chrAsi1 > rbest.log 2>&1 &
      #    real    54m43.373s
  
      # running the swap
      mkdir /hive/data/genomes/chrAsi1/bed/blastz.hg19.swap
      cd /hive/data/genomes/chrAsi1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzChrAsi1.2013-07-07/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    104m51.152s
  
      cat fb.chrAsi1.chainHg19Link.txt
      #  976558929 bases of 3363564316 (29.033%) in intersection
  
      cd /hive/data/genomes/chrAsi1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # cape elephant shrew/eleEdw1 Lastz run (DONE - 2013-07-07 - Hiram)
      screen -S eleEdw1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzEleEdw1.2013-07-07
      cd /hive/data/genomes/hg19/bed/lastzEleEdw1.2013-07-07
      cat << '_EOF_' > DEF
  # human vs cape elephant shrew
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: cape elephant shrew EleEdw1
  SEQ2_DIR=/hive/data/genomes/eleEdw1/eleEdw1.2bit
  SEQ2_LEN=/hive/data/genomes/eleEdw1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  
  BASE=/hive/data/genomes/hg19/bed/lastzEleEdw1.2013-07-07
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -qRepeats=windowmaskerSdust -syntenicNet > do.log 2>&1
      #   real    2909m40.585s
      # spent a very long time chaining on encodek
  
      cat fb.hg19.chainEleEdw1Link.txt
      #   801844008 bases of 2897316137 (27.675%) in intersection
  
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 eleEdw1 > rbest.log 2>&1 &
      #   real    48m43.836s
  
      # running the swap
      mkdir /hive/data/genomes/eleEdw1/bed/blastz.hg19.swap
      cd /hive/data/genomes/eleEdw1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzEleEdw1.2013-07-07/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          --qRepeats=windowmaskerSdust -syntenicNet > swap.log 2>&1
      #  real    82m15.401s
  
      cat fb.eleEdw1.chainHg19Link.txt
      #  803183596 bases of 3315871847 (24.222%) in intersection
  
      cd /hive/data/genomes/eleEdw1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # armadillo/dasNov3 Lastz run (DONE - 2013-07-07 - Hiram)
      screen -S dasNov3    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzDasNov3.2013-07-07
      cd /hive/data/genomes/hg19/bed/lastzDasNov3.2013-07-07
      cat << '_EOF_' > DEF
  # human vs armadillo
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: armadillo DasNov3
  SEQ2_DIR=/hive/data/genomes/dasNov3/dasNov3.2bit
  SEQ2_LEN=/hive/data/genomes/dasNov3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=300
  
  BASE=/hive/data/genomes/hg19/bed/lastzDasNov3.2013-07-07
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1779m53.947s
      # finish chr19 chain manually on hgwdev:
      time ./chain.csh hg19.2bit:chr19: chain/hg19.2bit:chr19:.chain
      #   real    266m24.746s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    133m23.375s
  
      cat fb.hg19.chainDasNov3Link.txt
      #   1349317168 bases of 2897316137 (46.571%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 dasNov3 > rbest.log 2>&1 &
      # real    62m32.248s
  
      # running the swap
      mkdir /hive/data/genomes/dasNov3/bed/blastz.hg19.swap
      cd /hive/data/genomes/dasNov3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzDasNov3.2013-07-07/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    143m3.205s
  
      cat fb.dasNov3.chainHg19Link.txt
      #  1382137077 bases of 3299882059 (41.884%) in intersection
  
      cd /hive/data/genomes/dasNov3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # hedgehog/eriEur2 Lastz run (DONE - 2013-07-08 - Hiram)
      screen -S eriEur2    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzEriEur2.2013-07-08
      cd /hive/data/genomes/hg19/bed/lastzEriEur2.2013-07-08
      cat << '_EOF_' > DEF
  # human vs hedgehog
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: hedgehog EriEur2
  SEQ2_DIR=/hive/data/genomes/eriEur2/eriEur2.2bit
  SEQ2_LEN=/hive/data/genomes/eriEur2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzEriEur2.2013-07-08
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    825m44.398s
  
      cat fb.hg19.chainEriEur2Link.txt
      #   757625719 bases of 2897316137 (26.149%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 eriEur2 > rbest.log 2>&1 &
      # real     58m0.319s
  
      # running the swap
      mkdir /hive/data/genomes/eriEur2/bed/blastz.hg19.swap
      cd /hive/data/genomes/eriEur2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzEriEur2.2013-07-08/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    98m18.554s
  
      cat fb.eriEur2.chainHg19Link.txt
      #  729081383 bases of 2333073535 (31.250%) in intersection
  
      cd /hive/data/genomes/eriEur2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # tasmanian devil/sarHar1 Lastz run (DONE - 2013-07-09 - Hiram)
      screen -S sarHar1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzSarHar1.2013-07-09
      cd /hive/data/genomes/hg19/bed/lastzSarHar1.2013-07-09
      cat << '_EOF_' > DEF
  # human vs tasmanian devil
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: tasmanian devil SarHar1
  SEQ2_DIR=/hive/data/genomes/sarHar1/sarHar1.2bit
  SEQ2_LEN=/hive/data/genomes/sarHar1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=250
  
  BASE=/hive/data/genomes/hg19/bed/lastzSarHar1.2013-07-09
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1605m41.445s
  
      cat fb.hg19.chainSarHar1Link.txt
      #   214795886 bases of 2897316137 (7.414%) in intersection
  
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 sarHar1 > rbest.log 2>&1 &
      #   real    13m23.565s
  
      # running the swap
      mkdir /hive/data/genomes/sarHar1/bed/blastz.hg19.swap
      cd /hive/data/genomes/sarHar1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzSarHar1.2013-07-09/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #   real    35m56.282s
  
      cat fb.sarHar1.chainHg19Link.txt
      #    203364336 bases of 2931539702 (6.937%) in intersection
  
      cd /hive/data/genomes/sarHar1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # wallaby/macEug2 Lastz run (DONE - 2013-07-09 - Hiram)
      screen -S macEug2    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMacEug2.2013-07-09
      cd /hive/data/genomes/hg19/bed/lastzMacEug2.2013-07-09
      cat << '_EOF_' > DEF
  # human vs wallaby
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: wallaby MacEug2
  SEQ2_DIR=/hive/data/genomes/macEug2/macEug2.2bit
  SEQ2_LEN=/hive/data/genomes/macEug2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=1000
  
  BASE=/hive/data/genomes/hg19/bed/lastzMacEug2.2013-07-09
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    2464m16.805s
      #   chaining chr19 on hgwdev
      #   real    158m29.798s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    56m29.109s
  
      cat fb.hg19.chainMacEug2Link.txt
      #   189299569 bases of 2897316137 (6.534%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 macEug2 > rbest.log 2>&1 &
      #   real    18m25.454s
  
      # running the swap
      mkdir /hive/data/genomes/macEug2/bed/blastz.hg19.swap
      cd /hive/data/genomes/macEug2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMacEug2.2013-07-09/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    176m21.822s
  
      cat fb.macEug2.chainHg19Link.txt
      #  183339665 bases of 2536076957 (7.229%) in intersection
  
      cd /hive/data/genomes/macEug2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # manatee/triMan1 Lastz run (DONE - 2013-07-09 - Hiram)
      screen -S triMan1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzTriMan1.2013-07-09
      cd /hive/data/genomes/hg19/bed/lastzTriMan1.2013-07-09
      cat << '_EOF_' > DEF
  # human vs manatee
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: manatee TriMan1
  SEQ2_DIR=/hive/data/genomes/triMan1/triMan1.2bit
  SEQ2_LEN=/hive/data/genomes/triMan1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzTriMan1.2013-07-09
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1479m33.445s
      #   chaining chr19 finished manually - running Wed Jul 10 14:41:50 PDT 2013
      #   real    278m44.449s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    135m36.963s
  
      cat fb.hg19.chainTriMan1Link.txt
      #   1349583595 bases of 2897316137 (46.580%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 triMan1 > rbest.log 2>&1 &
      #   real    54m56.531s
  
      # running the swap
      mkdir /hive/data/genomes/triMan1/bed/blastz.hg19.swap
      cd /hive/data/genomes/triMan1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzTriMan1.2013-07-09/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    130m17.024s
  
      cat fb.triMan1.chainHg19Link.txt
      #  1299051630 bases of 2769099677 (46.912%) in intersection
  
      cd /hive/data/genomes/triMan1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # star-nosed mole/conCri1 Lastz run (DONE - 2013-07-09 - Hiram)
      screen -S conCri1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzConCri1.2013-07-09
      cd /hive/data/genomes/hg19/bed/lastzConCri1.2013-07-09
      cat << '_EOF_' > DEF
  # human vs star-nosed mole
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: star-nosed mole ConCri1
  SEQ2_DIR=/hive/data/genomes/conCri1/conCri1.2bit
  SEQ2_LEN=/hive/data/genomes/conCri1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=10
  
  BASE=/hive/data/genomes/hg19/bed/lastzConCri1.2013-07-09
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1470m34.378s
  
      cat fb.hg19.chainConCri1Link.txt
      #   1072983049 bases of 2897316137 (37.034%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 conCri1 > rbest.log 2>&1 &
      #   real    56m41.559s
  
      # running the swap
      mkdir /hive/data/genomes/conCri1/bed/blastz.hg19.swap
      cd /hive/data/genomes/conCri1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzConCri1.2013-07-09/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    93m49.833s
  
      cat fb.conCri1.chainHg19Link.txt
      #  1026037326 bases of 1682542007 (60.981%) in intersection
  
      cd /hive/data/genomes/conCri1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # weddell seal/lepWed1 Lastz run (DONE - 2013-07-11 - Hiram)
      screen -S lepWed1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzLepWed1.2013-07-11
      cd /hive/data/genomes/hg19/bed/lastzLepWed1.2013-07-11
      cat << '_EOF_' > DEF
  # human vs weddell seal
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: weddell seal LepWed1
  SEQ2_DIR=/hive/data/genomes/lepWed1/lepWed1.2bit
  SEQ2_LEN=/hive/data/genomes/lepWed1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzLepWed1.2013-07-11
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1195m29.817s
  
      cat fb.hg19.chainLepWed1Link.txt
      #   1518895407 bases of 2897316137 (52.424%) in intersection
  
      # set symlink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzLepWed1.2013-07-11 lastz.lepWed1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 lepWed1 > rbest.log 2>&1 &
      #   real    59m0.523s
  
      # running the swap
      mkdir /hive/data/genomes/lepWed1/bed/blastz.hg19.swap
      cd /hive/data/genomes/lepWed1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzLepWed1.2013-07-11/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    100m46.082s
  
      cat fb.lepWed1.chainHg19Link.txt
      #  1457672837 bases of 2223164129 (65.567%) in intersection
  
      cd /hive/data/genomes/lepWed1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Cat felCat5 (DONE - 2014-03-12 - PAULINE)
      mkdir /hive/data/genomes/hg19/bed/lastzfelCat5.2014-02-26
      cd /hive/data/genomes/hg19/bed/lastzfelCat5.2014-02-26
  
      screen -S hg19felCat5
      cat << '_EOF_' > DEF
  # human vs Cat
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Cat felCat5
  SEQ2_DIR=/hive/data/genomes/felCat5/felCat5.2bit
  SEQ2_LEN=/hive/data/genomes/felCat5/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzfelCat5.2014-02-26
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      bash
      doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -syntenicNet \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=3000 -chainLinearGap=medium >& do.log
      # Elapsed time: 1396m22s - busy cluster
      cat fb.hg19.chainfelCat5Link.txt
      # 1519102826 bases of 2897316137 (52.431%) in intersection
  
      #   running the swap
      mkdir /hive/data/genomes/felCat5/bed/blastz.hg19.swap
      cd /hive/data/genomes/felCat5/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzfelCat5.2014-02-26/DEF \
          -syntenicNet -swap \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=3000 -chainLinearGap=medium >& swap.log
  
      cat fb.felCat5.chainHg19Link.txt
      # 1453639480 bases of 2364296207 (61.483%) in intersection
  
      cd /hive/data/genomes/felCat5/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # ferret/musFur1 Lastz run (DONE - 2013-07-11 - Hiram)
      screen -S musFur1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMusFur1.2013-07-11
      cd /hive/data/genomes/hg19/bed/lastzMusFur1.2013-07-11
      cat << '_EOF_' > DEF
  # human vs ferret
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: ferret MusFur1
  SEQ2_DIR=/hive/data/genomes/musFur1/musFur1.2bit
  SEQ2_LEN=/hive/data/genomes/musFur1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzMusFur1.2013-07-11
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1157m55.155s
  
      cat fb.hg19.chainMusFur1Link.txt
      #   1477435563 bases of 2897316137 (50.993%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMusFur1.2013-07-11 lastz.musFur1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 musFur1 > rbest.log 2>&1 &
      #   real    71m34.431s
  
      # running the swap
      mkdir /hive/data/genomes/musFur1/bed/blastz.hg19.swap
      cd /hive/data/genomes/musFur1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMusFur1.2013-07-11/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    113m35.370s
  
      cat fb.musFur1.chainHg19Link.txt
      #  1419772812 bases of 2277906570 (62.328%) in intersection
  
      cd /hive/data/genomes/musFur1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # bactrian camel/camFer1 Lastz run (DONE - 2013-07-12 - Hiram)
      screen -S camFer1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzCamFer1.2013-07-12
      cd /hive/data/genomes/hg19/bed/lastzCamFer1.2013-07-12
      cat << '_EOF_' > DEF
  # human vs bactrian camel
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: bactrian camel CamFer1
  SEQ2_DIR=/hive/data/genomes/camFer1/camFer1.2bit
  SEQ2_LEN=/hive/data/genomes/camFer1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzCamFer1.2013-07-12
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    839m17.894s
  
      cat fb.hg19.chainCamFer1Link.txt
      #   1460794182 bases of 2897316137 (50.419%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzCamFer1.2013-07-12 lastz.camFer1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 camFer1 > rbest.log 2>&1 &
      #   real    51m50.739s
  
      # running the swap
      mkdir /hive/data/genomes/camFer1/bed/blastz.hg19.swap
      cd /hive/data/genomes/camFer1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzCamFer1.2013-07-12/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #  real    97m20.985s
  
      cat fb.camFer1.chainHg19Link.txt
      #    1413721651 bases of 1985442806 (71.204%) in intersection
  
      cd /hive/data/genomes/camFer1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ  dolphin turTru2 Lastz run (DONE - 2013-08-19 braney)
      screen -S turTru2    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzTurTru2.2013-08-19
      cd  /hive/data/genomes/hg19/bed/lastzTurTru2.2013-08-19
      cat << '_EOF_' > DEF
  # human vs dolphin
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: dolphin turTru2
  SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit
  SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzTurTru2.2013-08-19
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -noDbNameCheck -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > do.log 2>&1
      # real    718m18.715s
  
      cat fb.hg19.chainTurTru2Link.txt
      # 1482195003 bases of 2897316137 (51.158%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzTurTru2.2013-08-19 lastz.turTru2
  
      cd /hive/data/genomes/hg19/bed/lastzTurTru2.2013-08-19
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 turTru2 > rbest.log 2>&1 &
      #   real    55m27.877s
  
      # running the swap
      mkdir /hive/data/genomes/turTru2/bed/blastz.hg19.swap
      cd /hive/data/genomes/turTru2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzTurTru2.2013-08-19/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    262m24.138s
  
      cat fb.turTru2.chainHg19Link.txt
      #    1431257975 bases of 2332402443 (61.364%) in intersection
  
      cd /hive/data/genomes/turTru2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # killer whale/orcOrc1 Lastz run  (DONE - 2013-07-31 braney)
      screen -S orcOrc1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzOrcOrc1.2013-07-31
      cd  /hive/data/genomes/hg19/bed/lastzOrcOrc1.2013-07-31
      cat << '_EOF_' > DEF
  # human vs killer whale
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: killer whale OrcOrc1
  SEQ2_DIR=/hive/data/genomes/orcOrc1/orcOrc1.2bit
  SEQ2_LEN=/hive/data/genomes/orcOrc1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzOrcOrc1.2013-07-31
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -noDbNameCheck -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      # real    471m39.469s
      # real    136m25.890s
  
      cat fb.hg19.chainOrcOrc1Link.txt
      #  1508701710 bases of 2897316137 (52.072%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzOrcOrc1.2013-07-31 lastz.orcOrc1
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -continue=syntenicNet -stop=syntenicNet \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > synNet.log 2>&1
      # real    47m15.614s
  
      cd /hive/data/genomes/hg19/bed/lastzOrcOrc1.2013-07-31
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 orcOrc1 > rbest.log 2>&1 &
      #   real    55m27.877s
  
      # running the swap
      mkdir /hive/data/genomes/orcOrc1/bed/blastz.hg19.swap
      cd /hive/data/genomes/orcOrc1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzOrcOrc1.2013-07-31/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real     103m7.570s
  
      cat fb.orcOrc1.chainHg19Link.txt
      #    1446385023 bases of 2249582127 (64.296%) in intersection
  
      cd /hive/data/genomes/orcOrc1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # pacific walrus/odoRosDiv1 Lastz run (DONE - 2013-07-12 - Hiram)
      screen -S odoRosDiv1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzOdoRosDiv1.2013-07-12
      cd /hive/data/genomes/hg19/bed/lastzOdoRosDiv1.2013-07-12
      cat << '_EOF_' > DEF
  # human vs pacific walrus
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: pacific walrus OdoRosDiv1
  SEQ2_DIR=/hive/data/genomes/odoRosDiv1/odoRosDiv1.2bit
  SEQ2_LEN=/hive/data/genomes/odoRosDiv1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzOdoRosDiv1.2013-07-12
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -noDbNameCheck -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    916m24.264s
  
      cat fb.hg19.chainOdoRosDiv1Link.txt
      #   1551759078 bases of 2897316137 (53.559%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzOdoRosDiv1.2013-07-12 lastz.odoRosDiv1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 odoRosDiv1 > rbest.log 2>&1 &
      #   real    55m27.877s
  
      # running the swap
      mkdir /hive/data/genomes/odoRosDiv1/bed/blastz.hg19.swap
      cd /hive/data/genomes/odoRosDiv1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzOdoRosDiv1.2013-07-12/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -noDbNameCheck -syntenicNet > swap.log 2>&1
      #   real    101m51.331s
  
      cat fb.odoRosDiv1.chainHg19Link.txt
      #    1490487100 bases of 2300235512 (64.797%) in intersection
  
      cd /hive/data/genomes/odoRosDiv1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ bonobo PanPan1 (DONE - 2013-07-15 - Hiram)
      screen -S panPan1     # use screen to manage this longish running job
      mkdir /hive/data/genomes/hg19/bed/lastzPanPan1.2013-07-15
      cd /hive/data/genomes/hg19/bed/lastzPanPan1.2013-07-15
  
      cat << '_EOF_' > DEF
  # human vs bonobo
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: bonobo panPan1
  SEQ2_DIR=/hive/data/genomes/panPan1/panPan1.2bit
  SEQ2_LEN=/hive/data/genomes/panPan1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanPan1.2013-07-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #   establish a screen to control this job
      screen -S lastz
  
      time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
  	-syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek \
  	-bigClusterHub=swarm > do.log 2>&1
      #   real    678m8.188s
      cat fb.hg19.chainPanPan1Link.txt
      #   2748426979 bases of 2897316137 (94.861%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPanPan1.2013-07-15 lastz.panPan1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 panPan1 > rbest.log 2>&1 &
          > swap.log 2>&1
      #   real    59m24.151s
  
      #   running the swap - DONE - 2013-03-26
      mkdir /hive/data/genomes/panPan1/bed/blastz.hg19.swap
      cd /hive/data/genomes/panPan1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzPanPan1.2013-07-15/DEF \
          -swap -syntenicNet \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          > swap.log 2>&1
      #   real    86m18.735s
      cat fb.panPan1.chainHg19Link.txt
      #   2657982214 bases of 2725905606 (97.508%) in intersection
  
  ##############################################################################
  # LASTZ coelacanth LatCha1 (DONE - 2013-07-15 - Hiram)
      screen -S latCha1     # use screen to manage this longish running job
      mkdir /hive/data/genomes/hg19/bed/lastzLatCha1.2013-07-15
      cd /hive/data/genomes/hg19/bed/lastzLatCha1.2013-07-15
  
      cat << '_EOF_' > DEF
  # human vs coelacanth
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: coelacanth latCha1
  SEQ2_DIR=/hive/data/genomes/latCha1/latCha1.2bit
  SEQ2_LEN=/hive/data/genomes/latCha1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzLatCha1.2013-07-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      #   establish a screen to control this job
      screen -S lastz
  
      time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
  	-syntenicNet -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=encodek -qRepeats=windowmaskerSdust \
  	-bigClusterHub=swarm > do.log 2>&1
      #   real    1029m56.443s
      cat fb.hg19.chainLatCha1Link.txt
      #   79653843 bases of 2897316137 (2.749%) in intersection
  
      cd /hive/data/genomes/hg19/bed
      ln -s lastzLatCha1.2013-07-15 lastz.latCha1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 latCha1 > rbest.log 2>&1 &
      #   real    17m52.001s
  
      #   running the swap - DONE - 2013-03-26
      mkdir /hive/data/genomes/latCha1/bed/blastz.hg19.swap
      cd /hive/data/genomes/latCha1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzLatCha1.2013-07-15/DEF \
          -swap -syntenicNet \
          -chainMinScore=5000 -chainLinearGap=loose -qRepeats=windowmaskerSdust \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          > swap.log 2>&1
      #   real    26m10.847s
      cat fb.latCha1.chainHg19Link.txt
      #   74647837 bases of 2183592768 (3.419%) in intersection
  
  ##############################################################################
  # lesser Egyptian jerboa/jacJac1 Lastz run (DONE - 2013-07-15 - Hiram)
      screen -S jacJac1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzJacJac1.2013-07-15
      cd /hive/data/genomes/hg19/bed/lastzJacJac1.2013-07-15
      cat << '_EOF_' > DEF
  # human vs lesser Egyptian jerboa
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: lesser Egyptian jerboa JacJac1
  SEQ2_DIR=/hive/data/genomes/jacJac1/jacJac1.2bit
  SEQ2_LEN=/hive/data/genomes/jacJac1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=90
  
  BASE=/hive/data/genomes/hg19/bed/lastzJacJac1.2013-07-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    2272m59.043s
  
      cat fb.hg19.chainJacJac1Link.txt
      #   1017128997 bases of 2897316137 (35.106%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzJacJac1.2013-07-15 lastz.jacJac1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 jacJac1 > rbest.log 2>&1 &
      #   real    52m50.154s
  
      # running the swap
      mkdir /hive/data/genomes/jacJac1/bed/blastz.hg19.swap
      cd /hive/data/genomes/jacJac1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzJacJac1.2013-07-15/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #   real    99m45.457s
  
      cat fb.jacJac1.chainHg19Link.txt
      #    993457205 bases of 2470259869 (40.217%) in intersection
  
      cd /hive/data/genomes/jacJac1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # Tibetan antelope/panHod1 Lastz run (DONE - 2013-07-15 - Hiram)
      screen -S panHod1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzPanHod1.2013-07-15
      cd /hive/data/genomes/hg19/bed/lastzPanHod1.2013-07-15
      cat << '_EOF_' > DEF
  # human vs Tibetan antelope
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Tibetan antelope PanHod1
  SEQ2_DIR=/hive/data/genomes/panHod1/panHod1.2bit
  SEQ2_LEN=/hive/data/genomes/panHod1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=90
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanHod1.2013-07-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1458m9.769s
      #   chaining chr19 on hgwdev
      #   real    137m25.764s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    120m53.333s
  
      cat fb.hg19.chainPanHod1Link.txt
      #   1357769091 bases of 2897316137 (46.863%) in intersection
  
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPanHod1.2013-07-15 lastz.panHod1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 panHod1 > rbest.log 2>&1 &
      #   real    78m31.403s
  
      # running the swap
      mkdir /hive/data/genomes/panHod1/bed/blastz.hg19.swap
      cd /hive/data/genomes/panHod1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPanHod1.2013-07-15/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #   real    128m19.578s
  
      cat fb.panHod1.chainHg19Link.txt
      #    1315762168 bases of 2507986438 (52.463%) in intersection
  
      cd /hive/data/genomes/panHod1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # domestic goat/capHir1 Lastz run (DONE - 2013-07-15 - Hiram)
      screen -S capHir1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzCapHir1.2013-07-15
      cd /hive/data/genomes/hg19/bed/lastzCapHir1.2013-07-15
      cat << '_EOF_' > DEF
  # human vs domestic goat
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: domestic goat CapHir1
  SEQ2_DIR=/hive/data/genomes/capHir1/capHir1.2bit
  SEQ2_LEN=/hive/data/genomes/capHir1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=400
  
  BASE=/hive/data/genomes/hg19/bed/lastzCapHir1.2013-07-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1689m8.583s
      # chaining chr19 on hgwdev
      #   real    181m55.026s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    131m41.556s
  
      cat fb.hg19.chainCapHir1Link.txt
      #   1347204011 bases of 2897316137 (46.498%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzCapHir1.2013-07-15 lastz.capHir1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 capHir1 > rbest.log 2>&1 &
      #   real    107m2.379s
  
      # running the swap
      mkdir /hive/data/genomes/capHir1/bed/blastz.hg19.swap
      cd /hive/data/genomes/capHir1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzCapHir1.2013-07-15/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #   real    153m53.873s
  
      cat fb.capHir1.chainHg19Link.txt
      #    1313252392 bases of 2495939845 (52.616%) in intersection
  
      cd /hive/data/genomes/capHir1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # naked mole rat/hetGla2 Lastz run (DONE - 2013-07-15 - Hiram)
      screen -S hetGla2    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzHetGla2.2013-07-15
      cd /hive/data/genomes/hg19/bed/lastzHetGla2.2013-07-15
      cat << '_EOF_' > DEF
  # human vs naked mole rat
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: naked mole rat HetGla2
  SEQ2_DIR=/hive/data/genomes/hetGla2/hetGla2.2bit
  SEQ2_LEN=/hive/data/genomes/hetGla2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzHetGla2.2013-07-15
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > do.log 2>&1
      #   real    1560m27.406s
      # chaining chr19 on hgwdev
      #   real    178m51.443s
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -continue=chainMerge -syntenicNet > chainMerge.log 2>&1
      #   real    138m13.867s
  
      cat fb.hg19.chainHetGla2Link.txt
      #   1379118490 bases of 2897316137 (47.600%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzHetGla2.2013-07-15 lastz.hetGla2
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 hetGla2 > rbest.log 2>&1 &
      #   real    86m23.630s
  
      # running the swap
      mkdir /hive/data/genomes/hetGla2/bed/blastz.hg19.swap
      cd /hive/data/genomes/hetGla2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzHetGla2.2013-07-15/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
          -syntenicNet > swap.log 2>&1
      #   real    151m11.917s
  
      cat fb.hetGla2.chainHg19Link.txt
      #    1361506063 bases of 2314771103 (58.818%) in intersection
  
      cd /hive/data/genomes/hetGla2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  
  ##############################################################################
  # DBSNP B138 / SNP138 (DONE 3/28/14)
  # Redone from scratch 3/28/14 after finding that the FTP files available when b138
  # was announced were outdated and were replaced a few weeks after the announcement.
  # 1.2million SNPs were dropped because they were not in the fasta files.  Next time,
  # check dates and pay more attention to line count of snp138Errors.bed.gz.  (#12975)
  # The script ran uneventfully.
  # Partially redone 8/19/13, 10/21/13 to fix allele freq script bugs.
  # Redmine #11438
      mkdir -p /hive/data/outside/dbSNP/138/human
      cd /hive/data/outside/dbSNP/138/human
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
      # to find the subdir name to use as orgDir below (human_9606 in this case).
      # Then click into that directory and look for file names like
      #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
      # -- use the first num for build and the second num_num for buildAssembly.
      # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
      #
      # Some trial and error was required to get the config.ra just right --
      # the b* filenames don't include buildAssembly!
      # patch contigs needed to be filtered out:
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606
  build 138
  buildAssembly
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p10
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
      # It fails with a list of unliftable contigs -- look at dbSnpContigsNotInUcsc.txt
      # to make sure they're all patch contigs as expected, then start over, ignoring those:
      cp dbSnpContigsNotInUcsc.txt patchContigs.txt
      cat >> config.ra <<EOF
  ignoreDbSnpContigsFile patchContigs.txt
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
      # After final snpNcbiToUcsc tweaking:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue translate \
        >>& do.log & tail -f do.log
      # In the load step, hgLoadSqlTab of snp138Seq failed due to duplicate sequences.
      # snp138Seq.tab is constructed in the translate step.  It would be good to
      # enhance hgLoadSeq to drop duplicates.  In the meantime, I if(0)'d the parts
      # of load.csh that completed and ran it again:
      ./load.csh >> & do.log & tail -f do.log
      # all done!
  
      # In MLQ #, a user reported missing SNP IDs and I suspect that some of the
      # losses might have something to do with IDs mapped to patch contigs losing
      # the primary reference contigs.  There are ~2500 rs's in the first SNPContigLoc
      # that have patch, but not primary mappings.  Next time do this early in the
      # process in case they haven't fixed they pipeline:
      # Look for reference assembly mappings that may have been lost due to patch mappings (?)
      cd /hive/data/outside/dbSNP/138/human/data
      # Make lists of primary and patch contig IDs, prefixed by "^" because we'll grep
      # for those words at the beginning of lines in the next step.
      zcat b138_ContigInfo.bcp.gz | g GRCh37 | g -v PATCHES \
      | cut -f 1 | sed -re 's/^/^/' \
        > primaryContigIds.txt
      zcat b138_ContigInfo.bcp.gz | g -w PATCHES \
      | cut -f 1 | sed -re 's/^/^/' \
        > patchContigIds.txt
      # Make lists of SNPs mapped to primary contigs and patch contigs.
      # Put contig IDs first because we're grepping to find them at the beginning of the line.
      zcat b138_SNPContigLoc.orig.bcp.gz | awk '{print $3, $2;}' > contigIdsAndRsIds.txt
      grep -wf primaryContigIds.txt contigIdsAndRsIds.txt > primaryContigsAndRsIds.txt
      grep -wf patchContigIds.txt contigIdsAndRsIds.txt > patchContigsAndRsIds.txt
      # Now trim to keep just rs IDs:
      awk '{print $2;}' primaryContigsAndRsIds.txt | uniq > rsIdsOnPrimary.txt
      awk '{print $2;}' patchContigsAndRsIds.txt | uniq > rsIdsOnPatches.txt
      # Compare to find rs IDs mapped to patch contigs but not to primary contigs:
      sort rsIdsOnPrimary.txt > rsIdsOnPrimary.alphaSort.txt
      sort rsIdsOnPatches.txt > rsIdsOnPatches.alphaSort.txt
      comm -13 rsIdsOnPrimary.alphaSort.txt rsIdsOnPatches.alphaSort.txt \
      | sort -nu \
        > rsIdsOnPatchesNotPrimary.txt
      wc -l rsIdsOnPatchesNotPrimary.txt
  #0 rsIdsOnPatchesNotPrimary.txt
      # yay!
  
      # 8/19/13: rebuilding after fix #11544 -- many allele frequencies were combined
      # incorrectly due to stranded data (but not strand flag) in SNPAlleleFreq and
      # positive-strand-only data in SNPAlleleFreq_TGP.
      # 10/11/13: I forgot to add the -deDupTGP argument to snpAddTGPAlleleFreq.pl !
      # Re-running steps from snpAddTGPAlleleFreq.pl on, this time with -deDupTGP.
      # 10/21/13: Jonathan noticed a SNP for which "-" appeared twice in alleles.
      # Fixed, re-running again.
      cd /hive/data/outside/dbSNP/138/human
      mkdir preAlleleFreqFix
      mv ucscAlleleFreq.txt.gz snp138*.bed.gz preAlleleFreqFix/
      # Run with -debug to regenerate addToDbSnp script:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra \
        -continue addToDbSnp -stop addToDbSnp -debug
      # Now re-run the updated snpAddTGPAlleleFreq.pl command:
      grep snpAdd addToDbSnp.csh
      ~/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg19snp138 \
        -contigLoc=b138_SNPContigLoc -deDupTGP > ucscAlleleFreq.txt
      # Reload the ucscAlleleFreq table:
      hgLoadSqlTab hg19snp138 ucscAlleleFreq{,.sql,.txt} >>& do.log & tail -f do.log
      # Redo the big join:
      set tmpDir = `cat /hive/data/outside/dbSNP/138/human/workingDir`
      mkdir -p $tmpDir
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin -stop bigJoin \
        >>& do.log & tail -f do.log
     # Manually re-run snpNcbiToUcsc (the translate step includes fasta file concat
     # and indexing, which would waste a lot of time):
     cd $tmpDir
     echo 'select * from par' | hgsql hg19 -NB > par.bed
     snpNcbiToUcsc -snp132Ext -par=par.bed ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp138 \
       >>& /hive/data/outside/dbSNP/138/human/do.log &
     tail -f /hive/data/outside/dbSNP/138/human/do.log
     head snp138Errors.bed >>& /hive/data/outside/dbSNP/138/human/do.log &
     tail -f /hive/data/outside/dbSNP/138/human/do.log
     wc -l snp138* >>& /hive/data/outside/dbSNP/138/human/do.log &
     tail -f /hive/data/outside/dbSNP/138/human/do.log
     gzip *.txt *.bed *.tab
     cp -p * /hive/data/outside/dbSNP/138/human/
     cd /hive/data/outside/dbSNP/138/human
     rm $tmpDir/*
     # Edit load.csh to skip snp138Seq -- it's huge and nothing changed there.
     csh -efx ./load.csh >>& do.log & tail -f do.log
  
  
  #############################################################################
  # FILTER SNP138 (DONE 3/28/14 angie)
  # Redone 3/28/14 after redo w/new b138 FTP files (#12975).
  # Redone 10/22/13.
     # Redmine #11438
     # Make several tracks that are filtered subsets of snp138:
     # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp138Mult
     # Second, siphon off the common variants -> snp138Common
     # Third, take the (uniquely mapped, not ens to be common) variants
     # w/dbSNP's "clinically-assoc" flag -> snp138Flagged
     cd /hive/data/outside/dbSNP/138/human
     zcat snp138.bed.gz \
     | ~/kent/src/hg/utils/automation/categorizeSnps.pl
  #Mult:     3617212
  #Common:   14056661
  #Flagged:  91499
  #leftover: 47699648
     foreach f ({Mult,Common,Flagged}.bed.gz)
       mv $f snp138$f
     end
     # Compare to counts from 10/22, before complete re-run w/updated FTP files:
  #Mult:     3616194
  #Common:   14056510
  #Flagged:  90349
  #leftover: 46502161
     # Compare to counts from before fixing #11954 ('-' appearing twice in alleles):
  #Mult:     3616194
  #Common:   14056627
  #Flagged:  90349
  #leftover: 46502044
     # Compare to counts from before counting 100% non-reference as common:
  #snp138Mult:    3616194
  #snp138Common:  14038502
  #snp138Flagged: 90351
  #leftover:      46520167
     # Compare to counts from before fixing allele freq strand bug:
  #snp138Mult:    3616194
  #snp138Common:  14057590
  #snp138Flagged: 90189
  #leftover:      46501241
     # So 162 SNPs would have been left out of snp138Flagged.
  
     # Load tables
     foreach subset (Mult Common Flagged)
       hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
         hg19 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz
     end
  
  
  #############################################################################
  # SNP138 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 8/1/13 angie)
  # Redone 3/31/14 after redo w/new b138 FTP files (#12975).
  # Previously done 8/1/13
     # Redmine #11438
     mkdir /hive/data/genomes/hg19/bed/snp138Ortho
     cd /hive/data/genomes/hg19/bed/snp138Ortho
     # Filter snp138 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
     zcat /hive/data/outside/dbSNP/138/human/snp138.bed.gz \
     | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
     | sort -u \
       > snp138ExcludeIds.txt
     wc -l snp138ExcludeIds.txt
  #1334416 snp138ExcludeIds.txt
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     zcat /hive/data/outside/dbSNP/138/human/snp138.bed.gz \
     | awk '$3-$2 == 1 && $11 == "single" {print;}' \
     | grep -vFwf snp138ExcludeIds.txt \
     | awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       > snp138ForLiftOver.bed
  
     # Map coords to chimp using liftOver.
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../snp138ForLiftOver.bed 10000 split/chunk
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro4.over.chain.gz \
         \{check out exists out/panTro4.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     ssh swarm
     cd /hive/data/genomes/hg19/bed/snp138Ortho/run.liftOChimp
     para make jobList
  #Completed: 5404 of 5404 jobs
  #CPU time in finished jobs:     366531s    6108.84m   101.81h    4.24d  0.012 y
  #IO & Wait Time:                 18722s     312.04m     5.20h    0.22d  0.001 y
  #Average job time:                  71s       1.19m     0.02h    0.00d
  #Longest finished job:             270s       4.50m     0.07h    0.00d
  #Submission to last job:          1562s      26.03m     0.43h    0.02d
  
     # Map coords to orangutan using liftOver.
     mkdir ../run.liftOPon
     cd ../run.liftOPon
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
         \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     para make jobList
  #Completed: 5404 of 5404 jobs
  #CPU time in finished jobs:     737428s   12290.47m   204.84h    8.54d  0.023 y
  #IO & Wait Time:                 24367s     406.12m     6.77h    0.28d  0.001 y
  #Average job time:                 141s       2.35m     0.04h    0.00d
  #Longest finished job:             653s      10.88m     0.18h    0.01d
  #Submission to last job:          3135s      52.25m     0.87h    0.04d
  
     # Map coords to macaque using liftOver.
     mkdir ../run.liftOMac
     cd ../run.liftOMac
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac3.over.chain.gz \
         \{check out exists out/rheMac3.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     para make jobList
  #Completed: 5404 of 5404 jobs
  #CPU time in finished jobs:     783517s   13058.61m   217.64h    9.07d  0.025 y
  #IO & Wait Time:                 24306s     405.10m     6.75h    0.28d  0.001 y
  #Average job time:                 149s       2.49m     0.04h    0.00d
  #Longest finished job:             653s      10.88m     0.18h    0.01d
  #Submission to last job:          5939s      98.98m     1.65h    0.07d
  
     cd /hive/data/genomes/hg19/bed/snp138Ortho
     # Concatenate the chimp results, sorting by chimp pos in order to
     # efficiently access 2bit sequence in getOrthoSeq.  The output of
     # that is then sorted by the glommed human info field, so that we
     # can use join to combine chimp and macaque results in the next step.
     # Ditto for macaque and orangutan.  Each command pipe takes ~15 minutes:
     sort -k1,1 -k2n,2n run.liftOChimp/out/panTro4.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro4/panTro4.2bit \
     | sort > panTro4.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
     | sort > ponAbe2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac3.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac3/rheMac3.2bit \
     | sort > rheMac3.orthoGlom.txt
     wc -l panTro4.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac3.orthoGlom.txt
  #   50847196 panTro4.orthoGlom.txt
  #   48383647 ponAbe2.orthoGlom.txt
  #   43702072 rheMac3.orthoGlom.txt
  
     # Use the glommed name field as a key to join up chimp and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom fields from each file, which are in the same order
     # as the chimp and macaque columns of hg18.snp128OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       panTro4.orthoGlom.txt ponAbe2.orthoGlom.txt \
     | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
             else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
       > tmp.txt
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       tmp.txt rheMac3.orthoGlom.txt \
     | perl -wpe 'chomp; \
         ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
         $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
         $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
         s/^.*$//;' \
     | sort -k1,1 -k2n,2n > snp138OrthoPt4Pa2Rm3.bed
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg19 snp138OrthoPt4Pa2Rm3 snp138OrthoPt4Pa2Rm3.bed
  #Read 52573568 elements of size 22 from snp138OrthoPt4Pa2Rm3.bed
     # Cleanup:
     rm -r run*/split tmp.txt
     gzip snp138ExcludeIds.txt snp138ForLiftOver.bed &
  
  
  ############################################################################
  # DBSNP CODING ANNOTATIONS (138) (DONE 3/31/14 angie)
     # Previously done 8/1/13
     cd /hive/data/outside/dbSNP/138/human
     # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
     # For anything except an insertion (0 bases between flanks),
     # we need to add 1 to the end coord.  For an insertion, we need
     # to add 1 to the start coord.  Make a hash of the insertion IDs,
     # then look up each ID in ncbiFuncAnnotations.txt to tell which
     # transform to apply.
     # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
      zcat ncbiFuncAnnotations.txt.gz \
      | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \
      | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
      | uniq \
        > ncbiFuncAnnotationsFixed.txt
      wc -l ncbiFuncAnnotationsFixed.txt
  
  #6187796 ncbiFuncAnnotationsFixed.txt
     # How many & what kinds of function types?
     cut -f 6 ncbiFuncAnnotationsFixed.txt \
     | sort -n | uniq -c
  #1013446 3  (coding-synon)
  #2830106 8  (cds-reference -- ignored)
  #  50753 41 (nonsense)
  #1762922 42 (missense)
  #   1623 43 (stop-loss)
  #  51453 44 (frameshift)
  # 477493 45 (cds-indel)
  
     # In b138, the functional annotations include non-coding (frame = NULL),
     # which we'll exclude here because this is supposed to be just coding stuff...
     # probably need to update how we show dbSNP's func annos anyway, e.g.
     # it is a shame that we toss out codon number and transcript offset.
     # Gather up multiple annotation lines into one line per {snp, gene, frame}:
     ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \
     | liftUp snp138CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
     # This had some hg19-specific warnings, e.g.
  #NC_012920 isn't in liftSpec file line 2973 of stdin
  #NC_012920 isn't in liftSpec file line 2981 of stdin
     # That's the chrM hg19 didn't use.  We need to pull out NC_012920 lines and use
     # liftOver instead of liftUp.
     g -w NC_012920 ncbiFuncAnnotationsFixed.txt \
     | ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl \
     | liftOver -tab -bedPlus=3 stdin /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \
         snp138CodingDbSnp.chrM.bed snp138CodingDbSnp.chrM.unmapped
     # Now sort them back together:
     mv snp138CodingDbSnp.bed snp138CodingDbSnp.allButChrM.bed
     sort -k1,1 -k2n,2n snp138CodingDbSnp.chrM.bed snp138CodingDbSnp.allButChrM.bed \
       > snp138CodingDbSnp.bed
  
     # Load table
     hgLoadBed hg19 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
       -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
       snp138CodingDbSnp.bed
  #Read 3278887 elements of size 11 from snp138CodingDbSnp.bed
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP138 (DONE 3/31/14 angie)
  # Redone 3/31/14 after redo w/new b138 FTP files (#12975).
  # Updated 3/17/14 when user reported in MLQ #12902 that not all files were gzipped.
  # Originally done 8/19/13.
     mkdir /hive/data/genomes/hg19/snp138Mask
     cd /hive/data/genomes/hg19/snp138Mask
     # Identify rsIds with various problems -- we will exclude those.
     zcat /hive/data/outside/dbSNP/138/human/snp138.bed.gz \
     | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
       | sort -u \
       > snp138ExcludeRsIds.txt
     zcat /hive/data/outside/dbSNP/138/human/snp138.bed.gz \
     | grep -vFwf snp138ExcludeRsIds.txt \
       > snp138Cleaned.bed
     wc -l snp138Cleaned.bed
  #60882371 snp138Cleaned.bed
  
     # Substitutions:
     mkdir substitutions
     snpMaskSingle snp138Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
     | faSplit byname stdin substitutions/
  #Masked 54208201 snps in 54208183 out of 3137040325 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3137040325 (difference is 120939)
     # Check that 120939 is the total #bases in sequences with nothing in snp138Cleaned:
     grep -Fw single snp138Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
     grep -vwf /data/tmp/1 ../chrom.sizes \
     | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #120939
     # warnings about differing observed strings at same base position:
     wc -l diffObserved.txt
  #24 diffObserved.txt
  #TODO: send list to dbSNP.  (much smaller list now!)
     # Make sure that sizes are identical, first diffs are normal -> IUPAC,
     # and first diffs' case is preserved:
     foreach f (substitutions/chr*.fa)
       faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
     end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10176 (m != a)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
     foreach f (substitutions/chr*.fa)
       echo $f:t:r
       mv $f $f:r.subst.fa
     end
     # Fire off a bunch of gzip jobs in parallel:
     ls -1 substitutions/*.fa | split -l 5
     foreach f (x??)
       gzip `cat $f` &
     end
     # Wait for backgrounded gzip jobs to complete
     rm x??
  
     # Insertions & deletions not done.  To date we have only offered substs for download.
     # If there is user demand, use template from snp131 above.
  
     # Clean up and prepare for download:
     gzip snp138Cleaned.bed &
     foreach d (substitutions)
       pushd $d
         md5sum *.gz > md5sum.txt
         cp /hive/data/genomes/hg19/snp137Mask/$d/README.txt .
       popd
     end
     # Edit the README.txt.
  
     # Create download links on hgwdev.
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp138Mask
     ln -s /hive/data/genomes/hg19/snp138Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp138Mask/
  
  
  #########################################################################
  # create ucscToINSDC name mapping (DONE - 2013-08-15 - Hiram)
      # this allows the "ensembl" blue bar button to appear
      mkdir /hive/data/genomes/hg19/bed/ucscToINSDC
      cd /hive/data/genomes/hg19/bed/ucscToINSDC
  
      cat << '_EOF_' > translateNames.sh
  #!/bin/sh
  
  cat ../../genbank/Primary_Assembly/assembled_chromosomes/chr2acc \
     | sed -e 's/^/chr/'
  
  cat ../../genbank/ALT_REF*/localID2acc \
     | sed -e 's/HSCHR6_MHC_APD_CTG1/chr6_apd_hap1/;
  s/HSCHR6_MHC_COX_CTG1/chr6_cox_hap2/;
  s/HSCHR6_MHC_DBB_CTG1/chr6_dbb_hap3/;
  s/HSCHR6_MHC_MANN_CTG1/chr6_mann_hap4/;
  s/HSCHR6_MHC_MCF_CTG1/chr6_mcf_hap5/;
  s/HSCHR6_MHC_QBL_CTG1/chr6_qbl_hap6/;
  s/HSCHR6_MHC_SSTO_CTG1/chr6_ssto_hap7/;
  s/HSCHR4_1_CTG9/chr4_ctg9_hap1/;
  s/HSCHR17_1_CTG5/chr17_ctg5_hap1/;'
  
  zcat ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | grep -v "^#" | cut -f1 \
     | sed -e 's/^GL\([0-9]*\).1/chrUn_gl\1\tGL\1.1/;'
  
  grep -v "^#" \
    ../../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf \
      | sed -e 's/^\([0-9]*\)\tGL\([0-9]*\).1/chr\1_gl\2_random\tGL\2.1/;'
  
  echo -e "chrM\tNC_001807.4"
  '_EOF_'
      # << happy emacs
  
      chmod +x translateNames.sh
      ./translateNames.sh | sort > ucscToINSDC.txt
      join <(sort ../../chrom.sizes) ucscToINSDC.txt \
          | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' > ucscToINSDC.tab
  
      # maximum size of UCSC chrom name for SQL index
      cut -f1 ucscToINSDC.tab | awk '{print length($0)}' | sort -n | tail -1
      #   21
  
      hgLoadSqlTab hg19 ucscToINSDC $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         ucscToINSDC.tab
  
      # verify the track link to INSDC functions
  
  #########################################################################
  # lastz Fugu fr3 (DONE - 2013-08-20 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S fr3
      mkdir /hive/data/genomes/hg19/bed/lastzFr3.2013-08-20
      cd /hive/data/genomes/hg19/bed/lastzFr3.2013-08-20
  
      cat << '_EOF_' > DEF
  # Human vs. Fugu
  # Try "human-fugu" (more distant, less repeat-killed than mammal) params
  # +M=50:
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Fugu fr3
  SEQ2_DIR=/hive/data/genomes/fr3/fr3.2bit
  SEQ2_LEN=/hive/data/genomes/fr3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=30
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzFr3.2013-08-20
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
      #   Elapsed time: 77m52s
  
      cat fb.hg19.chainFr3Link.txt
      #   49339815 bases of 2897316137 (1.703%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzFr3.2013-08-20 lastz.fr3
  
      #	and for the swap
      mkdir /hive/data/genomes/fr3/bed/blastz.hg19.swap
      cd /hive/data/genomes/fr3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzFr3.2013-08-20/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
      #   real     8m36.196s
      cat  fb.fr3.chainHg19Link.txt
      #	43105455 bases of 350961831 (12.282%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/fr3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ  Yangtze River dolphin lipVex1 Lastz run (DONE - 2013-08-22 - Hiram)
      screen -S lipVex1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzLipVex1.2013-08-22
      cd  /hive/data/genomes/hg19/bed/lastzLipVex1.2013-08-22
      cat << '_EOF_' > DEF
  # human vs Yangtze River dolphin
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Yangtze River dolphin lipVex1
  SEQ2_DIR=/hive/data/genomes/lipVex1/lipVex1.2bit
  SEQ2_LEN=/hive/data/genomes/lipVex1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  
  BASE=/hive/data/genomes/hg19/bed/lastzLipVex1.2013-08-22
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > do.log 2>&1
      # interrupted due to reconfigurations at SDSC
      # continuing after finishing the run.blastz:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -continue=cat -syntenicNet > cat.log 2>&1
      # real    336m27.196s
  
      cat fb.hg19.chainLipVex1Link.txt
      # 1508704119 bases of 2897316137 (52.072%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzLipVex1.2013-08-22 lastz.lipVex1
  
      cd /hive/data/genomes/hg19/bed/lastzLipVex1.2013-08-22
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 lipVex1 > rbest.log 2>&1 &
      #   real    64m49.366s
  
      # running the swap
      mkdir /hive/data/genomes/lipVex1/bed/blastz.hg19.swap
      cd /hive/data/genomes/lipVex1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzLipVex1.2013-08-22/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real     112m58.566s
  
      cat fb.lipVex1.chainHg19Link.txt
      #    1465154822 bases of 2397016190 (61.124%) in intersection
  
      cd /hive/data/genomes/lipVex1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ  spiny softshell turtle apaSpi1 Lastz run (DONE - 2013-08-26 - Hiram)
      screen -S apaSpi1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzApaSpi1.2013-08-26
      cd  /hive/data/genomes/hg19/bed/lastzApaSpi1.2013-08-26
      cat << '_EOF_' > DEF
  # human vs spiny softshell turtle
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: spiny softshell turtle apaSpi1
  SEQ2_DIR=/hive/data/genomes/apaSpi1/apaSpi1.2bit
  SEQ2_LEN=/hive/data/genomes/apaSpi1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=800
  
  BASE=/hive/data/genomes/hg19/bed/lastzApaSpi1.2013-08-26
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    1667m22.615s
      # rework to get -qRepeats=windowmaskerSdust done properly
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -continue=chainRun -qRepeats=windowmaskerSdust -chainMinScore=5000 \
          -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > chainRun.log 2>&1
      # real    14m9.806s
  
      cat fb.hg19.chainApaSpi1Link.txt
      # 85536067 bases of 2897316137 (2.952%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzApaSpi1.2013-08-26 lastz.apaSpi1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 apaSpi1 > rbest.log 2>&1 &
      #   real    18m15.781s
  
      # running the swap
      mkdir /hive/data/genomes/apaSpi1/bed/blastz.hg19.swap
      cd /hive/data/genomes/apaSpi1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzApaSpi1.2013-08-26/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #  real    35m2.669s
  
      cat fb.apaSpi1.chainHg19Link.txt
      #    73764290 bases of 1877982184 (3.928%) in intersection
  
      cd /hive/data/genomes/apaSpi1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ  green seaturtle cheMyd1 Lastz run (DONE - 2013-08-26 - Hiram)
      screen -S cheMyd1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzCheMyd1.2013-08-26
      cd  /hive/data/genomes/hg19/bed/lastzCheMyd1.2013-08-26
      cat << '_EOF_' > DEF
  # human vs green seaturtle
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: green seaturtle cheMyd1
  SEQ2_DIR=/hive/data/genomes/cheMyd1/cheMyd1.2bit
  SEQ2_LEN=/hive/data/genomes/cheMyd1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=800
  
  BASE=/hive/data/genomes/hg19/bed/lastzCheMyd1.2013-08-26
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    1293m10.598s
      # rework to get -qRepeats=windowmaskerSdust done properly
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -continue=chainRun -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -qRepeats=windowmaskerSdust > chainRun.log 2>&1
      #  real    17m37.540s
  
      cat fb.hg19.chainCheMyd1Link.txt
      # 107081916 bases of 2897316137 (3.696%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzCheMyd1.2013-08-26 lastz.cheMyd1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 cheMyd1 > rbest.log 2>&1 &
      #   real    9m42.319s
  
      # running the swap
      mkdir /hive/data/genomes/cheMyd1/bed/blastz.hg19.swap
      cd /hive/data/genomes/cheMyd1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzCheMyd1.2013-08-26/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    17m13.262s
  
      cat fb.cheMyd1.chainHg19Link.txt
      #    96045284 bases of 2110381997 (4.551%) in intersection
  
      cd /hive/data/genomes/cheMyd1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ  green seaturtle micOch1 Lastz run (DONE - 2013-08-26 - Hiram)
      screen -S micOch1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMicOch1.2013-08-26
      cd  /hive/data/genomes/hg19/bed/lastzMicOch1.2013-08-26
      cat << '_EOF_' > DEF
  # human vs green seaturtle
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: green seaturtle micOch1
  SEQ2_DIR=/hive/data/genomes/micOch1/micOch1.2bit
  SEQ2_LEN=/hive/data/genomes/micOch1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzMicOch1.2013-08-26
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    1238m6.278s
  
      cat fb.hg19.chainMicOch1Link.txt
      # 935400030 bases of 2897316137 (32.285%) in intersection
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -continue=syntenicNet -stop=syntenicNet \
          -chainMinScore=3000 -chainLinearGap=medium -workhorse=hgwdev \
          -smallClusterHub=ku -bigClusterHub=ku > synNet.log 2>&1
      # real    39m31.241s
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMicOch1.2013-08-26 lastz.micOch1
      cd lastz.micOch1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 micOch1 > rbest.log 2>&1 &
      #   real    62m39.003s
  
      # running the swap
      mkdir /hive/data/genomes/micOch1/bed/blastz.hg19.swap
      cd /hive/data/genomes/micOch1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMicOch1.2013-08-26/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    104m38.965s
  
      cat fb.micOch1.chainHg19Link.txt
      #    908760510 bases of 2104321675 (43.185%) in intersection
  
      cd /hive/data/genomes/micOch1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
      # rbest catch up 2017-03-16
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          micOch1 hg19) > rbest.log 2>&1
      # real    375m5.044s
  
  #########################################################################
  # lastz southern platyfish xipMac1 (DONE - 2013-08-27 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S xipMac1
      mkdir /hive/data/genomes/hg19/bed/lastzXipMac1.2013-08-27
      cd /hive/data/genomes/hg19/bed/lastzXipMac1.2013-08-27
  
      cat << '_EOF_' > DEF
  # Human vs. southern platyfish
  # (more distant, less repeat-killed than mammal) params
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: southern platyfish xipMac1
  SEQ2_DIR=/hive/data/genomes/xipMac1/xipMac1.2bit
  SEQ2_LEN=/hive/data/genomes/xipMac1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzXipMac1.2013-08-27
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 \
          -chainLinearGap=loose > do.log 2>&1
      #   real    743m56.684s
      # stuck on one job, continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 \
          -continue=cat -chainLinearGap=loose > cat.log 2>&1
      #   real    17m14.187s
  
      cat fb.hg19.chainXipMac1Link.txt
      #   53354483 bases of 2897316137 (1.842%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzXipMac1.2013-08-27 lastz.xipMac1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 xipMac1 > rbest.log 2>&1 &
      #   real    5m49.906s
  
      #	and for the swap
      mkdir /hive/data/genomes/xipMac1/bed/blastz.hg19.swap
      cd /hive/data/genomes/xipMac1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzXipMac1.2013-08-27/DEF \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #   real     8m28.415s
      cat  fb.xipMac1.chainHg19Link.txt
      #	47814217 bases of 652815383 (7.324%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/xipMac1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # lastz Xenopus tropicalis xenTro7 (DONE - 2013-08-28 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S xenTro7
      mkdir /hive/data/genomes/hg19/bed/lastzXenTro7.2013-08-28
      cd /hive/data/genomes/hg19/bed/lastzXenTro7.2013-08-28
  
      cat << '_EOF_' > DEF
  # Human vs. Xenopus tropicalis
  # (more distant, less repeat-killed than mammal) params
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Xenopus tropicalis xenTro7
  SEQ2_DIR=/hive/data/genomes/xenTro7/xenTro7.2bit
  SEQ2_LEN=/hive/data/genomes/xenTro7/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=50
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzXenTro7.2013-08-28
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=5000 \
          -chainLinearGap=loose > do.log 2>&1
      #   real    555m40.795s
  
      cat fb.hg19.chainXenTro7Link.txt
      #   91350514 bases of 2897316137 (3.153%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzXenTro7.2013-08-28 lastz.xenTro7
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 xenTro7 > rbest.log 2>&1 &
      #   real     58m54.190s
  
      #	and for the swap
      mkdir /hive/data/genomes/xenTro7/bed/blastz.hg19.swap
      cd /hive/data/genomes/xenTro7/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzXenTro7.2013-08-28/DEF \
          -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #   real     62m38.163s
      cat  fb.xenTro7.chainHg19Link.txt
      #	92294714 bases of 1365936747 (6.757%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/xenTro7/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ pig susScr3 Lastz run (DONE - 2013-08-28 - Hiram)
      screen -S susScr3    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzSusScr3.2013-08-28
      cd  /hive/data/genomes/hg19/bed/lastzSusScr3.2013-08-28
      cat << '_EOF_' > DEF
  # human vs pig
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: pig susScr3
  SEQ2_DIR=/hive/data/genomes/susScr3/susScr3.2bit
  SEQ2_LEN=/hive/data/genomes/susScr3/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzSusScr3.2013-08-28
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  ynteni
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    985m36.384s
  
      cat fb.hg19.chainSusScr3Link.txt
      # 1332108993 bases of 2897316137 (45.977%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzSusScr3.2013-08-28 lastz.susScr3
  
      cd /hive/data/genomes/hg19/bed/lastzSusScr3.2013-08-28
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 susScr3 > rbest.log 2>&1 &
      #   real    168m41.815s
  
      # running the swap
      mkdir /hive/data/genomes/susScr3/bed/blastz.hg19.swap
      cd /hive/data/genomes/susScr3/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzSusScr3.2013-08-28/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    190m59.779s
  
      cat fb.susScr3.chainHg19Link.txt
      #    1445651389 bases of 2525294057 (57.247%) in intersection
  
      cd /hive/data/genomes/susScr3/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ squirrel speTri2 Lastz run (DONE - 2013-08-28 - Hiram)
      screen -S speTri2    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzSpeTri2.2013-08-28
      cd  /hive/data/genomes/hg19/bed/lastzSpeTri2.2013-08-28
      cat << '_EOF_' > DEF
  # human vs squirrel
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: squirrel speTri2
  SEQ2_DIR=/hive/data/genomes/speTri2/speTri2.2bit
  SEQ2_LEN=/hive/data/genomes/speTri2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=80
  
  BASE=/hive/data/genomes/hg19/bed/lastzSpeTri2.2013-08-28
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    830m26.230s
  
      cat fb.hg19.chainSpeTri2Link.txt
      # 1440632189 bases of 2897316137 (49.723%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzSpeTri2.2013-08-28 lastz.speTri2
  
      cd /hive/data/genomes/hg19/bed/lastzSpeTri2.2013-08-28
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 speTri2 > rbest.log 2>&1 &
      #   real    117m12.195s
  
      # running the swap
      mkdir /hive/data/genomes/speTri2/bed/blastz.hg19.swap
      cd /hive/data/genomes/speTri2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzSpeTri2.2013-08-28/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    174m25.240s
  
      cat fb.speTri2.chainHg19Link.txt
      #    1425773347 bases of 2311060300 (61.693%) in intersection
  
      cd /hive/data/genomes/speTri2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # lastz Nile Tilapia oreNil2 (DONE - 2013-08-28 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S oreNil2
      mkdir /hive/data/genomes/hg19/bed/lastzOreNil2.2013-08-28
      cd /hive/data/genomes/hg19/bed/lastzOreNil2.2013-08-28
  
      cat << '_EOF_' > DEF
  # Human vs. Nile Tilapia
  # (more distant, less repeat-killed than mammal) params
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Nile Tilapia oreNil2
  SEQ2_DIR=/hive/data/genomes/oreNil2/oreNil2.2bit
  SEQ2_LEN=/hive/data/genomes/oreNil2/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LIMIT=30
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzOreNil2.2013-08-28
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
  	-qRepeats=windowmaskerSdust -chainMinScore=5000 \
          -chainLinearGap=loose > do.log 2>&1
      #   real    409m26.473s
  
      cat fb.hg19.chainOreNil2Link.txt
      #   53080894 bases of 2897316137 (1.832%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzOreNil2.2013-08-28 lastz.oreNil2
  
      cd /hive/data/genomes/hg19/bed/lastzOreNil2.2013-08-28
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 oreNil2 > rbest.log 2>&1 &
      #   real     30m17.282s
  
      #	and for the swap
      mkdir /hive/data/genomes/oreNil2/bed/blastz.hg19.swap
      cd /hive/data/genomes/oreNil2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzOreNil2.2013-08-28/DEF \
          -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #   real     51m15.426s
      cat  fb.oreNil2.chainHg19Link.txt
      #	50170932 bases of 816084674 (6.148%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/oreNil2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # lastz Atlantic cod gadMor1 (DONE - 2013-08-29 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S gadMor1
      mkdir /hive/data/genomes/hg19/bed/lastzGadMor1.2013-08-29
      cd /hive/data/genomes/hg19/bed/lastzGadMor1.2013-08-29
  
      cat << '_EOF_' > DEF
  # Human vs. Atlantic cod
  # (more distant, less repeat-killed than mammal) params
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: Atlantic cod gadMor1
  SEQ2_DIR=/hive/data/genomes/gadMor1/gadMor1.2bit
  SEQ2_LEN=/hive/data/genomes/gadMor1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=1000
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzGadMor1.2013-08-29
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # this is a tough one, even at 1,000 LIMIT, it is still 154,128 jobs
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
  	-qRepeats=windowmaskerSdust -chainMinScore=5000 \
          -chainLinearGap=loose > do.log 2>&1
      #   real    3867m27.630s
  
      cat fb.hg19.chainGadMor1Link.txt
      #   44944606 bases of 2897316137 (1.551%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzGadMor1.2013-08-29 lastz.gadMor1
  
      cd /hive/data/genomes/hg19/bed/lastzGadMor1.2013-08-29
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 gadMor1 > rbest.log 2>&1 &
      #   real     10m52.744s
  
      #	and for the swap
      mkdir /hive/data/genomes/gadMor1/bed/blastz.hg19.swap
      cd /hive/data/genomes/gadMor1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzGadMor1.2013-08-29/DEF \
          -workhorse=hgwdev -smallClusterHub=ku \
          -qRepeats=windowmaskerSdust -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #   real     80m20.797s
      cat  fb.gadMor1.chainHg19Link.txt
      #	40164018 bases of 608038597 (6.606%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/gadMor1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ  Painted turtle chrPic1 Lastz run (DONE - 2013-08-29 - Hiram)
      screen -S chrPic1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzChrPic1.2013-08-29
      cd  /hive/data/genomes/hg19/bed/lastzChrPic1.2013-08-29
      cat << '_EOF_' > DEF
  # human vs Painted turtle
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Painted turtle chrPic1
  SEQ2_DIR=/hive/data/genomes/chrPic1/chrPic1.2bit
  SEQ2_LEN=/hive/data/genomes/chrPic1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzChrPic1.2013-08-29
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    892m13.008s
  
      cat fb.hg19.chainChrPic1Link.txt
      # 107659077 bases of 2897316137 (3.716%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzChrPic1.2013-08-29 lastz.chrPic1
      cd lastz.chrPic1
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 chrPic1 > rbest.log 2>&1 &
      #   real    30m27.085s
  
      # running the swap
      mkdir /hive/data/genomes/chrPic1/bed/blastz.hg19.swap
      cd /hive/data/genomes/chrPic1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzChrPic1.2013-08-29/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real      101m31.723s
  
      cat fb.chrPic1.chainHg19Link.txt
      #    98942356 bases of 2158289746 (4.584%) in intersection
  
      cd /hive/data/genomes/chrPic1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Microbat myoLuc2 Lastz run (DONE - 2013-08-29 - Hiram)
      screen -S myoLuc2    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMyoLuc2.2013-08-29
      cd  /hive/data/genomes/hg19/bed/lastzMyoLuc2.2013-08-29
      cat << '_EOF_' > DEF
  # human vs Microbat
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Microbat myoLuc2
  SEQ2_DIR=/hive/data/genomes/myoLuc2/myoLuc2.2bit
  SEQ2_LEN=/hive/data/genomes/myoLuc2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzMyoLuc2.2013-08-29
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    1016m59.056s
  
      cat fb.hg19.chainMyoLuc2Link.txt
      # 1155637547 bases of 2897316137 (39.886%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMyoLuc2.2013-08-29 lastz.myoLuc2
  
      cd /hive/data/genomes/hg19/bed/lastzMyoLuc2.2013-08-29
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 myoLuc2 > rbest.log 2>&1 &
      #   real    98m21.512s
  
      # running the swap
      mkdir /hive/data/genomes/myoLuc2/bed/blastz.hg19.swap
      cd /hive/data/genomes/myoLuc2/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMyoLuc2.2013-08-29/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    124m57.506s
  
      cat fb.myoLuc2.chainHg19Link.txt
      #    1176187354 bases of 1966419868 (59.814%) in intersection
  
      cd /hive/data/genomes/myoLuc2/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Chinchilla chiLan1 Lastz run (DONE - 2013-08-29 - Hiram)
      screen -S chiLan1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzChiLan1.2013-08-29
      cd  /hive/data/genomes/hg19/bed/lastzChiLan1.2013-08-29
      cat << '_EOF_' > DEF
  # human vs Chinchilla
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Chinchilla chiLan1
  SEQ2_DIR=/hive/data/genomes/chiLan1/chiLan1.2bit
  SEQ2_LEN=/hive/data/genomes/chiLan1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzChiLan1.2013-08-29
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    1092m26.135s
  
      cat fb.hg19.chainChiLan1Link.txt
      # 1362093436 bases of 2897316137 (47.012%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzChiLan1.2013-08-29 lastz.chiLan1
  
      cd /hive/data/genomes/hg19/bed/lastzChiLan1.2013-08-29
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 chiLan1 > rbest.log 2>&1 &
      #   real    114m45.554s
  
      # running the swap
      mkdir /hive/data/genomes/chiLan1/bed/blastz.hg19.swap
      cd /hive/data/genomes/chiLan1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzChiLan1.2013-08-29/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    132m31.964s
  
      cat fb.chiLan1.chainHg19Link.txt
      #    1324270224 bases of 2284276400 (57.973%) in intersection
  
      cd /hive/data/genomes/chiLan1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ soft-shell turtle pelSin1 Lastz run (DONE - 2013-08-29 - Hiram)
      screen -S pelSin1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzPelSin1.2013-08-29
      cd  /hive/data/genomes/hg19/bed/lastzPelSin1.2013-08-29
      cat << '_EOF_' > DEF
  # human vs soft-shell turtle
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: soft-shell turtle pelSin1
  SEQ2_DIR=/hive/data/genomes/pelSin1/pelSin1.2bit
  SEQ2_LEN=/hive/data/genomes/pelSin1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzPelSin1.2013-08-29
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
      # real    760m28.165s
  
      cat fb.hg19.chainPelSin1Link.txt
      # 94684471 bases of 2897316137 (3.268%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPelSin1.2013-08-29 lastz.pelSin1
  
      cd /hive/data/genomes/hg19/bed/lastzPelSin1.2013-08-29
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 pelSin1 > rbest.log 2>&1 &
      #   real    24m55.846s
  
      # running the swap
      mkdir /hive/data/genomes/pelSin1/bed/blastz.hg19.swap
      cd /hive/data/genomes/pelSin1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPelSin1.2013-08-29/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    53m24.544s
  
      cat fb.pelSin1.chainHg19Link.txt
      #    84523108 bases of 2106639384 (4.012%) in intersection
  
      cd /hive/data/genomes/pelSin1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Chinese hamster criGri1 Lastz run (DONE - 2013-08-30 - Hiram)
      screen -S criGri1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzCriGri1.2013-08-30
      cd  /hive/data/genomes/hg19/bed/lastzCriGri1.2013-08-30
      cat << '_EOF_' > DEF
  # human vs Chinese hamster
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Chinese hamster criGri1
  SEQ2_DIR=/hive/data/genomes/criGri1/criGri1.2bit
  SEQ2_LEN=/hive/data/genomes/criGri1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=300
  
  BASE=/hive/data/genomes/hg19/bed/lastzCriGri1.2013-08-30
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -syntenicNet -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    2599m44.559s
  
      cat fb.hg19.chainCriGri1Link.txt
      #   984641190 bases of 2897316137 (33.985%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzCriGri1.2013-08-30 lastz.criGri1
  
      cd /hive/data/genomes/hg19/bed/lastzCriGri1.2013-08-30
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 criGri1 > rbest.log 2>&1 &
      #   real    57m9.843s
  
      # running the swap
      mkdir /hive/data/genomes/criGri1/bed/blastz.hg19.swap
      cd /hive/data/genomes/criGri1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzCriGri1.2013-08-30/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    110m44.600s
  
      cat fb.criGri1.chainHg19Link.txt
      #  968620399 bases of 2301325917 (42.090%) in intersection
  
      cd /hive/data/genomes/criGri1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ golden hamster mesAur1 Lastz run (DONE - 2013-08-30 - Hiram)
      screen -S mesAur1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMesAur1.2013-08-30
      cd  /hive/data/genomes/hg19/bed/lastzMesAur1.2013-08-30
      cat << '_EOF_' > DEF
  # human vs golden hamster
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: golden hamster mesAur1
  SEQ2_DIR=/hive/data/genomes/mesAur1/mesAur1.2bit
  SEQ2_LEN=/hive/data/genomes/mesAur1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=150
  
  BASE=/hive/data/genomes/hg19/bed/lastzMesAur1.2013-08-30
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -syntenicNet -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    2400m6.299s
  
      cat fb.hg19.chainMesAur1Link.txt
      # 915714541 bases of 2897316137 (31.606%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMesAur1.2013-08-30 lastz.mesAur1
  
      cd /hive/data/genomes/hg19/bed/lastzMesAur1.2013-08-30
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 mesAur1 > rbest.log 2>&1 &
      #   real    60m12.921s
  
      # running the swap
      mkdir /hive/data/genomes/mesAur1/bed/blastz.hg19.swap
      cd /hive/data/genomes/mesAur1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMesAur1.2013-08-30/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    87m57.730s
  
      cat fb.mesAur1.chainHg19Link.txt
      #    888275411 bases of 2076176254 (42.784%) in intersection
  
      cd /hive/data/genomes/mesAur1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ black flying fox pteAle1 Lastz run (DONE - 2013-08-30 - Hiram)
      screen -S pteAle1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzPteAle1.2013-08-30
      cd  /hive/data/genomes/hg19/bed/lastzPteAle1.2013-08-30
      cat << '_EOF_' > DEF
  # human vs black flying fox
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: black flying fox pteAle1
  SEQ2_DIR=/hive/data/genomes/pteAle1/pteAle1.2bit
  SEQ2_LEN=/hive/data/genomes/pteAle1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=400
  
  BASE=/hive/data/genomes/hg19/bed/lastzPteAle1.2013-08-30
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -syntenicNet -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    2451m8.857s
  
      cat fb.hg19.chainPteAle1Link.txt
      # 1439107015 bases of 2897316137 (49.670%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPteAle1.2013-08-30 lastz.pteAle1
  
      cd /hive/data/genomes/hg19/bed/lastzPteAle1.2013-08-30
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 pteAle1 > rbest.log 2>&1 &
      #   real    66m15.979s
  
      # running the swap
      mkdir /hive/data/genomes/pteAle1/bed/blastz.hg19.swap
      cd /hive/data/genomes/pteAle1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPteAle1.2013-08-30/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    113m30.834s
  
      cat fb.pteAle1.chainHg19Link.txt
      #  1382430767 bases of 1944625202 (71.090%) in intersection
  
      cd /hive/data/genomes/pteAle1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ David's Myotis myoDav1 Lastz run (DONE - 2013-08-30 - Hiram)
      screen -S myoDav1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMyoDav1.2013-08-30
      cd  /hive/data/genomes/hg19/bed/lastzMyoDav1.2013-08-30
      cat << '_EOF_' > DEF
  # human vs David's Myotis
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: David's Myotis myoDav1
  SEQ2_DIR=/hive/data/genomes/myoDav1/myoDav1.2bit
  SEQ2_LEN=/hive/data/genomes/myoDav1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=600
  
  BASE=/hive/data/genomes/hg19/bed/lastzMyoDav1.2013-08-30
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -syntenicNet -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    2580m42.983s
  
      cat fb.hg19.chainMyoDav1Link.txt
      #  1152836794 bases of 2897316137 (39.790%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMyoDav1.2013-08-30 lastz.myoDav1
  
      cd /hive/data/genomes/hg19/bed/lastzMyoDav1.2013-08-30
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 myoDav1 > rbest.log 2>&1 &
      #   real    70m11.697s
  
      # running the swap
      mkdir /hive/data/genomes/myoDav1/bed/blastz.hg19.swap
      cd /hive/data/genomes/myoDav1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMyoDav1.2013-08-30/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    127m39.070s
  
      cat fb.myoDav1.chainHg19Link.txt
      #    1131034077 bases of 1878461987 (60.211%) in intersection
  
      cd /hive/data/genomes/myoDav1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ big brown bat eptFus1 Lastz run (DONE - 2013-08-30 - Hiram)
      screen -S eptFus1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzEptFus1.2013-08-30
      cd  /hive/data/genomes/hg19/bed/lastzEptFus1.2013-08-30
      cat << '_EOF_' > DEF
  # human vs big brown bat
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: big brown bat eptFus1
  SEQ2_DIR=/hive/data/genomes/eptFus1/eptFus1.2bit
  SEQ2_LEN=/hive/data/genomes/eptFus1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzEptFus1.2013-08-30
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -syntenicNet -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    2141m28.715s
  
      cat fb.hg19.chainEptFus1Link.txt
      # 1165334482 bases of 2897316137 (40.221%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzEptFus1.2013-08-30 lastz.eptFus1
  
      cd /hive/data/genomes/hg19/bed/lastzEptFus1.2013-08-30
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 eptFus1 > rbest.log 2>&1 &
      #   real    67m59.692s
  
      # running the swap
      mkdir /hive/data/genomes/eptFus1/bed/blastz.hg19.swap
      cd /hive/data/genomes/eptFus1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzEptFus1.2013-08-30/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    99m18.361s
  
      cat fb.eptFus1.chainHg19Link.txt
      #    1125367165 bases of 1811378799 (62.128%) in intersection
  
      cd /hive/data/genomes/eptFus1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Tawny puffer fish takFla1 Lastz run (DONE - 2013-08-30 - Hiram)
      screen -S takFla1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzTakFla1.2013-08-30
      cd  /hive/data/genomes/hg19/bed/lastzTakFla1.2013-08-30
      cat << '_EOF_' > DEF
  # human vs Tawny puffer fish
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Tawny puffer fish takFla1
  SEQ2_DIR=/hive/data/genomes/takFla1/takFla1.2bit
  SEQ2_LEN=/hive/data/genomes/takFla1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzTakFla1.2013-08-30
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    1139m40.279s
  
      cat fb.hg19.chainTakFla1Link.txt
      #  30765768 bases of 2897316137 (1.062%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzTakFla1.2013-08-30 lastz.takFla1
  
      cd /hive/data/genomes/hg19/bed/lastzTakFla1.2013-08-30
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 takFla1 > rbest.log 2>&1 &
      #   real    5m2.879s
  
      # running the swap
      mkdir /hive/data/genomes/takFla1/bed/blastz.hg19.swap
      cd /hive/data/genomes/takFla1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzTakFla1.2013-08-30/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real     6m56.043s
  
      cat fb.takFla1.chainHg19Link.txt
      #    26755636 bases of 314848639 (8.498%) in intersection
  
      cd /hive/data/genomes/takFla1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Princess of Burundi neoBri1 Lastz run (DONE - 2013-09-02 - Hiram)
      screen -S neoBri1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzNeoBri1.2013-09-02
      cd  /hive/data/genomes/hg19/bed/lastzNeoBri1.2013-09-02
      cat << '_EOF_' > DEF
  # human vs Princess of Burundi
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Princess of Burundi neoBri1
  SEQ2_DIR=/hive/data/genomes/neoBri1/neoBri1.2bit
  SEQ2_LEN=/hive/data/genomes/neoBri1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzNeoBri1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    774m23.227s
  
      cat fb.hg19.chainNeoBri1Link.txt
      # 35299643 bases of 2897316137 (1.218%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzNeoBri1.2013-09-02 lastz.neoBri1
  
      cd /hive/data/genomes/hg19/bed/lastzNeoBri1.2013-09-02
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 neoBri1 > rbest.log 2>&1 &
      #   real    19m5.052s
  
      # running the swap
      mkdir /hive/data/genomes/neoBri1/bed/blastz.hg19.swap
      cd /hive/data/genomes/neoBri1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzNeoBri1.2013-09-02/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    11m18.544s
  
      cat fb.neoBri1.chainHg19Link.txt
      #  32747070 bases of 685897281 (4.774%) in intersection
  
      cd /hive/data/genomes/neoBri1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Burton's mouthbreeder hapBur1 Lastz run (DONE - 2013-09-02 - Hiram)
      screen -S hapBur1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzHapBur1.2013-09-02
      cd  /hive/data/genomes/hg19/bed/lastzHapBur1.2013-09-02
      cat << '_EOF_' > DEF
  # human vs Burton's mouthbreeder
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Burton's mouthbreeder hapBur1
  SEQ2_DIR=/hive/data/genomes/hapBur1/hapBur1.2bit
  SEQ2_LEN=/hive/data/genomes/hapBur1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=40
  
  BASE=/hive/data/genomes/hg19/bed/lastzHapBur1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    720m19.953s
  
      cat fb.hg19.chainHapBur1Link.txt
      # 35367525 bases of 2897316137 (1.221%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzHapBur1.2013-09-02 lastz.hapBur1
  
      cd /hive/data/genomes/hg19/bed/lastzHapBur1.2013-09-02
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 hapBur1 > rbest.log 2>&1 &
      #   real    8m49.709s
  
      # running the swap
      mkdir /hive/data/genomes/hapBur1/bed/blastz.hg19.swap
      cd /hive/data/genomes/hapBur1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzHapBur1.2013-09-02/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    7m34.243s
  
      cat fb.hapBur1.chainHg19Link.txt
      #    33244147 bases of 698936397 (4.756%) in intersection
  
      cd /hive/data/genomes/hapBur1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Pundamilia nyererei cichlid fish punNye1 Lastz run (DONE - 2013-09-02 - Hiram)
      screen -S punNye1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzPunNye1.2013-09-02
      cd  /hive/data/genomes/hg19/bed/lastzPunNye1.2013-09-02
      cat << '_EOF_' > DEF
  # human vs Pundamilia nyererei cichlid fish
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Pundamilia nyererei cichlid fish punNye1
  SEQ2_DIR=/hive/data/genomes/punNye1/punNye1.2bit
  SEQ2_LEN=/hive/data/genomes/punNye1/chrom.sizes
  SEQ2_CHUNK=15000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzPunNye1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    778m57.473s
  
      cat fb.hg19.chainPunNye1Link.txt
      # 35332553 bases of 2897316137 (1.219%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPunNye1.2013-09-02 lastz.punNye1
  
      cd /hive/data/genomes/hg19/bed/lastzPunNye1.2013-09-02
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 punNye1 > rbest.log 2>&1 &
      #   real    15m0.100s
  
      # running the swap
      mkdir /hive/data/genomes/punNye1/bed/blastz.hg19.swap
      cd /hive/data/genomes/punNye1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPunNye1.2013-09-02/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    11m41.848s
  
      cat fb.punNye1.chainHg19Link.txt
      #  33189954 bases of 698757151 (4.750%) in intersection
  
      cd /hive/data/genomes/punNye1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Mexican tetra (cavefish) astMex1 Lastz run (DONE - 2013-09-02 - Hiram)
      screen -S astMex1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzAstMex1.2013-09-02
      cd  /hive/data/genomes/hg19/bed/lastzAstMex1.2013-09-02
      cat << '_EOF_' > DEF
  # human vs Mexican tetra (cavefish)
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Mexican tetra (cavefish) astMex1
  SEQ2_DIR=/hive/data/genomes/astMex1/astMex1.2bit
  SEQ2_LEN=/hive/data/genomes/astMex1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=60
  
  BASE=/hive/data/genomes/hg19/bed/lastzAstMex1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    822m26.496s
  
      cat fb.hg19.chainAstMex1Link.txt
      # 36769531 bases of 2897316137 (1.269%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzAstMex1.2013-09-02 lastz.astMex1
  
      cd /hive/data/genomes/hg19/bed/lastzAstMex1.2013-09-02
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 astMex1 > rbest.log 2>&1 &
      #   real    21m11.885s
  
      # running the swap
      mkdir /hive/data/genomes/astMex1/bed/blastz.hg19.swap
      cd /hive/data/genomes/astMex1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzAstMex1.2013-09-02/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    6m37.356s
  
      cat fb.astMex1.chainHg19Link.txt
      #    33708255 bases of 964264884 (3.496%) in intersection
  
      cd /hive/data/genomes/astMex1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Spotted Gar lepOcu1 Lastz run (DONE - 2013-09-02 - Hiram)
      screen -S lepOcu1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzLepOcu1.2013-09-02
      cd  /hive/data/genomes/hg19/bed/lastzLepOcu1.2013-09-02
      cat << '_EOF_' > DEF
  # human vs Spotted Gar
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Spotted Gar lepOcu1
  SEQ2_DIR=/hive/data/genomes/lepOcu1/lepOcu1.2bit
  SEQ2_LEN=/hive/data/genomes/lepOcu1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzLepOcu1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    692m41.530s
  
      cat fb.hg19.chainLepOcu1Link.txt
      # 44098709 bases of 2897316137 (1.522%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzLepOcu1.2013-09-02 lastz.lepOcu1
  
      cd /hive/data/genomes/hg19/bed/lastzLepOcu1.2013-09-02
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 lepOcu1 > rbest.log 2>&1 &
      #   real    15m34.553s
  
      # running the swap
      mkdir /hive/data/genomes/lepOcu1/bed/blastz.hg19.swap
      cd /hive/data/genomes/lepOcu1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzLepOcu1.2013-09-02/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    8m47.294s
  
      cat fb.lepOcu1.chainHg19Link.txt
      #    35349904 bases of 869414361 (4.066%) in intersection
  
      cd /hive/data/genomes/lepOcu1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # lastz chinese alligator allSin1 (DONE - 2013-09-02 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S hg19AllSin1
      mkdir /hive/data/genomes/hg19/bed/lastzAllSin1.2013-09-02
      cd /hive/data/genomes/hg19/bed/lastzAllSin1.2013-09-02
  
      cat << '_EOF_' > DEF
  # Human vs. chinese alligator
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: alligator allSin1
  SEQ2_DIR=/hive/data/genomes/allSin1/allSin1.2bit
  SEQ2_LEN=/hive/data/genomes/allSin1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzAllSin1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #	real  517m26.179s
      # forgot to reload the database tables after adding chrM, finish
      # the load step, then continuing:
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -continue=download `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=5000 -chainLinearGap=loose > download.log 2>&1
      #  Elapsed time: 1m36s
  
      cat fb.hg19.chainAllSin1Link.txt
      #	198130655 bases of 2897316137 (6.838%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzAllSin1.2013-09-02 lastz.allSin1
  
      # good to have recip best here
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
            hg19 allSin1 > rbest.log 2>&1 &
      #  real    18m19.294s
  
      #	and for the swap
      mkdir /hive/data/genomes/allSin1/bed/blastz.hg19.swap
      cd /hive/data/genomes/allSin1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzAllSin1.2013-09-02/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #	real    23m42.533s
  
      cat  fb.allSin1.chainHg19Link.txt
      #	170963971 bases of 2198602449 (7.776%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/allSin1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # LASTZ Mexican tetra (cavefish) mayZeb1 Lastz run (DONE - 2013-09-02 - Hiram)
      screen -S mayZeb1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzMayZeb1.2013-09-02
      cd  /hive/data/genomes/hg19/bed/lastzMayZeb1.2013-09-02
      cat << '_EOF_' > DEF
  # human vs Maylandia zebra cichlid fish
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Maylandia zebra cichlid fish mayZeb1
  SEQ2_DIR=/hive/data/genomes/mayZeb1/mayZeb1.2bit
  SEQ2_LEN=/hive/data/genomes/mayZeb1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzMayZeb1.2013-09-02
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=5000 -chainLinearGap=loose \
          -qRepeats=windowmaskerSdust -workhorse=hgwdev -smallClusterHub=ku \
          -bigClusterHub=ku > do.log 2>&1
      # real    736m7.416s
  
      cat fb.hg19.chainMayZeb1Link.txt
      #  36002695 bases of 2897316137 (1.243%) in intersection
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzMayZeb1.2013-09-02 lastz.mayZeb1
  
      cd /hive/data/genomes/hg19/bed/lastzMayZeb1.2013-09-02
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 mayZeb1 > rbest.log 2>&1 &
      #   real    10m10.495s
  
      # running the swap
      mkdir /hive/data/genomes/mayZeb1/bed/blastz.hg19.swap
      cd /hive/data/genomes/mayZeb1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzMayZeb1.2013-09-02/DEF \
          -qRepeats=windowmaskerSdust -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          > swap.log 2>&1
      #   real    6m51.755s
  
      cat fb.mayZeb1.chainHg19Link.txt
      #  33421946 bases of 713527863 (4.684%) in intersection
  
      cd /hive/data/genomes/mayZeb1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # Amur tiger panTig1 Lastz run (DONE - 2013-09-10 - Hiram)
      screen -S panTig1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzPanTig1.2013-09-10
      cd /hive/data/genomes/hg19/bed/lastzPanTig1.2013-09-10
      cat << '_EOF_' > DEF
  # human vs Amur tiger
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: Amur tiger  PanTig1
  SEQ2_DIR=/hive/data/genomes/panTig1/panTig1.2bit
  SEQ2_LEN=/hive/data/genomes/panTig1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanTig1.2013-09-10
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > do.log 2>&1
      #   real    952m31.453s
  
      cat fb.hg19.chainPanTig1Link.txt
      #   1523174002 bases of 2897316137 (52.572%) in intersection
  
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 panTig1 > rbest.log 2>&1 &
      #  real    541m21.230s
  
      # running the swap
      mkdir /hive/data/genomes/panTig1/bed/blastz.hg19.swap
      cd /hive/data/genomes/panTig1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPanTig1.2013-09-10/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    329m54.539s
  
      cat fb.panTig1.chainHg19Link.txt
      #  1458977137 bases of 2332849683 (62.541%) in intersection
  
      cd /hive/data/genomes/panTig1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  # NCBI ClinVar (new version -DONE - 2014-11-08 - Max)
  # Note that since Jan 2015, ClinVar is on otto update mode.
  # for the procedure, see ~/kent/src/hg/utils/otto/clinvar/
  
  mkdir /hive/data/genomes/hg19/bed/clinvar
  cd /hive/data/genomes/hg19/bed/clinvar
  cat << '_EOF_' > clinvar.as
  table clinVarBed
  "Browser extensible data (4 fields) plus information about a ClinVar entry"
      (
      string chrom;        "Chromosome (or contig, scaffold, etc.)"
      uint   chromStart;   "Start position in chromosome"
      uint   chromEnd;     "End position in chromosome"
      string name;         "Name of item"
      lstring origName;         "Orignal name of item"
      string type;         "Type of Variant"
      string geneId;         "NCBI Entrez Gene ID"
      string geneSym;         "NCBI Entrez Gene Symbol"
      string clinSign;         "Clinical significance:"
      string snpId;         "dbSNP ID"
      string nsvId;         "dbVar ID"
      string rcvAcc;         "ClinVar ID"
      string testedInGtr;         "Genetic Testing Registry"
      lstring phenotype;         "Phenotype identifiers"
      string origin;         "Data origin"
      string assembly;         "Genome assembly"
      string cytogenetic;         "Cytogenetic status"
      string reviewStatus;         "Review status"
      lstring hgvsCod;         "coding HGVS"
      lstring hgvsProt;         "protein HGVS"
      string numSubmit;         "number of submitters"
      string lastEval;         "last evaluation"
      string guidelines;         "guidelines"
      lstring otherIds;         "other identifiers e.g. OMIM IDs, etc."
      )
  _EOF_
  ~/kent/src/hg/utils/otto/clinvar/clinVarToBed --auto
  mv clinvarMain.hg19.bb clinvarMain.bb
  mv clinvarCnv.hg19.bb clinvarCnv.bb
  mkdir /hive/data/genomes/hg38/bed/clinvar
  mv clinvarMain.hg38.bb /hive/data/genomes/hg38/bed/clinvar/clinvarMain.bb
  mv clinvarCnv.hg38.bb /hive/data/genomes/hg38/bed/clinvar/clinvarCnv.bb
  
  ln -s /hive/data/genomes/hg38/bed/clinvar/clinvarMain.bb /gbdb/hg38/bbi/clinvarMain.bb
  ln -s /hive/data/genomes/hg38/bed/clinvar/clinvarCnv.bb /gbdb/hg38/bbi/clinvarCnv.bb
  ln -s /hive/data/genomes/hg19/bed/clinvar/clinvarMain.bb /gbdb/hg19/bbi/clinvarMain.bb
  ln -s /hive/data/genomes/hg19/bed/clinvar/clinvarCnv.bb /gbdb/hg19/bbi/clinvarCnv.bb
  
  #########################################################################
  # lastz Arctic lamprey letCam1 (DONE - 2013-09-19 - Hiram)
      # establish a screen to control this job with a name to indicate what it is
      screen -S letCam1
      mkdir /hive/data/genomes/hg19/bed/lastzLetCam1.2013-09-19
      cd /hive/data/genomes/hg19/bed/lastzLetCam1.2013-09-19
  
      cat << '_EOF_' > DEF
  # Human vs. Arctic lamprey
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_M=50
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Arctic lamprey LetCam1
  SEQ2_DIR=/hive/data/genomes/letCam1/letCam1.2bit
  SEQ2_LEN=/hive/data/genomes/letCam1/chrom.sizes
  SEQ2_CHUNK=10000000
  SEQ2_LAP=0
  SEQ2_LIMIT=200
  
  BASE=/hive/data/genomes/hg19/bed/lastzLetCam1.2013-09-19
  TMPDIR=/scratch/tmp
  '_EOF_'
      # << happy emacs
  
      # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
      #	number of jobs, 50,000 to something under 100,000
      # when not present, SEQ2_LIMIT is a default 100
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          `pwd`/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
      #   real    221m54.751s
  
      cat fb.hg19.chainLetCam1Link.txt
      #   37233603 bases of 2897316137 (1.285%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/hg19/bed
      ln -s lastzLetCam1.2013-09-19 lastz.letCam1
  
      #	and for the swap
      mkdir /hive/data/genomes/letCam1/bed/blastz.hg19.swap
      cd /hive/data/genomes/letCam1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzLetCam1.2013-09-19/DEF \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
      #    Elapsed time: 123m9s
      cat  fb.letCam1.chainHg19Link.txt
      #	30587683 bases of 853455384 (3.584%) in intersection
  
      # set sym link to indicate this is the lastz for this genome:
      cd /hive/data/genomes/letCam1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  #########################################################################
  ## 100-Way Multiz (WORKING - 2013-08-29 - Hiram)
      ssh hgwdev
      mkdir /hive/data/genomes/hg19/bed/multiz100way
      cd /hive/data/genomes/hg19/bed/multiz100way
  
      # using the benchmark 100 way.nh file:
      less ~/kent/src/hg/utils/phyloTrees/benchMark100.nh
      cp -p ~/kent/src/hg/utils/phyloTrees/benchMark100.nh ./hg19.100way.nh
  
      # extract species list from that .nh file
      sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
          hg19.100way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
          | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt
  
      # using this Common name .nh file in the phyloGif tool:
      # ~/kent/src/hg/utils/phyloTrees/benchMark100.commonNames.nh
      #	http://genome.ucsc.edu/cgi-bin/phyloGif
      #	to obtain a png image for src/hg/htdocs/images/phylo/mm19_100way.png
  
      # construct a common name list:
      sed -e 's#  *##; s#[()]##g; s#:.*##;' \
          ~/kent/src/hg/utils/phyloTrees/benchMark100.commonNames.nh \
            > hg19.100way.commonName.list
  
      # measure done status
      cat << '_EOF_' > ckDone.sh
  #!/bin/sh
  
  export N=0
  
  paste hg19.100way.commonName.list species.list.txt | grep -v hg19 | while read
  nameDb
  do
     N=`echo $N | awk '{printf "%02d", $1+1}'`
     db=`echo $nameDb | awk '{print $2}'`
     name=`echo $nameDb | awk '{print $1}'`
     Db=`echo $db | sed 's/.*/\u&/'`
     lzDir=`ls -ogrtd /hive/data/genomes/hg19/bed/lastz${Db}* 2> /dev/null |
  tail -1 | awk '{print $NF}'`
     echo -e -n "# ${N} $db\t"
     if [ "x${lzDir}y" != "xy" ]; then
        if [ -s ${lzDir}/fb.hg19.chain${Db}Link.txt ]; then
          hg19Percent=`sed -e 's#.* (##; s#) in .*##;'
  ${lzDir}/fb.hg19.chain${Db}Link.txt`
          otherPercent=""
          echo -e -n "${hg19Percent}"
          if [ -s
  "/hive/data/genomes/${db}/bed/blastz.hg19.swap/fb.${db}.chainHg19Link.txt" ];
  then
            otherPercent=`sed -e 's#.* (##; s#) in .*##;'
  /hive/data/genomes/${db}/bed/blastz.hg19.swap/fb.${db}.chainHg19Link.txt`
            dateStamp=`ls --full-time -og
  /hive/data/genomes/${db}/bed/blastz.hg19.swap/fb.${db}.chainHg19Link.txt | awk
  '{print $4,$5}' | sed -e 's/\.[0-9][0-9]*$//'`
            echo -e "\t${otherPercent}\t${dateStamp}"
          else
            echo -e "\totherPercent"
          fi
        else
           echo working
        fi
     else
        echo to be done
     fi
  done
  '_EOF_'
      # << happy emacs
      chmod +x ./ckDone.sh
  
      ./ckDone.sh
  
  # 01 panTro4    95.279% 95.563% 2013-01-28 17:16:47 Chimp
  # 02 gorGor3    89.876% 91.109% 2011-10-21 09:46:07 Gorilla
  # 03 ponAbe2    91.350% 89.617% 2009-06-02 13:10:58 Orangutan
  # 04 nomLeu3    87.764% 89.943% 2013-03-26 17:45:18 Gibbon
  # 05 rheMac3    82.859% 88.131% 2012-03-16 11:20:22 Rhesus
  # 06 macFas5    86.666% 86.757% 2013-06-30 23:15:33 Crab_eating_macaque
  # 07 papHam1    82.810% 85.825% 2013-07-01 16:31:15 Baboon
  # 08 chlSab1    86.446% 88.132% 2013-06-30 14:31:23 Green_monkey
  # 09 calJac3    70.654% 73.768% 2010-02-11 17:13:05 Marmoset
  # 10 saiBol1    69.598% 78.834% 2013-06-30 20:17:32 Squirrel_monkey
  # 11 otoGar3    54.830% 65.026% 2013-07-03 12:30:16 Bushbaby
  # 12 tupChi1    46.558% 50.286% 2013-07-07 14:59:14 Chinese_tree_shrew
  # 13 speTri2    49.723% 61.693% 2013-08-29 11:46:55 Squirrel
  # 14 jacJac1    35.106% 40.217% 2013-07-17 08:08:53 Lesser_Egyptian_jerboa
  # 15 micOch1    32.285% 43.185% 2013-08-27 11:29:15 Prairie_vole
  # 16 criGri1    33.985% 42.090% 2013-09-03 10:33:23 Chinese_hamster
  # 17 mesAur1    31.606% 42.784% 2013-09-02 22:14:46 Golden_hamster
  # 18 mm10       35.249% 38.226% 2012-03-08 12:46:33 Mouse
  # 19 rn5        31.662% 36.299% 2012-06-28 11:17:33 Rat
  # 20 hetGla2    47.600% 58.818% 2013-07-17 14:32:23 Mole_rat
  # 21 cavPor3    43.680% 48.043% 2009-08-31 09:42:19 Guinea_pig
  # 22 chiLan1    47.012% 57.973% 2013-08-30 10:31:54 Chinchilla
  # 23 octDeg1    41.385% 47.619% 2013-07-07 20:41:34 Brush_tailed_rat
  # 24 oryCun2    44.317% 48.405% 2009-08-26 18:32:38 Rabbit
  # 25 ochPri3    34.676% 49.855% 2013-06-30 23:40:02 Pika
  # 26 susScr3    45.977% 57.247% 2013-08-29 11:39:26 Pig
  # 27 vicPac2    50.193% 68.707% 2013-06-18 12:59:49 Alpaca
  # 28 camFer1    50.419% 71.204% 2013-07-13 09:55:38 Bactrian_camel
  # 29 turTru2    51.158% 61.364% 2013-08-22 19:12:10 Dolphin
  # 30 orcOrc1    52.072% 64.296% 2013-08-26 10:37:15 Killer_whale
  # 31 panHod1    46.863% 52.463% 2013-07-17 14:04:08 Tibetan_antelope
  # 32 bosTau7    46.971% 49.508% 2012-01-23 17:51:35 Cow
  # 33 oviAri3    46.833% 51.939% 2013-06-28 10:51:00 Sheep
  # 34 capHir1    46.498% 52.616% 2013-07-17 14:35:06 Goat
  # 35 equCab2    57.050% 66.774% 2009-06-07 00:01:11 Horse
  # 36 cerSim1    58.103% 69.204% 2012-10-23 11:02:24 White_rhinoceros
  # 37 felCat5    52.426% 61.479% 2013-07-13 21:56:34 Cat
  # 38 canFam3    51.848% 60.817% 2012-07-04 18:30:38 Dog
  # 39 musFur1    50.993% 62.328% 2013-07-12 11:27:15 Ferret
  # 40 ailMel1    50.164% 62.884% 2010-02-05 16:18:11 Panda
  # 41 odoRosDiv1 53.559% 64.797% 2013-07-13 10:00:44 Pacific_walrus
  # 42 lepWed1    52.424% 65.567% 2013-07-12 12:55:51 Weddell_seal
  # 43 pteAle1    49.670% 71.090% 2013-09-03 09:48:39 Black_flying_fox
  # 44 pteVam1    45.414% 69.685% 2013-07-05 16:34:42 Megabat
  # 45 myoDav1    39.790% 60.211% 2013-09-02 22:51:42 David's_myotis
  # 46 myoLuc2    39.886% 59.814% 2013-08-30 10:32:33 Microbat
  # 47 eptFus1    40.221% 62.128% 2013-09-02 22:18:16 Big_brown_bat
  # 48 eriEur2    26.149% 31.250% 2013-07-09 12:56:38 Hedgehog
  # 49 sorAra2    27.681% 35.514% 2013-06-19 10:26:27 Shrew
  # 50 conCri1    37.034% 60.981% 2013-07-10 14:15:58 Star-nosed_mole
  # 51 loxAfr3    46.636% 42.430% 2009-07-22 19:56:13 Elephant
  # 52 eleEdw1    27.675% 24.222% 2013-07-10 10:15:11 Cape_elephant_shrew
  # 53 triMan1    46.580% 46.912% 2013-07-11 11:50:53 Manatee
  # 54 chrAsi1    34.209% 29.033% 2013-07-09 17:37:53 Cape_golden_mole
  # 55 echTel2    30.135% 32.736% 2013-06-26 15:23:14 Tenrec
  # 56 oryAfe1    41.668% 34.051% 2013-07-09 17:37:45 Aardvark
  # 57 dasNov3    46.571% 41.884% 2013-07-09 18:07:57 Armadillo
  # 58 monDom5    14.358% 11.615% 2009-06-15 14:40:30 Opossum
  # 59 sarHar1    7.414%  6.937%  2013-07-10 15:13:20 Tasmanian_devil
  # 60 macEug2    6.534%  7.229%  2013-07-11 18:52:40 Wallaby
  # 61 ornAna1    7.627%  11.259% 2009-06-02 13:49:28 Platypus
  # 62 falChe1    3.812%  8.232%  2013-06-28 09:25:57 Saker_falcon
  # 63 falPer1    3.879%  8.365%  2013-06-28 09:29:07 Peregrine_falcon
  # 64 ficAlb2    3.715%  8.625%  2013-06-29 20:59:02 Collared_flycatcher
  # 65 zonAlb1    3.476%  8.622%  2013-06-27 09:07:21 White_throated_sparrow
  # 66 geoFor1    3.503%  8.504%  2012-07-29 16:50:12 Medium_ground_finch
  # 67 taeGut2    5.321%  11.613% 2013-06-18 11:17:31 Zebra_finch
  # 68 pseHum1    3.982%  9.685%  2013-06-29 21:00:36 Tibetan_ground_jay
  # 69 melUnd1    3.577%  8.237%  2013-06-28 11:39:53 Budgerigar
  # 70 amaVit1    3.298%  7.807%  2013-06-28 10:27:36 Puerto_Rican_parrot
  # 71 araMac1    2.912%  7.320%  2013-06-30 00:32:23 Scarlet_macaw
  # 72 colLiv1    3.683%  8.468%  2013-06-29 20:58:08 Rock_pigeon
  # 73 anaPla1    3.509%  8.215%  2013-06-28 09:26:58 Mallard_duck
  # 74 galGal4    3.684%  8.990%  2013-06-29 23:50:36 Chicken
  # 75 melGal1    2.645%  6.637%  2011-03-28 14:43:01 Turkey
  # 76 allMis1    7.084%  8.120%  2013-06-30 22:51:47 American_Alligator
  # 77 cheMyd1    3.696%  4.551%  2013-08-28 10:53:39 Green_sea_turtle
  # 78 chrPic1    3.716%  4.584%  2013-08-30 09:25:04 Painted_turtle
  # 79 pelSin1    3.268%  4.012%  2013-08-30 09:23:33 Soft_shell_turtle
  # 80 apaSpi1    2.952%  3.928%  2013-08-28 10:34:30 Spiny_softshell_turtle
  # 81 anoCar2    3.552%  5.190%  2011-04-19 14:45:48 Lizard
  # 82 xenTro7    3.153%  6.757%  2013-08-29 10:30:11 Frog_X._tropicalis
  # 83 latCha1    2.749%  3.419%  2013-07-16 08:39:38 Coelacanth
  # 84 tetNig2    1.712%  14.194% 2009-08-11 10:13:42 Tetraodon
  # 85 fr3        1.703%  12.282% 2013-08-20 15:07:40 Fugu
  # 86 takFla1    1.062%  8.498%  2013-09-02 21:06:45 Yellowbelly_pufferfish
  # 87 oreNil2    1.832%  6.148%  2013-08-29 10:07:09 Tilapia
  # 88 neoBri1    1.218%  4.774%  2013-09-03 08:30:34 Princess_of_Burundi
  # 89 hapBur1    1.221%  4.756%  2013-09-03 08:20:17 Burton's_mouthbreeder
  # 90 mayZeb1    1.243%  4.684%  2013-09-03 09:22:54 Maylandia_zebra
  # 91 punNye1    1.219%  4.750%  2013-09-03 08:34:58 Pundamilia_nyererei
  # 92 oryLat2    1.849%  6.705%  2009-06-02 12:02:22 Medaka
  # 93 xipMac1    1.842%  7.324%  2013-08-28 10:50:22 Southern_platyfish
  # 94 gasAcu1    1.916%  11.175% 2009-06-02 11:58:46 Stickleback
  # 95 gadMor1    1.551%  6.606%  2013-09-02 22:29:08 Atlantic_cod
  # 96 danRer7    2.790%  6.151%  2010-12-20 11:31:42 Zebrafish
  # 97 astMex1    1.269%  3.496%  2013-09-03 08:20:50 Mexican_tetra
  # 98 lepOcu1    1.522%  4.066%  2013-09-03 08:34:05 Spotted_gar
  # 99 petMar2    1.046%  3.324%  2012-10-17 13:34:36 Lamprey
  
  # None of this concern for distances matters in building the first step, the
  # maf files.
  
      # create species list and stripped down tree for autoMZ
      sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
  	hg19.100way.nh > tmp.nh
      echo `cat tmp.nh` > tree-commas.nh
      echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
      sed 's/[()]//g; s/,/ /g' tree.nh > species.list
  
      #	bash shell syntax here ...
      cd /hive/data/genomes/hg19/bed/multiz100way
      export H=/hive/data/genomes/hg19/bed
      mkdir mafLinks
      cat recipBest.list | while read G
      do
  	mkdir mafLinks/$G
  	if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then
  	    ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
          else
              echo "missing lastz.${G}/mafRBestNet/"
          fi
      done
  
      cat synNet.list | while read G
      do
  	mkdir mafLinks/$G
          if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then
  	    ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
          else
              echo "missing lastz.${G}/mafSynNet/"
          fi
      done
  
      cat netOnly.list | while read G
      do
  	mkdir mafLinks/$G
          if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then
              ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
          else
              echo "missing lastz.${G}/mafNet/"
          fi
      done
  
      #	verify the alignment type is correct:
      grep -v hg19 species.list.txt | while read D
  do
      ls -l mafLinks/$D/chr1.maf.gz 2> /dev/null | awk '{print $NF}'
  done
  
      #	need to split these things up into smaller pieces for
      #	efficient kluster run.
      mkdir /hive/data/genomes/hg19/bed/multiz100way/mafSplit
      cd /hive/data/genomes/hg19/bed/multiz100way/mafSplit
  
      #	mafSplitPos splits on gaps or repeat areas that will not have
      #	any chains, approx 5 Mbp intervals, gaps at least 10,000
      mafSplitPos -minGap=10000 hg19 5 stdout | sort -u \
  	| sort -k1,1 -k2,2n > mafSplit.bed
      #	There is a splitRegions.pl script here (copied from previous hg19 46way)
      #	that can create a custom track from this mafSplit.bed file.
      #	Take a look at that in the browser and see if it looks OK,
      #	check the number of sections on each chrom to verify none are
      #	too large.  Despite the claim above, it does appear that some
      #	areas are split where actual chains exist.
      ./splitRegions.pl mafSplit.bed > splitRegions.ct
  
      # to see the sizes of the regions:
      grep "^chr" splitRegions.ct | awk '{print $3-$2,$0}' | sort -rn | less
  
      #	run a kluster job to split them all
      ssh ku
      cd /hive/data/genomes/hg19/bed/multiz100way/mafSplit
      cat << '_EOF_' > runOne
  #!/bin/csh -ef
  set G = $1
  set C = $2
  mkdir -p $G
  pushd $G > /dev/null
  if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then
      if ( -s hg19_${C}.00.maf ) then
          /bin/rm -f hg19_${C}.*.maf
      endif
      /cluster/bin/x86_64/mafSplit ../mafSplit.bed hg19_ ../../mafLinks/${G}/${C}.maf.gz
      /bin/gzip hg19_${C}.*.maf
  else
      /bin/touch hg19_${C}.00.maf
      /bin/gzip hg19_${C}.00.maf
  endif
  popd > /dev/null
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      cat << '_EOF_' > template
  #LOOP
  runOne $(root1) $(root2) {check out exists+ $(root1)/hg19_$(root2).00.maf.gz}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      for G in `sed -e "s/hg19 //" ../species.list`
  do
      echo $G
  done > species.list
      cut -f 1 ../../../chrom.sizes > chr.list
  
      gensub2 species.list chr.list template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc...
  # Completed: 3894 of 3894 jobs
  # CPU time in finished jobs:      18929s     315.49m     5.26h    0.22d  0.001 y
  # IO & Wait Time:                 62908s    1048.46m    17.47h    0.73d  0.002 y
  # Average job time:                  21s       0.35m     0.01h    0.00d
  # Longest finished job:             346s       5.77m     0.10h    0.00d
  # Submission to last job:           471s       7.85m     0.13h    0.01d
  
      # construct a list of all possible maf file names.
      # they do not all exist in each of the species directories
      find . -type f | grep "maf.gz" | wc -l
      # 39882
      find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u > maf.list
      wc -l maf.list
      #   403 maf.list
  
      mkdir /hive/data/genomes/hg19/bed/multiz100way/splitRun
      cd /hive/data/genomes/hg19/bed/multiz100way/splitRun
      mkdir maf run
      cd run
      mkdir penn
      cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
      cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
      cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn
  
      #	set the db and pairs directories here
      cat > autoMultiz.csh << '_EOF_'
  #!/bin/csh -ef
  set db = hg19
  set c = $1
  set result = $2
  set run = `/bin/pwd`
  set tmp = /dev/shm/$db/multiz.$c
  set pairs = /hive/data/genomes/hg19/bed/multiz100way/mafSplit
  /bin/rm -fr $tmp
  /bin/mkdir -p $tmp
  /bin/cp -p ../../tree.nh ../../species.list $tmp
  pushd $tmp > /dev/null
  foreach s (`/bin/sed -e "s/$db //" species.list`)
      set in = $pairs/$s/$c
      set out = $db.$s.sing.maf
      if (-e $in.gz) then
          /bin/zcat $in.gz > $out
          if (! -s $out) then
              echo "##maf version=1 scoring=autoMZ" > $out
          endif
      else if (-e $in) then
          /bin/ln -s $in $out
      else
          echo "##maf version=1 scoring=autoMZ" > $out
      endif
  end
  set path = ($run/penn $path); rehash
  $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
          > /dev/null
  popd > /dev/null
  /bin/rm -f $result
  /bin/cp -p $tmp/$c $result
  /bin/rm -fr $tmp
  /bin/rmdir --ignore-fail-on-non-empty /dev/shm/$db
  '_EOF_'
  # << happy emacs
      chmod +x autoMultiz.csh
  
      cat  << '_EOF_' > template
  #LOOP
  ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz100way/splitRun/maf/$(root1)}
  #ENDLOOP
  '_EOF_'
  # << happy emacs
  
      ln -s ../../mafSplit/maf.list maf.list
      ssh ku
      cd /hive/data/genomes/hg19/bed/multiz100way/splitRun/run
  
      gensub2 maf.list single template stdout > jobList
      # allocate 2 CPUs to allow only 16 of these jobs at a time on a 32 CPU node
      para -cpu=2 create jobList
      # limit number of total jobs at a time to leave at least 3/4 the cluster
      # free for other work 64 jobs at 2 CPUs/job == 128 CPUs consumed
      # of the total pool of 512 CPUs in the cluster:
      para -maxJob=64 push
  # Completed: 398 of 403 jobs
  # Crashed: 5 jobs
  # CPU time in finished jobs:   11445606s  190760.10m  3179.34h  132.47d  0.363 y
  # IO & Wait Time:                 17567s     292.78m     4.88h    0.20d  0.001 y
  # Average job time:               28802s     480.03m     8.00h    0.33d
  # Longest finished job:           87903s    1465.05m    24.42h    1.02d
  # Submission to last job:        245779s    4096.32m    68.27h    2.84d
  
      # last five jobs run on hgwdev:
      cd /hive/data/genomes/hg19/bed/multiz100way/splitRun/run
      time ./last5.sh > last5.log 2>&1
      #  real    2373m23.562s
  
      # put the split maf results back together into a single maf file
      #	eliminate duplicate comments
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz100way/splitRun
      mkdir ../maf
      #	no need to save the comments since they are lost with mafAddIRows
  
      cat << '_EOF_' >> runOne
  #!/bin/csh -fe
  set C = $1
  if ( -s ../maf/${C}.maf.gz ) then
      rm -f ../maf/${C}.maf.gz
  endif
  head -q -n 1 maf/hg19_${C}.00.maf | sort -u > ../maf/${C}.maf
  grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
  tail -q -n 1 maf/hg19_${C}.00.maf | sort -u >> ../maf/${C}.maf
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      cat << '_EOF_' >> template
  #LOOP
  runOne $(root1) {check out exists+ ../maf/$(root1).maf}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      cut -f1 ../../../chrom.sizes > chr.list
      ssh ku
      cd /hive/data/genomes/hg19/bed/multiz100way/splitRun
      gensub2 chr.list single template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc ...
  # Completed: 62 of 66 jobs
  # Crashed: 4 jobs
  # CPU time in finished jobs:        461s       7.68m     0.13h    0.01d  0.000 y
  # IO & Wait Time:                 17863s     297.72m     4.96h    0.21d  0.001 y
  # Average job time:                 296s       4.93m     0.08h    0.00d
  # Longest finished job:            1144s      19.07m     0.32h    0.01d
  # Submission to last job:          1156s      19.27m     0.32h    0.01d
  
      # these four have empty results:
  #       chrUn_GL456383
  #       chrUn_GL456389
  #       chrUn_GL456390
  #       chrUn_GL456396
  
      # Load into database
      ssh hgwdev
      mkdir -p /gbdb/hg19/multiz100way/maf
      cd /hive/data/genomes/hg19/bed/multiz100way/maf
      ln -s `pwd`/*.maf /gbdb/hg19/multiz100way/maf/
  
      # this generates an immense multiz100way.tab file in the directory
      #	where it is running.  Best to run this over in scratch.
      #   This is going to take all day.
      cd /dev/shm
      time hgLoadMaf -pathPrefix=/gbdb/hg19/multiz100way/maf hg19 multiz100way
      # Loaded 82655835 mafs in 89 files from /gbdb/hg19/multiz100way/maf
      # real    79m31.028s
      # -rw-rw-r-- 1 4703102815 Sep  9 11:21 multiz100way.tab
  
      # reload 'fixed' maf file 2014-02-06
      # Loaded 110907342 mafs in 93 files from /gbdb/hg19/multiz100way/maf
      # real    368m1.988s
      # -rw-rw-r-- 1 6307041335 Feb  6 16:17 multiz100way.tab
      wc -l multiz100way.tab
      #  110907342 multiz100way.tab
  
      # reloading new tables 2014-02-07
      time (cat /gbdb/hg19/multiz100way/maf/*.maf \
          | nice -n +19 $HOME/kent/src/hg/makeDb/hgLoadMaf/hgLoadMafSummary \
            -verbose=2 -minSize=30000 \
  	-mergeGap=1500 -maxSize=200000 hg19 multiz100waySummary stdin)
  # Created 17161359 summary blocks from 3983260014 components and 110907342 mafs from stdin
  # real    242m0.403s
      #  -rw-rw-r-- 1  811621935 Feb  7 12:16 multiz100waySummary.tab
  
      # Created 13514621 summary blocks from 2957351868 components and 82655835 mafs from stdin
      # real    143m55.432s
  
      wc -l multiz100way*.tab
      #  110907342 multiz100way.tab
      #  17161359 multiz100waySummary.tab
      #  128068701 total
  
      #    82655835 multiz100way.tab
      #    13514621 multiz100waySummary.tab
      #    96170456 total
      #    -rw-rw-r-- 1 4703102815 Sep  9 11:21 multiz100way.tab
      #    -rw-rw-r-- 1  640169193 Sep 10 09:08 multiz100waySummary.tab
  
      rm multiz100way*.tab
  
  #######################################################################
  # GAP ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE - 2012-05-31 - Hiram)
      # mafAddIRows has to be run on single chromosome maf files, it does not
      #	function correctly when more than one reference sequence
      #	are in a single file.
      mkdir -p /hive/data/genomes/hg19/bed/multiz100way/anno
      cd /hive/data/genomes/hg19/bed/multiz100way/anno
  
      cd /hive/data/genomes/hg19/bed/multiz100way/anno
      # check for N.bed files everywhere:
      for DB in `cat ../species.list`
  do
      if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
          echo "MISS: ${DB}"
          cd /hive/data/genomes/${DB}
          twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
      else
          echo "  OK: ${DB}"
      fi
  done
  
      cd /hive/data/genomes/hg19/bed/multiz100way/anno
      for DB in `cat ../species.list`
  do
      echo "${DB} "
      ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
      echo ${DB}.bed  >> nBeds
      ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
      echo ${DB}.len  >> sizes
  done
      # make sure they all are successful symLinks:
      ls -ogrtL *.bed | wc -l
      # 100
  
      screen -S hg19      # use a screen to control this longish job
      ssh ku
      cd /hive/data/genomes/hg19/bed/multiz100way/anno
      mkdir result
  
      cat << '_EOF_' > template
  #LOOP
  mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg19/hg19.2bit {check out line+ result/$(file1)}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      ls ../maf/*.maf > maf.list
      # the tac puts the short jobs first
      gensub2 maf.list single template stdout | tac > jobList
      # no need to limit these jobs, there are only 93 of them
      para create jobList
      para try ... check ... push ...
  # Completed: 70 of 89 jobs
  # Crashed: 19 jobs
  # CPU time in finished jobs:       1027s      17.12m     0.29h    0.01d  0.000 y
  # IO & Wait Time:                  4532s      75.53m     1.26h    0.05d  0.000 y
  # Average job time:                  79s       1.32m     0.02h    0.00d
  # Longest finished job:             370s       6.17m     0.10h    0.00d
  # Submission to last job:           552s       9.20m     0.15h    0.01d
  
      # a number of these jobs did not finish due to memory limitations.
      # Run the rest manually on hgwdev:
  
  #!/bin/sh
  
  # 200 Gb memory limits
  
  export sizeG=200000000
  ulimit -d $sizeG
  ulimit -v $sizeG
  
  mafAddIRows -nBeds=nBeds ../maf/chr7.maf /hive/data/genomes/hg19/hg19.2bit result/chr7.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr5.maf /hive/data/genomes/hg19/hg19.2bit result/chr5.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr4.maf /hive/data/genomes/hg19/hg19.2bit result/chr4.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr3.maf /hive/data/genomes/hg19/hg19.2bit result/chr3.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr22.maf /hive/data/genomes/hg19/hg19.2bit result/chr22.maf &
  wait
  mafAddIRows -nBeds=nBeds ../maf/chr21.maf /hive/data/genomes/hg19/hg19.2bit result/chr21.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr20.maf /hive/data/genomes/hg19/hg19.2bit result/chr20.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr2.maf /hive/data/genomes/hg19/hg19.2bit result/chr2.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr19.maf /hive/data/genomes/hg19/hg19.2bit result/chr19.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr18.maf /hive/data/genomes/hg19/hg19.2bit result/chr18.maf &
  wait
  mafAddIRows -nBeds=nBeds ../maf/chr17.maf /hive/data/genomes/hg19/hg19.2bit result/chr17.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr16.maf /hive/data/genomes/hg19/hg19.2bit result/chr16.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr15.maf /hive/data/genomes/hg19/hg19.2bit result/chr15.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr14.maf /hive/data/genomes/hg19/hg19.2bit result/chr14.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr13.maf /hive/data/genomes/hg19/hg19.2bit result/chr13.maf &
  wait
  mafAddIRows -nBeds=nBeds ../maf/chr12.maf /hive/data/genomes/hg19/hg19.2bit result/chr12.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr11.maf /hive/data/genomes/hg19/hg19.2bit result/chr11.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr10.maf /hive/data/genomes/hg19/hg19.2bit result/chr10.maf &
  mafAddIRows -nBeds=nBeds ../maf/chr1.maf /hive/data/genomes/hg19/hg19.2bit result/chr1.maf &
  wait
  
      # the run time for those 31 jobs:
      #    real    337m11.544s
      #    user    788m52.885s
      #    sys     51m9.670s
      # and few more: chrX chr9 chr6 chr8 chr5 chr3
      #  real    90m49.567s
      #  user    313m55.725s
      #  sys     17m12.812s
      # chr2 last one took about 70 minutes
  
      # second reload 2014-02-07, on hgwdev 19 jobs:
      cd /hive/data/genomes/hg19/bed/multiz100way/anno
      time ./highMemJobs.sh > highMemJobs.log 2>&1
  
      # real    473m47.404s
      # user    1072m8.810s
      # sys     30m45.768s
  
      # verify all result files have some content, look for 0 size files:
      find . -type f -size 0
      # should see none
  
      # construct symlinks to get the individual maf files into gbdb:
      mkdir /gbdb/hg19/multiz100way/maf
      ln -s `pwd`/result/*.maf `pwd`/hgwdev/*.maf /gbdb/hg19/multiz100way/maf/
  
      # Load into database
      rm /gbdb/hg19/multiz100way/*.maf   # remove previous results
      cd /scratch/tmp
      time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg19/multiz100way/maf \
          hg19 multiz100way
  # Loaded 110940480 mafs in 93 files from /gbdb/hg19/multiz100way/maf
  
  # real    190m10.878s
  -rw-rw-r-- 1 6346988863 Feb  7 19:20 multiz100way.tab
  
  
      # Loaded 107446955 mafs in 93 files from /gbdb/hg19/multiz100way/maf
  
      # real    393m34.571s
      # user    155m49.137s
      # sys     7m9.102s
  
      time (cat /gbdb/hg19/multiz100way/maf/*.maf \
          | hgLoadMafSummary -verbose=2 -minSize=30000 \
  	-mergeGap=1500 -maxSize=200000 hg19 multiz100waySummary stdin)
      # second time around, 2014-02-05:
      # Created 17161359 summary blocks from 3983260014 components and 110940480 mafs from stdin
      # real    259m56.225s
      # user    236m54.551s
      # sys     19m27.053s
  # -rw-rw-r-- 1 6346988863 Feb  7 19:20 multiz100way.tab
  # -rw-rw-r-- 1  845944653 Feb  8 12:22 multiz100waySummary.tab
  
      # Created 17164313 summary blocks from 3766509527 components and 107446955 mafs from stdin
      # real    452m41.332s
      # user    229m13.460s
      # sys     19m0.104s
  
      # -rw-rw-r-- 1 6145982941 Sep 11 17:36 multiz100way.tab
      # -rw-rw-r-- 1  846085633 Sep 12 05:50 multiz100waySummary.tab
  
      wc -l *.tab
      # second time around, 2014-02-05:
      # 110940480 multiz100way.tab
      # 17161359 multiz100waySummary.tab
      # 128101839 total
  
      #  107446955 multiz100way.tab
      #   17164313 multiz100waySummary.tab
      #  124611268 total
  
      rm multiz100way*.tab
  
  #######################################################################
  # MULTIZ 100way MAF FRAMES (DONE - 2013-09-14 - Hiram)
      ssh hgwdev
      mkdir /hive/data/genomes/hg19/bed/multiz100way/frames
      cd /hive/data/genomes/hg19/bed/multiz100way/frames
  #   survey all the genomes to find out what kinds of gene tracks they have
      cat << '_EOF_' > showGenes.csh
  #!/bin/csh -fe
  foreach db (`cat ../species.list`)
      echo -n "${db}: "
      set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
      foreach table ($tables)
          if ($table == "ensGene" || $table == "refGene" || \
             $table == "mgcGenes" || $table == "ensGene" || \
             $table == "xenoRefGene" ) then
             set count = `hgsql $db -N -e "select count(*) from $table"`
              echo -n "${table}: ${count}, "
          endif
      end
      set orgName = `hgsql hgcentraltest -N -e \
              "select scientificName from dbDb where name='$db'"`
      set orgId = `hgsql hg19 -N -e \
              "select id from organism where name='$orgName'"`
      if ($orgId == "") then
          echo "Mrnas: 0"
      else
          set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
          echo "Mrnas: ${count}"
      endif
  end
  '_EOF_'
      # << happy emacs
      chmod +x ./showGenes.csh
      time ./showGenes.csh > showGenes.txt
      #   real    9m11.678s
  
      #   rearrange that output to create four sections, and place these names
      #           in .list files here:
      #   1. ensGene: hg19
      #   2. refGene: bosTau7 danRer7 galGal4 hg19 rheMac3 rn5 susScr3 xenTro3
      #   3. ensGene: ailMel1 anoCar2 calJac3 cavPor3 choHof1 dipOrd1 echTel1
      #               equCab2 eriEur1 fr3 gasAcu1 gorGor3 loxAfr3 melGal1
      #               micMur1 monDom5 myoLuc2 ochPri2 ornAna1 oryCun2 oryLat2
      #               panTro4 ponAbe2 proCap1 pteVam1 sorAra1 taeGut1 tarSyr1
      #               tetNig2 tupBel1 vicPac1
      #   4. xenoRefGene: canFam3 chrPic1 dasNov3 felCat5 hetGla2 latCha1 macEug2
      #               nomLeu2 otoGar3 oviAri1 papHam1 petMar1 saiBol1 sarHar1
      #               triMan1
      #   5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2
  
      mkdir genes
      #   1. ensGene: hg19 and mm10
      for DB in hg19 mm10
      do
      hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
        | genePredSingleCover stdin stdout | gzip -2c \
          > genes/${DB}.gp.gz
      done
      #   2. refGene, xenTro7 and bosTau7 want the full extended genePred:
      for DB in `cat refGene.list`
  do
  hgsql -N -e "select * from refGene" ${DB} | cut -f2- \
        | genePredSingleCover stdin stdout | gzip -2c \
          > /scratch/tmp/${DB}.tmp.gz
      mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
      echo "${DB} done"
  done
      #   3. ensGene, want the full extended genePred:
      # ailMel1 anaPla1 anoCar2 calJac3 canFam3 danRer7 equCab2 felCat5 fr3
      # gadMor1 galGal4 gorGor3 latCha1 loxAfr3 musFur1 myoLuc2 ornAna1 oryLat2
      # otoGar3 panTro4 pelSin1 petMar2 ponAbe2 pteVam1 rn5 sarHar1 speTri2
      # susScr3 tetNig2 xipMac1
      for DB in `cat ensGene.list`
  do
  hgsql -N -e "select * from ensGene" ${DB} | cut -f2- \
        | genePredSingleCover stdin stdout | gzip -2c \
          > /scratch/tmp/${DB}.tmp.gz
      mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
      echo "${DB} done"
  done
      #   4. nscanGene, want the full extended genePred:
      #  cavPor3 gasAcu1 melGal1 monDom5 oryCun2
      for DB in `cat nscan.list`
  do
  hgsql -N -e "select * from nscanGene" ${DB} | cut -f2- \
        | genePredSingleCover stdin stdout | gzip -2c \
          > /scratch/tmp/${DB}.tmp.gz
      mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
      echo "${DB} done"
  done
  
      # verify counts for genes are reasonable:
      for T in genes/*.gz
  do
      echo -n "# $T: "
      zcat $T | cut -f1 | sort | uniq -c | wc -l
  done
  
  # genes/ailMel1.gp.gz: 19204
  # genes/anaPla1.gp.gz: 15482
  # genes/anoCar2.gp.gz: 18532
  # genes/bosTau7.gp.gz: 12995
  # genes/calJac3.gp.gz: 20843
  # genes/canFam3.gp.gz: 19507
  ... etc ...
  # genes/pteVam1.gp.gz: 16966
  # genes/rn5.gp.gz: 22864
  # genes/sarHar1.gp.gz: 18663
  # genes/speTri2.gp.gz: 18796
  # genes/susScr3.gp.gz: 21596
  # genes/tetNig2.gp.gz: 19539
  # genes/xenTro7.gp.gz: 8393
  # genes/xipMac1.gp.gz: 20320
  
      # kluster job to annotate each maf file
      screen -S hg19      # manage long running procedure with screen
      ssh ku
      cd /hive/data/genomes/hg19/bed/multiz100way/frames
      cat << '_EOF_' > runOne
  #!/bin/csh -fe
  
  set C = $1
  set G = $2
  
  cat ../maf/${C}.maf | genePredToMafFrames hg19 stdin stdout \
          ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
  '_EOF_'
      # << happy emacs
      chmod +x runOne
  
      ls ../maf | sed -e "s/.maf//" > chr.list
      ls genes | sed -e "s/.gp.gz//" > gene.list
  
      cat << '_EOF_' > template
  #LOOP
  runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      mkdir parts
      gensub2 chr.list gene.list template jobList
      para create jobList
      para try ... check ... push
  # second time around 2014-02-08
  # Completed: 3627 of 3627 jobs
  # CPU time in finished jobs:     215439s    3590.66m    59.84h    2.49d  0.007 y
  # IO & Wait Time:               1153600s   19226.66m   320.44h   13.35d  0.037 y
  # Average job time:                 377s       6.29m     0.10h    0.00d
  # Longest finished job:            6108s     101.80m     1.70h    0.07d
  # Submission to last job:         17703s     295.05m     4.92h    0.20d
  
  # Completed: 3627 of 3627 jobs
  # CPU time in finished jobs:     213502s    3558.37m    59.31h    2.47d  0.007 y
  # IO & Wait Time:              10242520s  170708.66m  2845.14h  118.55d  0.325 y
  # Average job time:                2883s      48.05m     0.80h    0.03d
  # Longest finished job:           28625s     477.08m     7.95h    0.33d
  # Submission to last job:         60992s    1016.53m    16.94h    0.71d
  
      # collect all results into one file:
      cd /hive/data/genomes/hg19/bed/multiz100way/frames
      find ./parts -type f | while read F
  do
      zcat ${F}
  done | sort -k1,1 -k2,2n > multiz100wayFrames.bed
      # -rw-rw-r-- 1 857488422 Feb  8 08:08 multiz100wayFrames.bed
  
  
      # verify there are frames on everything that we asked for:
      cut -f4 multiz100wayFrames.bed | sort | uniq -c | sort -n \
          > annotation.survey.txt
      # should be 39 species:
      wc -l annotation.survey.txt
      #   39 annotation.survey.txt
      # and the minimum numbers:
      head annotation.survey.txt
      # 155538 bosTau7
      # 181247 oryCun2
      # 192009 panTro4
      # 196260 gorGor3
      # 200210 ponAbe2
      # 200763 hg19
      # 212618 xenTro7
      # ... etc ...
      # previously, the ailMel1 low count should have been seen as an error:
      #       317 ailMel1  <- this is a bit odd
      #    202537 bosTau7
      #    230400 xenTro7
      #    237422 oryCun2
  
      #   load the resulting file
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz100way/frames
      time gzip multiz100wayFrames.bed
      #   real    0m32.261s
      time hgLoadMafFrames hg19 multiz100wayFrames multiz100wayFrames.bed.gz
      # real    2m29.934s
  
      # second time around:
      time featureBits -countGaps hg19 multiz100wayFrames
      # 67663535 bases of 3137161264 (2.157%) in intersection
      # real    1m24.320s
  
      # first time:
      # 66293515 bases of 3137161264 (2.113%) in intersection
      # real    1m31.403s
  
      #   enable the trackDb entries:
  # frames multiz100wayFrames
  # irows on
      #   appears to work OK
  
  #########################################################################
  # Phylogenetic tree from 100-way (DONE - 2013-09-13 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/multiz100way/4d
      cd /hive/data/genomes/hg19/bed/multiz100way/4d
  
      # the annotated maf's are in:
      ../anno/result/*.maf
  
      # using ensGene for hg19, only transcribed genes and nothing
      #	from the randoms and other misc.
      hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene where cdsEnd > cdsStart;" hg19 \
        | egrep -E -v "chrM|chrUn|random|_hap" > ensGene.gp
      wc -l *.gp
      60396 ensGene.gp
  
      # verify it is only on the chroms:
      cut -f2 ensGene.gp | sort | uniq -c | sort -rn
      # 6323 chr1
      # 3965 chr2
      # 3872 chr19
      # 3670 chr11
      # 3538 chr17
      # 3484 chr3
      # 3144 chr12
      # 3099 chr6
      # 2878 chr7
      # 2578 chr5
      # 2541 chr16
      # 2534 chr10
      # 2434 chrX
      # 2401 chr9
      # 2247 chr4
      # 2043 chr14
      # 1967 chr8
      # 1875 chr15
      # 1609 chr20
      # 1363 chr22
      # 942 chr13
      # 891 chr18
      # 690 chr21
      # 308 chrY
  
      # first time:
      # 7967 chr1
      # 5092 chr2
      # 4711 chr19
      # ... etc ...
      # 1178 chr18
      # 954 chr21
      # 452 chrY
  
      genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp
      wc -l ensGeneNR.gp
      #	20031 ensGeneNR.gp
  
      ssh ku
      mkdir /hive/data/genomes/hg19/bed/multiz100way/4d/run
      cd /hive/data/genomes/hg19/bed/multiz100way/4d/run
      mkdir ../mfa
  
      # newer versions of msa_view have a slightly different operation
      # the sed of the gp file inserts the reference species in the chr name
      cat << '_EOF_' > 4d.csh
  #!/bin/csh -fe
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
  set r = "/hive/data/genomes/hg19/bed/multiz100way"
  set c = $1
  set infile = $r/anno/result/$2
  set outfile = $3
  cd /dev/shm
  # 'clean' maf, removes all chrom names, leaves only the db name
  perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
  awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\thg19.$c\t/" > $c.gp
  set NL=`wc -l $c.gp| gawk '{print $1}'`
  if ("$NL" != "0") then
      $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
      $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
  else
      echo "" > $r/4d/run/$outfile
  endif
  rm -f $c.gp $c.maf $c.ss
  '_EOF_'
      # << happy emacs
      chmod +x 4d.csh
  
      ls -1S /hive/data/genomes/hg19/bed/multiz100way/anno/result/*.maf \
  	| sed -e "s#.*multiz100way/anno/result/##" \
          | egrep -E -v "chrM|chrUn|random|_hap" > maf.list
  
      cat << '_EOF_' > template
  #LOOP
  4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      # the tac puts the shorter jobs at the front
      gensub2 maf.list single template stdout | tac > jobList
      para create jobList
      para try ... check
      para -maxJob=5 push
      para time
  # second time:
  # Completed: 24 of 24 jobs
  # CPU time in finished jobs:      43524s     725.41m    12.09h    0.50d  0.001 y
  # IO & Wait Time:                  2537s      42.28m     0.70h    0.03d  0.000 y
  # Average job time:                1919s      31.99m     0.53h    0.02d
  # Longest finished job:            3948s      65.80m     1.10h    0.05d
  # Submission to last job:          4133s      68.88m     1.15h    0.05d
  
  # first time:
  # Completed: 24 of 24 jobs
  # CPU time in finished jobs:      38593s     643.22m    10.72h    0.45d  0.001 y
  # IO & Wait Time:                411016s    6850.26m   114.17h    4.76d  0.013 y
  # Average job time:               18734s     312.23m     5.20h    0.22d
  # Longest finished job:           44666s     744.43m    12.41h    0.52d
  # Submission to last job:         60542s    1009.03m    16.82h    0.70d
  
      # combine mfa files
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz100way/4d
      # remove the broken empty files, size 0 and size 1:
      find ./mfa -type f -size 0 | xargs rm -f
      # most interesting, this did not identify files of size 1:
  #    find ./mfa -type f -size 1
      ls -og mfa | awk '$3 == 1' | awk '{print $NF}' > empty.list
      sed -e "s#^#mfa/##" empty.list | xargs rm -f
      #want comma-less species.list
      time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
  	--aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \
  	    > 4d.all.mfa
      # check they are all in there:
      grep "^>" 4d.all.mfa | wc -l
      #   100
  
      # use phyloFit to create tree model (output is phyloFit.mod)
      time nice -n +19 \
  	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
  	    --EM --precision MED --msa-format FASTA --subst-mod REV \
  		--tree ../tree-commas.nh 4d.all.mfa
      #   real    223m18.088s
      # second time 2014-02-09:
      #	real    212m12.485s
      mv phyloFit.mod all.mod
  
      grep TREE all.mod
  
  #######################################################################
  # phastCons 60-way (DONE - 2012-06-12, 2012-08-21 - Hiram)
      #	was unable to split the full chrom MAF files, now working on the
      #	maf files as they were split up during multiz
  
      # split 100way mafs into 10M chunks and generate sufficient statistics
      # files for # phastCons
      ssh ku
      mkdir -p /hive/data/genomes/hg19/bed/multiz100way/cons/ss
      mkdir -p /hive/data/genomes/hg19/bed/multiz100way/cons/msa.split
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/msa.split
  
      cat << '_EOF_' > doSplit.csh
  #!/bin/csh -ef
  set c = $1
  set MAF = /hive/data/genomes/hg19/bed/multiz100way/anno/result/$c.maf
  set WINDOWS = /hive/data/genomes/hg19/bed/multiz100way/cons/ss/$c
  set WC = `cat $MAF | wc -l`
  set NL = `grep "^#" $MAF | wc -l`
  if ( -s $2 ) then
      exit 0
  endif
  if ( -s $2.running ) then
      exit 0
  endif
  
  date >> $2.running
  
  rm -fr $WINDOWS
  mkdir $WINDOWS
  pushd $WINDOWS > /dev/null
  if ( $WC != $NL ) then
  /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
      $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
  endif
  popd > /dev/null
  date >> $2
  rm -f $2.running
  '_EOF_'
      # << happy emacs
      chmod +x doSplit.csh
  
      cat << '_EOF_' > template
  #LOOP
  doSplit.csh $(root1) {check out line+ $(root1).done}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      #	do the easy ones first to see some immediate results
      ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list
  
      gensub2 maf.list single template jobList
      para create jobList
      para try ... check ... etc
      para push
      # running the last big ones on hgwdev - Sat Sep 14 10:30:34 PDT 2013
  # Completed: 64 of 66 jobs
  # Crashed: 2 jobs
  # CPU time in finished jobs:     347730s    5795.49m    96.59h    4.02d  0.011 y
  # IO & Wait Time:                102813s    1713.56m    28.56h    1.19d  0.003 y
  # Average job time:                7040s     117.33m     1.96h    0.08d
  # Longest finished job:           42666s     711.10m    11.85h    0.49d
  # Submission to last job:        150336s    2505.60m    41.76h    1.74d
      # finish the last 23 jobs on hgwdev with more memory.
  #!/bin/sh
  
  # limit 200 Gb
  export M=200000000
  ulimit -S -m $M -v $M
  
  ./doSplit.csh chr21 chr21.done &
  ./doSplit.csh chr22 chr22.done
  wait
  ./doSplit.csh chr19 chr19.done &
  ./doSplit.csh chr20 chr20.done
  wait
  ... etc ...
      # running the final 23 jobs on hgwdev, two at a time:
      # real    1739m11.579s
      # user    2286m40.633s
      # sys     52m30.012s
  
      # second time around chr1 thru chr8:
      time ./lastJobs.sh > lastJobs.log 2>&1
  
      # real    383m21.628s
      # user    1447m51.415s
      # sys     17m44.578s
  
      # Run phastCons
      #	This job is I/O intensive in its output files, beware where this
      #	takes place or do not run too many at once.
      ssh ku
      mkdir -p /hive/data/genomes/hg19/bed/multiz100way/cons/run.cons
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/run.cons
  
      #	there are going to be several different phastCons runs using
      #	this same script.  They trigger off of the current working directory
      #	$cwd:t which is the "grp" in this script.  It is one of:
      #	all glire glirePrimate glirePrimatePlacental
  
      cat << '_EOF_' > doPhast.csh
  #!/bin/csh -fe
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
  set c = $1
  set f = $2
  set len = $3
  set cov = $4
  set rho = $5
  set grp = $cwd:t
  set cons = /hive/data/genomes/hg19/bed/multiz100way/cons
  set tmp = $cons/tmp/$f
  mkdir -p $tmp
  set ssSrc = $cons/ss
  set useGrp = "$grp.mod"
  if (-s $cons/$grp/$grp.non-inf) then
    ln -s $cons/$grp/$grp.mod $tmp
    ln -s $cons/$grp/$grp.non-inf $tmp
    ln -s $ssSrc/$c/$f.ss $tmp
  else
    ln -s $ssSrc/$c/$f.ss $tmp
    ln -s $cons/$grp/$grp.mod $tmp
  endif
  pushd $tmp > /dev/null
  if (-s $grp.non-inf) then
    $PHASTBIN/phastCons $f.ss $useGrp \
      --rho $rho --expected-length $len --target-coverage $cov --quiet \
      --not-informative `cat $grp.non-inf` \
      --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
  else
    $PHASTBIN/phastCons $f.ss $useGrp \
      --rho $rho --expected-length $len --target-coverage $cov --quiet \
      --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
  endif
  popd > /dev/null
  mkdir -p pp/$c bed/$c
  sleep 4
  touch pp/$c bed/$c
  rm -f pp/$c/$f.pp
  rm -f bed/$c/$f.bed
  mv $tmp/$f.pp pp/$c
  mv $tmp/$f.bed bed/$c
  rm -fr $tmp
  '_EOF_'
      # << happy emacs
      chmod a+x doPhast.csh
  
      #	this template will serve for all runs
      #	root1 == chrom name, file1 == ss file name without .ss suffix
      cat << '_EOF_' > template
  #LOOP
  ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
  
      # Create parasol batch and run it
      ############################ run for all species
      cd /hive/data/genomes/hg19/bed/multiz100way/cons
      mkdir all
      cd all
      cp -p ../../4d/all.mod ./all.mod
  
      gensub2 ../run.cons/ss.list single ../run.cons/template jobList
      para -ram=8g create jobList
      para try ... check ... push ... etc.
  # Completed: 377 of 377 jobs
  # CPU time in finished jobs:     124523s    2075.38m    34.59h    1.44d  0.004 y
  # IO & Wait Time:                 28923s     482.06m     8.03h    0.33d  0.001 y
  # Average job time:                 407s       6.78m     0.11h    0.00d
  # Longest finished job:             873s      14.55m     0.24h    0.01d
  # Submission to last job:          4792s      79.87m     1.33h    0.06d
      # second time 2014-02-09:
  # Completed: 377 of 377 jobs
  # CPU time in finished jobs:     123749s    2062.49m    34.37h    1.43d  0.004 y
  # IO & Wait Time:                 32295s     538.24m     8.97h    0.37d  0.001 y
  # Average job time:                 414s       6.90m     0.11h    0.00d
  # Longest finished job:             726s      12.10m     0.20h    0.01d
  # Submission to last job:          2332s      38.87m     0.65h    0.03d
  
      # create Most Conserved track
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/all
      time cut -f1 ../../../../chrom.sizes | while read C
  do
      ls -d bed/${C} 2> /dev/null | while read D
      do
          cat ${D}/${C}*.bed
      done | sort -k1,1 -k2,2n \
      | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
  done > tmpMostConserved.bed
      # real    1m31.673s
  
      # -rw-rw-r--  1 346958126 Feb  9 20:19 tmpMostConserved.bed
      # -rw-rw-r--  1 346487128 Sep 15 21:03 tmpMostConserved.bed
  
      time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \
          > mostConserved.bed
      # real    1m25.027s
  
      # -rw-rw-r--  1 355755569 Feb  9 20:21 mostConserved.bed
      # -rw-rw-r--  1 355270467 Sep 15 21:07 mostConserved.bed
  
      # load into database
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/all
      time nice -n +19 hgLoadBed hg19 phastConsElements100way mostConserved.bed
      # second time 2014-02-09:
      # Read 10121223 elements of size 5 from mostConserved.bed
      # real    1m57.217s
  
      # Read 10107363 elements of size 5 from mostConserved.bed
      # real    1m56.508s
  
      # Try for 5% overall cov, and 70% CDS cov
      time featureBits hg19 -enrichment refGene:cds phastConsElements100way
      #	--rho 0.3 --expected-length 45 --target-coverage 0.3
      # second time 2014-02-09:
      #	refGene:cds 1.211%, phastConsElements100way 5.603%, both 0.849%,
      #	  cover 70.12%, enrich 12.51x
      #   refGene:cds 1.208%, phastConsElements100way 5.601%, both 0.847%,
      #     cover 70.11%, enrich 12.52x
      #  real    1m18.381s
  
      time featureBits hg19 -enrichment ensGene:cds phastConsElements100way
      #   ensGene:cds 1.271%, phastConsElements100way 5.601%, both 0.860%,
      #      cover 67.67%, enrich 12.08x
      #   real    1m15.636s
  
      time featureBits hg19 -enrichment ensGene:cds phastConsElements100way
      #   ensGene:cds 1.268%, phastConsElements100way 5.601%, both 0.864%,
      #      cover 68.15%, enrich 12.17x
      #   real    1m8.520s
  
      # Create merged posterier probability file and wiggle track data files
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/all
      mkdir downloads
  
      time for D in `ls -d pp/chr* | sed -e 's#pp/##'`
  do
      echo "working: $D"
      find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
  	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
          | gzip -c > downloads/${D}.phastCons100way.wigFix.gz
  done
      # real    29m38.561s
  
      #	encode those files into wiggle data
      time (zcat downloads/*.wigFix.gz \
  	| wigEncode stdin phastCons100way.wig phastCons100way.wib)
      #   Converted stdin, upper limit 1.00, lower limit 0.00
      #   real    13m12.190s
      du -hsc *.wi?
      # second time around 2014-02-09:
      # 2.7G    phastCons100way.wib
      # 273M    phastCons100way.wig
      # 3.0G    total
  
      # first time:
      # 2.7G    ../../cons.0/all/phastCons100way.wib
      # 412M    ../../cons.0/all/phastCons100way.wig
      # 3.1G    total
  
      #	encode into a bigWig file:
      #	(warning wigToBigWig process may be too large for memory limits
      #	in bash, to avoid the 32 Gb memory limit, set 180 Gb here:
  sizeG=188743680
  export sizeG
  ulimit -d $sizeG
  ulimit -v $sizeG
      time (zcat downloads/*.wigFix.gz \
          | wigToBigWig stdin ../../../../chrom.sizes phastCons100way.bw)
      #   real    38m47.061s
      #   -rw-rw-r--  1 5777277074 Feb  9 23:49 phastCons100way.bw
      #   -rw-rw-r--  1 5897970469 Sep 15 21:40 phastCons100way.bw
      bigWigInfo phastCons100way.bw
      # second time around 2014-02-09
  version: 4
  isCompressed: yes
  isSwapped: 0
  primaryDataSize: 3,942,673,355
  primaryIndexSize: 90,344,600
  zoomLevels: 10
  chromCount: 92
  basesCovered: 2,857,876,073
  mean: 0.101765
  min: 0.000000
  max: 1.000000
  std: 0.237072
  
  version: 4
  isCompressed: yes
  isSwapped: 0
  primaryDataSize: 4,052,086,370
  primaryIndexSize: 137,792,116
  zoomLevels: 10
  chromCount: 92
  basesCovered: 2,832,683,483
  mean: 0.102504
  min: 0.000000
  max: 1.000000
  std: 0.237858
  
      #	if you wanted to use the bigWig file, loading bigWig table:
      #   but we don't use the bigWig file
      mkdir /gbdb/hg19/bbi
      ln -s `pwd`/phastCons100way.bw /gbdb/hg19/bbi
      hgsql hg19 -e 'drop table if exists phastCons100way; \
              create table phastCons100way (fileName varchar(255) not null); \
              insert into phastCons100way values
  	("/gbdb/hg19/bbi/phastCons100way.bw");'
  
      # Load gbdb and database with wiggle.
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/all
      ln -s `pwd`/phastCons100way.wib /gbdb/hg19/multiz100way/phastCons100way.wib
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz100way hg19 \
  	phastCons100way phastCons100way.wig
      #   real    0m49.130s
  
      time wigTableStats.sh hg19 phastCons100way
      # real    0m20.097s
  # second time around 2014-02-09
  # db.table      min max mean count sumData
  # hg19.phastCons100way    0 1 0.101765 2857876073 2.90832e+08
  # 0.237072 viewLimits=0:1
  
  # db.table      min max mean count sumData
  # hg19.phastCons100way 0 1 0.102504 2832683483 2.9036e+08
  #       stdDev viewLimits
  #    0.237858 viewLimits=0:1
  
      #  Create histogram to get an overview of all the data
      ssh hgwdev
      cd /hive/data/genomes/hg19/bed/multiz100way/cons/all
      time nice -n +19 hgWiggle -doHistogram -db=hg19 \
  	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
  	    phastCons100way > histogram.data 2>&1
      #	real    2m38.825s
  
      #	create plot of histogram:
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phastCons100way track"
  set xlabel " phastCons100way score"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.02]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
      # complains about font, but makes the png image:
  # Could not find/open font when opening font "arial", using internal non-scalable font
  
  
      display histo.png &
  
  #########################################################################
  # phyloP conservation for 100-way (DONE - 2012-06-15 - 2012-08-21 - Hiram)
  #
  # all vertebrates
  #
      # split SS files into 1M chunks, this business needs smaller files
      #   to complete
  
      # many of these jobs run too much memory to finish on a kluster node
      # can run all of this on hgwdev
  
      mkdir /hive/data/genomes/hg19/bed/multiz100way/consPhyloP
      cd /hive/data/genomes/hg19/bed/multiz100way/consPhyloP
      mkdir ss run.split
      cd run.split
  
      cat << '_EOF_' > doSplit.csh
  #!/bin/csh -ef
  set c = $1
  set MAF = /hive/data/genomes/hg19/bed/multiz100way/anno/result/$c.maf
  set WINDOWS = /hive/data/genomes/hg19/bed/multiz100way/consPhyloP/ss/$c
  set WC = `cat $MAF | wc -l`
  set NL = `grep "^#" $MAF | wc -l`
  if ( -s $2 ) then
      exit 0
  endif
  if ( -s $2.running ) then
      exit 0
  endif
  
  date >> $2.running
  
  rm -fr $WINDOWS
  mkdir -p $WINDOWS
  pushd $WINDOWS > /dev/null
  if ( $WC != $NL ) then
  /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
      $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
  endif
  popd > /dev/null
  date >> $2
  rm -f $2.running
  '_EOF_'
  # << happy emacs
  
      #	do the easy ones first to see some immediate results
      ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list
  
      # this needs a {check out line+ $(root1.done)} test for verification:
      cat << '_EOF_' > template
  #LOOP
  ./doSplit.csh $(root1) $(root1).done
  #ENDLOOP
  '_EOF_'
  # << happy emacs
  
      gensub2 maf.list single template jobList
      # copy the jobList to runEm.sh, edit to make all the commands run in
      #   the background, with wait statements every few commands to run
      #   a small number of these at once, no more than four at once with
      #   the large chroms, the small randoms can run a bunch at once, they
      #   finish quickly.
      time ./runEm.sh
      # real 1140m47.332s = 19h 0m 47.332s
      # second time around, combination of a ku batch and 8 jobs on hgwdev,
      # last 8 jobs:
      time ./lastBits.sh > lastBits.log 2>&1
      # real    391m46.037s
  
      # run phyloP with score=LRT
      ssh ku
      cd /cluster/data/hg19/bed/multiz100way/consPhyloP
      mkdir run.phyloP
      cd run.phyloP
  
      # Adjust model file base composition background and rate matrix to be
      # representative of the chromosomes in play
      cp -p ../../4d/all.mod ./all.mod
      grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
      #	0.513
      /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
  	../../4d/all.mod 0.513 > all.mod
  
      cat << '_EOF_' > doPhyloP.csh
  #!/bin/csh -fex
  set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
  set f = $1
  set ssFile = $1:t
  echo "ssFile: $ssFile"
  set out = $2
  set cName = $f:h
  echo "cName: $cName"
  set n = $f:r:e
  set grp = $cwd:t
  set cons = /hive/data/genomes/hg19/bed/multiz100way/consPhyloP
  set tmp = $cons/tmp/$grp/$f
  rm -fr $tmp
  mkdir -p $tmp
  set ssSrc = "$cons/ss/$cName/$ssFile"
  set useGrp = "$grp.mod"
  ln -s $cons/run.phyloP/$grp.mod $tmp
  pushd $tmp > /dev/null
  echo source: $ssSrc.ss
  $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
      -i SS $useGrp $ssSrc.ss > $ssFile.wigFix
  popd > /dev/null
  mkdir -p $out:h
  sleep 4
  mv $tmp/$ssFile.wigFix $out
  rm -fr $tmp
  '_EOF_'
      # << happy emacs
      chmod +x doPhyloP.csh
  
      # Create list of chunks
      find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list
  
      # Create template file
      #	file1 == $chr/$chunk/file name without .ss suffix
      cat << '_EOF_' > template
  #LOOP
  ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
  #ENDLOOP
  '_EOF_'
      # << happy emacs
  
      ######################   Running all species  #######################
      # setup run for all species
      mkdir /hive/data/genomes/hg19/bed/multiz100way/consPhyloP/all
      cd /hive/data/genomes/hg19/bed/multiz100way/consPhyloP/all
      rm -fr wigFix
      mkdir wigFix
  
      gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
      para create jobList
      para try ... check ... push ... etc ...
      para -maxJob=100 push
      para time
  # Completed: 3010 of 3010 jobs
  # CPU time in finished jobs:    5672403s   94540.06m  1575.67h   65.65d  0.180 y
  # IO & Wait Time:                 51879s     864.64m    14.41h    0.60d  0.002 y
  # Average job time:                1902s      31.70m     0.53h    0.02d
  # Longest finished job:            2889s      48.15m     0.80h    0.03d
  # Submission to last job:         58824s     980.40m    16.34h    0.68d
  
      ssh hgwdev
      cd /cluster/data/hg19/bed/multiz100way/consPhyloP/run.phyloP/all
      mkdir downloads
      for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
  do
      echo "working: $D"
      find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
  	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
          | gzip -c > downloads/${D}.phyloP100way.wigFix.gz
  done
      #   real    97m3.368s
  
      time (zcat downloads/*.wigFix.gz \
      | wigEncode stdin phyloP100way.wig phyloP100way.wib > wigEncode.log 2>&1) &
      #   Converted stdin, upper limit 9.87, lower limit -20.00
      #   real    17m1.747s
  
      # if wigToBigWig runs out of memory, increase the shell memory limits:
      #  200Gb == 200*(1024^2) = 209715200
  export sizeG=209715200
  ulimit -d $sizeG
  ulimit -v $sizeG
      time (zcat downloads/*.wigFix.gz \
          | wigToBigWig stdin ../../../../chrom.sizes phyloP100way.bw)
      #   real    61m35.698s
  
      bigWigInfo phyloP100way.bw
      # second time 2014-02-11:
  version: 4
  isCompressed: yes
  isSwapped: 0
  primaryDataSize: 7,386,377,587
  primaryIndexSize: 90,385,252
  zoomLevels: 10
  chromCount: 92
  basesCovered: 2,857,876,073
  mean: 0.105362
  min: -20.000000
  max: 9.873000
  std: 1.026658
  
  version: 4
  isCompressed: yes
  isSwapped: 0
  primaryDataSize: 7,520,567,316
  primaryIndexSize: 137,824,308
  zoomLevels: 10
  chromCount: 92
  basesCovered: 2,832,683,483
  mean: 0.108207
  min: -20.000000
  max: 9.869000
  std: 1.026550
  
      #	if you wanted to use the bigWig file, loading bigWig table:
      ln -s `pwd`/phyloP100way.bw /gbdb/hg19/bbi
      hgsql hg19 -e 'drop table if exists phyloP100wayAll; \
              create table phyloP100wayAll \
  		(fileName varchar(255) not null); \
              insert into phyloP100wayAll values
  	("/gbdb/hg19/bbi/phyloP100way.bw");'
  
      #	loading the wiggle table:
      ln -s `pwd`/phyloP100way.wib /gbdb/hg19/multiz100way
      time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz100way hg19 \
  	phyloP100wayAll phyloP100way.wig
      #   real    1m14.834s
  
      wigTableStats.sh hg19 phyloP100wayAll
      # second time 2014-02-11:
  # db.table      min max mean count sumData
  # hg19.phyloP100wayAll    -20 9.873 0.105362 2857876073 3.01112e+08
  #	stdDev viewLimits
  #      1.02666 viewLimits=-5.02793:5.23865
      #	that range is: 20 + 9.873 = 29.873 for -hBinSize=0.29873 below
      #   to get 1,000 bins
  
      # first time:
  # db.table      min max mean count sumData
  # hg19.phyloP100wayAll -20 9.869 0.108207 2832683483 3.06515e+08
  #	stdDev viewLimits
  #      1.02655 viewLimits=-5.02454:5.24096
      #	that range is: 20 + 9.869 = 29.869 for -hBinSize=0.29869 below
      #   to get 1,000 bins
  
      #  Create histogram to get an overview of all the data
      time nice -n +19 hgWiggle -doHistogram \
  	-hBinSize=0.29873 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
  	    -db=hg19 phyloP100wayAll > histogram.data 2>&1
      #   real    4m24.071s
  
      #	create plot of histogram:
  # this gnuplot issues an error:
  # Could not find/open font when opening font "arial", using internal non-scalable font
  
      cat << '_EOF_' | gnuplot > histo.png
  set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
  set size 1.4, 0.8
  set key left box
  set grid noxtics
  set grid ytics
  set title " Human Hg19 Histogram phyloP100way track, all 100 vertebrates"
  set xlabel " phyloP100way score, all 100 vertebrates"
  set ylabel " Relative Frequency"
  set y2label " Cumulative Relative Frequency (CRF)"
  set y2range [0:1]
  set y2tics
  set yrange [0:0.2]
  set xrange [-2:2]
  
  plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
          "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
  '_EOF_'
      #	<< happy emacs
  
      display histo.png &
  
  #########################################################################
  # construct download files for 60-way (DONE - 2012-06-27 - 2012-08-21 - Hiram)
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz100way
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz100way/maf
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz100way/alignments
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phastCons100way
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phastCons100way/hg19.100way.phastCons
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phyloP100way
  mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way
  
      mkdir /hive/data/genomes/hg19/bed/multiz100way/downloads
      cd /hive/data/genomes/hg19/bed/multiz100way/downloads
      mkdir multiz100way phastCons100way phyloP100way
      cd multiz100way
      mkdir maf alignments
      cd maf
      time rsync -a -P ../../../anno/result/ ./
      #   real    322m58.633s
      time gzip *.maf
      #   real    515m4.515s
      time md5sum *.maf.gz > md5sum.txt
      #   real    10m6.351s
      ln -s `pwd`/*.maf.gz `pwd`/md5sum.txt \
          /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz100way/maf
      cd ..
      du -hsc maf ../../anno/result/
      #    68G     maf
      #    722G    ../../anno/result/
      #    789G    total
  
      grep TREE ../../4d/all.mod | sed -e 's/TREE: //' \
         | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
            > hg19.100way.nh
  
     ~/kent/src/hg/utils/phyloTrees/commonNames.sh hg19.100way.nh \
         | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
            > hg19.100way.commonNames.nh
     ~/kent/src/hg/utils/phyloTrees/scientificNames.sh hg19.100way.nh \
         | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
            > hg19.100way.scientificNames.nh
      md5sum *.gz *.nh > md5sum.txt
  
      ln -s `pwd`/*.nh `pwd`/*.txt `pwd`/*.maf.gz \
          /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz100way
  
  #####################################################################
      cd /hive/data/genomes/hg19/bed/multiz100way/downloads/phastCons100way
      mkdir hg19.100way.phastCons
      cd hg19.100way.phastCons
      ln -s ../../../cons/all/downloads/chr*.gz .
      time md5sum *.gz > md5sum.txt
      # real    2m9.045s
  
      ln -s `pwd`/*.gz `pwd`/md5sum.txt \
    /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phastCons100way/hg19.100way.phastCons
      #   real    6m11.158s
      cd ..
      ln -s ../../cons/all/all.mod hg19.100way.phastCons.mod
      ln -s ../../cons/all/phastCons100way.bw hg19.100way.phastCons.bw
      time md5sum *.mod *.bw > md5sum.txt
      #  real    3m4.462s
      # obtain the README.txt from mm10/phastCons60way and update for this
      #   situation, from:
  # /hive/data/genomes/mm10/bed/multiz60way/downloads/phastCons60way/README.txt
      ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phastCons100way
  
      #####################################################################
      cd /hive/data/genomes/hg19/bed/multiz100way/downloads/phyloP100way
      mkdir hg19.100way.phyloP100way
      cd hg19.100way.phyloP100way
      ln -s ../../../consPhyloP/all/downloads/chr*.gz .
      time md5sum *.gz > md5sum.txt &
      # real    3m37.813s
  
      ln -s `pwd`/*.gz `pwd`/md5sum.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way
  
      cd ..
      ln -s ../../consPhyloP/run.phyloP/all.mod hg19.100way.phyloP100way.mod
  
      ln -s ../../consPhyloP/all/phyloP100way.bw hg19.100way.phyloP100way.bw
  
      time md5sum *.mod *.bw > md5sum.txt &
      #  real    4m44.724s
  
      # obtain the README.txt from mm10/phyloP60way and update for this
      #   situation, from:
  # /hive/data/genomes/mm10/bed/multiz60way/downloads/phyloP60way/README.txt
      ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/phyloP100way
  
      ###########################################################################
      ## create upstream refGene maf files
      cd /hive/data/genomes/hg19/bed/multiz100way/downloads/multiz100way
      # bash script
  time for S in 1000 2000 5000
  do
      echo "making upstream${S}.maf"
      featureBits hg19 ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
          | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
          | /cluster/bin/$MACHTYPE/mafFrags hg19 multiz100way \
                  stdin stdout \
                  -orgs=/hive/data/genomes/hg19/bed/multiz100way/species.list \
          | gzip -c > upstream${S}.maf.gz
      echo "done upstream${S}.maf.gz"
  done
      #   real    603m51.338s
  
      md5sum *.nh *.maf.gz > md5sum.txt
      #   real    10m6.351s
  
      # obtain the README.txt from mm10/multiz60way and update for this
      #   situation, from:
  # /hive/data/genomes/mm10/bed/multiz60way/downloads/multiz60way/README.txt
      ln -s `pwd`/*.nh `pwd`/*.maf.gz `pwd`/*.txt \
          /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/multiz100way
  
  #############################################################################
  # hgPal downloads (DONE - 2013-09-20 - Hiram)
  # hgPal downloads (redo refGene - 2016-10-10 - braney )
  # hgPal downloads (redo knownGene - 2016-10-04 - braney )
  #   FASTA from 100-way for ensGene
  
      ssh hgwdev
      screen -S hg19HgPal
      mkdir /hive/data/genomes/hg19/bed/multiz100way/pal
      cd /hive/data/genomes/hg19/bed/multiz100way/pal
      cat ../species.list | tr '[ ]' '[\n]' > order.list
  
      export mz=multiz100way
      export gp=knownGene
      export db=hg19
      export I=0
      mkdir exonAA exonNuc
      for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
      do
          I=`echo $I | awk '{print $1+1}'`
  	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
  	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
          if [ $I -gt 6 ]; then
              echo "date"
              echo "wait"
              I=0
          fi
      done > $gp.jobs
      echo "date" >> $gp.jobs
      echo "wait" >> $gp.jobs
  
      time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
      #   real    237m38.310s
  
      export mz=multiz100way
      export gp=knownGene
      export db=hg19
      time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      #   real    4m34.089s
      time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      #   real    18m41.606s
      du -hsc exonAA exonNuc *.fa.gz
      # 2.5G    exonAA
      # 4.0G    exonNuc
      # 2.5G    knownGene.multiz100way.exonAA.fa.gz
      # 4.0G    knownGene.multiz100way.exonNuc.fa.gz
  
      rm -rf exonAA exonNuc
  
      # we're only distributing exons at the moment
      export mz=multiz100way
      export gp=knownGene
      export db=hg19
      export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      md5sum *.fa.gz > md5sum.txt
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
      ln -s `pwd`/md5sum.txt $pd/
  
      # running up refGene
      # screen control for long running job business
      screen -S alignments100
      cd /hive/data/genomes/hg19/bed/multiz100way/pal
      export mz=multiz100way
      export gp=refGene
      export db=hg19
      export I=0
      mkdir exonAA exonNuc refGene
      hgsql hg19 -Ne "select concat(g.name,'.',d.version),g.* from refGene g, gbCdnaInfo d where g.name = d.acc" | cut -f 1,4- |  splitFileByColumn /dev/stdin refGene -col=2
      :
      for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
      do
          I=`echo $I | awk '{print $1+1}'`
  	echo "mafGene -chrom=$C -exons -noTrans -useFile $db $mz refGene/$C order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
  	echo "mafGene -chrom=$C -exons -useFile $db $mz refGene/$C order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
          if [ $I -gt 6 ]; then
              echo "date"
              echo "wait"
              I=0
          fi
      done > $gp.jobs
      echo "date" >> $gp.jobs
      echo "wait" >> $gp.jobs
  
      time sh -x $gp.jobs > $gp.jobs.log 2>&1
      #   real    173m1.000s
  
      export mz=multiz100way
      export gp=refGene
      export db=hg19
      time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
      #   real    3m34.210s
      time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
      #   real    14m4.669s
      du -hsc exonAA exonNuc refGene*.fa.gz
      # 2.0G    exonAA
      # 3.2G    exonNuc
      # 2.0G    refGene.multiz100way.exonAA.fa.gz
      # 3.2G    refGene.multiz100way.exonNuc.fa.gz
      # 11G     total
  
      rm -rf exonAA exonNuc
  
      # we're only distributing exons at the moment
      export mz=multiz100way
      export gp=refGene
      export db=hg19
      export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      md5sum *.fa.gz > md5sum.txt
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
      ln -s `pwd`/md5sum.txt $pd/
  
      ### And knownCanonical
      cd /hive/data/genomes/hg19/bed/multiz100way/pal
      export mz=multiz100way
      export gp=knownCanonical
      export db=hg19
      mkdir exonAA exonNuc ppredAA ppredNuc knownCanonical
  
      cut -f1 ../../../chrom.sizes | while read C
      do
          echo $C
  	hgsql hg19 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed
      done
  
      ls knownCanonical/*.known.bed | while read F
      do
        if [ -s $F ]; then
           echo $F | sed -e 's#knownCanonical/##; s/.known.bed//'
        fi
      done | while read C
      do
  	echo "date"
  	echo "mafGene -geneBeds=knownCanonical/$C.known.bed  $db $mz knownGene order.list stdout | \
  	    gzip -c > ppredAA/$C.ppredAA.fa.gz"
  	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz knownGene order.list stdout | \
  	    gzip -c > ppredNuc/$C.ppredNuc.fa.gz"
  	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \
  	    gzip -c > exonNuc/$C.exonNuc.fa.gz"
  	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \
  	    gzip -c > exonAA/$C.exonAA.fa.gz"
      done > $gp.$mz.jobs
  
      time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1
      # real    1033m6.639s
  
      rm *.known.bed
      mz=multiz100way
      gp=knownCanonical
      db=hg19
      zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz &
      zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz &
      zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz &
      zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
  
      rm -rf exonAA exonNuc ppredAA ppredNuc
  
      mz=multiz100way
      gp=knownCanonical
      db=hg19
      pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
      mkdir -p $pd
      ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
      ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
      cd $pd
      md5sum *.fa.gz > md5sum.txt
  
  #########################################################################
  # sperm whale/phyCat1 Lastz run  (DONE - 2013-10-05 braney)
      screen -S phyCat1    # use screen to manage this long running job
      mkdir /hive/data/genomes/hg19/bed/lastzPhyCat1.2013-10-05
      cd  /hive/data/genomes/hg19/bed/lastzPhyCat1.2013-10-05
      cat << '_EOF_' > DEF
  # human vs sperm whale
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: sperm whale PhyCat1
  SEQ2_DIR=/hive/data/genomes/phyCat1/phyCat1.2bit
  SEQ2_LEN=/hive/data/genomes/phyCat1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzPhyCat1.2013-10-05
  TMPDIR=/dev/shm
  '_EOF_'
      # << emacs
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > do.log 2>&1
      # real    779m50.178s
      # forgot to load up phyCat1 database for net repeat classification
      # finish load step manually, then:
  
      cat fb.hg19.chainPhyCat1Link.txt
      #  1521042352 bases of 2897316137 (52.498%) in intersection
  
      time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -continue=download -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > download.log 2>&1
      # real    32m10.340s
  
      # create symLink to indicate this is the version to use
      cd /hive/data/genomes/hg19/bed
      ln -s lastzPhyCat1.2013-10-05 lastz.phyCat1
  
      cd /hive/data/genomes/hg19/bed/lastzPhyCat1.2013-10-05
      # filter with doRecipBest.pl
      time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 phyCat1 > rbest.log 2>&1 &
      #   real    59m7.123s
  
      # running the swap
      mkdir /hive/data/genomes/phyCat1/bed/blastz.hg19.swap
      cd /hive/data/genomes/phyCat1/bed/blastz.hg19.swap
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPhyCat1.2013-10-05/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet > swap.log 2>&1
      #   real    102m56.234s
  
      cat fb.phyCat1.chainHg19Link.txt
      #    1455933862 bases of 2233689186 (65.181%) in intersection
  
      cd /hive/data/genomes/phyCat1/bed
      ln -s blastz.hg19.swap lastz.hg19
  
  
  #########################################################################
  # ClinVar Variants track (2013-10-09 - Max)
  wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz -O - | gunzip > variant_summary.txt
  echo WARNING: For hg19 we remove all chrM entries.
  # keep only Ch37 variants and shorten the name
  less variant_summary.txt  |  tawk '($13=="GRCh37" || $13=="GRCh37.p9") {print "chr"$14,$15,$16,$3,$9,$2,$4,$5,$6,$7,$8,$10,$11,$12,$13,$17,$18,$19,$20,$21,$23,$24}'  | sed -e 's/, /,/g' | sed -e 's/;/,/g' | tawk '{$3=$3+1; split($4, a, " "); $4=a[1]; if ($4=="") {$4=$11}; print}' | grep -v ClinicalSignificance | sort -k1,1 -k2,2n | grep -v chrM | sed -e 's/,/, /g' > clinvarFull.bed
  # separate into two subtracks
  cat clinvarFull.bed | egrep 'copy number (loss|gain)' > clinvarCnv.bed
  cat clinvarFull.bed | grep -v nsv > clinvarMain.bed
  # convert to bb
  cat << '_EOF_' > clinvar.as
  table clinVarBed
  "Browser extensible data (4 fields) plus information about a ClinVar entry"
      (
      string chrom;        "Chromosome (or contig, scaffold, etc.)"
      uint   chromStart;   "Start position in chromosome"
      uint   chromEnd;     "End position in chromosome"
      string name;         "Name of item"
      string rcvAcc;         "ClinVar ID"
      string type;         "Type of Variant"
      string geneId;         "NCBI Entrez Gene ID"
      string geneSym;         "NCBI Entrez Gene Symbol"
      string clinSign;         "Clinical significance:"
      string snpId;         "dbSNP ID"
      string nsvId;         "dbVar ID"
      string testedInGtr;         "Genetic Testing Registry"
      lstring phenotype;         "Phenotype identifiers"
      string origin;         "Data origin"
      string assembly;         "Genome assembly"
      string cytogenetic;         "Cytogenetic status"
      string reviewStatus;         "Review status"
      string hgvsCod;         "coding HGVS"
      string hgvsProt;         "protein HGVS"
      string lastEval;         "last evaluation"
      string guidelines;         "guidelines"
      lstring otherIds;         "other identifiers (OMIM)"
      )
  '_EOF_'
  
  bedToBigBed clinvarMain.bed /scratch/data/hg19/chrom.sizes clinvarMain.bb -type=bed4+18 -tab -as=clinvar.as
  bedToBigBed clinvarCnv.bed /scratch/data/hg19/chrom.sizes clinvarCnv.bb -type=bed4+18 -tab -as=clinvar.as
  cp clinvarMain.bb /hive/data/genomes/hg19/bed/clinvar/
  cp clinvarCnv.bb /hive/data/genomes/hg19/bed/clinvar/
  
  _EOF_
  #########################################################################
  # LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie)
  # Redmine #13359, #24285 -- otto-mate To Do #17877
  # previously done 7/7/14, 9/9/16, 5/30/18
      screen -S lrg -t lrg
      set today = `date +%Y_%m_%d`
      mkdir -p /hive/data/genomes/hg19/bed/lrg/$today
      cd /hive/data/genomes/hg19/bed/lrg/$today
      wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip
      unzip LRG_public_xml_files.zip
      # The .atree file was useful for getting a handle on the hierarchy and types of nodes:
      # autoDtd LRG_1.xml lrg.dtd lrg.stats -atree=lrg.atree
  
      # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts:
      ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh37
      genePredCheck lrgTranscriptsUnmapped.gp
  #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46
  #checked: 1029 failed: 1
      # If there are complaints e.g. about exonFrame, look for inconsistencies in the
      # affected transcript's coding_region/coordinates vs. exon/intron info in xml.
      # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background
      # (missing exonFrame info doesn't affect our track representation because we end up using
      # psl).  We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon
      # portion is only the stop codon.
  
      # hg19 has patches on hgwdev but not on the RR, and the patches may remain on hgwdev.
      # To avoid confusion, exclude patch sequences for now; if we release patches, rebuild
      # LRG tracks without this part.
      mv lrg.bed lrg.allSeqs.bed
      cut -f 1 ../../../chrom.sizes.initial | grep -Fwf - lrg.allSeqs.bed > lrg.bed
      wc -l lrg*bed
  #   930 lrg.allSeqs.bed
  #   888 lrg.bed
  
      # Load LRG regions:
      bedToBigBed lrg.bed /hive/data/genomes/hg19/chrom.sizes lrg.bb \
        -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
      ln -sf `pwd`/lrg.bb /gbdb/hg19/bbi/lrg.bb
      hgBbiDbLink hg19 lrg /gbdb/hg19/bbi/lrg.bb
  
      # Map LRG fixed_annotation transcripts from LRG coords to hg19 coords (HT MarkD):
      lrgToPsl lrg.bed /hive/data/genomes/hg19/chrom.sizes lrg.psl
      pslCheck lrg.psl
  #checked: 888 failed: 0 errors: 0
      awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes
      genePredToFakePsl -chromSize=lrg.sizes placeholder \
        lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds
      pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg19.psl
      mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \
        lrgTranscriptsHg19.psl lrgTranscriptsHg19NoName2.gp
  #Warning: no CDS for LRG_163t1
  #Warning: no CDS for LRG_347t1
      # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*).
      grep -l NR_ LRG_163.xml LRG_347.xml
  #LRG_163.xml
  #LRG_347.xml
  
      # Load PSL, CDS and sequences.
      hgLoadPsl hg19 -table=lrgTranscriptAli lrgTranscriptsHg19.psl
      hgLoadSqlTab hg19 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds
      hgPepPred hg19 tab lrgCdna lrgCdna.tab
      hgPepPred hg19 tab lrgPep lrgPep.tab
  
      # OPTIONAL (only done for initial track development)
      # For a rough comparison of mapping methods, and to get some extra error-checking on the PSL
      # from the chain code, try this too:
      pslToChain lrg.psl lrg.chain
      chainSwap lrg.chain lrgMap.chain
      liftOver -genePred lrgTranscriptsUnmapped.gp lrgMap.chain lrgTLiftOver.gp noMap
      # The noMap file has a few "Boundary problem" errors because liftOver doesn't have
      # sophisticated handling of exonFrames and indels.  Also, liftOver carries over exonFrames
      # as-is without regard to strand, so they end up being reversed for LRG's on the - strand.
      # That's why we're using MarkD's method above!
      # The resulting genePredExt from that process is missing the name2 column from
      # the original; add it back:
      cut -f 1,12 lrgTranscriptsUnmapped.gp > txName2
      join -j 1 -a 1 -t'	' \
        -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,2.2,1.13,1.14,1.15 \
        lrgTranscriptsHg19NoName2.gp txName2 > lrgTranscriptsHg19.gp
      # Ignore exonFrames and compare:
      awk '$1 != "#" {print $1;}' noMap
      cut -f 1-14 lrgTranscriptsHg19.gp | sed -re 's/unk/none/g;' > t1
      cut -f 1-14 lrgTLiftOver.gp > t2
      diff t1 t2 | awk '{print $1, $2;}' | uniq
      # Hopefully there won't be differences aside from the ones liftOver couldn't map.
      rm t1 t2
  
  
  #########################################################################
  # phastBias gBGC track (2013-10-29 - Galt)
  #   RM #11697 data received from Katie Pollard and Adam Siepel
  #   GC-Biased Gene Conversion in the Human and Chimpanzee Genomes
  #   Data consists of tracts of gBGC regions identified in a BED and bigWig on
  #   hg19.  This data was recreated on hg19, it is not lifted from hg18.
  #   Paper: http://www.plosgenetics.org/article/info%3Adoi%2F10.1371%2Fjournal.pgen.1003684
  #   Data:  http://compgen.bscb.cornell.edu/~mt269/gBGC_tracks/
  #
  cd /hive/data/genomes/hg19/bed
  mkdir phastBias
  cd phastBias
  
  wget -O phastBiasTracts3.bed 'http://compgen.bscb.cornell.edu/~mt269/gBGC_tracks/phastBias_hg19_tracts.bed'
  
  # the bigWig file is 17GB and using wget would take 2.5 hours.
  # since the file is coming from New York, we can use paraFetch to reduce the
  # download time significantly:
  paraFetch -progress 10 3 'http://compgen.bscb.cornell.edu/~mt269/gBGC_tracks/phastBias_hg19_posterior.bw' phastBiasPosteriors3.bw
  # it took 1.4 hours and ran at 3MB/sec. It still saved over an hour, but I was
  # hoping for even better results.  Maybe 10 is too many connections for their
  # server.
  
  hgLoadBed hg19 phastBiasTracts3 phastBiasTracts3.bed
  
  ln -s /hive/data/genomes/hg19/bed/phastBias/phastBiasPosteriors3.bw /gbdb/hg19/bbi/
  
  hgBbiDbLink hg19 phastBiasPosteriors3 /gbdb/hg19/bbi/phastBiasPosteriors3.bw
  
  # see hg18.txt makedoc for details on how the .ra and .html files were
  # downloaded and installed at the human level.
  
  #############################################################################
  # EVS Exome Variants from ESP 6500 Exome  (Redmine 9329) 2013-11-08
  # Max previously downloaded all files from http://evs.gs.washington.edu/EVS/ {download tab}
  # to /hive/data/outside/evs
  cd /hive/data/outside/evs
  ll
  -rw-rw-r-- 1 max      protein  132832601 Oct  7 11:12 ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.txt.tar.gz
  -rw-rw-r-- 1 max      protein  124718926 Oct  7 11:12 ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.vcf.tar.gz
  -rw-rw-r-- 1 max      protein  416261389 Oct  7 11:12 ESP6500SI-V2.coverage.all_sites.txt.tar.gz
  -rw-rw-r-- 1 max      protein   13192522 Oct  7 11:12 ESP6500SI-V2.coverage.seq_blocks.txt.tar.gz
  -rw-rw-r-- 1 max      protein    6065184 Oct  7 11:12 agilent_nimblegen_exome_targets_esp_project.tar.gz
  tar -xzf tar -xzf ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.vcf.tar.gz
  ll
  total 3205904
  -rw-rw-r-- 1 max      protein  132832601 Oct  7 11:12 ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.txt.tar.gz
  -rw-rw-r-- 1 max      protein  124718926 Oct  7 11:12 ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.vcf.tar.gz
  -rw-r--r-- 1 tdreszer genecats  93987403 Aug 26 14:29 ESP6500SI-V2-SSA137.updatedRsIds.chr1.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  38335017 Aug 26 14:32 ESP6500SI-V2-SSA137.updatedRsIds.chr10.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  56868902 Aug 26 14:32 ESP6500SI-V2-SSA137.updatedRsIds.chr11.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  49327149 Aug 26 14:32 ESP6500SI-V2-SSA137.updatedRsIds.chr12.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  15843136 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr13.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  30776581 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr14.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  31170853 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr15.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  44270933 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr16.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  57017413 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr17.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  14075907 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr18.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  65266127 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr19.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  68850292 Aug 26 14:29 ESP6500SI-V2-SSA137.updatedRsIds.chr2.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  25274367 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr20.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  12146901 Aug 26 14:33 ESP6500SI-V2-SSA137.updatedRsIds.chr21.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  22731631 Aug 26 14:14 ESP6500SI-V2-SSA137.updatedRsIds.chr22.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  54016924 Aug 26 14:29 ESP6500SI-V2-SSA137.updatedRsIds.chr3.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  35294834 Aug 26 14:29 ESP6500SI-V2-SSA137.updatedRsIds.chr4.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  43985991 Aug 26 14:30 ESP6500SI-V2-SSA137.updatedRsIds.chr5.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  45749508 Aug 26 14:30 ESP6500SI-V2-SSA137.updatedRsIds.chr6.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  44366987 Aug 26 14:31 ESP6500SI-V2-SSA137.updatedRsIds.chr7.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  33404379 Aug 26 14:32 ESP6500SI-V2-SSA137.updatedRsIds.chr8.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  40067697 Aug 26 14:32 ESP6500SI-V2-SSA137.updatedRsIds.chr9.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  25130750 Aug 26 14:34 ESP6500SI-V2-SSA137.updatedRsIds.chrX.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats    269593 Aug 26 14:34 ESP6500SI-V2-SSA137.updatedRsIds.chrY.snps_indels.vcf
  -rw-rw-r-- 1 max      protein  416261389 Oct  7 11:12 ESP6500SI-V2.coverage.all_sites.txt.tar.gz
  -rw-rw-r-- 1 max      protein   13192522 Oct  7 11:12 ESP6500SI-V2.coverage.seq_blocks.txt.tar.gz
  -rw-rw-r-- 1 max      protein    6065184 Oct  7 11:12 agilent_nimblegen_exome_targets_esp_project.tar.gz
  # get tabix (samtools)
  cd /cluster/bin
  mkdir samtools
  cd samtools
  # download from http://sourceforge.net/projects/samtools/files/tabix/
  -rw-rw-r-- 1 tdreszer genecats 514507 Nov  8 11:14 samtools-0.1.19.tar.bz2.tar.bz2.tar.bz2
  tar -xjf samtools-0.1.19.tar.bz2.tar.bz2.tar.bz2
  mv samtools-0.1.19 ..
  mv samtools-0.1.19.tar.bz2.tar.bz2.tar.bz2 ../samtools-0.1.19/
  cd ../samtools-0.1.19/
  rm -rf ../samtools/
  make
  cd ..
  mkdir tabix
  cd tabix
  # download from http://sourceforge.net/projects/samtools/files/tabix/
  -rw-rw-r-- 1 tdreszer genecats 54403 Nov  8 11:25 tabix-0.2.6.tar.bz2.tar.bz2.tar.bz2
  tar -xjf tabix-0.2.6.tar.bz2.tar.bz2.tar.bz2
  mv tabix-0.2.6 ..
  mv tabix-0.2.6.tar.bz2.tar.bz2.tar.bz2 ../tabix-0.2.6/
  cd ../tabix-0.2.6/
  rm -rf ../tabix/
  make
  
  # bgzip then tabixify files
  cd /hive/data/outside/evs
  for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
      /cluster/bin/tabix-0.2.6/bgzip ESP6500SI-V2-SSA137.updatedRsIds.chr${chr}.snps_indels.vcf
  done
  for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
      /cluster/bin/tabix-0.2.6/tabix ESP6500SI-V2-SSA137.updatedRsIds.chr${chr}.snps_indels.vcf.gz
  done
  ll
  total 1617920
  -rw-rw-r-- 1 max      protein  132832601 Oct  7 11:12 ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.txt.tar.gz
  -rw-rw-r-- 1 max      protein  124718926 Oct  7 11:12 ESP6500SI-V2-SSA137.dbSNP138-rsIDs.snps_indels.vcf.tar.gz
  -rw-rw-r-- 1 tdreszer genecats  13257881 Nov  8 11:34 ESP6500SI-V2-SSA137.updatedRsIds.chr1.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     57058 Nov  8 11:36 ESP6500SI-V2-SSA137.updatedRsIds.chr1.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   5324208 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr10.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     26309 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr10.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   8057109 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr11.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     30394 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr11.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6951392 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr12.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     31477 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr12.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   2282788 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr13.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     13677 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr13.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   4413259 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr14.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     19363 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr14.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   4451460 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr15.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     20180 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr15.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6266776 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr16.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     19272 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr16.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   8005893 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr17.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     26404 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr17.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   2011581 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr18.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     11785 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr18.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   9460006 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr19.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     22808 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr19.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   9746625 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr2.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     49239 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr2.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   3560189 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr20.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     14079 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr20.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   1710920 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr21.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats      5965 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr21.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   3211868 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr22.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     10849 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr22.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   7609656 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr3.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     38679 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr3.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   5116880 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr4.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     28097 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr4.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6118636 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr5.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     29706 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr5.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6561993 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr6.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     32437 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr6.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6395663 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr7.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     32184 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr7.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   4816680 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr8.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     24710 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr8.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   5656614 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chr9.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     25772 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chr9.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   3528099 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chrX.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     23453 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chrX.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats     32962 Nov  8 11:35 ESP6500SI-V2-SSA137.updatedRsIds.chrY.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats       811 Nov  8 11:37 ESP6500SI-V2-SSA137.updatedRsIds.chrY.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 max      protein  416261389 Oct  7 11:12 ESP6500SI-V2.coverage.all_sites.txt.tar.gz
  -rw-rw-r-- 1 max      protein   13192522 Oct  7 11:12 ESP6500SI-V2.coverage.seq_blocks.txt.tar.gz
  -rw-rw-r-- 1 max      protein    6065184 Oct  7 11:12 agilent_nimblegen_exome_targets_esp_project.tar.gz
  
  # make symlinks in gbdb:
  mkdir /gbdb/hg19/evs
  ln -s `pwd`/*.vcf.gz* /gbdb/hg19/evs/
  cp /dev/null evsEsp6500.txt
  for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
      echo "/gbdb/hg19/evs/ESP6500SI-V2-SSA137.updatedRsIds.chr${chr}.snps_indels.vcf.gz\tchr${chr}" >> evsEsp6500.txt
  done
  # manually swapped in tabs for \t's, then load
  hgLoadSqlTab hg19 evsEsp6500 ~/kent/src/hg/lib/bbiChroms.sql evsEsp6500.txt
  
  # Add evsEsp6500 track to hg19/trackDb.ra
  
  #########################################################################
  # EVS Exome Variants from ESP 6500 Exome UPDATE (Redmine 9329) 2014-03-28
  cd /hive/data/outside/evs/
  mkdir update
  cd /hive/data/outside/evs/update
  wget 'http://evs.gs.washington.edu/evs_bulk_data/ESP6500SI-V2-SSA137.protein-hgvs-update.snps_indels.vcf.tar.gz'
  tar -xzf ESP6500SI-V2-SSA137.protein-hgvs-update.snps_indels.vcf.tar.gz
  ll
  -rw-r--r-- 1 tdreszer genecats 124720615 Mar 28 13:19 ESP6500SI-V2-SSA137.protein-hgvs-update.snps_indels.vcf.tar.gz
  -rw-r--r-- 1 tdreszer genecats  93987280 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr1.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  38334975 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr10.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  56868856 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr11.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  49327119 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr12.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  15843124 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr13.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  30776532 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr14.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  31170814 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr15.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  44270899 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr16.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  57017358 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr17.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  14075899 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr18.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  65266023 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr19.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  68850168 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr2.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  25274351 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr20.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  12146878 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr21.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  22731583 Feb  6 14:01 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr22.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  54016869 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr3.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  35294803 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr4.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  43985947 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr5.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  45749446 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr6.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  44366928 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr7.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  33404366 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr8.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  40067651 Feb  6 14:12 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr9.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats  25130716 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chrX.snps_indels.vcf
  -rw-r--r-- 1 tdreszer genecats    269593 Feb  6 14:13 ESP6500SI-V2-SSA137.updatedProteinHgvs.chrY.snps_indels.vcf
  
  # compress and make indexes
  for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
      /cluster/bin/tabix-0.2.6/bgzip ESP6500SI-V2-SSA137.updatedProteinHgvs.chr${chr}.snps_indels.vcf
  done
  for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
      /cluster/bin/tabix-0.2.6/tabix ESP6500SI-V2-SSA137.updatedProteinHgvs.chr${chr}.snps_indels.vcf.gz
  done
  
  ll
  total 507888
  -rw-r--r-- 1 tdreszer genecats 124720615 Mar 28 13:19 ESP6500SI-V2-SSA137.protein-hgvs-update.snps_indels.vcf.tar.gz
  -rw-rw-r-- 1 tdreszer genecats  13258210 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr1.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     57068 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr1.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   5324193 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr10.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     26305 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr10.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   8057163 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr11.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     30370 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr11.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6951317 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr12.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     31468 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr12.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   2282785 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr13.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     13687 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr13.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   4413068 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr14.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     19361 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr14.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   4451406 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr15.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     20177 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr15.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6266701 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr16.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     19274 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr16.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   8005745 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr17.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     26399 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr17.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   2011561 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr18.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     11774 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr18.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   9459910 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr19.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     22802 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr19.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   9746724 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr2.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     49265 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr2.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   3560127 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr20.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     14067 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr20.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   1710937 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr21.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats      5960 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr21.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   3211748 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr22.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     10850 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr22.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   7609634 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr3.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     38695 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr3.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   5116781 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr4.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     28085 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr4.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6118583 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr5.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     29707 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr5.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6561883 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr6.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     32430 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr6.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   6395768 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr7.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     32174 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr7.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   4816721 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr8.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     24693 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr8.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   5656727 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr9.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     25797 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chr9.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats   3528086 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chrX.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats     23418 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chrX.snps_indels.vcf.gz.tbi
  -rw-rw-r-- 1 tdreszer genecats     32962 Mar 28 13:30 ESP6500SI-V2-SSA137.updatedProteinHgvs.chrY.snps_indels.vcf.gz
  -rw-rw-r-- 1 tdreszer genecats       811 Mar 28 13:31 ESP6500SI-V2-SSA137.updatedProteinHgvs.chrY.snps_indels.vcf.gz.tbi
  
  cd ..
  mkdir older
  mv *.vcf.gz older
  mv *.vcf.gz,tbi older
  mv *.tar.gz older
  mv *.txt older
  mv update/* .
  rmdir update
  
  # make new symlinks in gbdb:
  rm /gbdb/hg19/evs/*.vcf.gz*
  ln -s `pwd`/*.vcf.gz* /gbdb/hg19/evs/
  
  # Update hg19.evsEsp6500 table
  cp /dev/null evsEsp6500.txt
  for chr in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
      echo "/gbdb/hg19/evs/ESP6500SI-V2-SSA137.updatedProteinHgvs.chr${chr}.snps_indels.vcf.gz\tchr${chr}" >> evsEsp6500.txt
  done
  # manually swapped in tabs for \t's, then load
  hgLoadSqlTab hg19 evsEsp6500 ~/kent/src/hg/lib/bbiChroms.sql evsEsp6500.txt
  
  ########################################################################
  # UCSF BRAIN METHYLATION (DONE 2013-11-26 Pauline)
  
      mkdir /hive/data/genomes/hg19/bed/ucsfBrainMethyl
      cd /hive/data/genomes/hg19/bed/ucsfBrainMethyl
      wget --timestamping 'ftp://wangftp.wustl.edu/ucsfBrainMethyl/bigWig/ucsfChipSeqH3K4me3BrainCoverage.bigWig'
      wget --timestamping 'ftp://wangftp.wustl.edu/ucsfBrainMethyl/bigWig/ucsfMedipSeqBrainCoverage.bigWig'
      wget --timestamping 'ftp://wangftp.wustl.edu/ucsfBrainMethyl/bigWig/ucsfMedipSeqBrainCpG.bigWig'
      wget --timestamping 'ftp://wangftp.wustl.edu/ucsfBrainMethyl/bigWig/ucsfMreSeqBrainCpG.bigWig'
      wget --timestamping 'ftp://wangftp.wustl.edu/ucsfBrainMethyl/bigWig/ucsfRnaSeqBrainAllCoverage.bigWig'
      wget --timestamping 'ftp://wangftp.wustl.edu/ucsfBrainMethyl/bigWig/ucsfRnaSeqBrainSmartCoverage.bigWig'
  
      mkdir /gbdb/hg19/bbi/ucsfBrainMethyl/
  
      ln -s `pwd`/ucsfChipSeqH3K4me3BrainCoverage.bigWig /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfChipSeqH3K4me3BrainCoverage.bigWig
      ln -s `pwd`/ucsfMedipSeqBrainCoverage.bigWig  /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfMedipSeqBrainCoverage.bigWig
      ln -s `pwd`/ucsfMedipSeqBrainCpG.bigWig  /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfMedipSeqBrainCpG.bigWig
      ln -s `pwd`/ucsfMreSeqBrainCpG.bigWig  /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfMreSeqBrainCpG.bigWig
      ln -s `pwd`/ucsfRnaSeqBrainAllCoverage.bigWig  /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfRnaSeqBrainAllCoverage.bigWig
      ln -s `pwd`/ucsfRnaSeqBrainSmartCoverage.bigWig  /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfRnaSeqBrainSmartCoverage.bigWig
  
      hgBbiDbLink hg19 ucsfChipSeqH3K4me3BrainCoverage /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfChipSeqH3K4me3BrainCoverage.bigWig
      hgBbiDbLink hg19 ucsfMedipSeqBrainCoverage /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfMedipSeqBrainCoverage.bigWig
      hgBbiDbLink hg19 ucsfMedipSeqBrainCpG /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfMedipSeqBrainCpG.bigWig
      hgBbiDbLink hg19 ucsfMreSeqBrainCpG /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfMreSeqBrainCpG.bigWig
      hgBbiDbLink hg19 ucsfRnaSeqBrainAllCoverage /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfRnaSeqBrainAllCoverage.bigWig
      hgBbiDbLink hg19 ucsfRnaSeqBrainSmartCoverage /gbdb/hg19/bbi/ucsfBrainMethyl/ucsfRnaSeqBrainSmartCoverage.bigWig
  
  #used existing trackDb entry for lifted track, and was able to override track
  #type bedgraph -> bigWig.
  
  ########################################################################
  # Bing BLAT track (DONE 2013-12-06 Max)
  # requires the pubs tools in ~max/projects/pubs and also on github as "pubMunch"
  lftp ftp://ftp.research.microsoft.com/pub/BobD/ -e 'mget *'
  # indexing
  pubConvBing -i /hive/data/outside/pubs/bing/
  # conversion on ku
  pubConvBing /hive/data/outside/pubs/bing/ /hive/data/inside/pubs/text/bing/
  # mapping,chaining,table-creation to hg19 on ku
  pubMap bing annots-tables --onlyDb=hg19
  # load into mysql
  pubMap bing load
  
  
  ########################################################################
  # CNV Developmental Delay track (DONE 2014-11-21 Steve)
  
      mkdir /hive/data/genomes/hg19/bed/cnvDevDelay
      cd /hive/data/genomes/hg19/bed/cnvDevDelay
  
  # NOTE FOR NEXT TIME: dbVar reorganized their ftp directories.  Should there be a next time,
  # the files are now at
  # ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/gvf/nstd100.GRCh37.variant_call.gvf.gz
  # ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/gvf/nstd100.GRCh37.variant_region.gvf.gz
  # ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/gvf/nstd54.GRCh37.variant_call.gvf.gz
  # ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/gvf/nstd54.GRCh37.variant_region.gvf.gz
  
  wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd100_Coe_et_al_2014/gvf/nstd100_Coe_et_al_2014.GRCh37.remap.all.germline.ucsc.gvf.gz'
  wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd54_Cooper_et_al_2011/gvf/nstd54_Cooper_et_al_2011.GRCh37.remap.all.germline.ucsc.gvf.gz'
  
  cp /kent/src/hg/utils/automation/gvfToBed8Attrs.pl .
  mv gvfToBed8Attrs.pl gvfToBed8AttrsCase.pl
  cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl100.pl
  cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl54.pl
  
  # made three local copies of Angie's gvf conversion script - one to include
  # only case individuals from nstd100, one to include only control individuals
  # from nstd100 and one to include only control individuals from nstd54
  
  # had to add an additional elsif statement to the nstd100 scripts to filter
  # based on sample_name field:
  
  #  } elsif ($tag eq "sample_name") {
  #    $sample_name = $val;
  #  }
  
  # added line 33/35 to each file:
  
  # next if ($sample_name eq "Unknown"); # keep only "case" individuals from nstd100
  # next if ($sample_name ne "Unknown"); # keep only "control" individuals from nstd100
  # next if ($phenotype ne "not_reported"); # keep only "control" individuals from nstd54
  
  zcat nstd100_Coe_et_al_2014.GRCh37.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsCase.pl > cnvDevDelayAllCase.bed
  zcat nstd100_Coe_et_al_2014.GRCh37.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl100.pl > cnvDevDelayAllControl.bed
  zcat nstd54_Cooper_et_al_2011.GRCh37.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl54.pl >> cnvDevDelayAllControl.bed
  
  hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
          -allowStartEqualEnd hg19 cnvDevDelayCase cnvDevDelayAllCase.bed
  
  hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
          -allowStartEqualEnd hg19 cnvDevDelayControl cnvDevDelayAllControl.bed
  
      checkTableCoords hg19 cnvDevDelayCase
      checkTableCoords hg19 cnvDevDelayControl
  
  
  #############################################################################
  # LIFTOVER TO Hg38 (DONE - 2013-12-31 - Hiram )
      mkdir /hive/data/genomes/hg19/bed/blat.hg38noMask.2013-12-31
      cd /hive/data/genomes/hg19/bed/blat.hg38noMask.2013-12-31
      # experiments were done with this process to determine if the
      # target/query sequences need to be masked or not.  They do not
      # need to be masked, the same result is obtained on masked or
      # unmasked sequence.
      # -debug run to create run dir and scripts, the partition script
      # was modified to use the unmasked hg19/hg38 sequences
      doSameSpeciesLiftOver.pl -debug -stop=net -buildDir=`pwd` \
        -bigClusterHub=ku \
         -dbHost=hgwdev -workhorse=hgwdev \
            -ooc=/hive/data/genomes/hg19/11.ooc hg19 hg38
      # Turns out the chain step procedure will not construct the proper
      # set of files in debug mode (pslParts.lst) because it can not.
      # the chain step has to be run for real:
      doSameSpeciesLiftOver.pl -continue=chain -buildDir=`pwd` \
        -bigClusterHub=ku \
         -dbHost=hgwdev -workhorse=hgwdev \
            -ooc=/hive/data/genomes/hg19/11.ooc hg19 hg38 > chain.log 2>&1
  
      # verify the convert link on the browser is now active from hg19 to hg38
  
  ##############################################################################
  # hg19 <-> hg38 difference tracks (DONE - 2013-12-28 - Hiram)
      # note: the procedure for this is in the hg38.txt file under
      # this same heading.  The end result is the loading of the table:
  
      cd /hive/data/genomes/hg19/bed/liftOverHg38
      hgLoadBed hg19 hg38ContigDiff hg19.itemRgb.bed
  
  
  ##########################################################################
  # NEANDERTAL AND DENISOVA METHYLATION (DONE 8/27/14 angie)
      # RM #13439
      mkdir /hive/data/genomes/hg19/bed/neandertalMethylation
      cd /hive/data/genomes/hg19/bed/neandertalMethylation
      wget http://carmelab.huji.ac.il/data/Reconstructed_Methylation_Neandertal.zip
      unzip Reconstructed_Methylation_Neandertal.zip
      # The file has scores in e-notation ("4.108367e+001"); use perl to add "chr"
      # and convert scores to regular floating point notation.
      # Also, the submitters requested a green-to-red score display (0 = green, 100 = red)
      # like the ENCODE HAIB Methyl-RRBS track, so make it a bed 9 with the ENCODE coloring.
      # For the bed5 score, scale up 0-100 to 0-1000.
      perl -we '\
          @colorScale = (0x00FF00, 0x37FF00, 0x69FF00, 0x9BFF00, 0xCDFF00, 0xFFFF00, \
                         0xFFCD00, 0xFF9B00, 0xFF6900, 0xFF3700, 0xFF0000); \
          while (<>) { \
            chomp;  ($c, $s, $e, $score) = split; \
            $chr = "chr".$c; \
            $score = $score + 0.0; \
            $bedScore = int($score * 10); \
            $colorIdx = int(@colorScale * $score / 100); \
            $colorIdx-- if ($colorIdx >= @colorScale); \
            $color = $colorScale[$colorIdx]; \
            print join("\t", $chr, $s, $e, $score, $bedScore, "+", $s, $e, $color) . "\n"; \
          }' \
        Recon_Meth_Altai_Neandertal.txt \
          > neandertalMethylation.bed
      hgLoadBed hg19 neandertalMethylation neandertalMethylation.bed
      # clean up
      rm Recon_Meth_Altai_Neandertal.txt bed.tab
      gzip neandertalMethylation.bed &
  
  
      mkdir /hive/data/genomes/hg19/bed/denisovaMethylation
      cd /hive/data/genomes/hg19/bed/denisovaMethylation
      wget http://carmelab.huji.ac.il/data/Reconstructed_Methylation_Denisovan.zip
      unzip Reconstructed_Methylation_Denisovan.zip
      # Apply the same transform as above:
      perl -we '\
          @colorScale = (0x00FF00, 0x37FF00, 0x69FF00, 0x9BFF00, 0xCDFF00, 0xFFFF00, \
                         0xFFCD00, 0xFF9B00, 0xFF6900, 0xFF3700, 0xFF0000); \
          while (<>) { \
            chomp;  ($c, $s, $e, $score) = split; \
            $chr = "chr".$c; \
            $score = $score + 0.0; \
            $bedScore = int($score * 10); \
            $colorIdx = int(@colorScale * $score / 100); \
            $colorIdx-- if ($colorIdx >= @colorScale); \
            $color = $colorScale[$colorIdx]; \
            print join("\t", $chr, $s, $e, $score, $bedScore, "+", $s, $e, $color) . "\n"; \
          }' \
        Recon_Meth_Denisovan.txt > denisovaMethylation.bed
      hgLoadBed hg19 denisovaMethylation denisovaMethylation.bed
      # clean up
      rm Recon_Meth_Altai_Denisova.txt bed.tab
      gzip denisovaMethylation.bed &
  
  
  ##############################################################################
  # DBSNP B141 / SNP141 (DONE 9/10/14)
      # Redmine #13309
      mkdir -p /hive/data/outside/dbSNP/141/human_hg19
      cd /hive/data/outside/dbSNP/141/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
      # to find the subdir name to use as orgDir below (human_9606 in this case).
      # Then click into that directory and look for file names like
      #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
      # -- use the first num for build and the second num_num for buildAssembly.
      # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
      #
      # Some trial and error was required to get the config.ra just right --
      # the b* filenames don't include buildAssembly!
      # patch contigs needed to be filtered out:
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606
  build 141
  buildAssembly GRCh37p13
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p13
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
  
      # It fails with a list of unliftable contigs -- look at dbSnpContigsNotInUcsc.txt
      # to make sure they're all patch contigs as expected, then start over, ignoring those:
      cut -f 2 cantLiftUpSeqNames.txt > patchContigs.txt
      cat >> config.ra <<EOF
  ignoreDbSnpContigsFile patchContigs.txt
  EOF
      # Continue at loadDbSnp.  There are always errors in snpNcbiToUcsc (translate step)
      # as it encounters new weird inputs; fix as necessary.  This time I also expanded
      # checking for overlapping items from insertions only to all types of variants.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp \
        >& do.log &
      tail -f do.log
  
      # After final snpNcbiToUcsc tweaking:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue translate \
        >>& do.log & tail -f do.log
  
      # Fasta is missing for 3259 rs's -- and unfortunately, they all seem to
      # be clinically associated so people will be interested in them.
      # dbSNP has not re-dumped rs_fasta as requested, so use batch_query to get
      # the fasta, and... back to the beginning with new fasta's.
      zcat snp141Errors.bed.gz \
      | grep 'Missing observed' \
      | cut -f 4 \
      | sort -u \
        > idsNoFasta.txt
      # Upload that file here and select FASTA as output format:
      #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
      # Wait for email with link to results.
      # 13 IDs are rejected as not in dbSNP (this used to explain most missing fasta):
  #397518414
  #397518415
  #397518416
  #398124647
  #398124648
  #398124649
  #398124650
  #431905487
  #431905488
  #431905489
  #431905490
  #431905491
  #431905492
      # Download the batch query results:
      wget ftp://ftp.ncbi.nlm.nih.gov/snp/batch/140818195028.gz
      # Yep, looks like rs_fasta.  Move it into the rs_fasta directory with a name that
      # will be picked up by "zcat rs_fasta/rs_ch*.fas.gz":
      mv 140818195028.gz rs_fasta/rs_chBatchQuery.fas.gz
      # Now continue from the addToDbSnp step.
      # NOTE: I actually combined this addToDbSnp run with the strand-fixing run below.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
        >>& do.log & tail -f do.log
  
      # b141 was released with some ens issues, including incorrect strand for some items.
      # Correct locally:
      wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/misc/ens_issues/b141/rs_with_changed_orientation.bcp
      tail -n +2 rs_with_changed_orientation.bcp \
      | perl -wne 'my ($snpId, $old, undef, $new) = split("\t"); \
                   if ($old != $new) { \
                     print "update b141_SNPContigLoc_GRCh37p13 set orientation = $new where snp_id = $snpId;\n"; \
                   }' > fixRsOrientation.sql
  #...except the files were wrong, see https://ncbijira.ncbi.nlm.nih.gov/browse/VR-28
  
      # 8/5/14: dbSNP re-released a subset of the FTP files... patch those in and rebuild.
      # I made a new subdir firstBuild and moved a bunch of files there so I can compare.
      cd /hive/data/outside/dbSNP/141/human_hg19
      mv data data.orig
      mkdir data
      cd data
      ln -s ../data.orig/* .
      rm b141_SNPContigLoc_GRCh37p13.bcp.gz b141_SNPContigLocusId_GRCh37p13.bcp.gz \
         b141_SNPMapInfo_GRCh37p13.bcp.gz
      foreach f (b141_SNPContigLoc_GRCh37p13.bcp.gz b141_SNPContigLocusId_GRCh37p13.bcp.gz \
                 b141_SNPMapInfo_GRCh37p13.bcp.gz)
        wget ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b141_GRCh37p13/database/organism_data/update_2014_Jul25/$f
      end
  
      hgsql hg19snp141 -e 'drop table b141_SNPContigLoc_GRCh37p13; \
                           drop table b141_SNPContigLocusId_GRCh37p13; \
                           drop table b141_SNPMapInfo_GRCh37p13;'
      # Recreate those tables using schema/table.sql.
  
      # Run the parts of loadDbSnp.csh for those specific tables:
      # ---------- begin loadDbSnp.csh excerpt ----------
      foreach t (b141_SNPContigLocusId_GRCh37p13 b141_SNPMapInfo_GRCh37p13)
        zcat /hive/data/outside/dbSNP/141/human_hg19/data/$t.bcp.gz  | grep -vwFf '/hive/data/outside/dbSNP/141/human_hg19/patchContigs.txt'\
        | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
          > tmp.tab
        hgLoadSqlTab -oldTable hg19snp141 $t placeholder tmp.tab
        rm tmp.tab
      end
      hgsql hg19snp141 -e \
        'alter table b141_SNPContigLocusId_GRCh37p13 add index (ctg_id); \
         alter table b141_SNPMapInfo_GRCh37p13 add index (snp_id);'
  
      # b141_SNPContigLoc_GRCh37p13 is huge, and we want only the reference contig mappings.
      # Keep lines only if they have a word match to some reference contig ID.
      # That allows some false positives from coord matches; clean those up afterward.
      zcat /hive/data/outside/dbSNP/141/human_hg19/data/b141_ContigInfo_GRCh37p13.bcp.gz \
      | cut -f 1 | sort -n > b141_ContigInfo_GRCh37p13.ctg_id.txt
      zcat /hive/data/outside/dbSNP/141/human_hg19/data/b141_SNPContigLoc_GRCh37p13.bcp.gz \
      | grep -Fwf b141_ContigInfo_GRCh37p13.ctg_id.txt \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp141 b141_SNPContigLoc_GRCh37p13 placeholder stdin
      # Get rid of those false positives:
      hgsql hg19snp141 -e 'alter table b141_SNPContigLoc_GRCh37p13 add index (ctg_id);'
      hgsql hg19snp141 -e 'create table ContigLocFix select cl.* from b141_SNPContigLoc_GRCh37p13 as cl, b141_ContigInfo_GRCh37p13 as ci where cl.ctg_id = ci.ctg_id;'
      hgsql hg19snp141 -e 'alter table ContigLocFix add index (ctg_id);'
      hgsql hg19snp141 -e 'drop table b141_SNPContigLoc_GRCh37p13; \
                           rename table ContigLocFix to b141_SNPContigLoc_GRCh37p13;'
      hgsql hg19snp141 -e 'alter table b141_SNPContigLoc_GRCh37p13 add index (snp_id);'
      # ---------- end loadDbSnp.csh excerpt ----------
  
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=addToDbSnp \
        >>& do.log & tail -f do.log
  
      # That completed OK. Now, try to identify the remaining >300k strand-swap errors
      # that should be fixed in hg19.snp141.
      cd /hive/data/outside/dbSNP/141/human_hg19
      # First, make some files for dbSNP, directly from their download files.
      # To prepare for join, '.'-concat location, rs# ID, ori and allele fields of b138 SNPContigLoc:
      zcat data/b138_SNPContigLoc.bcp.gz \
      | awk -F"\t" '{print $3"."$4"."$5".rs"$2"\t"$15"\t"$16;}'
      | sort \
        > hg19_b138_locOriAl.txt
      # Likewise for b141 on GRCh37:
      zcat data/b141_SNPContigLoc_GRCh37p13.bcp.gz \
      | awk -F"\t" '{print $3"."$4"."$5".rs"$2"\t"$15"\t"$16;}' \
      | sort \
        > hg19_b141_locOriAl.txt
      # Join to find items with identical coords and rs#ID.  Columns output:
      # 1: coords.rsID (same in both input files, column IDs 1.1 and 2.1)
      # 2: b138 ori (1.2)
      # 3: b138 allele (1.3)
      # 4: b141 ori (2.2)
      # 5: b141 allele (2.3)
      join -o1.1,1.2,1.3,2.2,2.3 hg19_b138_locOriAl.txt hg19_b141_locOriAl.txt \
        > hg19_b138_b141_locOriAl.txt
      # Now find changes of orientation:
      awk '$2 != $4 {print;}' hg19_b138_b141_locOriAl.txt \
        > hg19_b138_b141_locOriAl_strandChange.txt
      gzip hg19_b138_b141_locOriAl_strandChange.txt
      # There are also 379 items whose allele changed as well, all length-2 items
      # switching between insertion and 2-base change (dbSNP uses 2-base regions for
      # both types):
      zcat hg19_b138_b141_locOriAl_strandChange.txt.gz \
      | awk '$3 != $5' \
        > hg19_b138_b141_locOriAl_strandChange_alChange.txt
  
      # I sent hg19_b138_b141_locOriAl_strandChange.txt.gz and
      # hg19_b138_b141_locOriAl_strandChange_alChange.txt to dbSNP
      # (again see https://ncbijira.ncbi.nlm.nih.gov/browse/VR-28)
  
      # Most, but not all of the changes were errors.  ObservedMismatch is a solid
      # indicator: if b138 doesn't have it but b141 does, it's an error in b141,
      # but if b138 has it and b141 doesn't, then b141 is OK and shouldn't be swapped.
      # Unfortunately for C/G, A/T or insertion snps, the only way to tell for sure is
      # to realign the flanking sequences... if the reported strand is incorrect, then
      # the alignment is crap.  So far all of the C/G and A/T examples that I have
      # looked at are wrong in b141, but I bet there are b141-OK ones hiding in there.
      # I want to fix what's broken in dbSNP, but not break anything new!
  
      # Repeat the analysis for dbSNP, but now use hg19 coords and include exceptions
      # (specifically, whether ObservedMismatch is present):
      zcat ../../138/human/snp138.bed.gz \
      | awk -F"\t" '{if ($18 ~ /ObservedMismatch/) { gotObs = "y"; } else { gotObs = "n"; } \
                     print $1"."$2"."$3"."$4 "\t" $6 "\t" $7 "\t" $9 "\t" gotObs;}' \
      | sort \
        > hg19_snp138_locOriEtc.txt
      zcat snp141.bed.gz \
      | awk -F"\t" '{if ($18 ~ /ObservedMismatch/) { gotObs = "y"; } else { gotObs = "n"; } \
                     print $1"."$2"."$3"."$4 "\t" $6 "\t" $7 "\t" $9 "\t" gotObs;}' \
      | sort \
        > hg19_snp141_locOriEtc.txt
      join -o 1.1,1.2,1.3,1.4,1.5,2.2,2.3,2.4,2.5 \
        hg19_snp138_locOriEtc.txt hg19_snp141_locOriEtc.txt \
        > hg19_snp138_snp141_locOriEtc.txt
      awk '$2 != $6 {print;}' hg19_snp138_snp141_locOriEtc.txt \
        > hg19_snp138_snp141_locOriEtc_strandChange.txt
      wc -l hg19_snp138_snp141_locOriEtc_strandChange.txt
  #335220 hg19_snp138_snp141_locOriEtc_strandChange.txt
  
      # What's the breakdown of ObservedMismatch in b138 vs b141?:
      awk '{print $5, $9;}' hg19_snp138_snp141_locOriEtc_strandChange.txt | sort | uniq -c
  #  65294 n n
  # 269435 n y
  #    479 y n
  #     12 y y
      # Only 12 have a strand change but still get OM in both, take a look:
      awk '$5 == "y" && $9 == "y"' hg19_snp138_snp141_locOriEtc_strandChange.txt
  #chr11.72552605.72552606.rs187553889 + T C/G y - T C/G y
  #chr12.120990391.120990392.rs189978208 + G A/T y - G A/T y
  #chr12.121858512.121858513.rs183189219 + G A/T y - G A/T y
  #chr12.71286640.71286641.rs190625971 + G A/T y - G A/T y
  #chr12.72025810.72025811.rs182418005 + G A/T y - G A/T y
  #chr2.43768454.43768455.rs189409390 + C A/T y - C A/T y
  #chr2.43965624.43965625.rs183733596 + G A/T y - G A/T y
  #chr3.122545749.122545750.rs187447068 + A C/G y - A C/G y
  #chr3.122545908.122545909.rs182585191 + G A/T y - G A/T y
  #chr3.123512559.123512560.rs191098134 + G A/T y - G A/T y
  #chr6.31324909.31324910.rs188104024 + A C/G y - A C/G y
  #chr8.95565652.95565653.rs189964963 + T C/G y - T C/G y
      # rs187553889's flanking sequences map it to the same location as rs77419620,
      # chr11 | 72535846 | 72535847.  So it's in the completely wrong place in both 138 & 141.
      # rs189978208 flanks -> location of rs186748904... whose flanks in turn map to
      # location of rs184077395!  zomg.
      # rs183189219 flanks -> location of rs185279212... whose flanks -> rs77636759!
      # rs190625971 flanks -> loc of rs185820511... flanks -> rs73341069
      # rs182418005 flanks -> loc of rs75047971 (16 bases left)
      # rs189409390 flanks -> loc of rs33979934
      # rs183733596 flanks -> loc of rs192959614
      # rs187447068 flanks -> loc of rs184298957 (2 bases left) whose flanks -> loc of rs114504955
      # rs182585191 flanks -> loc of rs190197603 (2 bases left)
      # rs191098134 flanks -> loc of rs187262203 (2 bases left)
      # rs188104024 flanks -> loc of rs1050462 on chr6 (1 base right), chr6_cox_hap2
      # rs189964963 flanks -> loc of rs185255459 (2 bases left)
      # All of those rs's have a single submitter: ILLUMINA|Cardio-Metabo_Chip
      # Reported that in VR-28.
  
      # Compare Ming's strand-change list with mine:
      g -v ^# firstBuild/b141_rs_orientation_flip_from_b138_list.txt \
      | awk -F"|" '{print "rs"$4}' \
      | sort \
        > ids_ming_b138_b141_strandChange.txt
      perl -wpe 's/^.*(rs\d+) .*/$1/' hg19_snp138_snp141_locOriEtc_strandChange.txt \
      | sort \
        > ids_hg19_snp138_snp141_locOriEtc_strandChange.txt
      # Ming's list but not mine (map to different locations, looks OK in b141)
      comm -23 ids_ming_b138_b141_strandChange.txt ids_hg19_snp138_snp141_locOriEtc_strandChange.txt | wc -l
  #255
      # Mine but not Ming's:
      comm -13 ids_ming_b138_b141_strandChange.txt ids_hg19_snp138_snp141_locOriEtc_strandChange.txt | wc -l
  #18422
  
      # I think the way to go is to swap the strand unless b138 had ObservedMismatch.
      # To do it right, we need to modify the b141_SNPContigLoc_GRCh37p13 table and
      # then continue from addToDbSnp.
      # First make a list of snp_id's that need their strand swapped:
      awk '$5 == "n"' hg19_snp138_snp141_locOriEtc_strandChange.txt \
      | perl -wpe 's/.*\.rs(\d+).*/$1/' \
        > idsToSwapStrand.txt
      awk '$5 == "n"' hg19_snp138_snp141_locOriEtc_strandChange.txt \
      | perl -wpe 's/^.*rs(\d+) ([+-]) .*// || die; \
        ($snp_id, $strand) = ($1, $2); \
        $ori = ($strand eq "-") ? 1 : 0; \
        $_ = "update b141_SNPContigLoc_GRCh37p13 set orientation = $ori where snp_id = $snp_id;\n"' \
        > fixOrientation.sql
      hgsql hg19snp141 < fixOrientation.sql
  
      # Now continue from addToDbSnp:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=addToDbSnp \
        >>& do.log & tail -f do.log
  # *** All done!
  
      # Upon examination of SNPs with weight > 1, it turned out that *none* of them
      # actually have MultipleAlignments - nor did they in snp138!  I think they were
      # assigned weight > 1 incorrectly - possibly due to GRCh37 alt/patch matches.
      # I asked dbSNP about it on 8/22 but no reply:
      #   https://ncbijira.ncbi.nlm.nih.gov/browse/VR-31
      # Meanwhile, this convinces me that the weights should all be 1:
      zcat snp141Mult.bed.gz | grep MultipleAlignments | wc -l
  #0
      zcat snp141Mult.bed.gz | cut -f 4 | sort > idsInMult.txt
      zcat ../../138/human/snp138Mult.bed.gz  | cut -f 4 | sort > idsIn138Mult.txt
      grep -Fwf idsInMult.txt idsIn138Mult.txt > idsInMultAlso138.txt
      wc -l idsInMultAlso138.txt
  #0 idsInMultAlso138.txt
      # So I am going to tweak all of their weights to 1, because with weight > 1 they
      # are not displayed by default!  We should drop the snp141Mult table because
      # there should be nothing in it -- they really did prune all multiple-mappers!
      mv snp141.bed.gz snp141BadWeights.bed.gz
      zcat snp141BadWeights.bed.gz \
      | awk -F"\t" 'BEGIN{OFS="\t";} {$17 = 1; print;}' \
      | gzip -c \
        > snp141.bed.gz
      hgLoadBed -tab -onServer -tmpDir=$TMPDIR -allowStartEqualEnd -type=bed6+ \
        hg19 snp141 -sqlTable=snp141.sql snp141.bed.gz
  
      # Now rebuild snp141{Common,Flagged} which were missing thousands of SNPs due to incorrect
      # weights.  Fortunately, I don't need to redo ortho alleles or masked sequences
      # because they filter out MultipleAlignments ignoring weight.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=filter -stop=filter \
        >>& do.log & tail -f do.log
  # (before: snp141Common had 13764550 rows, snp141Flagged had 80472)
  # (after:  snp141Common has 13780063 rows, snp141Flagged has 87410)
      zcat snp141Mult.bed.gz | wc -l
  #0
      hgsql hg19 -e 'drop table snp141Mult;'
  
  
  ##############################################################################
  # SNP141 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 8/29/14 angie)
      # Redmine #13309
      mkdir /hive/data/genomes/hg19/bed/snp141Ortho
      cd /hive/data/genomes/hg19/bed/snp141Ortho
      # Filter snp141 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
      zcat /hive/data/outside/dbSNP/141/human_hg19/snp141.bed.gz \
      | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      | sort -u \
        > snp141ExcludeIds.txt
      wc -l snp141ExcludeIds.txt
  #304685 snp141ExcludeIds.txt
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      zcat /hive/data/outside/dbSNP/141/human_hg19/snp141.bed.gz \
      | awk '$3-$2 == 1 && $11 == "single" {print;}' \
      | grep -vFwf snp141ExcludeIds.txt \
      | awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        > snp141ForLiftOver.bed
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOChimp
      cd run.liftOChimp
      mkdir split out
      splitFile ../snp141ForLiftOver.bed 10000 split/chunk
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro4.over.chain.gz \
          \{check out exists out/panTro4.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      ssh ku
      cd /hive/data/genomes/hg19/bed/snp141Ortho/run.liftOChimp
      para make jobList
  #Completed: 5408 of 5408 jobs
  #CPU time in finished jobs:     312849s    5214.15m    86.90h    3.62d  0.010 y
  #IO & Wait Time:                 22692s     378.20m     6.30h    0.26d  0.001 y
  #Average job time:                  62s       1.03m     0.02h    0.00d
  #Longest finished job:             213s       3.55m     0.06h    0.00d
  #Submission to last job:          1287s      21.45m     0.36h    0.01d
  
      # Map coords to orangutan using liftOver.
      mkdir ../run.liftOPon
      cd ../run.liftOPon
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
          \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 5408 of 5408 jobs
  #CPU time in finished jobs:     635602s   10593.37m   176.56h    7.36d  0.020 y
  #IO & Wait Time:                 23934s     398.89m     6.65h    0.28d  0.001 y
  #Average job time:                 122s       2.03m     0.03h    0.00d
  #Longest finished job:             451s       7.52m     0.13h    0.01d
  #Submission to last job:          1572s      26.20m     0.44h    0.02d
  
      # Map coords to macaque using liftOver.
      mkdir ../run.liftOMac
      cd ../run.liftOMac
      mkdir out
      ln -s ../run.liftOChimp/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        echo liftOver $f \
          /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac3.over.chain.gz \
          \{check out exists out/rheMac3.$f:t.bed\} out/hg19.$f:t.unmapped \
          >> jobList
      end
      para make jobList
  #Completed: 5408 of 5408 jobs
  #CPU time in finished jobs:     783935s   13065.58m   217.76h    9.07d  0.025 y
  #IO & Wait Time:                 28922s     482.03m     8.03h    0.33d  0.001 y
  #Average job time:                 150s       2.51m     0.04h    0.00d
  #Longest finished job:             506s       8.43m     0.14h    0.01d
  #Submission to last job:          1888s      31.47m     0.52h    0.02d
  
      cd /hive/data/genomes/hg19/bed/snp141Ortho
      # Concatenate the chimp results, sorting by chimp pos in order to
      # efficiently access 2bit sequence in getOrthoSeq.  The output of
      # that is then sorted by the glommed human info field, so that we
      # can use join to combine chimp and macaque results in the next step.
      # Ditto for macaque and orangutan.  Each command pipe takes ~15 minutes:
      sort -k1,1 -k2n,2n run.liftOChimp/out/panTro4.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro4/panTro4.2bit \
      | sort > panTro4.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
      | sort > ponAbe2.orthoGlom.txt
      sort -k1,1 -k2n,2n run.liftOMac/out/rheMac3.chunk*.bed \
      | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac3/rheMac3.2bit \
      | sort > rheMac3.orthoGlom.txt
      wc -l panTro4.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac3.orthoGlom.txt
  #   50837337 panTro4.orthoGlom.txt
  #   48476085 ponAbe2.orthoGlom.txt
  #   43632613 rheMac3.orthoGlom.txt
  
      # Use the glommed name field as a key to join up chimp and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of hg18.snp128OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        panTro4.orthoGlom.txt ponAbe2.orthoGlom.txt \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > tmp.txt
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        tmp.txt rheMac3.orthoGlom.txt \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
      | sort -k1,1 -k2n,2n > snp141OrthoPt4Pa2Rm3.bed
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp141OrthoPt4Pa2Rm3 snp141OrthoPt4Pa2Rm3.bed
  #Read 52562974 elements of size 22 from snp141OrthoPt4Pa2Rm3.bed
  
      # Cleanup:
      rm -r run*/split tmp.txt *.orthoGlom.txt
      gzip snp141ExcludeIds.txt snp141ForLiftOver.bed snp141OrthoPt4Pa2Rm3.bed &
  
  
  ############################################################################
  # SNPMASKED SEQUENCE FOR SNP141 (DONE 8/29/14 angie)
      # Redmine #13309
      mkdir /hive/data/genomes/hg19/snp141Mask
      cd /hive/data/genomes/hg19/snp141Mask
      # Identify rsIds with various problems -- we will exclude those.
      zcat /hive/data/outside/dbSNP/141/human_hg19/snp141.bed.gz \
      | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
        | sort -u \
        > snp141ExcludeRsIds.txt
      zcat /hive/data/outside/dbSNP/141/human_hg19/snp141.bed.gz \
      | grep -vFwf snp141ExcludeRsIds.txt \
        > snp141Cleaned.bed
      wc -l snp141Cleaned.bed
  #59287443 snp141Cleaned.bed
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp141Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
      | faSplit byname stdin substitutions/
  #Masked 52908561 snps in 52908552 out of 3095693983 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3095693983 (difference is 41467281)
      # Check that 41467281 is the total #bases in sequences with nothing in snp141Cleaned:
      grep -Fw single snp141Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
      grep -vwf /data/tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #41467281
      # warnings about differing observed strings at same base position:
      wc -l diffObserved.txt
  #16 diffObserved.txt
      # -- small beans, and dbSNP is aware of thousands of SNPs that have clustering issues.
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10176 (m != a)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
      end
      # Fire off a bunch of gzip jobs in parallel:
      ls -1 substitutions/*.fa | split -l 5
      foreach f (x??)
        gzip `cat $f` &
      end
      # Wait for backgrounded gzip jobs to complete
      rm x??
  
      # Insertions & deletions not done.  To date we have only offered substs for download.
      # If there is user demand, use template from snp131 above.
  
      # Clean up and prepare for download:
      gzip snp141Cleaned.bed &
      foreach d (substitutions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg19/snp138Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt.
  
      # Create download links on hgwdev.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp141Mask
      ln -s /hive/data/genomes/hg19/snp141Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp141Mask/
  
  
  ############################################################################
  # peptideAtlas peptides mapped to the genome
  
  # Aug 2014 build
  # Build #433, based on Ensemblv75
  
  mkdir -p /hive/data/genomes/hg19/bed/peptideAtlas/2014-Aug
  cd /hive/data/genomes/hg19/bed/peptideAtlas/2014-Aug
  
  wget --no-check-certificate -c "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?organism_name=homo%20sapiens&apply_action=QUERY&output_mode=tsv" -O peptideatlas_9606.tsv
  
  wget http://www.peptideatlas.org/builds/human/201408/coordinate_mapping.txt -O peptideAtlas.201408.tsv
  wc -l peptideAtlas.201408.tsv
  # 12858738
  
  grep -v UNKNOWN peptideAtlas.201408.tsv > ens.txt
   wc -l ens.txt
  # 5449240 ens.txt
  
  # tripled from 2013!
  # 1396070 ens.txt
  
  # 42% mapped (compared to 30% prev) -- 5.4M total mapped
  
  perl toBed.pl < ens.txt > ens.bed
  grep -v chrH ens.bed | grep -v chrG | sed 's/chrMT/chrM/'| sort | uniq > ens.clean.bed
  wc -l ens.clean.bed
  # 4868641 ens.clean.bed
  
  # To preview, load just the regions
  
  # bedDetail 6
  cat << 'EOF' > peptideAtlas.sql
  CREATE TABLE peptideAtlas2014 (
      chrom varchar(255) not null,   # Reference sequence chromosome or scaffold
      chromStart int unsigned not null,   # Start position in chromosome
      chromEnd int unsigned not null,     # End position in chromosome
      name varchar(255) not null, # Short Name of item
      id varchar(255) not null,   # ID to be used in URL to link back
      description longblob not null, # Long description of item for the details page
      #Indices
      INDEX(chrom, chromStart)
  );
  'EOF'
  sort -k1,1 -k2,2n ens.clean.bed > ens.sorted.bed
  hgLoadSqlTab hg19 peptideAtlasRegions2014 peptideAtlas.sql ens.sorted.bed
  
  
  # To link up exons, we need the full database
  
  wget http://www.peptideatlas.org/builds/human/201408/atlas_build_433.mysql.gz
  hgsql '' -e 'create database peptideAtlas2014'
  zcat atlas_build_433.mysql.gz | hgsql peptideAtlas2014
  
  csh scanFull.csh | grep TABLE
  #TABLE:    peptide : 1021823
  #TABLE:    peptide_instance : 1021823
  #TABLE:    peptide_instance_search_batch : 18156629
  #TABLE:    peptide_mapping : 12821066
  
  csh getSamples.csh > samples.tab
  
  tail -n +2 hgLoadSqlTab hgFixed peptideAtlasSample433 ~/kent/src/hg/lib/peptideAtlasSample.sql stdin
  
  # sample Ids aren't correct when reloaded into the DB -- they sent me this to use instead:
  wget http://www.peptideatlas.org/builds/human/201408/peptide_instance_sample.tar.gz
  tar xvfz /peptide_instance_sample.tar.gz
  
  # exploratory
  
  hgsql -N peptideAtlas2014 -e 'select min(peptide_length), max(peptide_length) from peptide'
  +------+------+
  |    7 |   83 |
  +------+------+
  
  hgsql -N peptideAtlas2014 -e 'select peptide_length from peptide order by peptide_length' | textHistogram stdin -binSize=3 -minVal=7 -maxBinCount=30 -log
    7 *********************************************************** 187407
   10 ************************************************************ 237387
   13 *********************************************************** 198221
   16 ********************************************************** 141932
   19 ******************************************************** 94676
   22 ***************************************************** 61945
   25 *************************************************** 39636
   28 ************************************************* 25360
   31 ********************************************** 14571
   34 ******************************************** 8632
   37 ***************************************** 5069
   40 *************************************** 2992
   43 ************************************ 1677
   46 ********************************* 1000
   49 ******************************* 568
   52 **************************** 330
   55 ************************* 174
   58 *********************** 126
   61 ******************* 51
   64 **************** 30
   67 *************** 20
   70 ********** 8
   73 ********* 7
   76  1
   79 *** 2
   82  1
  
  
  hgsql peptideAtlas2014 -e "select peptide_instance_id as peptide, matched_biosequence_id as bioseq, chromosome as chrom, start_in_chromosome as chromStart, end_in_chromosome as chromEnd, strand from peptide_mapping where chromosome='X' group by peptide, chromStart, chromEnd, strand order by peptide, chromStart, bioseq" > chrX_mappings.tab
  
  # max #mappings = 203
  
  hgsql peptideAtlas2014 -e "select peptide.peptide_accession as accession, peptide.peptide_sequence as sequence, peptide.SSRCalc_relative_hydrophobicity as hydrophobicity, peptide_instance.empirical_proteotypic_score as proteotypicScore, peptide_instance.n_samples as sampleCount from peptide, peptide_instance where peptide_instance.peptide_id=peptide.peptide_id order by peptide.peptide_accession" > peptides.tab
  
  hgsql peptideAtlas2014 -e "select peptide.peptide_accession as accession, peptide.peptide_length as length, peptide.peptide_sequence as sequence, peptide.SSRCalc_relative_hydrophobicity as hydrophobicity, peptide_instance.empirical_proteotypic_score as proteotypicScore, peptide_instance.n_samples as sampleCount from peptide, peptide_instance where peptide_instance.peptide_id=peptide.peptide_id order by peptide.peptide_accession" > peptides.len.tab
  
  wc -l peptides.tab
  #1021823 peptides.tab
  
  # create protein fasta file of sequences for download
  awk 'NR != 1 {printf(">%s\n%s\n", $1, $2)}' peptides.tab > peptides.fa
  mkdir downloads
  mv peptides.fa downloads
  cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg19
  mkdir peptideAtlas
  cd peptideAtlas
  ln -s /hive/data/genomes/hg19/bed/peptideAtlas/2014-Aug/downloads/peptides.fa .
  md5sum peptides.fa > md5sum.txt
  cat > README.txt << 'EOF'
  This directory contains peptide sequences from the August 2014 Human Build of PeptideAtlas (peptideatlas.org).
  'EOF'
  
  #tail -n +2 peptides.tab | \
      #hgLoadSqlTab hgFixed peptideAtlas2014Peptides ~/kent/src/hg/lib/peptideAtlasPeptide.sql stdin
  
  
  # BUG FIX: RM #15998
  # Reload peptideAtlas2014Peptides table with columns in the right order
  # 2015-09-08 (kate)
  tail -n +2 peptides.tab | awk '{OFS="\t"}{ print $1, $2, $5, $3, $4}' | \
      hgLoadSqlTab hgFixed peptideAtlas2014PeptidesFixed ~/kent/src/hg/lib/peptideAtlasPeptide.sql stdin
  hgsql hgFixed -e 'alter table peptideAtlas2014Peptides rename to peptideAtlas2014Peptides_old'
  hgsql hgFixed -e 'alter table peptideAtlas2014PeptidesFixed rename to peptideAtlas2014Peptides'
  
  
  # extract mappings with some metadata (biosequence matched (protein and gene)))
  # use number of samples where the peptide was observed as the score (to be scaled)
  # filter out patches (which we do not annotate), and contig-based mappings
  
  hgsql peptideAtlas2014 -e "select concat('chr', peptide_mapping.chromosome) as chr, peptide_mapping.start_in_chromosome-1 as chromStart, peptide_mapping.end_in_chromosome as chromEnd, peptide.peptide_accession as name, peptide_instance.n_samples as score, peptide_mapping.strand as strand, biosequence.biosequence_name as biosequence, biosequence.biosequence_gene_name as gene from peptide_mapping, peptide, peptide_instance, biosequence where peptide_mapping.chromosome not like '%PATCH' and peptide_mapping.chromosome not like '%CTG%' and peptide_mapping.chromosome not like '%TEST' and peptide_mapping.chromosome <> '0' and peptide.peptide_id=peptide_instance.peptide_id and peptide_mapping.peptide_instance_id=peptide_instance.peptide_instance_id and peptide_mapping.matched_biosequence_id=biosequence.biosequence_id order by chr, biosequence, chromStart" > mappings.atlas.bed
  
  # lift (actually, drop) alt haplotype mappings to 0-based UCSC hap chrom coords
  tail -n +2 mappings.atlas.bed | sed 's/chrHSCHR/HSCHR/' | \
      liftUp mappings.lifted.bed /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift carry stdin
  
  # translate contig names to UCSC chromosome names
  
  sed \
      -e 's/GL000191.1/1_gl000191_random/' \
      -e 's/GL000194.1/4_gl000194_random/' \
      -e 's/GL000195.1/7_gl000195_random/' \
      -e 's/GL000201.1/9_gl000201_random/' \
      -e 's/GL000205.1/17_gl000205_random/' \
      -e 's/GL000209.1/19_gl000209_random/' \
      -e 's/GL000213.1/Un_gl000213/' \
      -e 's/GL000218.1/Un_gl000218/' \
      -e 's/GL000219.1/Un_gl000219/' \
      -e 's/GL000222.1/Un_gl000222/' \
      -e 's/GL000223.1/Un_gl000223/' \
      -e 's/GL000242.1/Un_gl000242/' \
      -e 's/MT/M/' \
          mappings.lifted.bed > mappings.bed
  
  # link up exons and remove dups (peptides mapped to multiple targets at the same genomic location)
  
  # experiments
  # %splitFileByColumn mappings.bed mappings
  # creates mappings/chr*.bed
  
  # ?? blat -t=dnax -q=prot -out=pslx /hive/data/genomes/hg19/hg19.2bit/ peptides.fa peptides.psl
  # fed some sequences (test.fa) to webBlat... it finds more matches, but doesn't extend across
  # exon boundaries as well (w/ default config).  In my sample, the positions corresponding
  # to peptideAtlas positions were 100% matches.
  # NOTE: many sequences are shorter than blat's required 14
  
  er link exons and de-dup
  
  # gitted in kent/src/hg/makeDb/outside/peptideAtlas
  linkPeptideMappings.pl mappings.bed > mappings.bed12
  
  wc -l mappings.bed mappings.bed12
  #  5145435 mappings.bed
  #  1189307 mappings.bed12
  
  bedSort mappings.bed12 mappings.sorted.bed12
  hgLoadBed hg19 peptideAtlas2014 mappings.bed12
  # Read 1189306 elements of size 12 from mappings.bed12
  
  # NOTE: consider using pslMap to lift these to hg38
  
  # count number of peptide sequences in sample dump
  awk -F',' '{print $2}' peptide_inst*tsv | sort | uniq | wc -l
  # 1021824
  
  # check if peptides match translated genome (if so we can display w/o loading in main table)
  sequenceForBed -db=hg19 -keepName -bedIn=mappings.bed12 -fastaOut=mappings.fa
  faTrans mappings.fa peptides.fa
  faToTab -type=protein peptides.fa stdout | sort | uniq > peptides.fromMappings.tab
  tail -n +2 peptides.tab | awk '{print $1,"\t", $2}' | diff - peptides.fromMappings.tab
  # yes -- they are exact matches
  
  
  ##############
  # experiments
  
  featureBits hg19 peptideAtlas14
  # 15624786 bases of 2897316137 (0.539%) in intersection
  
  featureBits hg19 -enrichment refGene:cds peptideAtlas2014
  # refGene:cds 1.212%, peptideAtlas2014 0.539%, both 0.533%, cover 44.00%, enrich 81.58x
  
  # alternate mappings for the peptides, using FenyoLab PGx
  # Drag/Drop peptides.fa to web tool
  
  mkdir pgx
  cd pgx
  bedSort peptideAtlas.pgx.bed peptideAtlas.pgx.sorted.bed
  wc -l mappings.sorted.bed12 pgx/*.sorted.bed
  #1189306 mappings.sorted.bed12
  #1111981 pgx/peptideAtlas.pgx.sorted.bed
  
  hgLoadBed hg19 peptideAtlas2014Pgx peptideAtlas.pgx.sorted.bed
  
  
  hgsql hg19 -e 'select distinct(name) from peptideAtlas2014 order by name' > peptides.mapped.txt
  hgsql hg19 -e 'select distinct(accession) from hgFixed.peptideAtlas2014Peptides order by accession' > peptides.all.txt
  
  # diff, awk, sed > peptides.unmapped.txt
  
  wc -l peptides.unmapped.txt
  # 20498
  
  
  ##############################################################################
  # DBSNP B142 / SNP142 (DONE 10/14/15)
  # originally done 11/03/14
  # 11/17/14: Allele dump file was outdated at time of download -- it was re-dumped a couple weeks
  #           after release.  snp142 was missing some frequencies due to incomplete join with Allele.
  #           Rebuilding all tables that have allele frequencies (just the track tables & exceptions).
  # 2/17/15: User reported missing validation info (#14836); Jonathan found that SNP.bcp.gz
  #          had been updated 1/30/15, argh.  Rebuilding parts that depend on the SNP table.
  # 10/14/15: Zero out the allele freq columns for snp142 rows on the '-' strand that include
  #           1000GENOMES in submitters, because they are incorrect (Bug #16204).
  #           Regenerate snp142{Common,Flagged,Mult}.
      # Redmine #14189
      mkdir -p /hive/data/outside/dbSNP/142/human_hg19
      cd /hive/data/outside/dbSNP/142/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b142_GRCh37p13 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b142_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b142_* files, add that to config.ra as the "buildAssembly".
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b142_GRCh37p13
  build 142
  buildAssembly 105
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p13
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
  
      # It fails with a list of unliftable contigs -- look at dbSnpContigsNotInUcsc.txt
      # to make sure they're all patch contigs as expected, then start over, ignoring those:
      cut -f 2 cantLiftUpSeqNames.txt > patchContigs.txt
      cat >> config.ra <<EOF
  ignoreDbSnpContigsFile patchContigs.txt
  EOF
      # Continue at loadDbSnp.  Stop after loadDbSnp to identify SNPs whose sequences didn't
      # make it into the rs_fasta dump files; we can fetch those separately from dbSNP's
      # batch query.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp -stop=loadDbSnp \
        >>& do.log &
      tail -f do.log
  
      # Compare rs IDs in rs_fasta with b142_SNPContigLoc_105 to see which IDs were
      # omitted from the rs_fasta dump.
      zcat rs_fasta/rs*.fas.gz \
      | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      | sort -u > rsFastaIds.txt
      hgsql hg19snp142 -NBe 'select concat("rs", snp_id) from b142_SNPContigLoc_105' \
      | sort -u > contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #6 missingFromRsFasta.txt
      # A lot fewer than in snp141!
      # Fetch fasta for those ids from dbSNP's batch_query.
      # Go to this URL:
      #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
      # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
      # Wait for email with link to results, then download the results:
  
      # NOTE: after waiting >1 hour with no email from dbSNP, since there are only 6 this time,
      # I manually checked then on dbSNP's web site.  All 6 had a message like this:
  #rs483352939 was deleted on Sep 29, 2014 because its subsnp_id was deleted:
      # so rs_fasta is actually complete this time and those 6 in SNPContigLoc can be dropped!
      # Here's what I would have done if actual fasta sequences were returned:
      wget -O rs_fasta/rs_chBatchQuery.fas.gz $fileFromDbSnpEmail
      # Now continue from the addToDbSnp step.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
        >>& do.log & tail -f do.log
  # *** All done!
      # Wow, it almost never runs so smoothly; there are usually some formatting weirdnesses
      # that gum up snpNcbiToUcsc in the translate step.
      # However, as in b141, there are a bunch of items with weight > 1 but not MultipleAlignments.
      # I updated https://ncbijira.ncbi.nlm.nih.gov/browse/VR-31 with a comment.
      # Fix them up locally:
      mv snp142.bed.gz snp142BadWeights.bed.gz
      zcat snp142BadWeights.bed.gz \
      | awk -F"\t" 'BEGIN{OFS="\t";} $18 !~ /MultipleAlignments/ {$17 = 1;} {print;}' \
      | gzip -c \
         > snp142.bed.gz
      hgLoadBed -tab -onServer -tmpDir=$TMPDIR -allowStartEqualEnd -type=bed6+ \
        hg19 snp142 -sqlTable=snp142.sql snp142.bed.gz
  #Read 113128211 elements of size 25 from snp142.bed.gz
  
      # Now rebuild snp142{Common,Flagged} which were missing thousands of SNPs due to incorrect
      # weights.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=filter -stop=filter \
        >>& do.log & tail -f do.log
  # *** All done!  (through the 'filter' step)
  
  
      # NOTE: from here on out is exploratory stuff -- I ended up ignoring the extra files.
      # Download some of dbSNP's misc files for 142 to see if we can/should work them in:
      wget ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/misc/1kg_70K_phase_update.txt.gz
      wget ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/misc/1kg_70K_phase_update.txt.README
      gunzip 1kg_70K_phase_update.txt.gz
      # The first version of that file had some incorrect entries; reported to dbSNP
      # (https://ncbijira.ncbi.nlm.nih.gov/browse/VR-32); re-downloaded after fixes.
      # column headers from README:
  #subsnp_id|snp_id|loc_snp_id|old_chr|old_pos|old_ref_allele|old_alt_allele|new_chr|new_pos|new_ref|new_alt|
      # First line moves the position 8 bases to the left:
  #1367645320|576104692|PHASE3_V1_2774|1|723761|-|GAGAGAGG|1|723753|-|GAGAGAGG|
      # Look for allele changes:
      cut -d\| -f 2,4- --output-delimiter='	' 1kg_70K_phase_update.txt\
      | awk '$4 != $8 || $5 != $9' \
      | head
  #532008114       1       5366489 -       TTTTG   1       5366485 -       TTTGT
  #543956793       1       6266877 AG      -       1       6266874 GA      -
  #558079733       1       7056752 GAA     -       1       7056750 AAG     -
  #566669620       1       8121175 -       AATAATAATAA     1       8121167 -       AATAATAAAAT
      # Yep, looks like left-shifting of indel alignments.  I think the first one could be
      # shifted even further left (5366474 TTTTG) but maybe I got my eyes crossed.
  
      # I found 32 cases of an insertion turning into an SNV while flank alignment
      # indicates insertion - https://ncbijira.ncbi.nlm.nih.gov/browse/VR-33
      # Sent the output from this:
      awk -F\| '$6 == "-" && $10 != "-"' 1kg_70K_phase_update.txt
  
      # Do any of these change allele length?
      cut -d\| -f 2,4- --output-delimiter='	' 1kg_70K_phase_update.txt\
      | sed -re 's/\t-\t/\t\t/g' \
      | awk -F"\t" 'length($4) != length($8) || length($5) != length($9)' \
      | sed -re 's/\t\t/\t-\t/g' \
      | head
  #561146841       1       10948353        -       AAAT    1       10948349        -       AAATAAATAAAT
  #561146841       1       10948353        -       AAATAAATAAAT    1       10948349        -       AAATAAATAAATAAAT
  #561146841       1       10948353        -       AAATAAATAAATAAAT        1       10948349        -       AAATAAATAAATAAATAAAT
  #558127253       1       27541302        -       TTTA    1       27541298        -       TTTATTTA
  #561583565       1       28001357        -       TTTTC   1       28001352        -       TTTTCTTTTC
      # Ah, 561146841 is all jumbled there because it has multiple alt alleles --
      # it should simply move 4 bases to the left.
      # BLAT doesn't find rs558127253's flanking seqs! But it's messed up by mult alleles too.
      # Blatting reminds me that changing the position should also mean changing the flanking
      # sequences, and there's not enough info to do that reliably.  So I think I won't apply
      # the changes after all.
  
      # 11/17/14: Allele dump file was outdated at time of download -- it was re-dumped a couple weeks
      # after release.  snp142 was missing some frequencies due to incomplete join with Allele.
      # Rebuilding all tables that have allele frequencies (just the track tables & exceptions).
      # First download the updated Allele dump file:
      cd /hive/data/outside/dbSNP/142/shared
      rm Allele.bcp.gz
      wget --timestamping --no-verbose ftp://ftp.ncbi.nih.gov/snp/database/shared_data/Allele.bcp.gz
      cd /hive/data/outside/dbSNP/142/human_hg19
      # Clear out old data from Allele table, then reload with new dump file as in loadDbSnp.csh:
      hgsql hg19snp142 -e 'delete from Allele'
      zcat /hive/data/outside/dbSNP/142/shared/Allele.bcp.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp142 Allele placeholder stdin
      hgsql hg19snp142 -e 'alter table Allele add index (allele_id);'
      # Now update ucscAlleleFreq as in addToDbSnp.csh:
      /cluster/home/angie/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg19snp142 \
        -contigLoc=b142_SNPContigLoc_105 -deDupTGP \
        > ucscAlleleFreq.txt
      hgLoadSqlTab hg19snp142 ucscAlleleFreq{,.sql,.txt}
      # Now re-run from bigJoin onward.
      mkdir `cat workingDir`
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin \
        >>& do.log & tail -f do.log
  
      #########################################################################
      # 1/23/15: Jonathan noticed that NonIntegerChromCount's count in snp142ExceptionDesc
      # was much higher than the number of snp142 rows with NonIntegerChromCount; turns out
      # it was tallied per allele not per variant.  Fixed, regenerate;
      cd /hive/data/outside/dbSNP/142/human_hg19
      snpNcbiToUcsc -snp132Ext -par=par.bed.gz ucscNcbiSnp.bed.gz /hive/data/genomes/hg19/hg19.2bit \
        snp142RedoExceptionDesc
      # Takes ~45min.  Make sure there are no changes to snp142.bed:
      zcat snp142.bed.gz > /data/tmp/snp142.bed
      cmp /data/tmp/snp142.bed snp142RedoExceptionDesc.bed
      # No output -- good.
      rm /data/tmp/snp142.bed
      mv snp142ExceptionDesc.tab.gz snp142ExceptionDesc.tab.bak.gz
      mv snp142RedoExceptionDescExceptionDesc.tab snp142ExceptionDesc.tab
      hgLoadSqlTab hg19 snp142ExceptionDesc $HOME/kent/src/hg/lib/snp125ExceptionDesc.sql \
        snp142ExceptionDesc.tab
      rm snp142RedoExceptionDesc*
  
      #########################################################################
      # 2/17/15: User reported missing validation info (#14836); Jonathan found that SNP.bcp.gz
      #          had been updated 1/30/15, argh.  Rebuilding parts that depend on the SNP table.
      # First get a baseline, so we know whether the updated file has fixed the problem:
      hgsql hg19 -e 'select count(*) from snp142 where valid = "unens"'
  #| 107373843 |
  
      cd /hive/data/outside/dbSNP/142/human_hg19/data
      mv SNP.bcp.gz SNP.orig.bcp.gz
      # dbSNP updated only the GRCh38 copy of the file, but SNP.bcp.gz has assembly-independent
      # info so just link to the GRCh38 copy (see hg38/variation.txt):
      ln -s ../../human_hg38/data/SNP.bcp.gz .
      # Empty out hg19snp142.SNP because our loading command uses -oldTable because of table.sql...:
      hgsql hg19snp142 -e 'delete from SNP'
      # from loadDbSnp.csh:
      zcat /hive/data/outside/dbSNP/142/human_hg19/data/SNP.bcp.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp142 SNP placeholder stdin
      # Make sure there's no need to (re)create the index on snp_id as in loadDbSnp.csh.
      hgsql hg19snp142 -e 'show index from SNP'
  #| SNP   |          1 | snp_id   |            1 | snp_id      | A         |   112745696 |     NULL | NULL   | YES  | BTREE      |         |               |
      # yep, the index is still there.
      # Now re-run from bigJoin onward.
      mkdir -p `cat workingDir`
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin -stop=filter \
        >>& do.log & tail -f do.log
      # Make sure the updated file has fixed the problem (i.e. has << 107373843 missing 'valid' vals):
      hgsql hg19 -e 'select count(*) from snp142 where valid = "unens"'
  #| 24180022 |
      # 24M out of 113M is significantly less than 107M/113M, but still -- that's a lot!!
      # By way of comparison, snp138 has -- wow, 20M out of 65M!  Never realized it was that bad.
  
      #########################################################################
      # 10/14/15: Zero out the allele freq columns for snp142 rows on the '-' strand that include
      #           1000GENOMES in submitters, because they are incorrect (Bug #16204).
      #           Regenerate snp142{Common,Flagged,Mult}.
      cd /hive/data/outside/dbSNP/142/human_hg19
      hgsql hg19 -e 'update snp142 \
                       set alleleFreqCount=0, alleles="", alleleNs="", alleleFreqs="" \
                       where strand = "-" and find_in_set("1000GENOMES", submitters);'
      mv snp142.bed.gz snp142.150219.bed.gz
      zcat snp142.150219.bed.gz \
      | awk -F"\t" 'BEGIN{OFS="\t";} \
          ($6 == "-" && $20 ~ /1000GENOMES/) { $21=0; $22 = ""; $23 = ""; $24 = ""; } {print;}' \
      > snp142.bed
      # Make sure the changes are as intended:
      zcat snp142.150219.bed.gz | diff - snp142.bed | less
      gzip snp142.bed
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=filter -stop=filter \
        >>& do.log & tail -f do.log
  # *** All done!  (through the 'filter' step)
  
  
  ##############################################################################
  # SNP142 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 11/05/14 angie)
      # Redmine #13309
      mkdir /hive/data/genomes/hg19/bed/snp142Ortho
      cd /hive/data/genomes/hg19/bed/snp142Ortho
      # Filter snp142 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
      zcat /hive/data/outside/dbSNP/142/human_hg19/snp142.bed.gz \
      | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      | sort -u \
        > snp142ExcludeIds.txt
      wc -l snp142ExcludeIds.txt
  #905216 snp142ExcludeIds.txt
      # Glom all human info that we need for the final table onto the
      # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
      zcat /hive/data/outside/dbSNP/142/human_hg19/snp142.bed.gz \
      | awk '$3-$2 == 1 && $11 == "single" {print;}' \
      | grep -vFwf snp142ExcludeIds.txt \
      | awk 'BEGIN{OFS="\t";} \
          {print $1, $2, $3, \
                 $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                 0, $6;}' \
        > snp142ForLiftOver.bed
  
      # Do a cluster run to use liftOver to get the other species' coords
      # and get the species' "allele" (reference assembly base) at that location.
      # End with a lexical sort because we're going to join these files later.
      cat > liftOne.csh<<'EOF'
  #!/bin/csh -ef
  set chunkFile = $1
  set db = $2
  set outFile = $3
  set Db = `echo $db | perl -wpe 's/(\S+)/\u$1/'`
  set liftOverFile = /hive/data/genomes/hg19/bed/liftOver/hg19To$Db.over.chain.gz
  set other2bit = /hive/data/genomes/$db/$db.2bit
  liftOver $chunkFile $liftOverFile stdout /dev/null \
  | $HOME/kent/src/hg/snp/snpLoad/getOrthoSeq.pl $other2bit \
  | sort > $outFile
  'EOF'
  EOF
      chmod a+x liftOne.csh
  
      # Map coords to chimp using liftOver.
      mkdir run.liftOver
      cd run.liftOver
      mkdir split out
      # NOTE FOR NEXT TIME: make it 20000
      splitFile ../snp142ForLiftOver.bed 10000 split/chunk
      cp /dev/null jobList
      foreach chunkFile (split/chunk*)
        set chunk = $chunkFile:t:r
        foreach db (panTro4 ponAbe2 rheMac3)
          echo ../liftOne.csh $chunkFile $db \{check out exists out/$db.$chunk.bed\} \
            >> jobList
        end
      end
      ssh ku
      screen -S ortho -t ortho
      cd /hive/data/genomes/hg19/bed/snp142Ortho/run.liftOver
      para make jobList
  #Completed: 30927 of 30927 jobs
  #CPU time in finished jobs:    5106791s   85113.19m  1418.55h   59.11d  0.162 y
  #IO & Wait Time:                 96849s    1614.14m    26.90h    1.12d  0.003 y
  #Average job time:                 168s       2.80m     0.05h    0.00d
  #Longest finished job:            1024s      17.07m     0.28h    0.01d
  #Submission to last job:          6281s     104.68m     1.74h    0.07d
  
      cd /hive/data/genomes/hg19/bed/snp142Ortho
      # Join the results for each chunk:
      cat > joinOne.csh <<'EOF'
  #!/bin/csh -ef
  set chimpFile = $1
  set orangFile = $2
  set macFile = $3
  set outFile = $4
  set tmpFile = `mktemp`
      # Use the glommed name field as a key to join up chimp, orang and macaque
      # allele data.  Include glommed name from both files because if only
      # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
      # in the orthoGlom fields from each file, which are in the same order
      # as the chimp and macaque columns of hg18.snp128OrthoPanTro2RheMac2.
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        $chimpFile $orangFile \
      | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
              else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
        > $tmpFile
      join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
        -a 1 -a 2 -e '?' \
        $tmpFile $macFile \
      | perl -wpe 'chomp; \
          ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
          $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
          ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
            split(/\|/, $glomKey); \
          $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
          $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
          print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                           $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                           $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                           $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
          s/^.*$//;' \
          > $outFile
       rm $tmpFile
  'EOF'
  EOF
      chmod a+x joinOne.csh
      mkdir /hive/data/genomes/hg19/bed/snp142Ortho/run.join
      cd /hive/data/genomes/hg19/bed/snp142Ortho/run.join
      mkdir out
      ln -s ../run.liftOver/split .
      cp /dev/null jobList
      foreach f (split/chunk*)
        set chunk = $f:t
        echo ../joinOne.csh ../run.liftOver/out/{panTro4,ponAbe2,rheMac3}.$chunk.bed \
          \{check out exists out/$chunk.bed\} \
          >> jobList
      end
      para make jobList
  #Completed: 10309 of 10309 jobs
  #CPU time in finished jobs:       2779s      46.32m     0.77h    0.03d  0.000 y
  #IO & Wait Time:                 44119s     735.32m    12.26h    0.51d  0.001 y
  #Average job time:                   5s       0.08m     0.00h    0.00d
  #Longest finished job:              52s       0.87m     0.01h    0.00d
  #Submission to last job:          1250s      20.83m     0.35h    0.01d
  
      # Back on hgwdev, cat all of the joined results together (~45mins) & load table.
      cd /hive/data/genomes/hg19/bed/snp142Ortho
      sort -k1,1 -k2n,2n run.join/out/chunk*.bed > snp142OrthoPt4Pa2Rm3.bed
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
        hg19 snp142OrthoPt4Pa2Rm3 snp142OrthoPt4Pa2Rm3.bed
  #Read 100654051 elements of size 22 from snp142OrthoPt4Pa2Rm3.bed
  
      # Cleanup:
      rm -r run*/out run*/split
      gzip snp142ExcludeIds.txt snp142ForLiftOver.bed snp142OrthoPt4Pa2Rm3.bed &
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP142 (DONE 11/5/14 angie)
      # Redmine #13309
      mkdir /hive/data/genomes/hg19/snp142Mask
      cd /hive/data/genomes/hg19/snp142Mask
      # Identify rsIds with various problems -- we will exclude those.
      zcat /hive/data/outside/dbSNP/142/human_hg19/snp142.bed.gz \
      | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
        | sort -u \
        > snp142ExcludeRsIds.txt
      zcat /hive/data/outside/dbSNP/142/human_hg19/snp142.bed.gz \
      | grep -vFwf snp142ExcludeRsIds.txt \
        > snp142Cleaned.bed
      wc -l snp142Cleaned.bed
  #110419467 snp142Cleaned.bed
  
      # Substitutions:
      mkdir substitutions
      snpMaskSingle snp142Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
      | faSplit byname stdin substitutions/
  #Masked 102937577 snps in 102937575 out of 3137079239 genomic bases
  #/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3137079239 (difference is 82025)
      # Check that 82025 is the total #bases in sequences with nothing in snp142Cleaned:
      grep -Fw single snp142Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
      grep -vwf /data/tmp/1 ../chrom.sizes \
      | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
  #82025
      # warnings about differing observed strings at same base position:
      wc -l diffObserved.txt
  #2 diffObserved.txt
      # peanuts!  good.
      # Make sure that sizes are identical, first diffs are normal -> IUPAC,
      # and first diffs' case is preserved:
      foreach f (substitutions/chr*.fa)
        faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
      end
  #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10107 (y != c)
  #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60493 (R != A)
  #...
  #(output OK -- ambiguous bases replacing [agct] at SNP positions)
      foreach f (substitutions/chr*.fa)
        echo $f:t:r
        mv $f $f:r.subst.fa
      end
      # Fire off a bunch of gzip jobs in parallel:
      ls -1 substitutions/*.fa | split -l 5
      foreach f (x??)
        gzip `cat $f` &
      end
      # Wait for backgrounded gzip jobs to complete
      rm x??
  
      # Insertions & deletions not done.  To date we have only offered substs for download.
      # If there is user demand, use template from snp131 above.
  
      # Clean up and prepare for download:
      gzip snp142Cleaned.bed &
      foreach d (substitutions)
        pushd $d
          md5sum *.gz > md5sum.txt
          cp /hive/data/genomes/hg19/snp141Mask/$d/README.txt .
        popd
      end
      # Edit the README.txt.
  
      # Create download links on hgwdev.
      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp142Mask
      ln -s /hive/data/genomes/hg19/snp142Mask/substitutions/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp142Mask/
  
  
  ##############################################################################
  # UniProt annotations (DONE 11/10/14 max) RM 13688
  #
  # the uniprot parser is part of the publications pipeline, see github.com/maximilianh/pubMunch
  ~max/projects/pubs/tools/pubParseDb uniprot 9606
  ~/kent/src/hg/utils/uniprotMutations/makeUniProtToHg.sh hg19
  uniprotLift 9606 hg19 uniProtTohg19.psl
  hgBbiDbLink hg19 spStruct /gbdb/hg19/bbi/spMut.bb
  hgBbiDbLink hg19 spStruct /gbdb/hg19/bbi/spStruct.bb
  hgBbiDbLink hg19 spAnnot /gbdb/hg19/bbi/spAnnot.bb
  
  
  ##############################################################################
  # ExAC (Exome Aggregation Consortium) Variants & Calling Regions (DONE 3/30/15 angie)
  
      set release = 0.3
      mkdir -p /hive/data/genomes/hg19/bed/ExAC/$release
      cd /hive/data/genomes/hg19/bed/ExAC/$release
      set sitesVcfGz = ExAC.r$release.sites.vep.vcf.gz
      wget ftp://ftp.broadinstitute.org/pub/ExAC_release/release$release/$sitesVcfGz
      wget ftp://ftp.broadinstitute.org/pub/ExAC_release/release$release/README.\*
      wget ftp://ftp.broadinstitute.org/pub/ExAC_release/release$release/exome_calling_regions.v1.interval_list
  
      # The VCF includes not only regular chromosomes, but also some alts; translate their
      # sequence names into hg19's.
      zcat $sitesVcfGz | head -200 | grep ^##contig \
      | perl -wpe 's/^.*ID=([\w.]+),length=(\d+).*/$2\t$1/ || die $_;' | sort > vcfSizeChrom
      tawk '{print $2, $1;}' ../../../chrom.sizes | sort > hg19SizeChrom
      join -o 1.2,2.2 vcfSizeChrom hg19SizeChrom > vcfToHg19
      # Any missing?
      join -a 1 -e DOH -o 1.2,2.2 vcfSizeChrom hg19SizeChrom | g DOH
  #MT DOH
  #NC_007605 DOH
      # Unfortunately, they used the rCRS mitochondrion and we didn't so no chrM.
      # NC_007605 is Epstein-Barr virus.
      # Swap in as many hg19 seq names as we can:
      set sitesVcfUcsc = ExAC.r$release.sites.vep.hg19.vcf
      set sitesVcfGzUcsc = ExAC.r$release.sites.vep.hg19.vcf.gz
      zcat $sitesVcfGz \
      | perl -we 'open($V2H, "vcfToHg19") || die; \
          while (<$V2H>) { chomp; ($v, $h) = split; $v2h{$v} = $h; } \
          close($V2H); \
          while(<>) { \
            if (! /^#/) { \
              m/^([\w.]+)\t/ || die; \
              $chr = $v2h{$1}; \
              s/^([\w.]+)/$chr/ if ($chr); \
            } \
            print; \
          }' \
        > /data/tmp/$sitesVcfUcsc
      # Re-compress with bgzip and build tabix index (.tbi)
      pushd /data/tmp
      set tabixBin = /hive/data/outside/tabix/tabix-0.2.5/tabix-0.2.5
      $tabixBin/bgzip $sitesVcfUcsc
      $tabixBin/tabix -p vcf $sitesVcfGzUcsc
      popd
      cp -p /data/tmp/${sitesVcfGzUcsc}* . \
      && rm /data/tmp/${sitesVcfGzUcsc}*
  
      mkdir -p /gbdb/hg19/ExAC
      ln -s `pwd`/*.vcf.gz* /gbdb/hg19/ExAC/
      # skip this step: instead of making tiny sql table, use bigDataUrl in trackDb
  #    hgBbiDbLink hg19 exacVariants /gbdb/hg19/ExAC/$sitesVcfGzUcsc
  
      # Make a chromosomes line for trackDb:
      awk '{print $2}' vcfToHg19 | sort | xargs echo | sed -e 's/ /,/g'
  #chr1,chr10,chr11,chr11_gl000202_random,chr12,chr13,chr14,chr15,chr16,chr17,chr17_gl000203_random,chr17_gl000204_random,chr17_gl000205_random,chr17_gl000206_random,chr18,chr18_gl000207_random,chr19,chr19_gl000208_random,chr19_gl000209_random,chr1_gl000191_random,chr1_gl000192_random,chr2,chr20,chr21,chr21_gl000210_random,chr22,chr3,chr4,chr4_gl000193_random,chr4_gl000194_random,chr5,chr6,chr7,chr7_gl000195_random,chr8,chr8_gl000196_random,chr8_gl000197_random,chr9,chr9_gl000198_random,chr9_gl000199_random,chr9_gl000200_random,chr9_gl000201_random,chrUn_gl000211,chrUn_gl000212,chrUn_gl000213,chrUn_gl000214,chrUn_gl000215,chrUn_gl000216,chrUn_gl000217,chrUn_gl000218,chrUn_gl000219,chrUn_gl000220,chrUn_gl000221,chrUn_gl000222,chrUn_gl000223,chrUn_gl000224,chrUn_gl000225,chrUn_gl000226,chrUn_gl000227,chrUn_gl000228,chrUn_gl000229,chrUn_gl000230,chrUn_gl000231,chrUn_gl000232,chrUn_gl000233,chrUn_gl000234,chrUn_gl000235,chrUn_gl000236,chrUn_gl000237,chrUn_gl000238,chrUn_gl000239,chrUn_gl000240,chrUn_gl000241,chrUn_gl000242,chrUn_gl000243,chrUn_gl000244,chrUn_gl000245,chrUn_gl000246,chrUn_gl000247,chrUn_gl000248,chrUn_gl000249,chrX,chrY
  
      # How many SNVs & indels in the file?
      zcat $sitesVcfGz \
      | grep -v ^# \
      | perl -we 'while(<>) { \
                    @w = split; \
                    if (length($w[3]) != 1) { \
                      # if reference length is not 1, it is definitely an indel. \
                      $indels++; \
                    } else { \
                      # Check each alt allele \
                      $foundAlt = 0; \
                      @alts = split /,/, $w[4]; \
                      foreach $alt (@alts) { \
                        if (length($alt) != 1) { \
                          $foundAlt = 1; \
                          last; \
                        } \
                      } \
                      if ($foundAlt) { \
                        $indels++; \
                      } else { \
                        $snvs++; \
                      } \
                    } \
                  } \
                  print "\nsnvs: $snvs\nindels: $indels\n";'
  #snvs: 8754548
  #indels: 607770
      # Actually a slight decrease from r0.2.
  
      # Now make a bigBed6 track for the exome-calling regions.
      # Looks like a Picard interval file, so 1-based:
      # http://gatkforums.broadinstitute.org/discussion/1319/collected-faqs-about-interval-lists
      cut -f 1 exome_calling_regions.v1.interval_list | uniq
      # Header, regular chroms, MT which we'll have to strip.
      perl -wne \
        'next if (/^(@|MT)/); # skip header lines and chrM \
         chomp; \
         ($c, $start, $end, $strand, $name) = split("\t"); \
         $start--; \
         print join("\t", "chr$c", $start, $end, $name, 0, $strand) . "\n"; \
        ' exome_calling_regions.v1.interval_list \
        > exacCallingRegions.bed
      bedToBigBed exacCallingRegions.bed /hive/data/genomes/hg19/chrom.sizes \
        -type=bed6 -tab exacCallingRegions.bb
      ln -s `pwd`/exacCallingRegions.bb /gbdb/hg19/ExAC/
  
      # make a chromosomes line for trackDb:
      cut -f 1 exacCallingRegions.bed | uniq | xargs echo | sed -e 's/ /,/g'
  #chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY
  
  
  ##############################################################################
  ##############################################################################
  # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
  ##############################################################################
  
  ##############################################################################
  # GTEX Genotype-Tissue Expression
  #
  # Loading Jan 2014 (V4) data. (2015-Mar kate)
  #
  # Added scores to BED table. (2016-Mar kate)
  
  # Data and sample info (metadata) loaded rom http://www.gtexportal.org to
  # tables in hgFixed.  Methods here:
  # kent/src/hg/makeDb/doc/gtex/V4.txt
  
  # Load BED15ish table with tissue medians for each gene.  Coordinates are the
  # UCSC known genes canonical transcript for the ENSembl gene reported by GTEX.
  
  cd /hive/data/genomes/hg19/bed
  mkdir gtex; cd gtex
  
  # Map GENCODE to known genes for cannonical transcript selection
  hgMapToGene hg19 wgEncodeGencodeCompV19 knownGene knownToGencodeV19
  
  hgsql hg19 -Ne 'select count(*) from knownToGencodeV19'
  # 75623
  # compared to 80184 for hg38 V20 GENCODE
  
  # reloaded 9/23/15 to repair tissue ordering problem
  
  # reload and add scores (2016-mar kate)
  #hgGtexGeneBed hg19 -gencodeVersion=V19 gtexGene >&! log.txt
  
  # GTEX gene models are based on GENCODE V18.  Need a knownToGencodeV18 table to make this work
  #hgGtexGeneBed hg19 -gencodeVersion=V18 gtexGene >&! log.v18.txt
  
  set gencode = V24lift37
  # use latest attribute annotation
  ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
          hg19 -noLoad -gtexVersion=V4 -gencodeVersion=$gencode gtexGeneV4 >&! log.V4.txt
  
  # Max score: 178212.765625
  # 1094 not found
  
  wc -l gtexGeneV4.tab
  # 54899 gtexGeneV4.tab
  
  # add scores (see hg38/gtex.txt for background)
  
  set bedScore = ~/kent/src/utils/bedScore/bedScore
  $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV4.tab gtexGeneV4.bed
  
  # table looks OK, load it
  set lib = ~/kent/src/hg/lib
  hgLoadBed hg19 -noBin -tab -type=bed6+4 \
          -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                  gtexGeneBedNewV4 gtexGeneV4.bed
  
  # Load exon data
  cd /hive/data/outside/GTEx/2014-01-17
  cd exonReads
  alias hgGtex ~kate/kent/src/hg/makeDb/outside/hgGtex/hgGtex
  hgGtex -exon -noLoad -tab=output gtex V4 \
      GTEx_Analysis_V4_RNA-seq_RNA-SeQCv1.1.8_exon_reads.txt \
      ../GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt \
      ../GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt \
      ../portal/gtexColorTissue.dec.tab
  
  cd output
  hgLoadSqlTab hgFixed gtexExonTissueMedian ~/kent/src/hg/lib/gtexTissueMedian.sql \
          gtexExonTissueMedian.tab
  
  alias hgGtexExonBed ~kate/kent/src/hg/makeDb/outside/hgGtexExonBed/hgGtexExonBed
  hgGtexExonBed hg19 gtex ../gencode/gencode.v18.genes.patched_contigs_exons.txt
  
  # rename  and split out whole genes (exon _0).  NOTE: these include a few on chrM (called chrMT)
  foreach f (*.tab)
      set r = $f:r
      set t = `echo $r | sed 's/gtexTissueExonMedian//'`
      grep -v '_0' $f | sed 's/chrMT/chrM/' > gtexTissueExonMedian$t.bed
      grep '_0' $f | sed 's/chrMT/chrM/' > gtexTissueExon0Median$t.bed
  end
  
  # add scores (100-1000)
  cd scored
  bedScore -method=reg -uniform -col=7 -minScore=100 gtexTissueExonMedian*.bed scored/
  bedScore -method=reg -uniform -col=7 -minScore=100 gtexTissueExon0Median*.bed scored/
  
  # load
  cat > load.csh << 'EOF'
  foreach f (gtexTissueExon0Median*.bed)
      set r = $f:r
      hgLoadBed hg19 $r $f -sqlTable=/cluster/home/kate/kent/src/hg/lib/bed6FloatVal.sql -renameSqlTable -tab
  end
  foreach f (gtexTissueExonMedian*.bed)
      set r = $f:r
      hgLoadBed hg19 $r $f -sqlTable=/cluster/home/kate/kent/src/hg/lib/bed6FloatVal.sql -renameSqlTable -tab
  end
  'EOF'
  
  csh load.csh >&! load.log &
  cd ..
  
  # redo trackDb
  
  hgGtexExonBed hg19 -trackDb -bright gtex ../gencode/gencode.v18.genes.patched_contigs_exons.txt
  
  #############################################################################
  # GTEx V6 (October 2015) Kate
  # Create BED from hgFixed tables (see doc/gtex)
  # Reload with scores (see hg38/gtex.txt)
  
  cd /hive/data/outside/gtex/V6
  
  # Load gene models (Gencode V19 transcript union from GTEx)
  gunzip gencode.v19.genes.patched_contigs.gtf.gz
  
  # NOTE FOR NEXT TIME: hg19 now has chrMT, so leave "chrMT" as-is instead of changing to chrM.
  # optional: liftOver from chrMT (NC_012920) coords to chrM (NC_001807).
  
  awk '$1 !~ /^#/ {print "chr"$0}' gencode.v19.genes.patched_contigs.gtf | sed 's/chrMT/chrM/' | \
          gtfToGenePred stdin gencodeV19.hg19.genePred -infoOut=gtexGeneModelInfoV6.tab
  hgLoadGenePred hg19 gtexGeneModelV6 gencodeV19.hg19.genePred
  
  # Get transcript for each gene
  #tail -n +2 gtexGeneModelInfoV6.tab | awk '{printf("%s\t%s\n", $1, $9)}' > gtexGeneTranscriptsV6.tab
  #hgLoadSqlTab hgFixed gtexTranscriptV6 ~/kent/src/hg/lib/gtexTranscript.sql gtexGeneTranscriptsV6.tab
  
  # Load BED table
  cd /hive/data/genomes/hg19/bed
  mkdir -p gtex
  cd gtex
  
  set gencode = V24lift37
  # use latest attribute annotaion
  ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
          hg19 -noLoad -gtexVersion=V6 -gencodeVersion=$gencode gtexGeneV6 >&! log.txt
  #Max score: 219385.906250
  # 925 genes not found in GencodeAttrs table
  wc -l gtexGeneV6.tab
  # 55393 gtexGeneV6.tab
  
  # add scores (see hg38/gtex.txt for background)
  
  set bedScore = ~/kent/src/utils/bedScore/bedScore
  $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV6.tab gtexGeneV6.bed
  
  # table looks OK, load it
  set lib = ~/kent/src/hg/lib
  
  hgLoadBed hg19 -noBin -tab -type=bed6+4 \
          -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                  gtexGeneBedNewV6 gtexGeneV6.bed
  
  
  #~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
          #hg19 -gtexVersion=V6 -gencodeVersion=V19 gtexGeneV6 >&! log.txt
  
  #############################################################################
  # 1000 GENOMES PHASE 3 VARIANT CALLS (DONE 10/23/15 angie - table fixed 3/1/16)
      # It was kind of a pain to get the Aspera command line client ascp this time around --
      # turns out it's included in their web browser plugin package, which I didn't think to
      # obtain because I need a command-line tool not a web browser plugin.  Anyway,
      # for VCF as opposed to BAM, ftp worked just fine.
      screen -S phase3 -t phase3
      mkdir -p /hive/data/genomes/hg19/bed/1000Genomes/phase3
      cd /hive/data/genomes/hg19/bed/1000Genomes/phase3
      set relDir = ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20130502
      wget $relDir/README\*
      # Integrated call panel files:
      wget $relDir/inte\*
      # File of related individuals who were removed just before release, leaving
      # behind 31 alt-count=0 variant calls:
      wget $relDir/2014\*
      # Now the big compressed VCFs!
      wget --timestamping $relDir/ALL.chr\*.phase3\*.genotypes.vcf.gz
      # Recompute tabix index files instead of downloading
      foreach f (ALL*.vcf.gz)
        echo $f
        tabix -p vcf $f
      end
      # Install the files
      mkdir /gbdb/hg19/1000Genomes/phase3
      ln -s `pwd`/*.vcf.gz* /gbdb/hg19/1000Genomes/phase3/
      cp /dev/null tgpPhase3.txt
      foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
        set file = ALL.chr$c.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
        if ($c == "X") then
          set file = ALL.chr$c.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz
        endif
        if ($c == "Y") then
          set file = ALL.chr$c.phase3_integrated_v1b.20130502.genotypes.vcf.gz
        endif
        echo "/gbdb/hg19/1000Genomes/phase3/$file\tchr$c" >> tgpPhase3.txt
      end
      # hgBbiDbLink doesn't support the seq column so use hgLoadSqlTab:
      hgLoadSqlTab hg19 tgpPhase3 ~/kent/src/hg/lib/bbiChroms.sql tgpPhase3.txt
      # Make a chromosomes line for trackDb:
      hgsql hg19 -NBe 'select seqName from tgpPhase3' | xargs echo | sed -e 's/ /,/g'
  #chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX,chrY
  
  
  ############################################################################
  # 1000 GENOMES PHASE 3 PAIRED-END ACCESSIBLE REGIONS (DONE 4/7/15 angie)
      screen -S acc -t acc
      mkdir /hive/data/genomes/hg19/bed/1000Genomes/phase3Mapability
      cd /hive/data/genomes/hg19/bed/1000Genomes/phase3Mapability
      set relDir = ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20130502/supporting/accessible_genome_masks/
      wget $relDir/20141020.pilot_mask.whole_genome.bed
      wget $relDir/20141020.strict_mask.whole_genome.bed
      # We don't use the 4th name column which has 'pilot' for every item in the pilot file
      # and 'strict' for every item in the strict file.
      foreach t (pilot strict)
        cut -f 1-3 20141020.${t}_mask.whole_genome.bed > tmp.bed
        bedToBigBed -type=bed3 tmp.bed \
          /hive/data/genomes/hg19/chrom.sizes \
          20141020.${t}_mask.whole_genome.bb
      end
      rm tmp.bed
      ln -s `pwd`/2014*.bb /gbdb/hg19/1000Genomes/phase3/
      # Use trackDb's bigDataUrl setting instead of hgBbiDbLink tiny table
      # Make a chromosomes line for trackDb:
      bigBedInfo -chroms /gbdb/hg19/1000Genomes/phase3/20141020.strict_mask.whole_genome.bb \
      | egrep '^[[:space:]]+chr' | awk '{print $1;}' | xargs echo | sed -e 's/ /,/g'
  #chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX,chrY
  
  
  ############################################################################
  # LASTZ human/hg19 vs. rat/rn6 - (DONE - 2015-06-08 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzRn6.2015-06-08
      cd /hive/data/genomes/hg19/bed/lastzRn6.2015-06-08
  
      cat << '_EOF_' > DEF
  # human vs rat
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  BLASTZ_O=400
  BLASTZ_E=30
  
  # TARGET: human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=40000000
  SEQ1_LIMIT=10
  SEQ1_LAP=10000
  
  # QUERY: rat rn6
  SEQ2_DIR=/hive/data/genomes/rn6/rn6.2bit
  SEQ2_LEN=/hive/data/genomes/rn6/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=10
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRn6.2015-06-08
  TMPDIR=/dev/shm
  '_EOF_'
      # << happy emacs
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > do.log 2>&1
      # real    193m46.956s
  
      cat fb.hg19.chainRn6Link.txt
      # 924289029 bases of 2897316137 (31.902%) in intersection
  
      time (doRecipBest.pl -buildDir=`pwd` hg19 rn6) > rbest.log 2>&1 &
      # real    32m6.635s
  
      # and for the swap:
      mkdir /hive/data/genomes/rn6/bed/blastz.hg19.swap
      cd /hive/data/genomes/rn6/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzRn6.2015-06-08/DEF \
          -swap -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > swap.log 2>&1
      #  real    77m36.480s
  
      cat fb.rn6.chainHg19Link.txt
      # 949880616 bases of 2729860805 (34.796%) in intersection
  
      time (doRecipBest.pl -buildDir=`pwd` rn6 hg19) > rbest.log 2>&1
      # real    32m19.569s
  
  #########################################################################
  # GWIPS-viz Ribo-seq - (DONE - 2014-12-16 - Steve)
  # contact Audrey Michel (audreymannion@gmail.com)
  # redmine #14066
  
  obtained bigWig file from shared Google drive
  https://drive.google.com/a/soe.ucsc.edu/folderview?id=0B_xvV_5tXzOGQ1h5NEh4bnhNTDg&usp=sharing_eid
  
  cp Riboseq_track.bw /gbdb/hg19/bbi/gwipsvizRiboseq.bw
  
  added entry to /trackDb/human/trackDb.ra:
  
  track gwipsvizRiboseq
  type bigWig
  shortLabel GWIPS-viz Riboseq
  longLabel Ribosome Profiling from GWIPS-viz
  group expression
  visibility hide
  maxHeightPixels 100:32:8
  viewLimits 0:2000
  autoScale off
  html gwipsvizRiboseq
  
  
  #########################################################################
  # altLocations (DONE - 2015-07-10 - Galt)
  #
  # I used hapRegions instead of altSeqHaplotype because it has fewer records
  # and also the corrected chromStart (-1).
  # I am trying to have a standard name for the table, used in all assemblies with alternate haplotypes (not in patches).
  # hgTracks and hgc expect the table to have this name.
  
  hgsql hg19
  create table altLocations as (select bin,chrom,chromStart,chromEnd,name from hapRegions);
  
  #I also created this in top-level trackDb.ra:
  
  track altLocations
  type bed 4
  group map
  color 32,32,190
  shortLabel Alt Haplotypes
  longLabel Alternate Haplotypes to Reference Sequence Correspondence
  url /cgi-bin/hgTracks?position=$$
  urlLabel Corresponding position:
  
  
  ##############################################################################
  # DBSNP B144 / SNP144 (DONE 9/9/15)
      # Originally done 7/16/15
      # Updated 9/3/15 with additional alt mappings from dbSNP, see below
      # Redmine #15493
      mkdir -p /hive/data/outside/dbSNP/144/human_hg19
      cd /hive/data/outside/dbSNP/144/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b144_GRCh37p13 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b144_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b144_* files, add that to config.ra as the "buildAssembly".
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b144_GRCh37p13
  build 144
  buildAssembly 105
  liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
  refAssemblyLabel GRCh37.p13
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
  
      # It fails saying there are sequences it won't be able to lift because the
      # liftUp file, jkStuff/liftContigs.lft, doesn't cover all of the contigs in
      # b144_SNPContigLoc_GRCh37p13.  So combine it with the auto-generated suggested.lft
      # (but don't use the suggested mitochondrial mapping, deal with that separately
      # because hg19 uses the non-standard...):
      grep -v NC_012920 suggested.lft \
      | sort -u -k4,4 -k1n,1n /hive/data/genomes/hg19/jkStuff/liftContigs.lft - \
        > liftUp.lft
  
      # Change the liftUp setting in config.ra
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b144_GRCh37p13
  build 144
  buildAssembly 105
  liftUp liftUp.lft
  refAssemblyLabel GRCh37.p13
  EOF
  
      # It also gives a list of unliftable contigs -- look at cantLiftUpSeqNames.txt
      # to find patch contigs, then start over, ignoring those:
      grep patch cantLiftUpSeqNames.txt | cut -f 2 > patchContigs.txt
      cat >> config.ra <<EOF
  ignoreDbSnpContigsFile patchContigs.txt
  EOF
  
      # Continue at loadDbSnp.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp \
        >>& do.log & tail -f do.log
  
      # While that's running, compare rs IDs in rs_fasta with b144_SNPContigLoc_107 to see
      # which IDs (if any) were omitted from the rs_fasta dump.
      zcat rs_fasta/rs*.fas.gz \
      | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      | sort -u > rsFastaIds.txt
      zcat data/b144_SNPContigLoc_10.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #7 missingFromRsFasta.txt
      cat missingFromRsFasta.txt
  #rs730882053
  #rs730882172
  #rs730882173
  #rs730882174
  #rs730882175
  #rs730882176
  #rs730882177
      # All 7 of those were deleted because of withdrawn ss's, so no need to fetch fasta
      # for those.
      # For future reference, here's how to do it if necessary next time:
      # Fetch fasta for those ids from dbSNP's batch_query.
      # Go to this URL:
      #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
      # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
      # Wait for email with link to results -- don't close that tab! -- then download the results:
      # wget -O rs_fasta/rs_chBatchQuery.fas.gz $fileFromDbSnpEmail
  
      # Check on the progress of the script.  If we did get sequence from dbSNP, and if the
      # script has progressed past the loadDbSnp step and into the addToDbSnp step, then kill
      # it and continue from addToDbSnp to pick up the added sequence.
      # This time it was still in the loadDbSnp step, so no need to kill it.
  # *** All done!
  
      # 9/3/2015: Sarah Hunt at EBI alerted dbSNP that there seemed to be a lot
      # fewer mappings to alt sequences than in 142; dbSNP made new download files
      # that seem to be an updated b144_SNPContigLoc trimmed to contain only the
      # rows on alt sequences.
      cd /hive/data/outside/dbSNP/144/human_hg19/data
      wget ftp://ftp.ncbi.nih.gov/snp/temp/EBI/b144_alt/b144_SNPContigLoc_alt_only_105.bcp.gz
      gunzip b144_SNPContigLoc_alt_only_105.bcp.gz
      gunzip b144_SNPContigLoc_105.bcp.gz
      # Check sorting so we can merge-sort (quicker):
      sort -c -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_105.bcp
      # No output, good.
      sort -c -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_alt_only_105.bcp
  #sort: b144_SNPContigLoc_alt_only_105.bcp:201: disorder: rs      1707    224515579       1098938 1098938                 1098937 1098939 2               144     2015-04-14 09:01:00.0   128     0       T               1.0
      # Sort the new file, fortunately much smaller than the old file:
      sort -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_alt_only_105.bcp \
        > b144_SNPContigLoc_alt_only_105_sorted.bcp
      # Merge-sort the files, discarding lines with the same values in columns 2-5:
      sort -m -u -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_105.bcp \
        b144_SNPContigLoc_alt_only_105_sorted.bcp \
        > b144_SNPContigLoc_105_merged.bcp
      wc -l b144_SNPContigLoc_105.bcp b144_SNPContigLoc_alt_only_105_sorted.bcp \
        b144_SNPContigLoc_105_merged.bcp
  #  153282211 b144_SNPContigLoc_105.bcp
  #    1831401 b144_SNPContigLoc_alt_only_105_sorted.bcp
  #  153654053 b144_SNPContigLoc_105_merged.bcp
      # How many of the 1831401 alt mappings were already in b144_SNPContigLoc_105.bcp?
      expr 153282211 + 1831401 - 153654053
  #1459559
      # How many new mappings were added?
      expr 153654053 - 153282211
  #371842
      # Make sure they're really unique in columns 2-5:
      cut -f 2-5 b144_SNPContigLoc_105_merged.bcp | sort -k1n,1n -k2n,2n -k3n,3n -k4n,4n > 1
      uniq 1 > 2
      cmp 1 2
      rm 1 2
      # Install the merged file in place of the original b144_SNPContigLoc_105.bcp.gz
      gzip b144_SNPContigLoc_105_merged.bcp &
      mv b144_SNPContigLoc_105{,_orig}.bcp
      gzip b144_SNPContigLoc_105_orig.bcp &
      ln -s b144_SNPContigLoc_105_merged.bcp.gz b144_SNPContigLoc_105.bcp.gz
      rm b144_SNPContigLoc_alt_only_105_sorted.bcp
      # Run the pipeline again when the gzip of b144_SNPContigLoc_105_merged.bcp is complete.
      cd /hive/data/outside/dbSNP/144/human_hg19
      hgsql hg19 -NBe 'select chrom, count(*) from snp144 group by chrom order by chrom' \
        > chromCounts.orig
      mkdir origFiles
      mv snp144*.{bed,tab}*.gz do.log origFiles/
      hgsql '' -e 'drop database hg19snp144'
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
      tail -f do.log
      # While that's reloading the database, see if we need to fetch more fasta:
      zcat data/b144_SNPContigLoc_105.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #34 missingFromRsFasta.txt
      # It was only 7 before, so let's get sequence from dbSNP again.
      #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
      # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
      # Wait for email with link to results -- don't close that tab!
      # Excerpt from email:
  #dbSNP BATCH QUERY REPORT: FASTA
  #============================================================
  #Total number of Submitted ID:           34
  #============================================================
  #Total number of ID processed:           34
  #============================================================
  #These rs have been merged (submitted_rs -> new_rs)
  #________________________________________________
      # Download the gzipped fasta file and move/rename it to rs_chBatchQuery.fas.gz.
      faSize rs_chBatchQuery.fas.gz
  #4134 bases (34 N's 4100 real 4100 upper 0 lower) in 34 sequences in 1 files
      # Good -- unlike hg38, they all came through -- even the 7 that were deleted before.
      # There was no pre-existing rs_chBatchQuery.fas.gz so move this one into rs_fasta/.
      mv rs_chBatchQuery.fas.gz rs_fasta/
      tail -f do.log
      hgsql hg19 -NBe 'select chrom, count(*) from snp144 group by chrom order by chrom' \
        > chromCounts.merged
      sdiff chromCounts.{orig,merged} | less
      # The _hap* sequences have more mappings now, good.
  
  
  ##############################################################################
  # SNP144 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 9/9/15 angie)
      # Originally done 7/16/15
      # Redmine #15493
      screen -S ortho -t ortho
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 144 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/bed/snp144Ortho.2015-09-09
      cd /hive/data/genomes/hg19/bed/snp144Ortho.2015-09-09
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 144 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP144 (DONE 9/9/15 angie)
      # Originally done 7/16/15
      # Redmine #15493
      screen -S mask -t mask
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 144 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/snp144Mask.2015-09-09
      cd /hive/data/genomes/hg19/snp144Mask.2015-09-09
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 144 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # mafSnp view for 100 way
  #
  
  cd /cluster/data/hg19/bed/mafSnp
  cp /cluster/data/hg19/bed/multiz100way/species.list species.lst
  echo "select * from ensGene" | hgsql hg19 | tail -n +2 | cut -f 2-16 |   genePredSingleCover stdin stdout > ensSingle.gp
  hgLoadGenePred -genePredExt hg19 ensSingle ensSingle.gp
  mkdir syn nonsyn
  sort -nk 2 /cluster/data/hg19/chrom.sizes | awk '{print $1}' | while read chrom
  do
  echo /cluster/home/braney/bin/x86_64/mafGene -exons -chrom=$chrom hg19 multiz100way ensSingle species.lst nonsyn/$chrom.nonsyn.faa
  echo /cluster/home/braney/bin/x86_64/mafGene -uniqAA -exons -chrom=$chrom hg19 multiz100way ensSingle species.lst syn/$chrom.syn.faa
  done > jobs
  
  sh -x jobs
  
  cat syn/*.faa | /cluster/home/braney/bin/x86_64/paSNP species.lst stdin stdout | sed 's/:/ /' | sed 's/-/ /' | awk '{print $1, $2-1, $3, $4, 1819, "+", $2-1, $3, "0,255,0", 1, $3 - ($2 - 1), 0}' > syn.bed
  cat nonsyn/*.faa | /cluster/home/braney/bin/x86_64/paSNP species.lst stdin stdout | sed 's/:/ /' | sed 's/-/ /' | awk '{print $1, $2-1, $3, $4, 1583, "+", $2-1, $3, "255,0,0", 1, $3-($2 - 1), 0}' > nonsyn.bed
  rm output.bed
  for i in `cat species.lst`
  do
  echo $i
  grep -wh "$i" nonsyn.bed syn.bed | sort -k 1 -T . | bedSmash stdin chrom.sizes stdout >>  output.bed
  done
  
  awk '{print $1,$2,$3,$4,$5}' output.bed > load.bed
  
  out=mafSnp100way
  hgLoadBed hg19 $out load.bed
  
  ##############################################################################
  # LASTZ Zebrafish DanRer10 (DONE - 2010-12-17 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzDanRer10.2015-09-18
      cd /hive/data/genomes/hg19/bed/lastzDanRer10.2015-09-18
  
      printf "%s\n" \
  '# human vs zebrafish
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  BLASTZ_H=2000
  BLASTZ_Y=3400
  BLASTZ_L=6000
  BLASTZ_K=2200
  BLASTZ_Q=/scratch/data/blastz/HoxD55.q
  
  # TARGET: Human hg19
  SEQ1_DIR=/scratch/data/hg19/nib
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=80
  
  # QUERY: zebrafish danRer10
  SEQ2_DIR=/hive/data/genomes/danRer10/danRer10.2bit
  SEQ2_LEN=/hive/data/genomes/danRer10/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  
  BASE=/hive/data/genomes/hg19/bed/lastzDanRer10.2015-09-18
  TMPDIR=/dev/shm' > DEF
  
      #	establish a screen to control this job
      screen
      time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
  	-chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1
      #	real    2282m56.714s
      # chaining hg19 manually (ran out of memory on ku nodes):
      #  real    166m31.444s
      # -rw-rw-r-- 1 991457195 Sep 24 14:55 chr19.nib:chr19:.chain
      # continuing:
      time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
  	-continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) \
        > chainMerge.log 2>&1
      # real    43m9.651s
  
      cat fb.hg19.chainDanRer10Link.txt
      #	83602869 bases of 2897316137 (2.886%) in intersection
  
      time (doRecipBest.pl -buildDir=`pwd` hg19 danRer10) > rbest.log 2>&1 &
      # real    7m40.903s
  
      #	running the swap
      mkdir /hive/data/genomes/danRer10/bed/blastz.hg19.swap
      cd /hive/data/genomes/danRer10/bed/blastz.hg19.swap
      time (doBlastzChainNet.pl -verbose=2 \
  	/hive/data/genomes/hg19/bed/lastzDanRer10.2015-09-18/DEF \
  	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
  	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
  	-swap) > swap.log 2>&1
      #	real    43m15.435s
  
      cat fb.danRer10.chainHg19Link.txt
      #	83856287 bases of 1369683683 (6.122%) in intersection
  
      time (doRecipBest.pl -buildDir=`pwd` danRer10 hg19) > rbest.log 2>&1
      # real    8m8.619s
  
  _EOF_
  ##############################################################################
  # DBSNP B146 / SNP146 (DONE 3/11/16 angie)
      # Redmine #16777
      mkdir -p /hive/data/outside/dbSNP/146/human_hg19
      cd /hive/data/outside/dbSNP/146/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b146_GRCh37p13 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b146_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b146_* files, add that to config.ra as the "buildAssembly".
      # Since this build is on GRCh37.p13 like b144 above, use the liftUp.lft file
      # and ignoreDbSnpContigsFile constructed for b144.
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b146_GRCh37p13
  build 146
  buildAssembly 105
  liftUp ../../144/human_hg19/liftUp.lft
  refAssemblyLabel GRCh37.p13
  ignoreDbSnpContigsFile ../../144/human_hg19/patchContigs.txt
  EOF
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
  
      # While that's running, compare rs IDs in rs_fasta with b146_SNPContigLoc_105 to see
      # which IDs (if any) were omitted from the rs_fasta dump.
      zcat rs_fasta/rs*.fas.gz \
      | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      | sort -u > rsFastaIds.txt
      zcat data/b146_SNPContigLoc_105.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #2 missingFromRsFasta.txt
      cat missingFromRsFasta.txt
  #rs796052131
  #rs796052132
      # Both of those get "invalid snp_id value", so no need to fetch fasta for them.
      # For future reference, here's how to do it if necessary next time:
      # Fetch fasta for those ids from dbSNP's batch_query.
      # Go to this URL:
      #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
      # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
      # Wait for email with link to results -- don't close that tab! -- then download the results:
      # wget -O rs_fasta/rs_chBatchQuery.fas.gz $fileFromDbSnpEmail
  
      # Check on the progress of the script.  If we did get sequence from dbSNP, and if the
      # script has progressed past the loadDbSnp step and into the addToDbSnp step, then kill
      # it and continue from addToDbSnp to pick up the added sequence.
      # This time it was still in the loadDbSnp step, so no need to kill it.
  
      # 3/10/16 - I fixed a bug in doDbSnp.pl that caused a few thousand alt-only SNPs
      # to be lost due to wrongly ignoring the rs_fasta/rs_chAltOnly.fas.gz file, and
      # re-ran from the first step using rs_fasta data onward.
      # I edited addToDbSnp.csh to wrap an "if (0) then ... endif" around all steps except the
      # last step that creates the ucscGnl table from rs_fasta headers.
      mkdir -p `cat workingDir`
      csh -efx addToDbSnp.csh >>& do.log & tail -f do.log
      # When that completed, I continued running from the bigJoin step.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin >>& do.log &
      tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNP146 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (IN PROGRESS 3/11/16 angie)
      # Redmine #16777
      screen -S ortho -t ortho
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 146 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/bed/snp146Ortho.2016-03-11
      cd /hive/data/genomes/hg19/bed/snp146Ortho.2016-03-11
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 146 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP146 (DONE 3/11/16 angie)
      # Redmine #16777
      screen -S mask -t mask
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 146 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/snp146Mask.2016-03-11
      cd /hive/data/genomes/hg19/snp146Mask.2016-03-11
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 146 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # Patents (26 Feb 2016, Max)
  
  # first prep the meta data
  cd /hive/data/genomes/hg19/bed/patents/data/
  sort sequence_details.csv -S10G -k7,7 -t$'\t' --parallel=20 > sequence_details.csv.s7
  sort patent_details -S10G -k1b,1 -t$'\t' --parallel=20 > patent_details.csv.s1
  join -1 7 -2 1 sequence_details.csv.s7 patent_details.csv.s1 -t $'\t' -o '1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 2.3 2.4 2.5' > seqAndPatentJoined.tab &
  sort --parallel=40 seqAndPatentJoined.tab -k1,1 -S10G > seqAndPatentJoined.s1.tab
  
  # create list of high-throughput patents with >100 sequences
  # list is cross-organism!
  time sort -k4,4 -S10G --parallel=20 hg19.bed > hg19.s4.bed &
  cat seqAndPatentJoined.s1.tab | cut -f1,8 | cut -f2 | tabUniq -rs > patentSeqCounts.tab
  less patentSeqCounts.tab | tawk '($2>=100)' | cut -f1 > htPatents.txt
  
  # now summarize the documents to one line per sequence
  # takes two hours
  time patSeqSummarizeDocs seqAndPatentJoined.s1.tab > seqAndPatentSummary.tab
  
  # now convert the SAM files
  cd /hive/data/genomes/hg19/bed/patents/data/
  samtools view -S -t ensGenomeHg19/Homo_sapiens.GRCh37.75.dna.toplevel.fa.fai Homo_sapiens.GRCh37.75.s90c50.sam -h | grep -v ^MT > hg19.sam
  # convert to bed
  function sam2psl_pierre() { java -Dfile.encoding=UTF8 -Xmx500m    -cp "/cluster/bin/jvarkit/htsjdk-1.133/dist/commons-jexl-2.1.1.jar:/cluster/bin/jvarkit/htsjdk-1.133/dist/commons-logging-1.1.1.jar:/cluster/bin/jvarkit/htsjdk-1.133/dist/htsjdk-1.133.jar:/cluster/bin/jvarkit/htsjdk-1.133/dist/snappy-java-1.0.3-rc3.jar:/cluster/bin/jvarkit/dist-1.133/sam2psl.jar" com.github.lindenb.jvarkit.tools.misc.SamToPsl $*; }
  sam2psl_pierre hg19.sam 2> /dev/null > hg19.psl
  pslToBed hg19.psl hg19.bed
  # careful: this text includes tab characters
  sed -ri 's/_(16|0)	/	/g' hg19.bed
  # sort by name
  # The -S10G parameter is only supported in newer sort versions
  # if it complains, just remove it. It will just take longer.
  time sort -k4,4 -S10G --parallel=20 hg19.bed > hg19.s4.bed
  
  # convert the hg19 bed to bigBed
  cd /hive/data/genomes/hg19/bed/patents/hg19
  join -t $'\t' -1 4 -2 1 ../data/hg19.s4.bed ../data/seqAndPatentSummary.tab -o '1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 1.12 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.10 2.11 2.12' | patSeqFilterBulkAndAnnotate ../data/htPatents.txt patBulk.bed patNonBulk.bed -c ../data/seqCounts.tab
  bedSort patNonBulk.bed patNonBulk.bed
  bedSort patBulk.bed patBulk.bed
  bedToBigBed patNonBulk.bed /cluster/data/genomes/hg19/chrom.sizes patNonBulk.bb -tab -as=../patSummary.as -type=bed12+
  bedToBigBed patBulk.bed /cluster/data/genomes/hg19/chrom.sizes patBulk.bb -tab -as=../patSummary.as -type=bed12+
  hgBbiDbLink hg19 patBulk /gbdb/hg19/bbi/patBulk.bb
  hgBbiDbLink hg19 patNonBulk /gbdb/hg19/bbi/patNonBulk.bb
  
  #########################################################################
  # killer whale/orcOrc1 Lastz run  (WORKING - 2016-06-03 - Hiram)
      # note: incorrect date on this directory name, should be 2016-06-03
  
      mkdir /hive/data/genomes/hg19/bed/lastzOrcOrc1.2016-07-03
      cd  /hive/data/genomes/hg19/bed/lastzOrcOrc1.2016-07-03
      printf '# human vs killer whale
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  BLASTZ_M=254
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  
  # QUERY: killer whale OrcOrc1
  SEQ2_DIR=/hive/data/genomes/orcOrc1/orcOrc1.2bit
  SEQ2_LEN=/hive/data/genomes/orcOrc1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=20
  
  BASE=/hive/data/genomes/hg19/bed/lastzOrcOrc1.2016-07-03
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > do.log 2>&1
      # real    747m9.241s
  
      cat fb.hg19.chainOrcOrc1Link.txt
      #  1507066837 bases of 2897316137 (52.016%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg19 orcOrc1) \
        > rbest.log 2>&1 &
      # real    517m17.718s
  
      # running the swap
      mkdir /hive/data/genomes/orcOrc1/bed/blastz.hg19.swap
      cd /hive/data/genomes/orcOrc1/bed/blastz.hg19.swap
      time (doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzOrcOrc1.2016-07-03/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > swap.log 2>&1
      #   real    121m53.369s
  
      cat fb.orcOrc1.chainHg19Link.txt
      #    1445188424 bases of 2249582125 (64.243%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` orcOrc1 hg19) \
        > rbest.log 2>&1 &
      #   real    463m9.924s
  
  
  #########################################################################
  # DBSNP B147 / SNP147 (DONE 7/29/16 angie)
  # Initially done 7/1/16.  Updated 7/28/16 to fix func code bug (see below).
      # Redmine #17209
      mkdir -p /hive/data/outside/dbSNP/147/human_hg19
      cd /hive/data/outside/dbSNP/147/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b147_GRCh37p13 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b147_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b147_* files, add that to config.ra as the "buildAssembly".
      # Since this build is on GRCh37.p13 like b144 above, use the liftUp.lft file
      # and ignoreDbSnpContigsFile constructed for b144.
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b147_GRCh37p13
  build 147
  buildAssembly 105
  liftUp ../../144/human_hg19/liftUp.lft
  refAssemblyLabel GRCh37.p13
  ignoreDbSnpContigsFile ../../144/human_hg19/patchContigs.txt
  EOF
      # dbSNP dumped only GRCh37-specific files, not the whole complement of files
      # that we usually download.  So make some links to hg38 download files,
      # run doDbSnp.pl -debug to make scripts, run a subset of the commands in the
      # download & loadDbSnp steps instead of running those steps as usual.
      rmdir rs_fasta
      ln -s ../human_hg38/rs_fasta .
      rmdir schema
      ln -s ../human_hg38/schema .
      cd data
      ln -s ../../human_hg38/data/[A-Z]*.bcp.gz .
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -debug
  
      # Adapted from ../download_human_hg19_147.csh:
      set orgDir = human_9606_b147_GRCh37p13
      set ftpOrgData = ftp://ftp.ncbi.nih.gov/snp/organisms/$orgDir/database/organism_data
      wget --timestamping --no-verbose $ftpOrgData/b147_SNPContigLoc_105.bcp.gz
      wget --timestamping --no-verbose $ftpOrgData/b147_SNPContigLocusId_105.bcp.gz
      wget --timestamping --no-verbose $ftpOrgData/b147_ContigInfo_105.bcp.gz
      wget --timestamping --no-verbose $ftpOrgData/b147_SNPMapInfo_105.bcp.gz
      cd ..
  
      # Adapted from loadDbSnp.csh, but just copying non-GRCh37 tables from hg38snp147:
      set tmpDir = `mktemp -d $TMPDIR/doDbSnp.pl.translate.XXXXXX`
      chmod 775 $tmpDir
      cd $tmpDir
      echo $tmpDir > /hive/data/outside/dbSNP/147/human_hg19/workingDir
      hgsql -e 'create database hg19snp147'
      # Ugh, the schema has _107 for GRCh38 but GRCh37 is 105, so substitute
      sed -e 's/_107/_105/' /hive/data/outside/dbSNP/147/human_hg19/schema/table.sql \
      | hgsql hg19snp147
      # Get these empty tables out of the way -- copy from hg38snp147 using create view:
      foreach table (Allele Batch SNP SNPAlleleFreq SNPAlleleFreq_TGP SNPSubSNPLink \
                     SNP_bitfield SubSNP)
        hgsql hg19snp147 -e "drop table if exists $table;"
        hgsql hg19snp147 -e "create view $table as select * from hg38snp147.$table;"
      end
      foreach t (b147_ContigInfo_105 b147_SNPContigLocusId_105 b147_SNPMapInfo_105)
        zcat /hive/data/outside/dbSNP/147/human_hg19/data/$t.bcp.gz  | grep -vwFf '/hive/data/outside/dbSNP/147/human_hg19/../../144/human_hg19/patchContigs.txt'\
        | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
          > tmp.tab
        hgLoadSqlTab -oldTable hg19snp147 $t placeholder tmp.tab
        rm tmp.tab
      end
      hgsql hg19snp147 -e \
        'alter table b147_ContigInfo_105 add index (ctg_id); \
         alter table b147_SNPContigLocusId_105 add index (ctg_id); \
         alter table b147_SNPMapInfo_105 add index (snp_id);'
      # Make sure there are no orient != 0 contigs among those selected.
      set badCount = `hgsql hg19snp147 -NBe \
                        'select count(*) from b147_ContigInfo_105 where orient != 0;'`
      if ($badCount > 0) then
        echo "found $badCount contigs in b147_ContigInfo_105 with orient != 0"
      endif
  #found 695 contigs in b147_ContigInfo_105 with orient != 0
      # Uh-oh -- looks like some HuRef and other assembly contigs weren't grepped out...
      hgsql hg19snp147 -NBe 'select count(*) from b147_ContigInfo_105 where orient != 0 and group_label = "HuRef";'
  #695
      # So delete those manually and continue.
      hgsql hg19snp147 -NBe 'delete from b147_ContigInfo_105 where group_label != "GRCh37.p13";'
      # OK, now back to stuff from loadDbSnp.csh:
  
      # b147_SNPContigLoc_105 is huge, and we want only the reference contig mappings.
      # Keep lines only if they have a word match to some reference contig ID.
      # That allows some false positives from coord matches; clean those up afterward.
      hgsql hg19snp147 -NBe 'select ctg_id from b147_ContigInfo_105' \
      | sort -n > b147_ContigInfo_105.ctg_id.txt
      zcat /hive/data/outside/dbSNP/147/human_hg19/data/b147_SNPContigLoc_105.bcp.gz \
      | grep -Fwf b147_ContigInfo_105.ctg_id.txt \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp147 b147_SNPContigLoc_105 placeholder stdin
      # Get rid of those false positives:
      hgsql hg19snp147 -e 'alter table b147_SNPContigLoc_105 add index (ctg_id);'
      hgsql hg19snp147 -e 'create table ContigLocFix select cl.* from b147_SNPContigLoc_105 as cl, b147_ContigInfo_105 as ci where cl.ctg_id = ci.ctg_id;'
      hgsql hg19snp147 -e 'alter table ContigLocFix add index (ctg_id);'
      hgsql hg19snp147 -e 'drop table b147_SNPContigLoc_105; \
                           rename table ContigLocFix to b147_SNPContigLoc_105;'
      hgsql hg19snp147 -e 'alter table b147_SNPContigLoc_105 add index (snp_id);'
  
      # Now continue with business as usual at the addToDbSnp step.
      cd /hive/data/outside/dbSNP/147/human_hg19
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra --continue addToDbSnp >>& do.log &
      tail -f do.log
  
      # While that's running, compare rs IDs in rs_fasta with b147_SNPContigLoc_105 to see
      # which IDs (if any) were omitted from the rs_fasta dump.
      ln -s ../human_hg38/rsFastaIds.txt .
      # zcat rs_fasta/rs*.fas.gz \
      # | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      # | sort -u > rsFastaIds.txt
      zcat data/b147_SNPContigLoc_105.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #1 missingFromRsFasta.txt
      cat missingFromRsFasta.txt
  #rs869025304
      # According to http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs869025304 it's
      # a pathogenic allele... better include it.
      # Since there's only one, instead of batch query, just get its fasta from the web page.
      cat > rs_fasta/rs_chBatchQuery.fas <<EOF
  >gnl|dbSNP|rs869025304 rs=869025304|pos=51|len=101|taxid=9606|mol="genomic"|class=in-del|alleles="TT/CCGTATAGCTGG"|build=147|allele origin=C(maternal)/+.-----(germline)|suspect=?|clinsig=Pathogenic
  TCCAAGGGAT ATCTTCTAAC CATACCGATG ATTCTCCGTA TAGCTGGTCT
  N
  TCCACCTTAT CATCCTTCCT CTGAGAAGTA TGAAAACACT AAGGTAAGGC
  EOF
      # uh-oh -- "class=in-del" needed to be edited to "class=2" (see ../shared/SnpClassCode.bcp.gz)
      # in order to match the download files that we parse.
      gzip rs_fasta/rs_chBatchQuery.fas
      # Now start over from addToDbSnp...
      kill %1
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra --continue addToDbSnp >>& do.log &
      tail -f do.log
  # *** All done!
  
      # 7/28/16 -- Chris Lee found some NR_'s and XR_'s in snp147CodingDbSnp where they don't
      # belong.  Turns out that dbSnp was assigning cds-indel to any deletion with length
      # multiple of 3 that intersected an exon, whether the transcript was coding or not.
      # Some affected variants will end up with both cds-indel and ncRNA because they hit
      # both coding and noncoding transcripts.
      # This does not affect ortho alleles or SNP-masked sequences.
      # Fix the values in hg19snp147.b147_SNPContigLocusId_105 and rebuild from ucscFunc onwards.
      # For rows in which dbSNP has assigned cds-indel (45) to a non-coding transcript (NR_ or
      # XR_), assign ncRNA (30) instead.
      cd /hive/data/outside/dbSNP/147/human_hg19
      hgsql hg19snp147 -e 'update b147_SNPContigLocusId_105 set fxn_class = 30 \
                             where fxn_class = 45 and mrna_acc like "_R\_%";'
      # Comment out all of the parts of addToDbSnp.csh except for the part that creates ucscFunc
      # and run the modified script.
      mkdir -p `cat workingDir`
      csh -efx ./addToDbSnp.csh >>& do.log & tail -f do.log
      # Re-run the pipeline from bigJoin onward.
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra --continue bigJoin >>& do.log &
      tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNP147 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 7/2/16 angie)
      # Redmine #17209
      screen -S ortho -t ortho
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 147 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/bed/snp147Ortho.2016-07-01
      cd /hive/data/genomes/hg19/bed/snp147Ortho.2016-07-01
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 147 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP147 (DONE 7/1/16 angie)
      # Redmine #17209
      screen -S mask -t mask
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 147 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/snp147Mask.2016-07-01
      cd /hive/data/genomes/hg19/snp147Mask.2016-07-01
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 147 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNP147 BED4 BIGBED FOR VAI (DONE 10/13/16 angie)
      # Redmine #18222 -- probably should fold this into dbSNP pipeline...
      screen -S snp -t snp
      cd /hive/data/outside/dbSNP/147/human_hg19
      # Make bigBed with just the first 4 columns of snp147
      zcat snp147.bed.gz \
      | cut -f 1-4 \
      | sort -k1,1 -k2n,2n \
        > $TMPDIR/hg19.snp147.bed4
      bigBedToBed $TMPDIR/hg19.snp147.bed4 /hive/data/genomes/hg19/chrom.sizes snp147.bed4.bb
      # Install in location that hgVai will check:
      mkdir -p /gbdb/hg19/vai
      ln -s `pwd`/snp147.bed4.bb /gbdb/hg19/vai/
      # Clean up
      rm $TMPDIR/hg19.snp147.bed4
  
  
  ##############################################################################
  # Add GTEx to Gene Sorter (2016-08-18 kate)
  # See hg/near/makeNear.doc
  
  ##############################################################################
  # Minke whale/balAcu1 Lastz run (DONE - 2016-08-18 - Hiram)
  
      mkdir /hive/data/genomes/hg19/bed/lastzBalAcu1.2016-08-18
      cd  /hive/data/genomes/hg19/bed/lastzBalAcu1.2016-08-18
      printf '# human vs Minke whale
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  BLASTZ_M=254
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Minke whale BalAcu1
  SEQ2_DIR=/hive/data/genomes/balAcu1/balAcu1.2bit
  SEQ2_LEN=/hive/data/genomes/balAcu1/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=30
  
  BASE=/hive/data/genomes/hg19/bed/lastzBalAcu1.2016-08-18
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > do.log 2>&1
      # real    1085m48.306s
  
      cat fb.hg19.chainBalAcu1Link.txt
      #  1538575107 bases of 2897316137 (53.103%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg19 balAcu1) \
        > rbest.log 2>&1 &
      # real    449m19.809s
  
      # running the swap
      mkdir /hive/data/genomes/balAcu1/bed/blastz.hg19.swap
      cd /hive/data/genomes/balAcu1/bed/blastz.hg19.swap
      time (doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzBalAcu1.2016-08-18/DEF \
          -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > swap.log 2>&1
      #   real    121m53.369s
  
      cat fb.balAcu1.chainHg19Link.txt
      #    1488153903 bases of 2286657046 (65.080%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` balAcu1 hg19) \
        > rbest.log 2>&1 &
      #   real    437m16.453s
  
  #########################################################################
  # Crispr track. See ../crisprTrack/README.txt (2016-09-15 max)
  # Command: doCrispr.sh hg19 knownGene
  ##############################################################################
  
  
  ##############################################################################
  # DBSNP B149 / SNP149 (DONE 3/23/17 angie)
      # Redmine #18330
      mkdir -p /hive/data/outside/dbSNP/149/human_hg19
      cd /hive/data/outside/dbSNP/149/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b149_GRCh38p7 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b149_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b149_* files, add that to config.ra as the "buildAssembly".
      cat > config.ra <<EOF
      # Since this build is on GRCh37.p13 like b144 above, use the liftUp.lft file
      # and ignoreDbSnpContigsFile constructed for b144.
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b149_GRCh37p13
  build 149
  buildAssembly 105
  liftUp ../../144/human_hg19/liftUp.lft
  refAssemblyLabel GRCh37.p13
  ignoreDbSnpContigsFile ../../144/human_hg19/patchContigs.txt
  EOF
      # dbSNP does not make a complete set of download files for hg19 anymore, so
      # do a "debug" run and link some files from hg38 before running the script.
      # (hard link so wget doesn't get confused)
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -debug
      ln ../human_hg38/data/{S*,B*} data/
      # Edit the download script: put if(0) around download files that dbSNP doesn't dump:
      vi ../download_human_hg19_149.csh
      # Run the edited script manually:
      csh -efx ../download_human_hg19_149.csh >& do.log & tail -f do.log
      # Now continue the usual way at the next step after download:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue loadDbSnp >>& do.log &
      tail -f do.log
  
      # While that's running, compare rs IDs in rs_fasta with b147_SNPContigLoc_105 to see
      # which IDs (if any) were omitted from the rs_fasta dump.
      zcat rs_fasta/rs*.fas.gz \
      | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      | sort -u > rsFastaIds.txt
      zcat data/b149_SNPContigLoc_105.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      wc -l rsFastaIds.txt contigLocIds.txt
  # 154206854 rsFastaIds.txt
  # 153907040 contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #0 missingFromRsFasta
      # Good!  If there are some missing in the future see "DBSNP B147" above for instructions.
  
      # Yay, script completed:
  # *** All done!  (through the 'bigBed' step)
  
  
  ##############################################################################
  # SNP149 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 3/24/17 angie)
      # Redmine #18330
      screen -S ortho -t ortho
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 149 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/bed/snp149Ortho.2017-03-24
      cd /hive/data/genomes/hg19/bed/snp149Ortho.2017-03-24
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 149 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP149 (DONE 3/24/17 angie)
      # Redmine #18330
      screen -S mask -t mask
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 149 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/snp149Mask.2017-03-24
      cd /hive/data/genomes/hg19/snp149Mask.2017-03-24
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 149 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  #########################################################################
  # COSMIC v81 DONE Chris Eisenhart 2017-05-11
  # Make a new COSCMIC track for hg19
  mkdir /hive/data/outside/cosmic/hg19/v81
  cd /hive/data/outside/cosmic/hg19/v81
  
  # Get the new data
  sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk
  # Login to SFTP server then run these commands
  get /files/grch37/cosmic/v81/CosmicMutantExport.tsv.gz
  
  # Get the schema from V80
  cp ~/kent/src/hg/lib/cosmicNew.as .
  
  # Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts.
  zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv
  
  # Use a script to convert to bed format.
  cosmicToBed cosMut.tsv cosMut.bed
  # This many lines were skipped, 126756, for not having genomic coordinates.
  
  # Sort and convert to big bed using the .as file.
  sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed
  bedToBigBed -type=bed4+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg19/chrom.sizes cosMutHg19V80.bb -tab
  
  # Link it up so the outside world can see it.
  cd /gbdb/hg19/cosmic/
  ln -s /hive/data/outside/cosmic/hg19/v81/cosMutHg19V81.bb .
  
  
  ###########################################################################
  # Chimp Lastz run (DONE - 2017-05-04 - Hiram)
      screen -S hg19PanTro5      # use a screen to manage this longish running job
      mkdir /hive/data/genomes/hg19/bed/lastzPanTro5.2017-05-04
      cd /hive/data/genomes/hg19/bed/lastzPanTro5.2017-05-04
  
      # always set the BLASTZ program so we know what version was used
      printf "# human vs chimp
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_L=4500
  BLASTZ_T=2
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  #    A    C    G    T
  #    90 -330 -236 -356
  #  -330  100 -318 -236
  #  -236 -318  100 -330
  #  -356 -236 -330   90
  
  # TARGET: Human Hg19
  SEQ1_DIR=/scratch/data/hg19/hg19.2bit
  SEQ1_LEN=/scratch/data/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Chimp PanTro5
  SEQ2_DIR=/hive/data/genomes/panTro5/panTro5.2bit
  SEQ2_LEN=/hive/data/genomes/panTro5/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanTro5.2017-05-04
  TMPDIR=/dev/shm
  " > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > do.log 2>&1
      # real    219m12.394s
  
      cat fb.hg19.chainPanTro5Link.txt
      # 2824333913 bases of 2897316137 (97.481%) in intersection
  
      # filter with doRecipBest.pl
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          hg19 panTro5) > rbest.log 2>&1
      # real    192m19.595s
  
      # running the swap
      mkdir /hive/data/genomes/panTro5/bed/blastz.hg19.swap
      cd /hive/data/genomes/panTro5/bed/blastz.hg19.swap
      time (doBlastzChainNet.pl -verbose=2 \
          -swap /hive/data/genomes/hg19/bed/lastzPanTro5.2017-05-04/DEF \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > swap.log 2>&1
      # real    111m15.418s
  
      cat fb.panTro5.chainHg19Link.txt
      # 2929713252 bases of 3132620660 (93.523%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
          panTro5 hg19) > rbest.log 2>&1
      # real    279m26.262s
  
  #############################################################################
  # Interactions track,
  cd /hive/data/genomes/hg19/bed/interactions/
  
  # convert the protein interaction databases
  cd ppi
  
  cd iref
  # sorry don't know how exactly I downloaded this file.
  # they are available on http://irefindex.org/wiki/index.php?title=README_MITAB2.6_for_iRefIndex
  psiMtabToTab iref 9606.mitab.08122013.txt > ../iref.tab
  cat All.mitab.08122013.txt | cut -f9 | sed -e 's/pubmed://g' | tr '|' '\n' | grep -v pmids | grep -v ^-$ | uniq | sort | uniq > allPmids.txt
  
  cd ../argdb
  python parse_shortened.py | tawk '{print "argdb"NR, "gene", "", "AR", "gene", "", $1, "regulation", "", "argdb", $3, "interaction", $2}' | cat headers.tab - > ../argdb.tab
  cd ..
  
  ggPpiToTab quickgo go/associations.tsv > go.tab
  ggPpiToTab corum corum/allComplexes.csv > corum.tab
  ggPpiToTab negatome negatome/manual.txt > negatome.tab
  
  # convert the pathway databases
  
  cd pathways
  ggPpiToTab reactome reactome/homo_sapiens.interactions.txt > reactome.tab
  ggGpmlToTag wikiPathways/gpml/*.gpml > wiki.tab
  ggKgmlToTab kegg/*.xml > kegg.tab
  ggPidToTab pid/NCI-Nature_Curated.xml pid/BioCarta.xml > pid.tab
  cd ..
  
  # the text/ directory file msr.tab was created by adding 'msr' in front of every line
  # of text/msr3/pubmed_evts_140805.txt
  
  # build the big ggLink table from information in the subdirectories
  # and write the output to mysql/
  ggTables build pathways/ ppi/ text/ mysql/
  # add the document/abstract information
  ggTables docs mysql/
  # load mysql/ into hgFixed
  ggTables load hgFixed
  
  # == CREATE THE GENE LOCATIONS ==
  cd geneModels
  # === hg19 ===
  
  # get best transcript for gencode basic
  mysql hg19 -NBe 'select chrom, txStart, txEnd, name from wgEncodeGencodeBasicV19' > gencode19.bed
  mysql hg19 -NBe 'select distinct name, name2 from wgEncodeGencodeBasicV19' > gencode19.names.tab
  bedBestTranscript gencode19.bed gencode19.names.tab -m > gencode19.best.bed
  
  # get best transcript for refGene
  mysql hg19 -NBe 'select chrom, txStart, txEnd, name from refGene' > refGene.bed
  mysql hg19 -NBe 'select distinct name, name2 from refGene' > refGene.names.tab
  bedBestTranscript refGene.bed refGene.names.tab -m > refGene.best.bed
  
  # get UCSC knownGene canonical transcripts
  hgsql hg19 -NBe 'select chrom, chromStart, chromEnd, geneSymbol from knownCanonical JOIN kgXref ON kgId=transcript' > knownCanon.best.bed
  
  # add all gencode comprehensive models, but only as a last resort
  mysql hg19 -NBe 'select chrom, txStart, txEnd, name from wgEncodeGencodeCompV19' > gencodeComp19.bed
  mysql hg19 -NBe 'select distinct name, name2 from wgEncodeGencodeCompV19' > gencodeComp19.names.tab
  bedBestTranscript -m gencodeComp19.bed gencodeComp19.names.tab > gencodeComp19.best.bed
  
  # finally, even add the pseudogenes
  mysql hg19 -NBe 'select chrom, txStart, txEnd, name from wgEncodeGencodePseudoGeneV19' > gencodePseudo19.bed
  mysql hg19 -NBe 'select distinct name, name2 from wgEncodeGencodePseudoGeneV19' > gencodePseudo19.names.tab
  bedBestTranscript -m gencodePseudo19.bed gencodePseudo19.names.tab > gencodePseudo19.best.bed
  
  # now take the first line for each gene from all files, in this order
  bedNamePickFirst gencode19.best.bed refGene.best.bed gencodeComp19.best.bed knownCanon.best.bed gencodePseudo19.bed | cut -f-4 | bedSort stdin stdout > bestGenes.hg19.bed
  
  # == END GENE LOCATIONS ==
  
  # create the bigBed files
  ggTables bigBed mysql/ bigBed/ geneModels/bestGenes.hg19.bed hg19
  ln -s `pwd`/bigBed/geneInteractions.hg19.bb /gbdb/h19/bbi/interactions.bb
  #############################################################################
  
  ##############################################################################
  # DBSNP B150 / SNP150 (DONE 5/6/17 angie)
      # Redmine #19202
      mkdir -p /hive/data/outside/dbSNP/150/human_hg19
      cd /hive/data/outside/dbSNP/150/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b150_GRCh38p7 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b150_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b150_* files, add that to config.ra as the "buildAssembly".
      # Since this build is on GRCh37.p13 like b144 above, use the liftUp.lft file
      # and ignoreDbSnpContigsFile constructed for b144.
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b150_GRCh37p13
  build 150
  buildAssembly 105
  liftUp ../../144/human_hg19/liftUp.lft
  refAssemblyLabel GRCh37.p13
  ignoreDbSnpContigsFile ../../144/human_hg19/patchContigs.txt
  EOF
      # dbSNP does not make a complete set of download files for hg19 anymore, so
      # do a "debug" run and link some files from hg38 before running the script.
      # (hard link so wget doesn't get confused)
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -debug
      ln ../human_hg38/data/{S*,B*} data/
      # Edit the download script: put if(0) around download files that dbSNP doesn't dump.
      # They changed database/organism_data to database/data/organism_data so edit that too.
      vi ../download_human_hg19_150.csh
      # Run the edited script manually:
      csh -efx ../download_human_hg19_150.csh >& do.log & tail -f do.log
      # Now continue the usual way at the next step after download:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue loadDbSnp >>& do.log &
      tail -f do.log
  
      # While that's running, compare rs IDs in rs_fasta with b147_SNPContigLoc_105 to see
      # which IDs (if any) were omitted from the rs_fasta dump.
      zcat rs_fasta/rs*.fas.gz \
      | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      | sort -u > rsFastaIds.txt
      zcat data/b150_SNPContigLoc_105.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      wc -l rsFastaIds.txt contigLocIds.txt
  # 325658303 rsFastaIds.txt
  # 231449737 contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #0 missingFromRsFasta
      # Good!  If there are some missing in the future see "DBSNP B147" above for instructions.
  
      # Yay, script completed:
  # *** All done!  (through the 'bigBed' step)
  
  
  ##############################################################################
  # SNP150 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 5/8/17, REDONE 7/7/17 angie)
      # Redmine #19202
      # Redone 7/6/18 to use latest chimp & macaque assemblies
      screen -S ortho -t ortho
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 150 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/bed/snp150Ortho.2017-07-06
      cd /hive/data/genomes/hg19/bed/snp150Ortho.2017-07-06
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 150 \
        >>& do.log & tail -f do.log
      # Oops, liftOver cluster run crashed because there was no hg19ToRheMac8 liftover file yet -
      # added below.
      ssh ku
      cd /hive/data/genomes/hg19/bed/snp150Ortho.2017-07-06/run.liftOver
      para push -retries=5
      # wait for completion
      para time >>& ../do.log
      logout
      # Continue at next step
      cd /hive/data/genomes/hg19/bed/snp150Ortho.2017-07-06
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 150 \
        -buildDir `pwd` -continue join \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP150 (DONE 5/8/17 angie)
      # Redmine #19202
      screen -S mask -t mask
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 150 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/snp150Mask.2017-05-08
      cd /hive/data/genomes/hg19/snp150Mask.2017-05-08
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 150 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # LASTZ human/hg19 Tree chimp/rheMac8 - (DONE - 2017-07-08 - Angie)
      mkdir /hive/data/genomes/hg19/bed/lastzRheMac8.2017-07-06
      cd /hive/data/genomes/hg19/bed/lastzRheMac8.2017-07-06
  
      printf '# human vs macaca mulatta
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Macaca mulatta rheMac8
  SEQ2_DIR=/hive/data/genomes/rheMac8/rheMac8.2bit
  SEQ2_LEN=/hive/data/genomes/rheMac8/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=600
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRheMac8.2017-07-06
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -fileServer=hgwdev \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1
      # real    243m19.420s
  
      cat fb.hg19.chainRheMac8Link.txt
      # 2508296679 bases of 2897316137 (86.573%) in intersection
  
      time (doRecipBest.pl -buildDir=`pwd` hg19 rheMac8) > rbest.log 2>&1 &
      # real    534m6.491s
  
      # and for the swap:
      mkdir /hive/data/genomes/rheMac8/bed/blastz.hg19.swap
      cd /hive/data/genomes/rheMac8/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzRheMac8.2017-07-06/DEF \
          -swap -chainMinScore=5000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > swap.log 2>&1
      # real    268m2.274s
  
      cat fb.rheMac8.chainHg19Link.txt
      # 2495454040 bases of 3142093174 (79.420%) in intersection
  
      time (doRecipBest.pl -buildDir=`pwd` rheMac8 hg19) > rbest.log 2>&1
      # real    1088m7.851s
  
  ##############################################################################
  # Snpedia (DONE - 2016-07-03 - Max) - pushed in July 2017
  cd /hive/data/genomes/hg19/bed/snpedia
  wget http://www.snpedia.com/files/gbrowse/SNPedia.gff
  python download.py
  python filterPages.py > goodPages.txt
  python makeBed.py
  hgLoadBed hg19 snpedia snpedia.bed
  hgLoadSqlTab hg19 snpediaHtml snpediaHtml.sql snpedia.htmlTab
  ##############################################################################
  # Gtex transcript track hg38->hg19 coordinate conversion (DONE - 2017-07-27 - Chris)
  mkdir /hive/data/outside/gtex/barChartTrack/hg19backLift
  cd /hive/data/outside/gtex/barChartTrack/hg19backLift
  
  hgsql hg19 -e "select * from wgEncodeGencodeCompV24lift37" | awk '{print $2"\t"$5"\t"$6}' > foo
  cut -f 1 foo | cut -f 1 -d "." > foo2
  cut -f 2,3,4 foo > foo3
  paste foo2 foo3 > foo4
  
  # NOTE: CHRISL10-05-2021 - Re-run to fix float parse bug:
  cp ../sortedGtexTransExp.bed sortedGtexTransExpFixed.bed
  cut -f 1,2,3 sortedGtexTransExpFixed.bed > foo5Fixed
  cut -f 4 sortedGtexTransExpFixed.bed > foo6Fixed
  cut -f 5- sortedGtexTransExpFixed.bed > foo7Fixed
  paste foo6Fixed foo5Fixed > foo8Fixed
  paste foo8Fixed foo7Fixed > foo9Fixed
  
  join <(sort foo4) <(sort foo9Fixed) > foo10Fixed
  
  cat foo10Fixed | awk '{print $4"\t"$2"\t"$3"\t"$1"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14}' > foo11Fixed
  
  # Sort and convert into a bigBed file.
  sort -k1,1 -k2,2n foo11Fixed > sortedGtexTransExpHg19Fixed.bed
  # remove final line with header info:
  vim sortedGtexTransExpHg19Fixed.bed
  bedToBigBed -as=$HOME/kent/src/hg/lib/barChartBed.as -type=bed6+5 sortedGtexTransExpHg19Fixed.bed /hive/data/genomes/hg19/chrom.sizes gtexTranscExprHg19.bb
  # END NOTE: CHRISL10-05-2021
  
  # Link the files into gbdb
  # <2007-0-30 kate)
  cd /gbdb/hg19/gtex
  ln -s /hive/data/outside/gtex/barChartTrack/hg19backLift/gtexTranscExprHg19.bb gtexTranscExpr.bb
  
  ##############################################################################
  # cosmicRegions (DONE - 2017-08-03 - Chris)
  # Make a new COSCMIC track for hg19 v82
  mkdir /hive/data/outside/cosmic/hg19/v82
  cd /hive/data/outside/cosmic/hg19/v82
  
  # Get the new data
  sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk
  # Login to SFTP server (make an account online) then run this command
  get /files/grch37/cosmic/v82/CosmicMutantExport.tsv.gz
  
  # Get the schema from V81
  cp ../v81/cosmicNew.as .
  
  # Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts.
  zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv
  
  # Use a script to convert to bed format.
  cosmicToBed cosMut.tsv cosMut.bed
  # This many lines were skipped, 128966 for not having genomic coordinate
  
  # Sort and convert to big bed using the .as file.
  sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed
  bedToBigBed -type=bed8+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg19/chrom.sizes cosMutHg19V82.bb -tab -extraIndex=name,cosmLabel
  
  # Link it up so the outside world can see it.
  cd /gbdb/hg19/cosmic/
  ln -s /hive/data/outside/cosmic/hg19/v82/cosMutHg19V82.bb .
  ##############################################################################
  # snpedia (DONE - 2017-09-06 - Max)
  # see ../hg38/snpedia.txt
  #########################################################################
  # LASTZ human/hg19 Gorilla/gorGor5 - (DONE - 2017-11-08 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzGorGor5.2017-11-08
      cd /hive/data/genomes/hg19/bed/lastzGorGor5.2017-11-08
  
      printf '# human vs gorilla
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  BLASTZ_T=2
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_M=254
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
  #       A     C     G     T
  # A    90  -330  -236  -356
  # C  -330   100  -318  -236
  # G  -236  -318   100  -330
  # T  -356  -236  -330    90
  
  # TARGET: Human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: gorilla gorGor5
  SEQ2_DIR=/hive/data/genomes/gorGor5/gorGor5.2bit
  SEQ2_LEN=/hive/data/genomes/gorGor5/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzGorGor5.2017-11-08
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > do.log 2>&1
      # real    151m54.643s
  
      cat fb.hg19.chainGorGor5Link.txt
      # 2805809292 bases of 2897316137 (96.842%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` hg19 gorGor5) \
          > rbest.log 2>&1 &
      # real    101m0.033s
  
      cat fb.hg19.chainRBestGorGor5Link.txt
      # 2704000429 bases of 2897316137 (93.328%) in intersection
  
      # this syntenic procedure is work in progress to get it into the
      # the primary doBlastzChainNet.pl script - Mon Nov 13 18:42:27 PST 2017
      # create and load up syntenic chainNet:
  cd /hive/data/genomes/hg19/bed/lastzGorGor5.2017-11-08/axtChain
  netToAxt hg19.gorGor5.syn.net.gz hg19.gorGor5.all.chain.gz \
  /scratch/data/hg19/hg19.2bit /hive/data/genomes/gorGor5/gorGor5.2bit stdout \
    | axtSort stdin stdout \
       | axtToChain stdin /hive/data/genomes/hg19/chrom.sizes \
          /hive/data/genomes/gorGor5/chrom.sizes stdout \
              | gzip -c > hg19.gorGor5.syn.chain.gz
  hgLoadChain -tIndex hg19 chainSynGorGor5 hg19.gorGor5.syn.chain.gz
  netFilter -minGap=10 hg19.gorGor5.syn.net.gz \
    | hgLoadNet -verbose=0 hg19 netSynGorGor5 stdin
  cd /hive/data/genomes/hg19/bed/lastzGorGor5.2017-11-08
  featureBits hg19 chainSynGorGor5Link > fb.hg19.chainSynGorGor5Link.txt 2>&1
  cat fb.hg19.chainSynGorGor5Link.txt
  # 2868720234 bases of 3049335806 (94.077%) in intersection
  
      # and for the swap:
      mkdir /hive/data/genomes/gorGor5/bed/blastz.hg19.swap
      cd /hive/data/genomes/gorGor5/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzGorGor5.2017-11-08/DEF \
          -swap -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > swap.log 2>&1
      #  real    96m1.816s
  
      cat fb.gorGor5.chainHg19Link.txt
      # 2762099017 bases of 3080431298 (89.666%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` gorGor5 hg19) \
         > rbest.log 2>&1
      # real    106m51.054s
  
      cat fb.gorGor5.chainRBestHg19Link.txt
  # 2708063590 bases of 3080431298 (87.912%) in intersection
  
  ##############################################################################
  # GTEx cis eQTL tracks from GTEx release V6p  (DONE 10/25/17 kate)
      # Redmines #15646, #20236
  
  # This data published in Oct 12 Nature special issue:
  #       Genetic effects on gene expression across human tissues, doi:10.1038/nature24277
  
  # Data sources:
  #       * GTEx V6p LDACC files, posted to GTEx portal portal downloads:
  #               1. eQTL identifications, with effect sizes, pvalue, by LDACC, downloaded from G
  #               2. GTEx variant ID's, mapped to dbSNP
  #       * GTEx V6p cis-eQTL '95% credible set' by Caviar analysis (package 2)
  #               Provided via dropbox, 2/27/17 by Farhad Hormozdiari, at Eleazar Eskin lab, UCLA
  #
  # The track is based on the CAVIAR analysis 95% credible set, with effect sizes from the LDACC analysis.
  # Advising on data sources and track display from Casey Brown, U Penn, and JK
  
  ########
  # Download LDACC cis-eQTL calls from GTEx portal
  
  cd /hive/data/outside/GTEx/V6p
  mkdir eQtl; cd eQtl
  mkdir cis; cd cis
  
  wget http://gtexportal.org/static/datasets/gtex_analysis_v6p/single_tissue_eqtl_data/README_eQTL_v6p.txt
  wget http://gtexportal.org/static/datasets/gtex_analysis_v6p/single_tissue_eqtl_data/GTEx_Analysis_v6p_eQTL.tar
  tar xvf /GTEx_Analysis_v6p_eQTL.tar
  cd GTEx_Analysis_v6p_eQTL
  gunzip *.gz
  
  # using these files (effect size and p-value for snp/gene pairs):
  # <tissue>_Analysis.v6p.signif_snpgene_pairs.txt.gz
  
  #variant_id      gene_id tss_distance    pval_nominal    slope   slope_se        slope_fpkm      slope_fpkm_se   pval_nominal_threshold       min_pval_nominal        pval_beta
  #1_739528_G_A_b37        ENSG00000239906.1       599189  0.000201091     -0.583341       0.154452        -0.473059   0.114987 0.000234483     4.42635e-06     0.00461693
  
  ########
  # Download GTEx SNPs reference -- map imputed SNP's to dbSNP rsID's
  
  cd /hive/data/outside/GTEx/V6p/eQtl
  wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/reference/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_chr1-22+X_genot_imput_info04_maf01_HWEp1E6_variant_id_lookup.txt.gz
  gunzip *.gz
  
  wc -l *variant*txt
  11959406 GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_chr1-22+X_genot_imput_info04_maf01_HWEp1E6_variant_id_lookup.txt
  # ~12M variants
  
  ########
  # Set up build area to process CAVIAR dataset
  
  mkdir /hive/data/outside/GTEx/V6p/eQtl/Caviar2
  cd /hive/data/outside/GTEx/V6p/eQtl/Caviar2
  
  tar xvfz CAVIAR_output_checked.tar.gz
  
  find CAVIAR_output -print | head
  #...
  #CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000141258.8_Pancreas_Analysis_post
  #CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000136982.5_Pancreas_Analysis_set
  #CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000197345.8_Pancreas_Analysis_set
  #CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000172543.3_Pancreas_Analysis_hist
  
  # A directory for each tissue, with files per gene, named out/file_<geneId>_<tis>_Analysis_<type>
  #               with types:
  #       set:   Variants causal with probability of 95% (95% credible set)
  #       post:  Probability that a variant is causal
  #       hist:  Probability that gene has 0, 1, 2... 6 variants
  
  head CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000108947.4_Pancreas_Analysis_set
  
  #17_7581271_C_T_b37
  #17_7596802_C_T_b37
  # Format: chrom_start_nucs_build
  
  head CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000108947.4_Pancreas_Analysis_post
  
  #17_7439510_A_AT_b37     0.0039097
  #17_7571452_G_A_b37      0.00049001
  
  cat CAVIAR_output/Pancreas_Analysis/out/file_ENSG00000108947.4_Pancreas_Analysis_hist
  #2.81096e-13 0.227802 0.69627 0.0737158 0.00217832 3.3476e-05 3.35718e-07
  
  #NOTE: no files for 9 tissues:
  #* Bladder, Brain-Amygdala, Brain-Spinal cord, Brain-Substantia nigra, Cervix - Ectocervix, Cervix - Endocervix, Fallopian Tube, Kidney, Minor Salivary Gland.
  # This leaves 44 tissues, the 'eQTL tissues' listed on the GTEx portal.  Above were rejected as lacking sufficient samples with genotype (<70).
  
  ########
  # Pull GTEx tissue and gene info to files for input to track builder
  
  hgsql hgFixed -e 'select * from gtexTissue' > gtexTissue.tab
  hgsql hg19 -Ne 'select name, geneId from gtexGene' > gtexGeneName.tab
  
  ########
  # Create link to variant mapping file
  
  ln -s ../GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_chr1-22+X_genot_imput_info04_maf01_HWEp1E6_variant_id_lookup.txt gtexVariant.tab
  
  ########
  # Run parser and create BED files to load
  
  mkdir UCSC_output
  set bin = ~/kent/src/hg/makeDb/outside/gtexEqtl
  perl $bin/gtexEqtlMakeBeds.pl >& makeEqtlBeds.log &
  
  ########
  # Load 'cluster' summary track
  
  cd UCSC_output
  bedSort gtexEqtlGeneCluster.bed gtexEqtlGeneCluster.sorted.bed
  set dir = /cluster/home/kate/kent/src/hg/lib
  hgLoadBed hg19 -noSort -tab -renameSqlTable -allowStartEqualEnd \
          -type=bed5+11 -as=$dir/gtexEqtlCluster.as -sqlTable=$dir/gtexEqtlCluster.sql \
          gtexEqtlCluster gtexEqtlGeneCluster.sorted.bed
  #Read 1622952 elements of size 16 from gtexEqtlGeneCluster.sorted.bed
  
  # # this caused a load problem.  One instance in the entire file where tissues not combined, ugh
  chr21   45650008        45650009        rs56323213      5       ENSG00000160223.12      ICOSLG  -10840  -0.484  -       0       1       skinExposed,    -0.484, 7.064,  0.005,
  chr21   45650008        45650008        rs145424134     8       ENSG00000160223.12      ICOSLG  -10841  -0.070  -       0       1       esophagusMuscular,      -0.070, 5.106,  0.008,
  
  # refine generated trackDb.gtexEqtl.ra file and install in makeDb/trackDb/human/hg19
  
  ########
  # Load 44 per-tissue tracks: gtexEqtlTissue<tissueName>
  csh $bin/getxEqtlLoadTissues.csh UCSC_output >&! loadTissuesV2.log &
  
  #NOTE: V2 was a second release that followed immediately after first release (which was timed to coincide
  #  with Nature paper pub.  V2 revised schema (added ensembl gene ID, additional summary fields)
  # and color conventions.
  
  ###########################################################################
  # HGMD (updated 12/10/19 max)
  # HGMD (updated 01/25/18 max)
  # HGMD (updated 12/12/20 max)
  # got hgmd from Frank Schacherer Frank.Schacherer@qiagen.com and Rupert Yip Rupert.Yip@qiagen.com
  # see also the file hg38/hgmd.txt
  year=2020
  cd /hive/data/genomes/hg19/bed/hgmd
  cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg19.tsv | grep -v \# | tawk '{if ($5=="I") {start=$4-1; end=$4+1; col="100,100,100"} else if ($5=="D") {start=$4-1; end=$4; col="170,170,170"} else {start=$4-1; end=$4; col="0,0,0"}; print "chr"$3,start,end,$2":"$1,0,".",start,end,col,$2,$1,$5}' | sed -e 's/M$/substitution/' | sed -e 's/I$/insertion (between the two basepairs, sequence not provided by HGMD)/' | sed -e 's/D$/deletion (endpoint not provided by HGMD)/' | sed -e 's/X$/insertion-deletion (endpoint not provided by HGMD)/' | sed -e 's/R$/regulatory variant/' | sed -e 's/S$/splicing variant/' | sort -k1,1 -k2,2n > hgmd.bed
  bedToBigBed hgmd.bed /hive/data/genomes/hg19/chrom.sizes hgmd.bb -type=bed9+ -as=hgmd.as -tab
  ln -s /hive/data/genomes/hg19/bed/hgmd/hgmd.bb /gbdb/hg19/bbi/hgmd.bb
  hgBbiDbLink hg19 hgmd /gbdb/hg19/bbi/hgmd.bb
  # Forgot, finally done Oct 24: also updated hgBeacon
  bigBedToBed /gbdb/hg19/bbi/hgmd.bb /tmp/temp.bed
  python2 /usr/local/apache/cgi-bin/hgBeacon -f hgmd /tmp/temp.bed hgmd
  # Forgot, finally done June 26: updated GBIB as qateam
  scp /gbdb/hg19/bbi/hgmd.bb hgdownload:/usr/local/apache/gbib/prot/
  # next restrict RefSeq down to HGMD subset
  
  # addition of HGMD-restricted subset, Max, Jan 29 2019, updated Dec 10 2019
  cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2020-10-27/
  year=2019
  # change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019
  # adding "." so NM_123 doesn't match NM_123123
  cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt
  cat process/hg19.curated.gp.gz | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp
  hgLoadGenePred -genePredExt hg19 ncbiRefSeqHgmd hgmd.curated.gp
  $ wc -l hgmd.curated.gp
  7965 hgmd.curated.gp in 2019
  8971 hgmd.curated.gp in 2020
  
  # now continue the process at ../hg38/hgmd.txt
  
  #############################################################################
  # LASTZ human/hg19 vs. pig/susScr11 - (DONE - 2018-04-02 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02
      cd /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02
  
      printf '# human vs pig
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
  BLASTZ_O=400
  BLASTZ_E=30
  BLASTZ_M=254
  # default BLASTZ_Q score matrix:
  #       A     C     G     T
  # A    91  -114   -31  -123
  # C  -114   100  -125   -31
  # G   -31  -125   100  -114
  # T  -123   -31  -114    91
  
  # TARGET: Human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: pig susScr11
  SEQ2_DIR=/hive/data/genomes/susScr11/susScr11.2bit
  SEQ2_LEN=/hive/data/genomes/susScr11/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=2
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02
  TMPDIR=/dev/shm
  '> DEF
  
      time ($HOME/kent/src/hg/utils/automation/bigBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -trackHub -syntenicNet) > do.log 2>&1
      # real    638m46.019s
  
      cat fb.hg19.chainSusScr11Link.txt
      # 1427668193 bases of 2897316137 (49.276%) in intersection
      cat fb.hg19.chainSynSusScr11Link.txt
      # 1375005074 bases of 2897316137 (47.458%) in intersection
  
      # testing -trackHub option
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
  	-trackHub hg19 susScr11) > rbest.log 2>&1 &
      # real    691m17.717s
  
      cat fb.hg19.chainRBest.SusScr11.txt
      # 1342171392 bases of 2897316137 (46.325%) in intersection
  
      # and for the swap:
      mkdir /hive/data/genomes/susScr11/bed/blastz.hg19.swap
      cd /hive/data/genomes/susScr11/bed/blastz.hg19.swap
  
      # testing -trackHub option
      time ($HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02/DEF \
          -swap -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -trackHub -syntenicNet) > swap.log 2>&1
      #  real    107m7.364s
  
      cat fb.susScr11.chainHg19Link.txt
      # 1386496715 bases of 2472073034 (56.086%) in intersection
      cat fb.susScr11.chainSynHg19Link.txt
      # 1353158526 bases of 2472073034 (54.738%) in intersection
  
      # testing -trackHub option
      time ($HOME/kent/src/hg/utils/automation/doRecipBest.pl \
   -load -trackHub -workhorse=hgwdev -buildDir=`pwd` susScr11 hg19) \
  	> rbest.log 2>&1
      # real    610m45.624s
  
      cat fb.susScr11.chainRBest.Hg19.txt
      # 1342604720 bases of 2472073034 (54.311%) in intersection
  
  #########################################################################
  # DBSNP B151 / SNP151 (DONE 4/16/18 angie)
      # Redmine #21010
      mkdir -p /hive/data/outside/dbSNP/151/human_hg19
      cd /hive/data/outside/dbSNP/151/human_hg19
      # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
      # to find the subdir name to use as orgDir below (human_9606_b151_GRCh37p13 in this case).
      # Go to that subdirectory, then to database/organism_data/ and look for files
      # whose names start with b151_* and may or may not end with a suffix that identifies
      # the build assembly version or some annotation version.  If there is a suffix shared
      # by all b151_* files, add that to config.ra as the "buildAssembly".
      # Since this build is on GRCh37.p13 like b144 above, use the liftUp.lft file
      # and ignoreDbSnpContigsFile constructed for b144.
      cat > config.ra <<EOF
  db hg19
  orgDir human_9606_b151_GRCh37p13
  build 151
  buildAssembly 105
  liftUp ../../144/human_hg19/liftUp.lft
  refAssemblyLabel GRCh37.p13
  ignoreDbSnpContigsFile ../../144/human_hg19/patchContigs.txt
  EOF
      # dbSNP does not make a complete set of download files for hg19 anymore, so
      # do a "debug" run and link some files from hg38 before running the script.
      # (hard link so wget doesn't get confused)
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -debug
      ln ../human_hg38/data/{S*,B*} data/
      # Edit the download script: put if(0) around download files that dbSNP doesn't dump.
      vi ../download_human_hg19_151.csh
      # Run the edited script manually:
      csh -efx ../download_human_hg19_151.csh >& do.log & tail -f do.log
      # Now continue the usual way at the next step after download:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue loadDbSnp >>& do.log &
      tail -f do.log
  
      # While that's running, compare rs IDs in rs_fasta with b147_SNPContigLoc_105 to see
      # which IDs (if any) were omitted from the rs_fasta dump.
      zcat rs_fasta/rs*.fas.gz \
      | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
      | sort -u > rsFastaIds.txt
      zcat data/b151_SNPContigLoc_105.bcp.gz \
      | awk '{print "rs" $2;}' \
      | sort -u > contigLocIds.txt
      wc -l rsFastaIds.txt contigLocIds.txt
  #  611582085 rsFastaIds.txt
  #  648992551 contigLocIds.txt
      comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
      wc -l missingFromRsFasta.txt
  #49191042 missingFromRsFasta.txt
      # Doh!  Way too many for a batch request.  So weird... hg19 rs_fasta download files
      # are dated more recently (March 4-6) than hg38 (Feb 22-24).
      # Looks like I should use the hg38 fasta, it has only 45 missing (see hg38/variation.txt):
      comm -13 ../human_hg38/rsFastaIds.txt contigLocIds.txt | wc -l
  #45
      mv rs_fasta rs_fasta_missing_49M
      mkdir rs_fasta
      ln ../human_hg38/rs_fasta/* rs_fasta
      tail -f do.log
      # A mysql load command in the loadDbSnp step failed, so I edited loadDbSnp.csh to continue
      # from the failed command.
      csh -efx loadDbSnp.csh >> & do.log & tail -f do.log
      # Continue from next step:
      ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp >>& do.log &
      tail -f do.log
      # Yay, script completed:
  # *** All done!  (through the 'bigBed' step)
  
  
  ##############################################################################
  # SNP151 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 4/18/18 angie)
      # Redmine #21010
      screen -S ortho -t ortho
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 151 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/bed/snp151Ortho.2018-04-17
      cd /hive/data/genomes/hg19/bed/snp151Ortho.2018-04-17
      ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg19 151 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # SNPMASKED SEQUENCE FOR SNP151 (DONE 4/17/18 angie)
      # Redmine #21010
      screen -S mask -t mask
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 151 -debug
  # *** Steps were performed in /hive/data/genomes/hg19/snp151Mask.2018-04-17
      cd /hive/data/genomes/hg19/snp151Mask.2018-04-17
      ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg19 151 \
        >>& do.log & tail -f do.log
  # *** All done!
  
  
  ##############################################################################
  # LASTZ human/hg19 vs. pig/panPan2 - (DONE - 2018-05-01 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPanPan2.2018-05-01
      cd /hive/data/genomes/hg19/bed/lastzPanPan2.2018-05-01
  
      printf '# human vs bonobo
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz
  BLASTZ_O=400
  BLASTZ_E=30
  BLASTZ_M=254
  # default BLASTZ_Q score matrix:
  #       A     C     G     T
  # A    91  -114   -31  -123
  # C  -114   100  -125   -31
  # G   -31  -125   100  -114
  # T  -123   -31  -114    91
  
  # TARGET: Human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_LIMIT=5
  
  # QUERY: bonobo panPan2
  SEQ2_DIR=/hive/data/genomes/panPan2/panPan2.2bit
  SEQ2_LEN=/hive/data/genomes/panPan2/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=100
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanPan2.2018-05-01
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -trackHub -syntenicNet) > do.log 2>&1
      # real    638m46.019s
  
      cat fb.hg19.chainPanPan2Link.txt
      # 2771380606 bases of 2897316137 (95.653%) in intersection
      cat fb.hg19.chainSynPanPan2Link.txt
      # 2730967003 bases of 2897316137 (94.259%) in intersection
  
      # testing -trackHub option
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
  	hg19 panPan2) > rbest.log 2>&1 &
      # real    157m46.388s
  
      cat fb.hg19.chainRBest.PanPan2.txt
      # 2657287479 bases of 2897316137 (91.715%) in intersection
  
      # and for the swap:
      mkdir /hive/data/genomes/panPan2/bed/blastz.hg19.swap
      cd /hive/data/genomes/panPan2/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzPanPan2.2018-05-01/DEF \
          -swap -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -trackHub -syntenicNet) > swap.log 2>&1
      #  real    181m44.494s
  
      cat fb.panPan2.chainHg19Link.txt
      # 2671932133 bases of 2725937399 (98.019%) in intersection
      cat fb.panPan2.chainSynHg19Link.txt
      # 2657217983 bases of 2725937399 (97.479%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev \
  	-buildDir=`pwd` panPan2 hg19) > rbest.log 2>&1
      # real    132m52.206s
  
      cat fb.panPan2.chainRBest.Hg19.txt
      # 2661021780 bases of 2725937399 (97.619%) in intersection
  
  ##############################################################################
  # GNOMAD VARIANTS AND COVERAGE PLOTS (DONE - 2018-05-09 - Jonathan)
  # (Amended to add exome parsing 2018-06-18)
  # Redmine #18951
  
  # Fetch supporting files from gnomad.broadinstitute.org/downloads.
  FETCHDIR=/hive/data/outside/gnomAD
  mkdir -p $FETCHDIR
  cd $FETCHDIR
  GNOMADVER=2.0.2
  gsutil -m cp -r gs://gnomad-public/release/$GNOMADVER/vcf .
  gsutil -m cp -r gs://gnomad-public/release/$GNOMADVER/coverage .
  
  # Start with the VCF files
  cd vcf/genomes
  WORKDIR=/hive/data/genomes/hg19/bed/gnomad
  mkdir -p $WORKDIR
  
  # Some things misbehave with the .bgz extension (zcat command-line autocompletion for one),
  # so renaming the soft links as .gz when we create them.
  # Leaving the WORKDIR link creation in place for the moment, but it's a bit pointless.
  # Only the gbdb links are used in the end.
  mkdir -p /gbdb/hg19/gnomAD/vcf
  mkdir -p $WORKDIR/vcf
  for fil in `ls $FETCHDIR/vcf/genomes/gnomad.genomes.r$GNOMADVER.sites.chr*.vcf.bgz`;
      do BASE=`basename $fil .bgz`; ln -s $fil $WORKDIR/vcf/$BASE.gz; ln -s $fil.tbi $WORKDIR/vcf/$BASE.gz.tbi;
      ln -s $fil /gbdb/hg19/gnomAD/vcf/$BASE.gz; ln -s $fil.tbi /gbdb/hg19/gnomAD/vcf/$BASE.gz.tbi; done
  
  for fil in `ls $FETCHDIR/vcf/exomes/gnomad.exomes.r$GNOMADVER.sites.vcf.bgz`;
      do BASE=`basename $fil .bgz`; ln -s $fil $WORKDIR/vcf/$BASE.gz; ln -s $fil.tbi $WORKDIR/vcf/$BASE.gz.tbi;
      ln -s $fil /gbdb/hg19/gnomAD/vcf/$BASE.gz; ln -s $fil.tbi /gbdb/hg19/gnomAD/vcf/$BASE.gz.tbi; done
  
  # Dumping a list of annotated chromosomes for the gnomadGenomeVariants table.  exome variants, all being
  # in a single file, are just handled via a bigDataUrl in trackDb.  We still need an exomesChrList for
  # building the coverage wigs though.
  ls /gbdb/hg19/gnomAD/vcf/*.sites.chr*.gz | perl -pe 'chomp; s/^.*sites\.(chr\w+)\.vcf.*$/$_\t$1\n/' > $WORKDIR/genomesChrList
  hgLoadSqlTab hg19 gnomadGenomesVariants ~/kent/src/hg/lib/bbiChroms.sql $WORKDIR/genomesChrList
  ls $FETCHDIR/coverage/exomes/*.coverage.txt.gz | perl -pe 'chomp; s/^.*\.(chr\w+)\.coverage\.txt\.gz$/$_\t$1\n/' > $WORKDIR/exomesChrList
  
  # Now to make the coverage wig files
  mkdir -p $WORKDIR/coverage
  cd $FETCHDIR/coverage/genomes
  
  # makeWigs.pl is a short script to split coverage files into a collection of wiggles.
  # The script appends to these wiggle files because coverage is provided per-chromosome,
  # so makeWigs is run separately on each per-chrom coverage file and the results must be
  # joined together.
  
  cat > $WORKDIR/coverage/makeWigs.pl << EOF
  #!/bin/env perl
  use strict;
  use warnings;
  
  my \$version = shift;
  my \$chrom = shift;
  my \$sourceDir = shift;
  my \$destDir = shift;
  my \$genomesOrExomes = shift;
  
  open IN, "zcat \$sourceDir/gnomad.\$genomesOrExomes.r\$version.\$chrom.coverage.txt.gz |" ||
      die "Could not open coverage file on chromosome \$chrom";
  open (OUTMEAN, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.mean.wig");
  open (OUTMEDIAN, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.median.wig");
  open (OUT1, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth1.wig");
  open (OUT5, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth5.wig");
  open (OUT10, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth10.wig");
  open (OUT15, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth15.wig");
  open (OUT20, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth20.wig");
  open (OUT25, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth25.wig");
  open (OUT30, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth30.wig");
  open (OUT50, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth50.wig");
  open (OUT100, ">>", "\$destDir/gnomad.\$genomesOrExomes.coverage.depth100.wig");
  
  my \$skipHeader = <IN>;
  my \$prevPos = -10;
  while (my \$line = <IN>)
  {
      chomp(\$line);
      my @a = split /\s+/, \$line;
      my \$pos = \$a[1];
      if (\$pos != \$prevPos + 1)
      {
          print OUTMEAN "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUTMEDIAN "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT1 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT5 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT10 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT15 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT20 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT25 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT30 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT50 "fixedStep chrom=\$chrom start=\$pos step=1\n";
          print OUT100 "fixedStep chrom=\$chrom start=\$pos step=1\n";
      }
      printf OUTMEAN "%s\n", \$a[2];
      printf OUTMEDIAN "%s\n", \$a[3];
      printf OUT1   "%s\n", \$a[4];
      printf OUT5   "%s\n", \$a[5];
      printf OUT10  "%s\n", \$a[6];
      printf OUT15  "%s\n", \$a[7];
      printf OUT20  "%s\n", \$a[8];
      printf OUT25  "%s\n", \$a[9];
      printf OUT30  "%s\n", \$a[10];
      printf OUT50  "%s\n", \$a[11];
      printf OUT100 "%s\n", \$a[12];
  }
  EOF
  
  chmod +x $WORKDIR/coverage/makeWigs.pl
  
  for CHR in `cut -f 2 $WORKDIR/genomesChrList`;
      do $WORKDIR/coverage/makeWigs.pl $GNOMADVER $CHR `pwd` $WORKDIR/coverage genomes;
      done
  
  cd $FETCHDIR/coverage/exomes
  for CHR in `cut -f 2 $WORKDIR/exomesChrList`;
      do $WORKDIR/coverage/makeWigs.pl $GNOMADVER $CHR `pwd` $WORKDIR/coverage exomes;
      done
  
  # Convert to bigWigs
  for WIG in `ls $WORKDIR/coverage/*.wig`;
      do wigToBigWig $WIG /hive/data/genomes/hg19/chrom.sizes $WORKDIR/coverage/`basename $WIG .wig`.bw;
      done
  
  # No need to keep the wigs after the bigWigs are built
  for WIG in `ls $WORKDIR/coverage/*.wig`;
      do if [ -e $WORKDIR/coverage/`basename $WIG .wig`.bw ]; then rm $WIG; fi
      done
  
  mkdir -p /gbdb/hg19/gnomAD/coverage
  for BW in `ls $WORKDIR/coverage/*.bw`;
      do ln -s $BW /gbdb/hg19/gnomAD/coverage/`basename $BW`;
      done
  
  ##############################################################################
  # TEST SAME-SPECIES LIFTOVER (IN PROGRESS 6/1/18 angie)
  
      mkdir /hive/data/genomes/hg19/bed/blat.hg38.2018-06-01
      cd /hive/data/genomes/hg19/bed/blat.hg38.2018-06-01
      # Stop before the load step -- I just want to compare the chain & net results with
      # chainBridge in the mix.
      $HOME/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
        -stop=net -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/hg19/11.ooc hg19 hg38 >>& do.log & tail -f do.log
  #TODO: compare results with original.
  
  
  ##############################################################################
  # GNOMAD pLI AND MISSENSE (IN PROGRESS 10/2/18 ChrisL)
  # Redmine #20394 and #22061
  
      mkdir /hive/data/outside/gnomAD/pLI
      # files downloaded as redmine attachments (#22061) and then scp'd to above location
      # files provided by Anne O'Donnell-Luria: odonnell@broadinstitute.org
  
      # remove track line (which has weird characters anyways)
      tail -n +2 mis_z_20170508.bed > misZ.bed
      tail -n +2 pLI_20170508.bed > pLI.bed
  
      # make sure every field exists:
      tawk '{print NF}' misZ.bed | sort -u
      # 5
      tawk '{print NF}' pLI.bed | sort -u
      # 5
  
      # I predict there will be more work on these to get colors and other features
      # correct. For now just turn them into bigBeds:
      cat << EOF > pLI.as
  table pLI
  "BED5 with the pLI score as the name and a score for proper shading"
      (
      string chrom;       "Reference sequence chromosome or scaffold"
      uint   chromStart;  "Start position in chromosome"
      uint   chromEnd;    "End position in chromosome"
      string name;        "pLI score in range 0 - 1"
      uint   score;       "Score for item shading"
      )
  EOF
  
      cat << EOF > misZ.as
  table misZ
  "BED5 with the missense Z-score as the name and a score for proper shading"
      (
      string chrom;       "Reference sequence chromosome or scaffold"
      uint   chromStart;  "Start position in chromosome"
      uint   chromEnd;    "End position in chromosome"
      string name;        "Missense Z-score score"
      uint   score;       "Score for item shading"
      )
  EOF
  
      chromSizes=/hive/data/genomes/hg19/chrom.sizes
      sort -k1,1 -k2,2n pLI.bed > pLI.bed.sorted
      sort -k1,1 -k2,2n misZ.bed > misZ.bed.sorted
      bedToBigBed -tab -type=bed5 -as=pLI.as pLI.bed.sorted $chromSizes pLI.bb
      bedToBigBed -tab -type=bed5 -as=misZ.as misZ.bed.sorted $chromSizes misZ.bb
      mkdir /gbdb/hg19/gnomAD/pLI/
      ln -s `pwd`/*.bb /gbdb/hg19/gnomAD/pLI/
  
      # TODO: Ask Anne for feedback and changes, especially on color
  
  ##############################################################################
  # crispr 10K shoulders (DONE - 2018-11-09 - Hiram)
      time (~/kent/src/hg/utils/automation/doCrispr.pl \
       -stop=load -buildDir=`pwd` -smallClusterHub=hgwdev-101 hg19 ncbiRefSeq) \
  	> do.log 2>&1
      # real    5896m44.643s
  
      # hive cleaning 2021-04-26
      time (~/kent/src/hg/utils/automation/doCrispr.pl \
       -continue=cleanup -buildDir=`pwd` -smallClusterHub=hgwdev hg19 \
            -fileServer=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku
              -workhorse=hgwdev) > cleanup.log 2>&1 &
  
  ##############################################################################
  # LASTZ human/hg19 vs. chimp/panTro6 - (DONE - 2018-12-14 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPanTro6.2018-12-14
      cd /hive/data/genomes/hg19/bed/lastzPanTro6.2018-12-14
  
      printf '# human vs chimp
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_L=4500
  BLASTZ_T=2
  BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q
  #    A    C    G    T
  #    90 -330 -236 -356
  #  -330  100 -318 -236
  #  -236 -318  100 -330
  #  -356 -236 -330   90
  
  # TARGET: Human Hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Chimp PanTro6
  SEQ2_DIR=/hive/data/genomes/panTro6/panTro6.2bit
  SEQ2_LEN=/hive/data/genomes/panTro6/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPanTro6.2018-12-14
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=5000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=hgwdev-101 -bigClusterHub=ku \
              -syntenicNet) > do.log 2>&1
      # real    131m22.214s
  
      cat fb.hg19.chainPanTro6Link.txt
      # 2918066802 bases of 2991694177 (97.539%) in intersection
      cat fb.hg19.chainSynPanTro6Link.txt
      # 2899302671 bases of 2991694177 (96.912%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
  	hg19 panTro6) > rbest.log 2>&1 &
      # real    59m20.269s
  
      cat fb.hg19.chainRBest.PanTro6.txt
      # 2740453344 bases of 2991694177 (91.602%) in intersection
  
      # and for the swap:
      mkdir /hive/data/genomes/panTro6/bed/blastz.hg19.swap
      cd /hive/data/genomes/panTro6/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzPanTro6.2018-12-14/DEF \
          -swap -chainMinScore=5000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=hgwdev-101 -bigClusterHub=ku \
              -syntenicNet) > swap.log 2>&1
      #  real    68m21.941s
  
      cat fb.panTro6.chainHg19Link.txt
      # 2810896564 bases of 3018592990 (93.119%) in intersection
  
      cat fb.panTro6.chainSynHg19Link.txt
      # 2796543348 bases of 3018592990 (92.644%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev \
  	-buildDir=`pwd` panTro6 hg19) > rbest.log 2>&1
      # real    58m58.852s
  
      cat fb.panTro6.chainRBest.Hg19.txt
      # 2745994828 bases of 3018592990 (90.969%) in intersection
  
  ##############################################################################
  # LASTZ human/hg19 vs. orangutan/ponAbe3 - (DONE - 2018-12-14 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzPonAbe3.2018-12-14
      cd /hive/data/genomes/hg19/bed/lastzPonAbe3.2018-12-14
  
      printf '# human vs orangutan
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_L=4500
  BLASTZ_T=2
  BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q
  #    A    C    G    T
  #    90 -330 -236 -356
  #  -330  100 -318 -236
  #  -236 -318  100 -330
  #  -356 -236 -330   90
  
  # TARGET: Human Hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=10000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: Orangutan PonAbe3
  SEQ2_DIR=/hive/data/genomes/ponAbe3/ponAbe3.2bit
  SEQ2_LEN=/hive/data/genomes/ponAbe3/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=100
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzPonAbe3.2018-12-14
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=5000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=hgwdev-101 -bigClusterHub=ku \
              -syntenicNet) > do.log 2>&1
      # real    130m5.904s
  
      cat fb.hg19.chainPonAbe3Link.txt
      # 2832921438 bases of 2991694177 (94.693%) in intersection
      cat fb.hg19.chainSynPonAbe3Link.txt
      # 2809245575 bases of 2991694177 (93.901%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
  	hg19 ponAbe3) > rbest.log 2>&1 &
      # real    76m2.846s
  
      cat fb.hg19.chainRBest.PonAbe3.txt
      # 2638828970 bases of 2991694177 (88.205%) in intersection
  
      # and for the swap:
      mkdir /hive/data/genomes/ponAbe3/bed/blastz.hg19.swap
      cd /hive/data/genomes/ponAbe3/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzPonAbe3.2018-12-14/DEF \
          -swap -chainMinScore=5000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=hgwdev-101 -bigClusterHub=ku \
              -syntenicNet) > swap.log 2>&1
      #  real    62m32.858s
  
      cat fb.ponAbe3.chainHg19Link.txt
      # 2690870339 bases of 3043444524 (88.415%) in intersection
  
      cat fb.ponAbe3.chainSynHg19Link.txt
      # 2675805099 bases of 3043444524 (87.920%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev \
  	-buildDir=`pwd` ponAbe3 hg19) > rbest.log 2>&1
      # real    76m24.498s
  
      cat fb.ponAbe3.chainRBest.Hg19.txt
      # 2641865423 bases of 3043444524 (86.805%) in intersection
  
  #############################################################################
  # genomenom mastermind track, Max, Feb 2019
  cd /hive/data/genomes/hg19/bed/mastermind/
  wget 'https://mastermind.genomenon.com/cvr/download?format=csv' -O - > mastermind.2018.11.26.csv.gz
  unzip mastermind.2018.11.26.csv.zip
  mv mastermind_cited_variants_reference-2018.11.26-csv/ 2018-11-26
  hgsql hg19 -NB -e 'select alias, chrom from chromAlias where source = "refseq";' > chromAlias.tab
  python ~/kent/src/hg/makeDb/mastermind/mastermindToBed.py 2018-11-26/mastermind_cited_variants_reference-2018.11.26.csv
  bedSort mastermind.bed mastermind.bed
  bedToBigBed -type=bed9+ -as=~/kent/src/hg/makeDb/mastermind/mastermind.as -tab mastermind.bed /hive/data/genomes/hg19/chrom.sizes  mastermind.bb
  ln -s `pwd`/mastermind.bb /gbdb/hg19/bbi/mastermind.bb
  ##############################################################################
  # DGV GOLD (DATABASE OF GENOMIC VARIANTS GOLD STANDARD) (DONE 5/06/19 ChrisL)
  # Redmine #23371
  ##############################################################################
      TODAY=`date +%y%m%d`
      mkdir -p /hive/data/genomes/hg19/bed/dgv/$TODAY
      cd /hive/data/genomes/hg19/bed/dgv/$TODAY
      wget http://dgv.tcag.ca/dgv/docs/DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3
  
      # GFF3 with the 9th field full of extra info that we need to recreate the blocks
      # as seen at the DGV website. See note-6 in the redmine (23371) for an example
      # of the different cnv representations (1, 2, or 3 blocks).
  
      # what sub-fields are in the 9th field:
      head -1 DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 | cut -f9 | tr ';' '\n' | cut -d'=' -f1
      # ID
      # Name
      # variant_type
      # variant_sub_type
      # outer_start
      # inner_start
      # inner_end
      # outer_end
      # inner_rank
      # num_variants
      # variants
      # num_studies
      # Studies
      # num_platforms
      # Platforms
      # number_of_algorithms
      # algorithms
      # num_samples
      # samples
      # Frequency
      # PopulationSummary
      # num_unique_samples_tested
  
      # and how many unique CNV regions?
      cut -f9 DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 | tr ';' '\t' | cut -f1 | cut -d'=' -f2 | sort -u | wc -l
      # 38185
      wc -l DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3
      # 114555 DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3
      calc 114555 /3
      # 114555 /3 = 38185.000000
  
      # run script to process the bedlines out of each of the gff lines
      ~/kent/src/hg/utils/automation/translateDgvGold.py DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 | sort -k1,1 -k2,2n | uniq > dgvGold.bed12
  
      cat << EOF > dgvGold.as
  table dgvGold
  "Database of Genomic Variants Gold Standard Curated Variants"
      (
      string chrom; "Chromosome name"
      uint chromStart; "Maximum boundary of CNV"
      uint chromEnd; "Maximum boundary of CNV"
      string name; "Name from gff"
      uint score; "Not used"
      char[1] strand; "Not used"
      uint thickStart; "Same as chromEnd"
      uint thickEnd; "Same as chromEnd"
      uint reserved;  "Color of item. Blue for gain and red for loss"
      int blockCount; "Number of blocks"
      int[blockCount] blockSizes; "Size of each block"
      int[blockCount] chromStarts; "Start position of each block relative to chromStart"
      string dgvID; "Name of CNV from DGV"
      string variant_type; "CNV"
      string variant_sub_type; "Gain or Loss"
      int inner_rank; "Rank of the used to assign the blocks"
      int num_variants; "Number of variants coalesced to form the entire region"
      string[num_variants] variants; "Supporting variants"
      int num_studies; "Number of studies"
      string[num_studies] Studies; "Study names in 'Name Year' format"
      int num_platforms; "Number of sequencing platforms"
      string[num_platforms] Platforms; "Sequencing platform names"
      int number_of_algorithms; "Number of CNV detection algorithms"
      string[number_of_algorithms] algorithms; "CNV detection algorithms used"
      int num_samples; "Number of samples"
      string[num_samples] samples; "Sample names"
      string Frequency; "Overall frequency of variants across all studies"
      string PopulationSummary; "Populations tested across all studies"
      int num_unique_samples_tested; "Number of samples "
      )
  EOF
  
      CHROMSIZES=/hive/data/genomes/hg19/chrom.sizes
      bedToBigBed -type=bed12+17 -as=dgvGold.as -tab -extraIndex=name dgvGold.bed12 $CHROMSIZES dgvGold.bb
      bigBedInfo dgvGold.bb
      # version: 4
      # fieldCount: 29
      # hasHeaderExtension: yes
      # isCompressed: yes
      # isSwapped: 0
      # extraIndexCount: 0
      # itemCount: 38,185
      # primaryDataSize: 30,841,362
      # primaryIndexSize: 6,892
      # zoomLevels: 8
      # chromCount: 24
      # basesCovered: 580,564,080
      # meanDepth (of bases covered): 3.668451
      # minDepth: 1.000000
      # maxDepth: 81.000000
      # std of depth: 5.825349
  
      # link into gbdb
      mkdir -p /gbdb/hg19/dgv
      ln -s `pwd`/dgvGold.bb /gbdb/hg19/dgv/
  
  ##############################################################################
  # LASTZ human/hg19 Tree chimp/rheMac10 - (DONE - 2019-07-09 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzRheMac10.2019-07-09
      cd /hive/data/genomes/hg19/bed/lastzRheMac10.2019-07-09
  
      printf '# human vs macaca mulatta
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz
  # maximum M allowed with lastz is only 254
  BLASTZ_M=254
  BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q
  BLASTZ_O=600
  BLASTZ_E=150
  # other parameters from panTro2 vs hg18 lastz on advice from Webb
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_T=2
  
  # TARGET: Human Hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Macaca Mulatta RheMac10
  SEQ2_DIR=/hive/data/genomes/rheMac10/rheMac10.2bit
  SEQ2_LEN=/hive/data/genomes/rheMac10/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=600
  SEQ2_LAP=0
  SEQ2_IN_CONTIGS=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzRheMac10.2019-07-09
  TMPDIR=/dev/shm
  ' > DEF
      # << happy emacs
  
      time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
          -syntenicNet -fileServer=hgwdev \
          -chainMinScore=5000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1
      # real    85m20.849s
  
      cat fb.hg19.chainRheMac10Link.txt
      # 2604364199 bases of 2991694177 (87.053%) in intersection
      cat fb.hg19.chainSynRheMac10Link.txt
      # 2570126144 bases of 2991694177 (85.909%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` hg19 rheMac10) > rbest.log 2>&1 &
      # real    112m12.228s
  
      cat fb.hg19.chainRBest.RheMac10.txt
      # 2421810991 bases of 2991694177 (80.951%) in intersection
  
      # and for the swap:
      mkdir /hive/data/genomes/rheMac10/bed/blastz.hg19.swap
      cd /hive/data/genomes/rheMac10/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzRheMac10.2019-07-09/DEF \
          -swap -chainMinScore=5000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > swap.log 2>&1
      #  real    56m29.163s
  
      cat fb.rheMac10.chainHg19Link.txt
      # 2483311773 bases of 2936892733 (84.556%) in intersection
      cat fb.rheMac10.chainSynHg19Link.txt
      # 2461925388 bases of 2936892733 (83.828%) in intersection
  
      time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` rheMac10 hg19) > rbest.log 2>&1
      # real    112m15.825s
  
      cat fb.rheMac10.chainRBest.Hg19.txt
      # 2423781773 bases of 2936892733 (82.529%) in intersection
  
  #########################################################################
  # 2019-12-02: AVADA fulltext variants, Max
  cd /hive/data/genomes/hg19/bed/avada
  wget http://bejerano.stanford.edu/AVADA/avada_v1.00_2016.vcf.gz
  python ~/kent/src/hg/makeDb/avada/toBed.py > avada.bed
  cat avada.bed | sort -k10 > avada.s.bed
  cp /hive/data/inside/pubs/text/medline/articles.db /dev/shm/max/
  bedAppendPaperInfo avada.s.bed avadaPapers.bed  --db /dev/shm/max --geneIdx=9
  bedSort avadaPapers.bed avadaPapers.bed
  cat avadaPapers.bed | uniq > avadaPapers.u.bed
  bedToBigBed avadaPapers.u.bed /hive/data/genomes/hg19/chrom.sizes -as=${HOME}/kent/src/hg/makeDb/avada/avada.as  -tab avada.bb -type=bed9+
  rm /dev/shm/max/articles.db
  cd /gbdb/hg19/bbi
  ln -s /hive/data/genomes/hg19/bed/avada/avada.bb
  
  #########################################################################
  # Hi-C example track drawing from Rao 2014 - (DONE - 2019-10-07 - Jonathan)
  mkdir -p /hive/data/genomes/hg19/bed/hic
  cd /hive/data/genomes/hg19/bed/hic
  
  # Files are located on GEO at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63525.  I grabbed the hic files
  # and used a subset of them (the combined files that haven't been filtered).
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_GM12878_insitu_primary%2Breplicate_combined.hic
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_HMEC_combined.hic
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_HUVEC_combined.hic
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_IMR90_combined.hic
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_K562_combined.hic
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_KBM7_combined.hic
  wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_NHEK_combined.hic
  
  mkdir -p /gbdb/hg19/bbi/hic
  cd /gbdb/hg19/bbi/hic
  ln -s /hive/data/genomes/hg19/bed/hic/*.hic .
  #########################################################################
  
  # Illumina GDA  (DONE braney 2019-10-16)
  cd /cluster/data/hg19/bed/gda
  wget "http://webdata.illumina.com.s3-website-us-east-1.amazonaws.com/downloads/productfiles/global-diversity-array/infinium-global-diversity-array-8-d1-csv.zip"
  unzip *.zip
  awk 'BEGIN {FS=","; OFS="\t"} { print "chr" $10, $11-1, $11, $2,0, $21,$4}' *.csv | sort -k1,1 -k2,2n | bedClip stdin /cluster/data/hg19/chrom.sizes gda.bed
  hgLoadSqlTab hg19 snpArrayIlluminaGDA gda.sql gda.bed
  
  #########################################################################
  # ncbiRefSeq.p13 update (DONE - 2019-11-21 - Hiram)
  
      mkdir /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2019-11-21
      cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2019-11-21
  
      # running step wise just to be careful
      time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev \
        -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
        refseq vertebrate_mammalian Homo_sapiens \
        GCF_000001405.25_GRCh37.p13 hg19) > download.log 2>&1
      # real    2m27.137s
  
      time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
        -continue=process -bigClusterHub=ku -dbHost=hgwdev \
        -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
        refseq vertebrate_mammalian Homo_sapiens \
        GCF_000001405.25_GRCh37.p13 hg19) > process.log 2>&1
      # real    2m9.450s
  
      ### BEFORE loading this updated table
      cat fb.ncbiRefSeq.hg19.txt
      #  85414465 bases of 2991694177 (2.855%) in intersection
  
      featureBits -enrichment hg19 refGene ncbiRefSeq
   # refGene 3.002%, ncbiRefSeq 2.855%, both 2.690%, cover 89.59%, enrich 31.38x
  
      featureBits -enrichment hg19 ncbiRefSeq refGene
   # ncbiRefSeq 2.855%, refGene 3.002%, both 2.690%, cover 94.21%, enrich 31.38x
  
      featureBits -enrichment hg19 ncbiRefSeqCurated refGene
   # ncbiRefSeqCurated 2.855%, refGene 3.002%, both 2.690%, cover 94.21%, enrich 31.38x
  
      featureBits -enrichment hg19 refGene ncbiRefSeqCurated
   # refGene 3.002%, ncbiRefSeqCurated 2.855%, both 2.690%, cover 89.59%, enrich 31.38x
  
      time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
        -continue=load -bigClusterHub=ku -dbHost=hgwdev \
        -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
        refseq vertebrate_mammalian Homo_sapiens \
        GCF_000001405.25_GRCh37.p13 hg19) > load.log 2>&1
      # real    0m21.982s
  
      ### AFTER loading this updated table
      cat fb.ncbiRefSeq.hg19.txt
      # 93708953 bases of 2991694177 (3.132%) in intersection
  
      featureBits -enrichment hg19 refGene ncbiRefSeq
   # refGene 3.002%, ncbiRefSeq 3.132%, both 2.983%, cover 99.35%, enrich 31.72x
  
      featureBits -enrichment hg19 ncbiRefSeq refGene
   # ncbiRefSeq 3.132%, refGene 3.002%, both 2.983%, cover 95.23%, enrich 31.72x
  
      featureBits -enrichment hg19 ncbiRefSeqCurated refGene
   # ncbiRefSeqCurated 3.132%, refGene 3.002%, both 2.983%, cover 95.23%, enrich 31.72x
  
      featureBits -enrichment hg19 refGene ncbiRefSeqCurated
   # refGene 3.002%, ncbiRefSeqCurated 3.132%, both 2.983%, cover 99.35%, enrich 31.72x
  
  #########################################################################
  # CADD (DONE - 2019-12-05 - Max)
  wget https://krishna.gs.washington.edu/download/CADD/bigWig/CADD_GRCh37-v1.4.bw -O /hive/data/genomes/hg19/bed/cadd/CADD-v1.4.bw
  ln -s /hive/data/genomes/hg19/bed/cadd/CADD-v1.4.bw /gbdb/hg19/bbi/CADD-v1.4.bw
  #########################################################################
  
  #########################################################################
  # gnomAD 2 pLI and other loss of function metrics (DONE - 2019-12-10 - Chris)
  
  ### hg19 gnomad v2.1.1 gene/transcript constraint data ###
  cd /hive/data/outside/gnomAD.2/
  mkdir constraint
  cd constraint
  
  # use gsutil to copy files:
  gsutil cp gs://gnomad-public/release/2.1.1/constraint/*.txt.bgz .
  Copying gs://gnomad-public/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz...
  Copying gs://gnomad-public/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz...
  Copying gs://gnomad-public/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.downsamplings.txt.bgz...
  | [3 files][205.6 MiB/205.6 MiB]
  Operation completed over 3 objects/205.6 MiB.
  
  transcriptFile=gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz
  geneFile=gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz
  
  hgsql -Ne "select name from wgEncodeGencodeCompV19" hg19 | tr '.' '\t' | cut -f1 > hg19.gencodeV19.transcripts
  cut -f11 pliByGene.bed | sort > pliByGene.transcripts
  cut -f11 pliByTranscripts.bed | sort > pliByTranscripts.transcripts
  wc -l *.transcripts
  # 189020 hg19.gencodeV19.transcripts
  #  19704 pliByGene.transcripts
  #  80950 pliByTranscripts.transcripts
  # 289674 total
  
  # check that v19 has all the transcripts:
  comm -12 hg19.gencodeV19.transcripts pliByGene.transcripts | wc -l
  19704
  comm -12 hg19.gencodeV19.transcripts pliByTranscripts.transcripts | wc -l
  80950
  rm hg19.gencodeV19.transcripts
  
  # ok safe to use v19 exon boundaries, just need to drop the version numbers:
  hgsql -Ne "select * from wgEncodeGencodeCompV19" hg19 | cut -f2- | genePredToBed | sed -Ee 's/\.[0-9]+//' | sort -k4 > hg19.gencodeCompV19.bed12
  
  gzip -cd $geneFile | tail -n +2 \
      | tawk '{print $75,$76,$77,$64,$65,$1,$2,$3,$4,$5,$33,$12,$13,$14,$32,$17,$20,$21,$24,$25,$26,$27,$28,$29,$30}' \
      | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \
      | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=false 2>genes.chromMismatches
  
  gzip -cd $transcriptFile | tail -n +2 \
      | tawk '{print $76,$77,$78,$65,$66,$1,$2,$4,$5,$6,$34,$13,$14,$15,$33,$18,$21,$22,$25,$26,$27,$28,$29,$30,$31}' \
      | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \
      | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true 2>transcripts.chromMismatches
  sort -k1,1 -k2,2n pliByTranscript.tab > pliByTranscript.bed
  sort -k1,1 -k2,2n missenseByTranscript.tab > missenseByTranscript.bed
  sort -k1,1 -k2,2n pliByGene.tab > pliByGene.bed
  sort -k1,1 -k2,2n missenseByGene.tab > missenseByGene.bed
  
  # make pli .as file:
  #  table pliMetrics
  #  "bed12+6 for displaying gnomAD haploinsufficiency prediction scores"
  #      (
  #      string chrom;      "Reference sequence chromosome or scaffold"
  #      uint   chromStart; "Start position in chromosome"
  #      uint   chromEnd;   "End position in chromosome"
  #      string name;       "ENST or ENSG Name"
  #      uint   score;      "pLI score between 0-1000"
  #      char[1] strand;    "strand of transcript"
  #      uint thickStart;   "Start of where display is thick"
  #      uint thickEnd;     "End of where display should be thick"
  #      uint itemRgb;    "Color of item"
  #      int blockCount;   "Number of exons"
  #      int[blockCount] blockSizes;  "Size of each exon"
  #      int[blockCount] blockStarts; "0-based start position of each exon"
  #      string _mouseOver;  "Mouseover label"
  #      float _loeuf;      "LOEUF value for filters"
  #      float _pli;        "pLI value for filters"
  #      string geneName;   "Associated Gene symbol"
  #      string synonymous; "Synonymous metrics"
  #      string pLoF;       "Predicted Loss of Function metrics
  #      )
  
  # make missense .as file:
  #table missenseMetrics
  #"bed12+5 for displaying gnomAD missense prediction scores"
  #    (
  #    string chrom;      "Reference sequence chromosome or scaffold"
  #    uint   chromStart; "Start position in chromosome"
  #    uint   chromEnd;   "End position in chromosome"
  #    string name;       "ENST or ENSG Name"
  #    uint   score;      "pLI score between 0-1000, or  -1 for NA"
  #    char[1] strand;    "strand of transcript"
  #    uint thickStart;   "Start of where display is thick"
  #    uint thickEnd;     "End of where display should be thick"
  #    uint itemRgb;    "Color of item"
  #    int blockCount;   "Number of exons"
  #    int[blockCount] blockSizes;  "Size of each exon"
  #    int[blockCount] chromStarts; "0-based start position of each exon"
  #    string _mouseOver;  "Mouseover label"
  #    float _zscore;         "Z-score value for filters"
  #    string geneName;   "Gene symbol"
  #    string synonymous; "Synonymous metrics"
  #    string missense;   "Missense metrics"
  #    )
  
  sizes=/hive/data/genomes/hg19/chrom.sizes
  bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByGene.bed $sizes pliByGene.bb
  bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.bed $sizes pliByTranscript.bb
  bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByGene.bed $sizes missenseByGene.bb
  edToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.bed $sizes missenseByTranscript.bb
  cd /gbdb/hg19/gnomAD/pLI/
  ln -s /hive/data/outside/gnomAD.2/constraint/pliByGene.bb
  ln -s /hive/data/outside/gnomAD.2/constraint/pliByTranscript.bb
  ln -s /hive/data/outside/gnomAD.2/constraint/missenseByGene.bb
  ln -s /hive/data/outside/gnomAD.2/constraint/missenseByTranscript.bb
  ##############################################################################
  2020-01-13: Add size filter to dgvMerged and dgvSupporting track (ChrisL)
  cd /hive/data/genomes/hg19/bed/dgv/160810
  zcat dgvMerged.bed.gz | tawk '{print $0, $3-$2}' > dgvMergedWithSize.bed
  zcat dgvSupporting.bed.gz | tawk '{print $0, $3-$2}' > dgvSupportingWithSize.bed
  cat dgvPlusSize.as
  # table dgvPlus
  # "Database of Genomic Variants incorporating dbVar, July 2013 and later"
  #     (
  #     string chrom;       "Reference sequence chromosome or scaffold"
  #     uint   chromStart;  "Start position in chromosome"
  #     uint   chromEnd;    "End position in chromosome"
  #     string name;        "ID of merged variant or supporting variant"
  #     uint   score;       "Score from 0-1000 (placeholder for BED 9+ format)"
  #     char[1] strand;     "+ or - (placeholder for BED 9+ format)"
  #     uint thickStart;    "Same as chromStart (placeholder for BED 9+ format)"
  #     uint thickEnd;      "Same as chromStart (placeholder for BED 9+ format)"
  #     uint itemRgb;   "Item R,G,B color."
  #     string varType;     "Type of variation"
  #     string reference;   "Literature reference for the study that included this variant"
  #     uint pubMedId;      "For linking to pubMed abstract of reference"
  #     lstring method;     "Brief description of method"
  #     lstring platform;    "Sequencing platform (if specified)"
  #     string mergedVariants; "If this is a supporting variant, ID of merged variant"
  #     lstring supportingVariants; "If this is a merged variant, IDs of supporting variants"
  #     uint sampleSize;    "Number of samples in study"
  #     uint observedGains; "Number of samples with copy number gains"
  #     uint observedLosses; "Number of samples with copy number losses"
  #     lstring cohortDescription; "Description of sample population for the study"
  #     lstring genes;      "Genes overlapping this variant"
  #     lstring samples;    "Sample IDs if available"
  #     uint _size;         "Genomic Size of variant"
  #     )
  bedToBigBed -tab -as=dgvPlusSize.as -type=bed9+14 dgvMergedWithSize.bed  /hive/data/genomes/hg19/chrom.sizes dgvMerged.bb
  # pass1 - making usageList (24 chroms): 1054 millis
  # pass2 - checking and writing primary data (392583 records, 23 fields): 8495 millis
  bedToBigBed -tab -as=dgvPlusSize.as -type=bed9+14 dgvSupportingWithSize.bed  /hive/data/genomes/hg19/chrom.sizes dgvSupporting.bb
  # pass1 - making usageList (25 chroms): 2577 millis
  # pass2 - checking and writing primary data (6668715 records, 23 fields): 27271 millis
  cd /gbdb/hg19/dgv/
  ln -s /hive/data/genomes/hg19/bed/dgv/160810/dgvMerged.bb
  ln -s /hive/data/genomes/hg19/bed/dgv/160810/dgvSupporting.bb
  ##############################################################################
  # gnomAD Missense Constraint Scores track - Jan 15 2020 - ChrisL
  
      cd /hive/data/outside/gnomAD.2/constraint/
      mkdir missense
  
      gsutil cat gs://gnomad-public/legacy/exacv1_downloads/release1/regional_missense_constraint/README_fordist_mpc_values > README.txt
  
      # so the v2 files are what we want since they are most recent:
      gsutil cp gs://gnomad-public/legacy/exacv1_downloads/release1/regional_missense_constraint/fordist_constraint_official_mpc_values_v2.txt.* .
  
      # this is not what we want, at least not for what should be a bigBed, although this would make a nice bigWig file?
      chrom	pos	ref	alt	gene_name	ENST	ENSG	CCDS	Consequence	HGVSc	HGVSp	Amino_acids	context	SIFT	PolyPhen	obs_exp	mis_badness	fitted_score	MPC
      1	69094	G	A	OR4F5	ENST00000335137	ENSG00000186092	CCDS30547.1	missense_variant	ENST00000335137.3:c.4G>A	ENSP00000334393.3:p.Val2Met	V/M	GGT	deleterious(0.01)	possibly_damaging(0.828)	0.115421521859	0.359925753267	0.689727513402	2.7340307082
  
      # After reading the paper: https://www.biorxiv.org/content/10.1101/148353v1.full
      # it looks like what I want is in supp table 4, which is an excel sheet ugh
      wget https://www.biorxiv.org/highwire/filestream/44323/field_highwire_adjunct_files/2/148353-3.xlsx
      xlsx2csv -a -d 'tab' 148353-3.xlsx 148353-3
      ls 148353-3
      # Info.csv  Table_S4.csv
  
      # Table_S4.csv is where it's at:
      # head -2 148353-3/Table_S4.csv
      transcript  gene    chr amino_acids genomic_start   genomic_end obs_mis exp_mis obs_exp chisq_diff_null region_name
      ENST00000337907.3   RERE    1   1-507   8716356 8424825 97  197.9807    0.489947    51.505535   RERE_1
      ENST00000337907.3	RERE	1	508-1567	8424824	8415147	355	438.045275	0.810419	15.743847	RERE_2
  
      # now I need to get this into exons somehow
      hgsql -Ne "select * from wgEncodeGencodeCompV19" | cut -f2- | genePredToBed > hg19.gencodeV19.txt
      bedToPsl /hive/data/genomes/hg19/chrom.sizes hg19.gencodeV19.txt v19.psl
      # pslMap would work here but since I don't know how to make a psl for RERE:1-507 I can't supply
      # the input psl that pslMap needs. thus I'll need a new util
  
      # first trim the utrs from v19:
      ~/kent/src/hg/makeDb/gnomad/trimUtrs.py hg19.gencodeV19.txt trimmedUtrs.txt
      # 99448 transcript added to transcript dict
      # are these correct?
      bedToExons trimmedUtrs.txt my.gencode.exonsOnly
      bedToExons -cdsOnly hg19.gencodeV19.txt gencode.exonsOnly
      # the awk removes the non-coding transcripts
      diff <(cut -f1-4 gencode.exonsOnly | tawk '{if ($3 != $2) print}' | sort -k4) <(cut -f1-4 trimmedUtrs.txt | sort -k4)
      # no diffs so we're good
  
      # now chop up exons according to the amino acids:
      ~/kent/src/hg/makeDb/gnomad/aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv > aaToBed.out
      # make autoSql file, regular bed12 plus one for the gene name and one for the chi square value
      # table missenseConstraint
      # "Parts of transcripts shaded according to how well that region of the transcript tolerates missense variation."
      #     (
      #     string chrom;      "Chromosome (or contig, scaffold, etc.)"
      #     uint   chromStart; "Start position in chromosome"
      #     uint   chromEnd;   "End position in chromosome"
      #     string name;       "Name of item"
      #     uint   score;      "Score from 0-1000"
      #     char[1] strand;    "+ or -"
      #     uint thickStart;   "Start of where display should be thick (start codon)"
      #     uint thickEnd;     "End of where display should be thick (stop codon)"
      #     uint reserved;     "RGB color of item"
      #     int blockCount;    "Number of blocks"
      #     int[blockCount] blockSizes; "Comma separated list of block sizes"
      #     int[blockCount] chromStarts; "Start positions relative to chromStart"
      #     string geneName;     "Name of corresponding gene"
      #     int observed;      "Number of observed missense variants"
      #     float expected;      "Number of expected missense variants"
      #     float obs_exp;      "Observed/expected score"
      #     float chisq;       "Chi-Squared Difference"
      #     string _mouseOver; "MouseOver label"
      #     )
  
      # now we can make the bigBed:
      ./aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv | sort -k1,1 -k2,2n > missenseConstrained.bed
      bedToBigBed -tab -type=bed12+6 -as=missenseBed.as missenseConstrained.bed /hive/data/genomes/hg19/chrom.sizes missenseConstrained.bb
      # pass1 - making usageList (24 chroms): 5 millis
      # pass2 - checking and writing primary data (6507 records, 17 fields): 134 millis
      # only a few genes:
      cut -f13 missenseConstrained.bed | sort | uniq | wc -l
      # 2700
      ln -s /gbdb/hg19/gnomAD/missenseConstrained.bb missenseConstrained.bb
  
  ##############################################################################
  # adding RefSeq Select to NCBIRefSeq, Max, Feb 17 2020
  cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2019-11-21
  zcat download/*_genomic.gff.gz | egrep 'tag=(RefSeq|MANE) Select'  | cut -f9- | tr ';' '\n' | grep Name= | grep -v NP_ | cut -d= -f2 | sort -u > refseqSelectTranscripts.txt
  cat process/hg19.curated.gp | fgrep -f refseqSelectTranscripts.txt - > refseqSelect.curated.gp
  hgLoadGenePred -genePredExt hg19 ncbiRefSeqSelect refseqSelect.curated.gp
  wc -l refseqSelect.curated.gp
  21436 refseqSelect.curated.gp
  ##############################################################################
  # doseSensitivity (WORKING - 2020-02-06, Ana, Hiram)
  
    mkdir /hive/data/genomes/hg19/bed/doseSensitivity
    cd /hive/data/genomes/hg19/bed/doseSensitivity
  
    ftp original files from ClinGen:
  
    ftp://ftp.ncbi.nlm.nih.giv/pub/dbVar/clingen/
  
    wget --timestamping \
  ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/clingen/ClinGen_haploinsufficiency_gene_GRCh37.bed
  
    wget --timestamping \
  ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/clingen/ClinGen_triplosensitivity_gene_GRCh37.bed
  
  
  -rw-rw-rw- 1 202784 Jan 28 16:21 ClinGen_gene_curation_list_GRCh37.tsv
  -rw-rw-rw- 1  45142 Jan 28 16:21 ClinGen_haploinsufficiency_gene_GRCh37.bed
  -rw-rw-rw- 1  48750 Jan 28 16:22 ClinGen_triplosensitivity_gene_GRCh37.bed
  
    # create perl script for processing: toUcsc.pl
  
  ###########################################################################
  #!/usr/bin/env perl
  
  use strict;
  use warnings;
  
  my $argc = scalar(@ARGV);
  
  if ($argc != 1) {
    printf STDERR "usage: ./toUcsc.pl originalFile.bed > newFile.bed\n";
    exit 255;
  }
  
  my @mouseOver;
  
  $mouseOver[0] = "not yet evaluated";
  $mouseOver[1] = "no evidence for dosage pathogenicity";
  $mouseOver[2] = "little evidence for dosage pathogenicity";
  $mouseOver[3] = "some evidence for dosage pathogenicity";
  $mouseOver[4] = "sufficient evidence for dosage pathogenicity";
  $mouseOver[5] = "gene associated with autosomal recessive phenotype";
  $mouseOver[6] = "haploinsufficiency unlikely";
  
  my $inFile = shift;
  
  open (FH, "grep -v track $inFile|") or die "can not read $inFile";
  while (my $line = <FH>) {
    chomp $line;
  # printf STDERR "# %s\n", $line if ($line =~ m/Not/);
    my @a = split('\s+', $line, 5);
    my $selectMouseOver = 0;
    my $color = "0,255,0";
    my $score = $a[4];
    if ($a[4] =~ m/Not/) {
      $color = "128,128,128";
      $score = 0;
      $selectMouseOver = 0;
    } elsif (0 == $a[4]) {
      $color = "252,79,89";
      $selectMouseOver = 1;
    } elsif (1 == $a[4]) {
      $color = "209,45,51";
      $selectMouseOver = 2;
    } elsif (2 == $a[4]) {
      $color = "160,48,51";
      $selectMouseOver = 3;
    } elsif (3 == $a[4]) {
      $color = "109,51,43";
      $selectMouseOver = 4;
    } elsif (30 == $a[4]) {
      $color = "109,51,43";
      $selectMouseOver = 5;
    } elsif (40 == $a[4]) {
      $color = "0,0,255";
      $selectMouseOver = 6;
    } else {
      printf STDERR "%s\n", $line;
      die "unrecognized column 5 value ?";
    }
    printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t%s\t%s\t%d - %s\n",
      $a[0], $a[1], $a[2], $a[3], $score, $a[1], $a[2], $color, $a[4], $score, $mouseOver[$selectMouseOver];
  }
  close (FH);
  ###########################################################################
  
    # convert the original 'bed 5' data to bed 9+2 with perl script:
  
    ./toUcsc.pl ClinGen_haploinsufficiency_gene_GRCh37.bed \
           | sort -k1,1 -k2,2n > haploInsufficiency.bed
    ./toUcsc.pl ClinGen_triplosensitivity_gene_GRCh37.bed \
           | sort -k1,1 -k2,2n > triploSensitivity.bed
  
    # convert bed to bigBed:
    # using the dosSensitivity.as file:
  
  table doseSensitivity
  "ClinGen dosage sensitivity bed 9 plus original dosageScore"
      (
      string chrom;      "Chromosome (or contig, scaffold, etc.)"
      uint   chromStart; "Start position in chromosome"
      uint   chromEnd;   "End position in chromosome"
      string name;       "Name of item"
      uint   score;      "Score from 0-1000"
      char[1] strand;    "+ or -"
      uint thickStart;   "Start of where display should be thick (start codon)"
      uint thickEnd;     "End of where display should be thick (stop codon)"
      uint itemRgb;      "color indicates dosage score"
      string dosageScore; "dosage score from ClinGen"
      string mouseOver;  "description for the score meaning"
      )
  
  
    bedToBigBed -tab -type=bed9+2 -as=doseSensitivity.as \
       haploInsufficiency.bed ../../chrom.sizes haploInsufficiency.bb
    bedToBigBed -tab -type=bed9+2 -as=doseSensitivity.as \
       triploSensitivity.bed ../../chrom.sizes triploSensitivity.bb
  
    # measure data in bigBed files:
    bigBedInfo haploInsufficiency.bb | sed -e 's/^/# /;'
  # version: 4
  # fieldCount: 11
  # hasHeaderExtension: yes
  # isCompressed: yes
  # isSwapped: 0
  # extraIndexCount: 0
  # itemCount: 1,396
  # primaryDataSize: 42,693
  # primaryIndexSize: 6,388
  # zoomLevels: 6
  # chromCount: 24
  # basesCovered: 160,842,763
  # meanDepth (of bases covered): 1.001796
  # minDepth: 1.000000
  # maxDepth: 2.000000
  # std of depth: 0.042340
  
    bigBedInfo triploSensitivity.bb | sed -e 's/^/# /;'
  # version: 4
  # fieldCount: 11
  # hasHeaderExtension: yes
  # isCompressed: yes
  # isSwapped: 0
  # extraIndexCount: 0
  # itemCount: 1,396
  # primaryDataSize: 39,932
  # primaryIndexSize: 6,388
  # zoomLevels: 6
  # chromCount: 24
  # basesCovered: 160,842,763
  # meanDepth (of bases covered): 1.001796
  # minDepth: 1.000000
  # maxDepth: 2.000000
  # std of depth: 0.042340
  
  
    # symlinks to /gbdb/hg19:
  
    ln -s `pwd`/haploInsufficiency.bb \
       /gbdb/hg19/doseSensitivity/clinGenHaploInsufficiency.bb
    ln -s `pwd`/triploSensitivity.bb \
       /data/hg19/doseSensitivity/clinGenTriploSensitivity.bb
  
    # trackDb composite perhaps in ClinVar CNVs composite
  
  ##############################################################################
  # NCBI ReMap alignments (DONE 2020-02-11 Angie)
  # RM 24449
      mkdir /hive/data/genomes/hg19/bed/chainHg38ReMap
      cd /hive/data/genomes/hg19/bed/chainHg38ReMap
      wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.25_GRCh37.p13/GCF_000001405.39_GRCh38.p13/GCF_000001405.25-GCF_000001405.39.gff
      # We will need to substitute all the RefSeq chrom and contig IDs with our own names.
      # The same alt contig can appear in both assemblies with the same name, so replace
      # hg38 names at the beginning of the line and hg19 names after "Target=".
      hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
      | sed -re 's/\./\\./;' \
      | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \
        > hg19.hg38.chromAlias.sed
      hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
      | sed -re 's/\./\\./;' \
      | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \
        >> hg19.hg38.chromAlias.sed
      sed -f hg19.hg38.chromAlias.sed GCF_000001405.25-GCF_000001405.39.gff \
      | gff3ToPsl -dropQ /hive/data/genomes/{hg38,hg19}/chrom.sizes stdin stdout \
      | pslPosTarget stdin stdout \
      | sort -k14,14 -k16n,16n > remap.hg19.hg38.psl
      # Convert to chain for browser display.  Some of the remap chains have minScore < 1000 and
      # by default would be dropped by chainScore... use -minScore=0 to prevent that.
      time pslToChain remap.hg19.hg38.psl stdout \
      | chainScore -minScore=0 stdin /hive/data/genomes/{hg19/hg19.2bit,hg38/hg38.2bit} \
          remap.hg19.hg38.chain
  #real    5m55.241s
      hgLoadChain hg19 -tIndex chainHg38ReMap remap.hg19.hg38.chain
  #Loading 5315 chains into hg19.chainHg38ReMap
      # Chaining the ReMap alignments makes it a lot easier to see when separate alignments
      # to the same sequence are in the same order and orientation.
      time axtChain -psl -linearGap=medium -verbose=0 remap.hg19.hg38.psl \
        /hive/data/genomes/hg19/hg19.2bit /hive/data/genomes/hg38/hg38.2bit \
        remap.axtChain.hg19.hg38.chain
  #real    1m41.773s
      hgLoadChain hg19 -tIndex chainHg38ReMapAxtChain remap.axtChain.hg19.hg38.chain
  #Loading 2141 chains into hg19.chainHg38ReMapAxtChain
  
  
  ##############################################################################
  # gnomAD Structural Variants v2.1 - ChrisL
  # Redmine #24179
  
      cd /hive/data/outside/gnomAD.2/
      mkdir structuralVariants
  
      gsutil cp gs://gnomad-public/papers/2019-sv/gnomad_v2.1_sv.*.bed* .
      gsutil cp gs://gnomad-public/papers/2019-sv/gnomad_v2.1_sv.*.vcf* .
  
      for f in *.bed.gz; do out=${f/.bed.gz/}; zcat $f | tail -n +2 | cut -f1-4,18-28,31,32,36-38,41-42 | tawk '{$1="chr"$1; print $0}'> $out.bed4Plus; done
      # variant types:
      zcat gnomad_v2.1_sv.sites.bed.gz | cut -f32 | sort | uniq -c
        52604 BND y
         4778 CPX x
            8 CTX y
       169635 DEL x
        49571 DUP x
       109025 INS x
          748 INV x
         1108 MCNV x
            1 SVTYPE
  
      # add colors based on gnomad website and get in to proper bed9+
      for f in *.bed4Plus; do out=${f/.bed4Plus/}; bedClip -truncate $f $chromSizes stdout | ./gnomadSvToUcsc.awk | sort -k1,1 -k2,2n > $out.bed9Plus; done
  
      chromSizes=/hive/data/genomes/hg19/chrom.sizes
      for f in *.bed9Plus; do out=${f/.bed9Plus/}; bedToBigBed -tab -type=bed9+19 -as=gnomadSv.as -extraIndex=name $f $chromSizes $out.bb; done
      mkdir -p /gbdb/hg19/gnomAD/structuralVariants/
      cd /gbdb/hg19/gnomAD/structuralVariants/
      cp -s /hive/data/outside/gnomAD.2/structuralVariants/*.bb .
  
  ##############################################################################
  # NCBI regions that are problematic for sequencing, Mon Nov 18 05:06:17 PST 2019, Max
  
  mkdir /hive/data/genomes/hg19/bed/specialRegions/orig
  cd /hive/data/genomes/hg19/bed/specialRegions/orig
  # download and convert to Excel
  wget ftp://ftp.ncbi.nlm.nih.gov/variation/get-rm/highly_homologous_genes/Table_S1_List1_NGS_Dead_Zone_exon_level.xlsx
  wget ftp://ftp.ncbi.nlm.nih.gov/variation/get-rm/highly_homologous_genes/Table_S2_List2_NGS_Problem_List_High_Stringency_exon_level.xlsx
  wget ftp://ftp.ncbi.nlm.nih.gov/variation/get-rm/highly_homologous_genes/Table_S3_List3_NGS_Problem_List_Low_Stringency_exon_level.xlsx
  wget ftp://ftp.ncbi.nlm.nih.gov/variation/get-rm/highly_homologous_genes/Table_S4_List4_Sanger_Dead_Zone_exon_level.xlsx
  in2csv Table_S1_List1_NGS_Dead_Zone_exon_level.xlsx | csvformat -T > Table_S1_List1_NGS_Dead_Zone_exon_level.tsv
  in2csv  Table_S2_List2_NGS_Problem_List_High_Stringency_exon_level.xlsx | csvformat -T > Table_S2_List2_NGS_Problem_List_High_Stringency_exon_level.tsv
  in2csv Table_S3_List3_NGS_Problem_List_Low_Stringency_exon_level.xlsx | csvformat -T > Table_S3_List3_NGS_Problem_List_Low_Stringency_exon_level.tsv
  in2csv Table_S4_List4_Sanger_Dead_Zone_exon_level.xlsx | csvformat -T > Table_S4_List4_Sanger_Dead_Zone_exon_level.tsv
  
  cd ..
  # tabToBed is from https://github.com/maximilianh/maxtools
  tabToBed orig/Table_S1_List1_NGS_Dead_Zone_exon_level.tsv bed/deadZone.bed as/deadZone.as -t bed3+
  bedToBigBed -tab bed/deadZone.bed /hive/data/genomes/hg19/chrom.sizes bb/deadZone.bb -tab -type=bed3+ -as=as/deadZone.as
  
  tabToBed orig/Table_S2_List2_NGS_Problem_List_High_Stringency_exon_level.tsv bed/ngsProblemHigh.bed as/ngsProblemHigh.as -t bed3+
  bedToBigBed -tab bed/ngsProblemHigh.bed /hive/data/genomes/hg19/chrom.sizes bb/ngsProblemHigh.bb -tab -type=bed3+ -as=as/ngsProblemHigh.as
  
  tabToBed orig/Table_S3_List3_NGS_Problem_List_Low_Stringency_exon_level.tsv bed/ngsProblemLow.bed as/ngsProblemLow.as -t bed3+
  bedToBigBed -tab bed/ngsProblemLow.bed /hive/data/genomes/hg19/chrom.sizes bb/ngsProblemLow.bb -tab -type=bed3+ -as=as/ngsProblemLow.as
  
  tabToBed orig/Table_S4_List4_Sanger_Dead_Zone_exon_level.tsv bed/sangerDeadZone.bed as/sangerDeadZone.as -t bed3+
  bedToBigBed -tab bed/sangerDeadZone.bed /hive/data/genomes/hg19/chrom.sizes bb/sangerDeadZone.bb -tab -type=bed3+ -as=as/sangerDeadZone.as
  
  # the GIAB BED filter files
  cd orig
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterABQD.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterAlign.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterConflicting.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterCov.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterHapNoVar.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterMap.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterSSE.bed.gz
  wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterlt2Datasets.bed.gz
  gunzip *.gz
  cd ..
  for i in orig/*.bed; do out=`echo $i | sed -e 's|orig/VQSRv2.18_||g'`; out=`basename $out .bed`; echo $out; chromToUcsc -a hg19.chromAlias.tsv -i $i -o bed/$out.bed; done
  for i in bed/filter*.bed; do echo $i; bedSort $i $i; bedToBigBed $i /hive/data/genomes/hg19/chrom.sizes bb/`basename $i .bed`.bed -type=bed3; done
  cd /gbdb/hg19/bbi/special;
  for i in /hive/data/genomes/hg19/bed/specialRegions/bb/filter*.bb;  do ln -s $i; done
  bedSort orig/hg19-blacklist.v2.bed orig/hg19-blacklist.v2.bed
  bedToBigBed orig/hg19-blacklist.v2.bed /hive/data/genomes/hg19/chrom.sizes  bb/encBlacklist.bb -tab
  
  #############################################################################
  # GTEx V8 (April 2020) Kate
  # Create BED from hgFixed tables (see doc/gtex)
  
  cd /hive/data/outside/gtex/V8/rnaSeq
  
  # Lift GTEx LDACC gene models (GENCODE V26 isoforms collapsed to single gene model)
  from hg38 annotation by GTEx LDACC
  
  set chain = /hive/data/genomes/hg38/bed/liftOver/hg38ToHg19.over.chain.gz
  liftOver -genePred gencodeV26.hg38.genePred $chain gencodeV26.hg19.lifted.genePred \
                  gencodeV26.hg19.unmapped
  # 1300 gencodeV26.hg19.unmapped
  # (was 925 in V6 lift hg19 to hg38)
  
  # TODO: Consider transmap
  
  hgLoadGenePred hg19 gtexGeneModelV8 gencodeV26.hg19.lifted.genePred
  
  # Load BED table
  cd /hive/data/genomes/hg19/bed/gtex
  mkdir -p V8
  cd V8
  
  set gencode = V26lift37
  ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
          hg19 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt &
  
  # Max score: 267400.000000
  
  wc -l gtexGeneV8.tab
  #54481 gtexGeneV8.tab
  
  # 1070 genes not found in GencodeAttrs table
  # e.g.
  Can't find geneId ENSG00000278267 in wgEncodeGencodeAttrsV26lift37
  #from Ensembl page for MIR6859-1:
  #Stable ID ENSG00000278267 not present in GRCh37.
  
  # 650 genes not found in modelHash
  # e.g.
  Can't find gene ENSG00000279928.2 in modelHash
  # From Ensembl
  Gene: FO538757.1 ENSG00000279928
  There is no ungapped mapping of this gene onto the GRCh37 assembly.
  Stable ID ENSG00000279928 not present in GRCh37.
  
  #Max score: 219385.906250
  wc -l gtexGeneV8.tab
  # 54481 gtexGeneV8.tab
  # 55393 gtexGeneV6.tab
  
  # Add scores (see hg38/gtex.txt for background)
  
  set bedScore = ~/kent/src/utils/bedScore/bedScore
  $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed
  textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed
  0.000000 ************************************************************ 20189
  71.428643 **************************************************** 5512
  142.857286 **************************************************** 5401
  214.285929 *************************************************** 4587
  285.714571 *************************************************** 4399
  357.143214 *************************************************** 4672
  428.571857 **************************************************** 5390
  500.000500 ************************************************* 3073
  571.429143 ***************************************** 904
  642.857786 ********************************* 248
  714.286429 *************************** 80
  785.715071 ************** 10
  857.143714 ******** 4
  928.572357 *************** 12
  
  # table looks OK, load it
  set lib = ~/kent/src/hg/lib
  hgLoadBed hg19 -noBin -tab -type=bed6+4 \
          -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                  gtexGeneV8 gtexGeneBedV8.bed
  # Read 54481 elements of size 10 from gtexGeneBedV8.bed
  
  # TODO: Add to gene sorter
  
  #############################################################################
  # Build ncbiRefSeqGenomicDiff (DONE - 2020-06-10 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/ncbiRefSeqAnomalies.p13
      cd /hive/data/genomes/hg19/bed/ncbiRefSeqAnomalies.p13
  
      db=hg19
      pre=ncbiRefSeqGenomicDiff
      buildDir=/hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2020-01-21
      asmId=GCF_000001405.25_GRCh37.p13
  
      time (zcat $buildDir/process/$asmId.rna.cds.gz \
          | egrep '[0-9]+\.\.[0-9]+' \
          | pslMismatchGapToBed -cdsFile=stdin -db=$db -ignoreQNamePrefix=X \
              $buildDir/process/$asmId.$db.psl.gz \
              /hive/data/genomes/$db/$db.2bit \
              $buildDir/$db.rna.fa \
              $pre)
  
  # pslMismatchGapToBed: NM_001365372.1 gapIx 9 shifted right 74 bases, but next block size is only 38; report to NCBI
  # pslMismatchGapToBed: NM_001288811.1 gapIx 1 shifted left 6 bases, but previous block size is only 5; report to NCBI
  
  #  real    0m21.265s
  
   bedToBigBed -type=bed9+ -tab -as=$HOME/kent/src/hg/lib/txAliDiff.as $pre.bed \
          /hive/data/genomes/$db/chrom.sizes $pre.bb
  # pass1 - making usageList (180 chroms): 77 millis
  # pass2 - checking and writing primary data (27362 records, 20 fields): 234 millis
      ln -sf `pwd`/$pre.bb /gbdb/hg19/ncbiRefSeq/$pre.bb
  
  #############################################################################
  # clinvarSubLolly track  DONE BRANEY 12/14/2020
  mkdir /cluster/data/hg19/bed/clinvarSubLolly
  cd /cluster/data/hg19/bed/clinvarSubLolly
  bigBedToBed /gbdb/hg19/bbi/clinvar/clinvarMain.bb stdout | tawk '{print $40, $1,$2,$2+1,$4}' | sort  -S 40g > sort.main.bed
  hgsql hg19 -Ne "select varId,clinSign,scv from clinvarSub" | sort  -S 40g  > clinvarSubSub.txt
  
  join -t $'\t' sort.main.bed clinvarSubSub.txt | tawk '{print $2,$3,$4,$5,$6,$1, $7}' | sort -S 40g -k1,1 -k2,2n -k5,5 | tawk -f makeFranklin   | tawk -f assignColors > tmp1
  # add the line break after v409
  #tawk '{print $1":"$2 + 1"-"$3" <br>Variants (submissions):"$11}' tmp1 > tmp2
  tawk '{print $1":"$2 + 1"-"$3" Variants (submissions):"$11}' tmp1 > tmp2
  paste tmp1 tmp2 > bigBedInput.bed
  
  bedToBigBed -as=$HOME/kent/src/hg/lib/clinvarSubLolly.as -type=bed9+5 -tab bigBedInput.bed /cluster/data/hg19/chrom.sizes clinvarSubLolly.bb
  mkdir -p /gbdb/hg19/clinvarSubLolly
  ln -s `pwd`/clinvarSubLolly.bb /gbdb/hg19/clinvarSubLolly/clinvarSubLolly.bb
  
  bigBedToBed /gbdb/hg19/bbi/clinvar/clinvarMain.bb stdout | tawk '{print $40, $1,$2,$2+1,$4,$13,$15,$18,$19}' | sort  -S 40g > sort.main.bed
  hgsql hg19 -Ne "select * from clinvarSub" | sort  -S 40g  > clinvarSubSub.txt
  join -t $'\t' sort.main.bed clinvarSubSub.txt | tawk '{print $2,$3,$4,$5,0,"+",0,0,"0,0,0",$6,$20,$8, $9,$1,$10,$7,$11,$12,$13,$14,$15,$16,$17,$18,$19,$21}' | sort -S 40g -k1,1 -k2,2n | tawk -f assignScore > bigBedInput.bed
  
  bedToBigBed -as=clinvarSubBB.as -type=bed9+11 -tab bigBedInput.bed /cluster/data/hg19/chrom.sizes clinvarSub.bb
  ln -s `pwd`/clinvarSub.bb /gbdb/hg19/clinvarSubLolly/clinvarSub.bb
  
  
  #############################################################################
  # Trios for Genome In a Bottle - DONE 08/04/2020 ChrisL
  # see ~/kent/src/hg/makeDb/giab/make.txt
  
  #############################################################################
  # COVID GWAS from  COVID-19 Host Genetics Initiative  Sep 2020  Kate
  # see ~kent/src/hg/makeDb/doc/covid/covidHgiGwas.txt
  
  #############################################################################
  # gnomAD PEXT scores
  
  # PEXT data:
  # The baselevel is the sum of the expression value for all transcripts touching that base
  # The annotation-level is the sum of the expression of transcripts on which a variant has a
  # given annotation
  
  # download the pext data:
  wget https://storage.googleapis.com/gnomad-public/papers/2019-tx-annotation/pre_computed/all.possible.snvs.tx_annotated.GTEx.v7.021520.tsv.bgz
  wget https://storage.googleapis.com/gnomad-public/papers/2019-tx-annotation/gnomad_browser/all.baselevel.021620.tsv.bgz
  
  # these files are humongous even with compression:
  ls -lh all.*
  -rw-rw-r-- 1 chmalee genecats 307M Apr  8 21:54 all.baselevel.021620.tsv.bgz
  -rw-rw-r-- 1 chmalee genecats 6.7G Feb 14  2020 all.possible.snvs.tx_annotated.GTEx.v7.021520.tsv.bgz
  
  # how large are these files?
  time zcat all.base* | wc -l
  35305149
  
  real    1m11.964s
  user    1m8.725s
  sys 0m10.385s
  
  # this is theoretically all coding bases in gencode v19, split by tissue then run buildPext.py to
  # make one bigWig per tissue:
  mkdir run
  seq 4 57 | parallel -j10 'zcat all.baselevel.021620.tsv.bgz | cut -f1-3,{} | gzip -c > run/tissue{}.pext.gz'
  
  # overlapping exons in coding regions causes problems, don't output any scores
  # for those regions
  seq 4 57 | parallel --joblog run.log -j20 './buildPext.py run/tissue{}.pext.gz -o split'
  tail -n +2 run.log | cut -f4 | awk '{sum += $1}END{print sum/NR}'
  452.034
  
  # Turn into bigWigs:
  find split/ -name "*.bed" | parallel -j15 'sort -k1,1 -k2,2n {} | cut -f1-3,5 > {.}.bedGraph'
  find split/ -name "*.bedGraph" | parallel -j15 'bedGraphToBigWig {} /hive/data/genomes/hg19/chrom.sizes {.}.bw'
  mkdir -p /gbdb/hg19/gnomAD/pext
  ln -s `pwd`/split/*.bw /gbdb/hg19/gnomAD/pext/
  
  #############################################################################
  # update 2020-10-27 (DONE - Hiram - 2020-10-27)
  
    mkdir /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2020-10-27
    cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2020-10-27
  
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev \
        -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
        GCF_000001405.25_GRCh37.p13 hg19) > do.log 2>&1 &
    # real    6m47.005s
  
    cat fb.ncbiRefSeq.hg19.txt
    # 93720294 bases of 2991710746 (3.133%) in intersection
  
  #############################################################################
  # test update 2021-05-17 (DONE - Hiram - 2021-05-17)
  
    mkdir /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2021-05-17
    cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2021-05-17
  
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev \
        -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
        GCF_000001405.25_GRCh37.p13 hg19) > do.log 2>&1 &
    # real    7m42.506s
  
    cat fb.ncbiRefSeq.hg19.txt
    # 95470885 bases of 2991710746 (3.191%) in intersection
  
  #############################################################################
  # Covid-19 rare mutations, Max, Fri Oct 30 08:40:34 PDT 2020
  # received table from qzhang02@rockefeller.edu, wrote to UCSC.txt
  cd /hive/data/genomes/hg19/bed/covidMuts/
  dos2unix UCSC.txt
  cat UCSC.txt | tawk '{$1="chr"$1; chrom=$1; start=$2; rsId=$3; ref=$4; alt=$5; zygo=$6; gene=$7; genotype=$8; inh=$9; end=$2+length(ref); print chrom, start, end, ref">"alt, "0", ".", start, end, "0,0,0", "1", length(ref), "0", ref, alt, rsId, zygo, gene, genotype, inh;}' | grep -v chrchr > covidMuts.bed
  bedSort covidMuts.bed covidMuts.bed
  bedToBigBed -tab covidMuts.bed ../../chrom.sizes covidMuts.bb -as=../../hg19/bed/covidMuts/covidMuts.as -type=bed12+
  <<<<<<< Updated upstream
  
  #############################################################################
  # gnomAD v2.1.1 update, ChrisL 12-2-2020
  #############################################################################
  # See /hive/data/inside/gnomAD/v2.1.1/run.sh for more information, listed
  # here are the important steps:
  WORKDIR=/hive/data/inside/gnomAD/v2.1.1/
  cd $WORKDIR
  db="hg19"
  cd $db
  
  time parallel -j15 --joblog exomes.run.log --plus "vcfToBed -fields=${fields} {} exomes/{/..}.bed" ::: /hive/data/outside/gnomAD.2/v2.1.1/exomes/*.bgz
  # real    16m42.939s
  # user    172m26.966s
  # sys 1m41.186s
  
  # now turn into a single bed
  time cat hg19/exomes/*.bed | ./gnomadVcfBedToBigBed stdin stdout | sort -k1,1 -k2,2n > gnomad.v2.1.1.exomes.bed
  # real    21m44.331s
  # user    20m24.018s
  # sys 3m5.405s
  time bedToBigBed -type=bed9+50 -tab -as=exomes.as gnomad.v2.1.1.exomes.bed /hive/data/genomes/hg19/chrom.sizes exomes.bb
  # pass1 - making usageList (24 chroms): 11485 millis
  # pass2 - checking and writing primary data (17209972 records, 57 fields): 339555 millis
  #
  # real    6m45.792s
  # user    6m7.880s
  # sys 0m11.924s
  
  # same for genomes
  cd $db
  time parallel -j15 --joblog genomes.run.log --plus "vcfToBed -fields=${fields} {} genomes/{/..}.bed" ::: /hive/data/outside/gnomAD.2/v2.1.1/genomes/*.bgz
  # real  134m40.184s
  # user    1559m44.664s
  # sys 12m0.858s
  cd ..
  time cat hg19/genomes/*.bed | ./gnomadVcfBedToBigBed stdin stdout | sort -k1,1 -k2,2n > gnomad.v2.1.1.genomes.bed
  # real    199m48.619s
  # user    186m49.769s
  # sys 29m12.841s
  
  # now South Asian variants in the genomes file, change type:
  time bedToBigBed -type=bed9+47 -tab -as=genomes.as gnomad.v2.1.1.genomes.bed /hive/data/genomes/hg19/chrom.sizes genomes.bb
  # pass1 - making usageList (23 chroms): 165336 millis
  # pass2 - checking and writing primary data (253556152 records, 55 fields): 4909106 millis
  #
  # real    89m3.165s
  # user    86m41.554s
  # sys 2m15.722s
  
  #############################################################################
  # LASTZ Cow bosTau9 (ONE - 2020-12-07 - Hiram)
      mkdir /hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07
      cd /hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07
  
      printf '# human vs Cow
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
  BLASTZ_T=2
  BLASTZ_O=400
  BLASTZ_E=30
  BLASTZ_M=254
  # default BLASTZ_Q score matrix:
  #       A     C     G     T
  # A    91  -114   -31  -123
  # C  -114   100  -125   -31
  # G   -31  -125   100  -114
  # T  -123   -31  -114    91
  
  # TARGET: human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  
  # QUERY: Cow bosTau9
  SEQ2_DIR=/hive/data/genomes/bosTau9/bosTau9.2bit
  SEQ2_LEN=/hive/data/genomes/bosTau9/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LIMIT=10
  SEQ2_LAP=0
  
  BASE=/hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
              -syntenicNet) > do.log 2>&1
      # real    239m35.175s
  
      cat fb.hg19.chainBosTau9Link.txt
      # 1407432462 bases of 2991710746 (47.044%) in intersection
  
      cat fb.hg19.chainSynBosTau9Link.txt
      # 1354159575 bases of 2991710746 (45.264%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` hg19 bosTau9) > rbest.log 2>&1 &
      #	real    274m55.811s
  
      sed -e 's/^/    # /;' fb.hg19.chainRBest.BosTau9.txt
      #	1290531802 bases of 2991710746 (43.137%) in intersection
  
      #   running the swap
      mkdir /hive/data/genomes/bosTau9/bed/blastz.hg19.swap
      cd /hive/data/genomes/bosTau9/bed/blastz.hg19.swap
      time (doBlastzChainNet.pl -verbose=2 \
          /hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07/DEF \
          -swap  -syntenicNet -workhorse=hgwdev \
  	-smallClusterHub=hgwdev -bigClusterHub=ku \
          -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1
      #   real    72m28.826s
  
      cat fb.bosTau9.chainHg19Link.txt
      #   1342159887 bases of 2715853792 (49.419%) in intersection
      cat fb.bosTau9.chainSynHg19Link.txt
      #	1305558878 bases of 2715853792 (48.072%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` bosTau9 hg19) > rbest.log 2>&1 &
      # real    297m58.059s
  
      sed -e 's/^/    # /;' fb.bosTau9.chainRBest.Hg19.txt
      # 1291348753 bases of 2715853792 (47.549%) in intersection
  
  #############################################################################
  # Exome Probesets composite track
  # Tue Jan  5 02:25:06 PST 2021 Made by Ana, Tiana, Pranav, Beagan, reviewed and committed by Max
  # Download data for hg19:
  cd /hive/data/genomes/hg19/bed/exomeProbesets
  We made tracks for the main Exome Kit Vendors: IDT, Twist Biosciences, MGI, Agilent, Roche, and Illumina.
  
  Note: IDT, Agilent and Roche have bed files for the Probes and for the Target Regions. Twist, MGI, and Illumina have bed files for the Target Regions (but not for Probes).
  
  Data downloaded in my windows desktop and copied to hgwdev:
  scp <file.bed> ana@hgwdev.gi.ucsc.edu://hive/data/genomes/hg19/bed/exonArrays/raw/idt
  
  # IDT Datasets:
  
  Track: IDT - xGen Exome Research Panel Probes
  Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c.bed?sfvrsn=425c3407_7&download=true
  File name: xgen-exome-research-panel-probes-hg19.bed
  
  Track: IDT - xGen Exome Research Panel Target Regions
  Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c.bed?sfvrsn=435c3407_7&download=true
  File name: xgen-exome-research-panel-targets-hg19.bed
  
  Track: IDT - xGen Exome Research Panel V2 Probes
  Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-v2-probes-hg1952a5791532796e2eaa53ff00001c1b3c.bed?sfvrsn=1dd1707_6&download=true
  File name: xgen-exome-research-panel-v2-probes-hg19.bed
  
  Track: IDT - xGen Exome Research Panel V2 Target Regions
  Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-v2-targets-hg1902a5791532796e2eaa53ff00001c1b3c.bed?sfvrsn=6dd1707_10&download=true
  File name: xgen-exome-research-panel-v2-targets-hg19.bed
  
  # Twist Biosciences Datasets:
  
  Track: Twist - RefSeq Exome Panel Target Regions
  Download: https://www.twistbioscience.com/sites/default/files/resources/2019-09/Twist_Exome_RefSeq_targets_hg19_0.bed
  File name: Twist_Exome_RefSeq_targets_hg19_0.bed
  
  Track: Twist - Core Exome Panel Target Regions
  Download: https://www.twistbioscience.com/sites/default/files/resources/2018-09/Twist_Exome_Target_hg19.bed
  File name: Twist_Exome_Target_hg19.bed
  
  Track: Twist - Comprehensive Exome Panel Target Regions
  Download: https://www.twistbioscience.com/sites/default/files/resources/2020-09/Twist_ComprehensiveExome_targets_hg19.bed
  File name: Twist_ComprehensiveExome_targets_hg19.bed
  
  Track: Twist - Exome 2.1
  Download: curl https://www.twistbioscience.com/sites/default/files/resources/2021-10/hg38_Twist_exome_2_1_annotated_targets.bed | cut -f1-3 > TwistExome21.bed
  bedToBigBed TwistExome21.bed ../../chrom.sizes TwistExome21.bb -type=bed3
  
  # Updated on Feb 1, 2021 (max): got a corrected file from Tina Han <than@twistbioscience.com>
  To stay consistent, I kept the original filename TwistExome21.bb even though this 
  is not the 2.1 version anymore. Confusing, Twist went from 2.0 to 2.1 and then to 2.0.1 (!!)
  So 2.0.1 is more recent than 2.1.
  Original filename from Tina was: hg19_exome_v2.0.2_targets_sorted_validated.annotated.bed.gz
  
  mv TwistExome21.bb TwistExome21.old.bb
  cut TwistExome202.bed -f1-3 > TwistExome202.cut.bed
  bedToBigBed TwistExome202.cut.bed ../../chrom.sizes TwistExome21.bb -type=bed3
  
  
  # MGI Datasets:
  
  Track: MGI - Easy Exome Capture V4 Target Regions
  Download: https://en.mgitech.cn/Uploads/Temp/file/20191225/5e03126e808a0.zip
  File name: MGI_Exome_Capture_V4.bed
  
  Track: MGI - Easy Exome Capture V5 Target Regions
  Download: https://en.mgitech.cn/Uploads/Temp/file/20191225/5e0312a7be43e.zip
  File name: MGI_Exome_Capture_V5.bed
  
  # Agilent Datasets:
  Download for all Agilent files: https://earray.chem.agilent.com/suredesign/ - Password needed (from Ana)
  
  to get them:
  go to https://earray.chem.agilent.com/suredesign/ then select Find Designs -> CGH -> Agilent Catalog
  (HT to Daniel)
  
  Track: Agilent - SureSelect Clinical Research Exome Covered by Probes
  File name: S06588914_Covered.bed
  
  Track: Agilent - SureSelect Clinical Research Exome Target Regions
  File name: S06588914_Regions.bed
  
  Track: Agilent - SureSelect Clinical Research Exome V2 Covered by Probes
  File name: S30409818_Covered.bed
  
  Track: Agilent - SureSelect Clinical Research Exome V2 Target Regions
  File name: S30409818_Regions.bed
  
  Track: Agilent - SureSelect Focused Exome Covered by Probes
  File name: S07084713_Covered.bed
  
  Track: Agilent - SureSelect Focused Exome Target Regions
  File name: S07084713_Regions.bed
  
  Track: Agilent - SureSelect All Exon V4 Covered by Probes
  File name: S03723314_Covered.bed
  
  Track: Agilent - SureSelect All Exon V4 Target Regions
  File name: S03723314_Regions.bed
  
  Track: Agilent - SureSelect All Exon V4 + UTRs Covered by Probes
  File name: S03723424_Covered.bed
  
  Track: Agilent - SureSelect All Exon V4 + UTRs Target Regions
  File name: S03723424_Regions.bed
  
  Track: Agilent - SureSelect All Exon V5 Covered by Probes
  File name: S04380110_Covered.bed
  
  Track: Agilent - SureSelect All Exon V5 Target Regions
  File name: S04380110_Regions.bed
  
  Track: Agilent - SureSelect All Exon V5 + UTRs Covered by Probes
  File name: S04380219_Covered.bed
  
  Track: Agilent - SureSelect All Exon V5 + UTRs Target Regions
  File name: S04380219_Regions.bed
  
  Track: Agilent - SureSelect All Exon V6 r2 Covered by Probes
  File name: S07604514_Covered.bed
  
  Track: Agilent - SureSelect All Exon V6 r2 Target Regions
  File name: S07604514_Regions.bed
  
  Track: Agilent - SureSelect All Exon V6 + COSMIC r2 Covered by Probes
  File name: S07604715_Covered.bed
  
  Track: Agilent - SureSelect All Exon V6 + COSMIC r2 Target Regions
  File name: S07604715_Regions.bed
  
  Track: Agilent - SureSelect All Exon V6 + UTR r2 Covered by Probes
  File name: S07604624_Covered.bed
  
  Track: Agilent - SureSelect All Exon V6 + UTR r2 Target Regions
  File name: S07604624_Regions.bed
  
  Track: Agilent - SureSelect All Exon V7 Covered by Probes
  File name: S31285117_Covered.bed
  
  Track: Agilent - SureSelect All Exon V7 Target Regions
  File name: S31285117_Regions.bed
  
  # Roche Datasets:
  
  Track: Roche - KAPA HyperExome Capture Probe Footprint
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/design-files/KAPA%20HyperExome%20Design%20files%20hg19.zip
  File name: KAPA_HyperExome_hg19_capture_targets.bed
  
  Track: Roche - KAPA HyperExome Primary Target Regions
  Download:
  https://sequencing.roche.com/content/dam/rochesequence/worldwide/design-files/KAPA%20HyperExome%20Design%20files%20hg19.zip
  File name: KAPA_HyperExome_hg19_primary_targets.bed
  
  Track: Roche - SeqCap EZ Exome V3 Capture Probe Footprint
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/SeqCapEZ_Exome_v3.0_Design_Annotation_files.zip
  File name: SeqCap_EZ_Exome_v3_hg19_capture_targets.bed
  
  Track: Roche - SeqCap EZ Exome V3 Primary Target Regions
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/SeqCapEZ_Exome_v3.0_Design_Annotation_files.zip
  File name: SeqCap_EZ_Exome_v3_hg19_primary_targets.bed
  
  Track: Roche - SeqCap EZ Exome V3 + UTR Capture Probe Footprint
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/Exome_UTR_Design_Annotation_Files.zip
  File name: SeqCap_EZ_ExomeV3_Plus_UTR_hg19_capture_annotated.bed
  
  Track: Roche - SeqCap EZ Exome V3 + UTR Primary Target Regions
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/Exome_UTR_Design_Annotation_Files.zip
  File name: SeqCap_EZ_ExomeV3_Plus_UTR_hg19_primary_annotated.bed
  
  Track: Roche - SeqCap EZ MedExome Capture Probe Footprint
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExome_design_files.zip
  File name: SeqCap_EZ_MedExome_hg19_capture_targets.bed
  
  Track: Roche - SeqCap EZ MedExome Empirical Target Regions
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExome_design_files.zip
  File name: SeqCap_EZ_MedExome_hg19_empirical_targets.bed
  
  Track: Roche - SeqCap EZ MedExome + Mito Capture Probe Footprint
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExomePlusMito_design_files.zip
  File name: SeqCap_EZ_MedExomePlusMito_hg19_capture_targets.bed
  
  Track: Roche - SeqCap EZ MedExome + Mito Empirical Target Regions
  Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExomePlusMito_design_files.zip
  File name: SeqCap_EZ_MedExomePlusMito_hg19_empirical_targets.bed
  
  # Illumina Datasets:
  
  Track: Illumina - Nextera DNA Exome V1.2 Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/nextera-dna-exome/nextera-dna-exome-targeted-regions-manifest-bed.zip
  File name: nextera-dna-exome-targeted-regions-manifest-v1-2.bed
  
  Track: Illumina - Nextera Rapid Capture Exome Target Regions
  Download: https://support.illumina.com/softwaredownload.html?assetId=d2c2bc7e-75e5-4f20-bfb7-780839390565&assetDetails=nexterarapidcapture_exome_targetedregions.bed - Password needed (from Ana)
  File name: nexterarapidcapture_exome_targetedregions.bed
  
  Track: Illumina - Nextera Rapid Capture Exome V1.2 Target Regions
  Download: https://support.illumina.com/softwaredownload.html?assetId=197e4b2b-161d-4576-a52f-1204833567c5&assetDetails=nexterarapidcapture_exome_targetedregions_v1.2.bed - Password needed (from Ana)
  File name: nexterarapidcapture_exome_targetedregions_v1.2.bed
  
  Track: Illumina - Nextera Rapid Capture Expanded Exome Target Regions
  Download: https://support.illumina.com/softwaredownload.html?assetId=f020d708-dad9-44e4-8c7c-439add28536c&assetDetails=nexterarapidcapture_expandedexome_targetedregions.bed - Password needed (from Ana)
  File name: nexterarapidcapture_expandedexome_targetedregions.bed
  
  Track: Illumina - TruSeq DNA Exome V1.2 Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/truseq/truseq-dna-exome/truseq-dna-exome-targeted-regions-manifest-v1-2-bed.zip
  File name: truseq-dna-exome-targeted-regions-manifest-v1-2.bed
  
  Track: Illumina - TruSeq Rapid Exome V1.2 Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/truseq/truseq-rapid-exome-targeted-regions-manifest-v1-2-bed.zip
  File name: truseq-rapid-exome-targeted-regions-manifest-v1-2.bed
  
  Track: Illumina - TruSight ONE V1.1 Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/trusight/trusight-one-file-for-ucsc-browser-v1-1.zip
  File name: TruSight_One_v1.1.bed
  
  Track: Illumina - TruSight ONE Expanded V2.0 Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/nextera/nextera-flex-for-enrichment/trusight-one-expanded-targeted-regions-v2-0.zip
  File name: TSOne_Expanded_Final_TargetedRegions_v2
  
  Track: Illumina - TruSight Exome Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/documentation/chemistry_documentation/trusight/trusight_exome_manifest_a.bed
  File name: trusight_exome_manifest_a.bed
  
  Track: Illumina - AmpliSeq Exome Panel Target Regions
  Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/ampliseq-for-illumina/ampliseq-for-illumina-exome-panel-manifest-file-bed.zip
  File name: Exome.dna_manifest.20180509.bed
  
  # Converting bed files for hg19:
  
  All files were converted from bed to bigBed using the Genome Browser documentation. All of the files underwent the following steps, with the exception of a few files that are described below. (NOTE: the documentation includes a step to remove any header lines -- only a couple files had headers, and those were simply removed within vi/vim.)
  
  1. Sort all bed files
  sort -k1,1 -k2,2n unsorted.bed > input.bed
  
  2. fetchChromSizes (run once)
  fetchChromSizes hg19 > hg19.chrom.sizes
  
  Note: this only needs to be run once, since ione hg19.chrom.sizes files can be used for all bedToBigBed runs.
  
  3. bedToBigBed for all files
  bedToBigBed input.bed hg19.chrom.sizes myBigBed.bb
  
  Here's an example using the MGI Exome Capture V4 file:
  
  sort -k1,1 -k2,2n MGI_Exome_Capture_V4.bed > sorted_MGI_Exome_Capture_V4.bed
  
  fetchChromSizes hg19 > hg19.chrom.sizes
  
  bedToBigBed sorted_MGI_Exome_Capture_V4.bed hg19.chrom.sizes MGI_Exome_Capture_V4.bb
  
  --
  
  The following files from Roche had long entries in col4, causing these files to have rows that were too long for bedToBigBed. Therefore, all the input bed files had col4 cut. (Note: these were just the ensembl and ccds ids, which did not provide any other substantial information.)
  
  We ran the command
  
  > cut -f1,2,3
  
  for all such files. Here's an example for the Roche - KAPA HyperExome Capture Probe:
  
  Footprint file:
  
  cut -f1,2,3 sorted-KAPA_HyperExome_hg19_capture_targets.bed > sorted-cut-KAPA_HyperExome_hg19_capture_targets.bed
  
  
  #############################################################################
  # haploinsufficiency from DECIPHER - DONE 3/18/2021 Jonathan
  
  # Download latest predictions list from https://decipher.sanger.ac.uk/about/downloads/data
  mkdir -p /hive/data/outside/decipher/haploinsufficiency
  cd /hive/data/outside/decipher/haploinsufficiency
  wget https://decipher.sanger.ac.uk/files/downloads/HI_Predictions_Version3.bed.gz
  filePath=`pwd`/HI_Predictions_Version3.bed.gz
  
  # zcat | head shows the file is nearly ready to go, but could benefit from a bit of reorganization
  # (also floating point score values don't work for some bed processors)
  
  mkdir -p /hive/data/genomes/hg19/bed/decipherHaplo
  cd /hive/data/genomes/hg19/bed/decipherHaplo
  
  printf 'chomp;
  @fields = split /\t/;
  ($gene, $score, $pct) = split /\|/, $fields[3];
  $fields[3] = $gene;
  $fields[4] = 0;
  $rgb = $fields[8];
  $rgb =~ s/^(\d+),(\d+),0$/$1,$2,$1/;  # change red/green to magenta/green
  $fields[8] = $rgb;
  push @fields, ($pct, $score);
  push @fields, ("$gene, HI: $pct");
  print join ("\t", @fields) . "\n";
  ' > parse.pl
  
  zcat $filePath | tail -n +2 | perl -nf parse.pl | bedSort stdin HI_Predictions.bed
  
  bedToBigBed HI_Predictions.bed -type=bed9+2 -as=$HOME/kent/src/hg/lib/haploinsufficiency.as -tab ../../chrom.sizes haploinsufficiency.bb
  
  mkdir -p /gbdb/hg19/bbi/haploins/
  cd /gbdb/hg19/bbi/haploins/
  ln -s /hive/data/genomes/hg19/bed/decipherHaplo/haploinsufficiency.bb .
  
  
  #############################################################################
  # skinSoleBoldo JimK 01-14-2020
  # This describes how we got the skinSoleBoldo data set into the
  # Genome Browser from the Cell Browser.
  #############################################################################
  
  # Create working directory and go there
  mkdir /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo
  cd /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo
  
  # Create output dir for binaries
  mkdir bbi
  
  # Downloaded files from the UCSC cell browser's as so
  wget https://cells.ucsc.edu/aging-human-skin/meta.tsv
  wget https://cells.ucsc.edu/aging-human-skin/exprMatrix.tsv.gz
  
  # Get the first line (fields) out of meta.tsv and also make stats on it
  head -1 meta.tsv > meta.fields
  tabInfo meta.tsv -vals=20 > meta.20
  
  
  # Make a bunch of smaller matrices by clustering columns.  Mostly we'll use the cluster one
  # but some of the others are good to look at sometimes too.  This is the time consuming step.
  mkdir clust
  matrixClusterColumns -makeIndex=clust/exprMatrix.ix  exprMatrix.tsv.gz meta.tsv \
      Celltype clust/cell_type.matrix bbi/cell_type.stats \
      subj clust/donor.matrix bbi/donor.stats \
      age	clust/age.matrix bbi/age.stats \
      Celltype_and_Age clust/age_cell_type.matrix bbi/age_cell_type.stats
  
  # Get the first column (the genes) out of expression matrix.
  cut -f 1 clust/cell_type.matrix > gene.lst
  
  # Figure out the geneset they used and generate mapping file
  gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed
  # best is gencodeV19 as sym on hg19 with 21217 of 21353 (99.3631%) hits
  
  # Turn some into barChart, and then bigBarChart
  foreach s (cell_type donor age age_cell_type)
      matrixToBarChartBed clust/$s.matrix mapping.bed clust/$s.bed -stats=bbi/$s.stats -trackDb=clust/$s.ra
      bedSort clust/$s.bed clust/$s.bed
      bedToBigBed clust/$s.bed /hive/data/genomes/hg19/chrom.sizes bbi/$s.bb -type=bed6+3 -as=/cluster/home/kent/src/hg/lib/simpleBarChartBed.as
  end
  
  # Make up special colors for cell_type.  First manually create two column
  # file that relates at least some of sample labels to cell types we have colors for.
  # Call this file clust/cell_type.labels.
  matrixClusterColumns clust/cell_type.matrix clust/cell_type.labels cluster clust/cell_type.unnormed clust/cell_type.restats
  matrixNormalize column sum clust/cell_type.unnormed clust/cell_type.ref
  
  # Use same colors for sample
  foreach s (cell_type donor age age_cell_type)
      hcaColorCells clust/cell_type.ref ../typeColors.tsv clust/$s.matrix clust/$s.refStats -trackDb=clust/$s.colors -stats=bbi/$s.stats
  end
  
  # Link files needed by browser at runtime to the /gbdb dir
  mkdir /gbdb/hg19/bbi/skinSoleBoldo
  foreach s (cell_type donor age age_cell_type)
      ln -s /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo/bbi/$s.bb /gbdb/hg19/bbi/skinSoleBoldo/
      ln -s /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo/bbi/$s.stats /gbdb/hg19/bbi/skinSoleBoldo/
  end
  
  # Add the bits from clust/*.ra and clust/*.colors to hg19/trackDb.ra and you should be good.
  rm -f tracks.ra
  foreach s (cell_type donor age age_cell_type)
      grep -v barChartColors clust/$s.ra >>tracks.ra
      cat clust/$s.colors >> tracks.ra
      echo transformFunc NONE >> tracks.ra
      echo barChartLimit 2 >> tracks.ra
      echo "" >> tracks.ra
  end
  
  
  #############################################################################
  # fetalGeneAtlas JimK 01-19-2021
  ############################################################################
  # This is the RNA-seq part of the data set described in
  # "A human cell atlas of fetal gene expression" by Cao, Day et al
  # Science 13 Nove 2020.   This was imported from Cell Browser
  
  # Create directory for work.
  
  mkdir -p /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas
  cd /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas
  
  # Create output dir for binaries
  mkdir bbi
  
  # link in in files from cell browser
  ln -s /hive/data/inside/cells/datasets/fetal-gene-atlas/all/meta.tsv .
  ln -s /hive/data/inside/cells/datasets/fetal-gene-atlas/all/exprMatrix.tsv.gz .
  
  # Get the first line (fields) out of meta.tsv and also make stats on it
  head -1 meta.tsv > meta.fields
  tabInfo meta.tsv -vals=20 > meta.20
  
  
  # Make a bunch of smaller matrices by clustering columns.  Mostly we'll use the cluster one
  # but some of the others are good to look at sometimes too.  This is the time consuming step.
  mkdir clust
  matrixClusterColumns -makeIndex=clust/exprMatrix.ix  exprMatrix.tsv.gz meta.tsv \
      Main_cluster_name clust/cell_type.matrix bbi/cell_type.stats \
      Assay clust/Assay.matrix bbi/Assay.stats \
      Experiment_batch clust/Experiment_batch.matrix bbi/Experiment_batch.stats \
      Fetus_id clust/donor.matrix bbi/donor.stats \
      Organ clust/Organ.matrix bbi/Organ.stats \
      Organ_cell_lineage clust/Organ_cell_lineage.matrix bbi/Organ_cell_lineage.stats \
      RT_group clust/RT_group.matrix bbi/RT_group.stats \
      sex clust/sex.matrix bbi/sex.stats
  
  # Get the first column (the genes) out of expression matrix.
  cut -f 1 clust/cell_type.matrix > gene.lst
  
  
  # Figure out the geneset they used and generate mapping file
  gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed
  # best is gencodeV19 as id on hg19 with 60284 of 63562 (94.8428%) hits
  
  
  # Turn some into barChart, and then bigBarChart
  foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex)
      matrixToBarChartBed clust/$s.matrix mapping.bed clust/$s.bed -stats=bbi/$s.stats -trackDb=clust/$s.ra
      bedSort clust/$s.bed clust/$s.bed
      bedToBigBed clust/$s.bed /hive/data/genomes/hg19/chrom.sizes bbi/$s.bb -type=bed6+3 -as=/cluster/home/kent/src/hg/lib/simpleBarChartBed.as
  end
  
  # Make up special colors for cell_type.  First manually create two column
  # file that relates at least some of sample labels to cell types we have colors for.
  # Call this file cell_type.labels.
  matrixClusterColumns clust/cell_type.matrix cell_type.labels cluster clust/cell_type.unnormed clust/cell_type.restats
  matrixNormalize column sum clust/cell_type.unnormed clust/cell_type.ref
  #hcaColorCells clust/cell_type.ref ../typeColors.tsv clust/cell_type.matrix clust/cell_type.refStats -trackDb=clust/cell_type.colors -stats=bbi/cell_type.stats
  
  # Use same colors for some others
  foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex)
      hcaColorCells clust/cell_type.ref ../typeColors.tsv clust/$s.matrix clust/$s.refStats -trackDb=clust/$s.colors -stats=bbi/$s.stats
  end
  
  # Link files needed by browser at runtime to the /gbdb dir
  mkdir /gbdb/hg19/bbi/fetalGeneAtlas
  foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex)
      ln -s /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas/bbi/$s.bb /gbdb/hg19/bbi/fetalGeneAtlas/
      ln -s /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas/bbi/$s.stats /gbdb/hg19/bbi/fetalGeneAtlas/
  end
  
  
  hgBbiDbLink hg19 fetalGeneAtlasCellTypes /gbdb/hg19/bbi/fetalGeneAtlas/cell_type.bb
  hgBbiDbLink hg19 fetalGeneAtlasDonor /gbdb/hg19/bbi/fetalGeneAtlas/donor.bb
  hgBbiDbLink hg19 fetalGeneAtlasAssay /gbdb/hg19/bbi/fetalGeneAtlas/Assay.bb
  hgBbiDbLink hg19 fetalGeneAtlasExperiment /gbdb/hg19/bbi/fetalGeneAtlas/Experiment_batch.bb
  hgBbiDbLink hg19 fetalGeneAtlasOrgan /gbdb/hg19/bbi/fetalGeneAtlas/Organ.bb
  hgBbiDbLink hg19 fetalGeneAtlasOrganCellLineage /gbdb/hg19/bbi/fetalGeneAtlas/Organ_cell_lineage.bb
  hgBbiDbLink hg19 fetalGeneAtlasRtGroup /gbdb/hg19/bbi/fetalGeneAtlas/RG_group.bb
  hgBbiDbLink hg19 fetalGeneAtlasSex /gbdb/hg19/bbi/fetalGeneAtlas/sex.bb
  
  # Add the bits from clust/*.ra and clust/*.colors to hg19/trackDb.ra and you should be good.
  foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex)
      echo >> clust/$s.ra
  end
  cat clust/*.ra > tracks.ra
  
  #############################################################################
  # CADD, max Wed Feb 10 06:08:01 PST 2021
  # note for the next release / archiving, procedure as per team meeting and Chris Lee:
  # don't forget to copy the new files to /usr/local/apache/htdocs-hgdownload/goldenPath/archive/hg19/cadd/<version>
  # and update the "current" symlink
  # also add a section "Archived Data" to the docs page with a link to the hgdownload archive directory
  # If motivated, also add a hub.txt
  cd /hive/data/genomes/hg19/bed/cadd/
  wget https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/InDels.tsv.gz
  wget https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/whole_genome_SNVs.tsv.gz
  time python ~/kent/src/hg/makeDb/cadd/caddToBed.py InDels.tsv.gz
  time python ~/kent/src/hg/makeDb/cadd/caddToWig.py
  wigToBigWig a.wig ../../chrom.sizes a.bw  &
  wigToBigWig c.wig ../../chrom.sizes c.bw  &
  wigToBigWig a.wig ../../chrom.sizes a.bw &
  wigToBigWig g.wig ../../chrom.sizes g.bw &
  bedToBigBed ins.bed ../../chrom.sizes ins.bb -type=bed9+ -tab -as=${HOME}/kent/src/hg/makeDb/cadd/cadd.as
  bedToBigBed del.bed ../../chrom.sizes del.bb -type=bed9+ -tab -as=${HOME}/kent/src/hg/makeDb/cadd/cadd.as
  rm -f *.wig *.bed
  ##############################################################################
  #  update sno/miRNA TRACK (WORKING - 2021-02-18 - Hiram)
      # last release for GRCh37 was version 20: 2013-06-13
  
      # The data in this track is out of date so update the track.
      mkdir /hive/data/genomes/hg19/bed/wgRna-2013-06-13
      cd /hive/data/genomes/hg19/bed/wgRna-2013-06-13
  
      wget --timestamping \
           ftp://mirbase.org/pub/mirbase/20/genomes/hsa.gff3
  
      # examine chromosome list:
      grep -v "^#" hsa.gff3 | cut -f1 | sort | uniq -c > chr.list
  
      # Only select the primary transcripts, make coords 0-based so they match
      # gencode and refseq tracks, and remove ID and Alias entries in column 9:
      grep -v "^#" hsa.gff3 | grep -c "miRNA_primary_transcript"
      # 1871
      tawk '{if ($3 == "miRNA_primary_transcript") {$4-=1; print;}}' hsa.gff3 \
          | grep -v '^#' | tr ';' '\t' | tr '=' '\t' | cut -f1-8,14 \
             > hsa.primaryCleaned.gff3
      wc -l hsa.primaryCleaned.gff3
      # 1971
  
      # now get into bed format, with type="miRNA"
      tawk '{print $1, $4, $5, $9, 0, $7, 0, 0, "miRNA";}' \
         hsa.primaryCleaned.gff3 > miRNA.bed
  
      # get snoRNA entries from current wgRNA table:
      hgsql -Ne "select * from wgRna where type != 'miRNA'" hg19 \
        | cut -f2- > wgRna.other.bed
  
          # combine and load:
      cat miRNA.bed wgRna.other.bed > wgRna2021-02-18.bed
      hgLoadBed -tab -renameSqlTable -verbose=4 \
          -sqlTable=$HOME/kent/src/hg/lib/wgRna.sql \
          -as=$HOME/kent/src/hg/lib/wgRna.as hg19 wgRnaNew wgRna2021-02-18.bed
  # ### kent source version 410 ###
  # Reading wgRna2021-02-18.bed
  # Read 2273 elements of size 9 from wgRna2021-02-18.bed
  # Loading hg19
  
      # compare old and new tables:
      hgsql -Ne "select type, count(*) from wgRna group by type" hg19
  +---------+-----+
  | CDBox   | 269 |
  | HAcaBox | 112 |
  | miRNA   | 939 |
  | scaRna  |  21 |
  +---------+-----+
      hgsql -Ne "select type, count(*) from wgRnaNew group by type" hg19
  +---------+------+
  | CDBox   |  269 |
  | HAcaBox |  112 |
  | miRNA   | 1871 |
  | scaRna  |   21 |
  +---------+------+
      # compared to hg38:
      hgsql -Ne "select type, count(*) from wgRna group by type" hg38
  +---------+------+
  | CDBox   |  269 |
  | HAcaBox |  112 |
  | miRNA   | 1918 |
  | scaRna  |   21 |
  +---------+------+
  
      # backup old table and compare to new one before rename. Should be only miRNA update:
      hgsql -Ne "select * from wgRna" hg19 | cut -f2- > wgRna.backup
      comm -23 <(sort -k1 -k2n wgRna2021-02-18.bed) <(sort -k1 -k2n wgRna.backup) | cut -f9 | sort -u
  miRNA
  
      # rename wgRnaNew table:
      hgsqlSwapTables -dropTable3 hg19 wgRnaNew wgRna wgRnaOld
  
  ##############################################################################
  # Add Revel track, Max, Thu Apr 15 03:33:49 PDT 2021
  cd /hive/data/genomes/hg19/bed/revel
  aria2c https://rothsj06.u.hpc.mssm.edu/revel_grch38_all_chromosomes.csv.zip
  unzip revel_grch38_all_chromosomes.csv.zip
  # make sure that no weird chroms and no weird multi-nucleotide changes
  cat revel_grch38_all_chromosomes.csv  | tr ',' '\t' | cut -f5 | grep -v alt | awk '{print length($1)}' | uniq > lens.tmp &
  cut -d, revel_grch38_all_chromosomes.csv -f1 | uniq | sort | uniq -c > chroms.tmp &
  pigz revel_grch38_all_chromosomes.csv
  time python revelToWig.py revel_grch38_all_chromosomes.csv.gz hg19
  wigToBigWig t.wig ../../chrom.sizes t.bw &
  wigToBigWig c.wig ../../chrom.sizes c.bw  &
  wigToBigWig a.wig ../../chrom.sizes a.bw &
  wigToBigWig g.wig ../../chrom.sizes g.bw &
  ##############################################################################
  # LASTZ human/hg19 Gorilla/gorGor6 - (DONE - 2021-06-28 - Gerardo)
      mkdir /hive/data/genomes/hg19/bed/lastzGorGor6.2021-06-28/
      cd /hive/data/genomes/hg19/bed/lastzGorGor6.2021-06-28/
  
      printf '# human vs gorilla
  BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
  BLASTZ_T=2
  BLASTZ_O=600
  BLASTZ_E=150
  BLASTZ_M=254
  BLASTZ_K=4500
  BLASTZ_Y=15000
  BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q
  #       A     C     G     T
  # A    90  -330  -236  -356
  # C  -330   100  -318  -236
  # G  -236  -318   100  -330
  # T  -356  -236  -330    90
  
  # TARGET: Human hg19
  SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit
  SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes
  SEQ1_CHUNK=20000000
  SEQ1_LAP=10000
  SEQ1_IN_CONTIGS=0
  
  # QUERY: gorilla gorGor6
  SEQ2_DIR=/hive/data/genomes/gorGor6/gorGor6.2bit
  SEQ2_LEN=/hive/data/genomes/gorGor6/chrom.sizes
  SEQ2_CHUNK=20000000
  SEQ2_LAP=0
  SEQ2_LIMIT=50
  
  BASE=/hive/data/genomes/hg19/bed/lastzGorGor6.2021-06-28
  TMPDIR=/dev/shm
  ' > DEF
  
      time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
          -chainMinScore=3000 -chainLinearGap=medium \
            -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
              -syntenicNet) > do.log 2>&1
      # real    113m7.350s
  
      cat fb.hg19.chainGorGor6Link.txt
      # 2874362387 bases of 2991710746 (96.078%) in intersection
  
      cat fb.hg19.chainSynGorGor6Link.txt
      # 2854800888 bases of 2991710746 (95.424%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` hg19 gorGor6) > rbest.log 2>&1 &
      # real    67m18.231s
  
      sed -e 's/^/ # /;' fb.hg19.chainRBest.GorGor6.txt
       # 2691356356 bases of 2991710746 (89.960%) in intersection
  
      #   running the swap
      mkdir /hive/data/genomes/gorGor6/bed/blastz.hg19.swap
      cd /hive/data/genomes/gorGor6/bed/blastz.hg19.swap
  
      time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzGorGor6.2021-06-28/DEF \
          -swap -syntenicNet -workhorse=hgwdev  \
           -smallClusterHub=hgwdev -bigClusterHub=ku \
              -chainMinScore=3000 -chainLinearGap=medium)  > swap.log 2>&1 &
      # real 69m32.058s
  
      cat fb.gorGor6.chainHg19Link.txt
      2735990533 bases of 2999027915 (91.229%) in intersection
  
      cat fb.gorGor6.chainSynHg19Link.txt
      2726237067 bases of 2999027915 (90.904%) in intersection
  
      time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` gorGor6 hg19) > rbest.log 2>&1 &
      # real    70m15.001s
  
      sed -e 's/^/    # /;' fb.gorGor6.chainRBest.Hg19.txt
      # 2694784811 bases of 2999027915 (89.855%) in intersection
  
  ##############################################################################
  
  # JASPAR Track integration by Daniel 8/20/21
      cd ~/kent/src/hg/makeDb/trackDb 
      vi human/jaspar.ra 
      curl http://expdata.cmmt.ubc.ca/JASPAR/UCSC_tracks/hg19/trackDb.txt >> human/jaspar.ra 
      cd /hive/data/genomes/hg19/bed
      mkdir jaspar
      cd jaspar
      wget http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2018/hg19/JASPAR2018_hg19_all_chr.bb
      wget http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2020/JASPAR2020_hg19.bb
      mv JASPAR2020_hg19.bb JASPAR2020.bb
      mv JASPAR2018_hg19_all_chr.bb JASPAR2018.bb
      cd /gbdb/hg19
      mkdir jaspar
      cd jaspar
      ln -s /hive/data/genomes/hg19/bed/jaspar/JASPAR2018.bb .
      ln -s /hive/data/genomes/hg19/bed/jaspar/JASPAR2020.bb .
      cd ~/kent/src/hg/makeDb/trackDb
      vi human/jaspar.html  
      curl http://expdata.cmmt.ubc.ca/JASPAR/UCSC_tracks/JASPAR2018_TFBS_help.html >> human/jaspar.html 
  
  ###############################################################################
  
  # JASPAR 2022 track addition by Daniel 10/11/21
  
  cd ~/kent/src/hg/makeDb/trackDb
  curl http://expdata.cmmt.ubc.ca/JASPAR/UCSC_tracks/hg19/trackDb.txt | head -n16 >> human/jaspar.ra
  vi human/jaspar.ra
  cd /hive/data/genomes/hg19/bed/jaspar
  wget http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/JASPAR2022_hg19.bb
  mv JASPAR2022_hg19.bb JASPAR2022.bb
  cd /gbdb/hg19/jaspar
  ln -s /hive/data/genomes/hg19/bed/jaspar/JASPAR2022.bb .
  cd ~/kent/src/hg/makeDb/trackDb
  vi human/jaspar.html
  
  ###############################################################################
  
  # JASPAR 2022 bigBed update 12/7/21
  cd /hive/data/genomes/hg19/bed/jaspar
  wget http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/JASPAR2022_hg19.bb
  rm JASPAR2022.bb
  mv JASPAR2022_hg19.bb JASPAR2022.bb
  ls -lh
  ls -lh /gbdb/hg19/jaspar
  
  ###############################################################################
  
  # JASPAR 2022 bigBed update 1/3/22
  cd /hive/data/genomes/hg19/bed/jaspar
  wget http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/JASPAR2022_hg19.bb
  rm JASPAR2022.bb
  mv JASPAR2022_hg19.bb JASPAR2022.bb
  ls -lh
  ls -lh /gbdb/hg19/jaspar
  
  ###################################################
  
  # PanelApp refs #25568 3/10/22
  cd /hive/data/genomes/hg19/bed 
  mkdir panelApp
  cd panelApp
  wget https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg19/panel_hg19.bb
  wget https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg19/str_hg19.bb
  mv panel_hg19.bb genesPanel.bb
  mv str_hg19.bb STRsPanel.bb
  cd /gbdb/hg19
  mkdir panelApp
  cd panelApp
  ln -s /hive/data/genomes/hg19/bed/panelApp/genesPanel.bb
  ln -s /hive/data/genomes/hg19/bed/panelApp/STRsPanel.bb
  cd ~/kent/src/hg/makeDb/trackDb/human/hg19
  wget https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg19/panelapp.html
  cd ~/kent/src/hg/makeDb/trackDb
  mv human/hg19/panelapp.html human/hg19/panelApp.html
  curl https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg19/trackDb.txt >> trackDb.ra
  vi trackDb.ra
  make alpha DBS=hg19
  
  ###############################################
  
  #Probe and Microarray data sets addition 3/11/21
  
 +###############################################
 +# Jarvis, Max, 03/28/22
 +cd /hive/data/genomes/hg19/bed/jarvis
 +# downloaded from https://az.app.box.com/v/jarvis-gwrvis-scores/folder/146728772904 by Luis into orig/
 +cd orig
 +zcat `ls | sort | grep bed` > ../jarvis.bed
 +bedGraphToBigWig jarvis.bed ../chrom.sizes jarvis.bw
 +# Go to Brian's cube, push "that was easy" button
 +
+ #############################################################################
+ # chm13 liftover alignments  (2022-03-29 markd)
+ 
+ # preliminary CHM13 <-> hg19 liftOver chains until NCBI produces consensus alignments
+ # provide by Nae-Chyun Chen <naechyun.chen@gmail.com>
+ 
+     mkdir -p /hive/data/genomes/hg19/bed/chm13LiftOver
+     cd /hive/data/genomes/hg19/bed/chm13LiftOver
+ 
+ # Obtain GRCh37 from T2T Globus: team-liftover/v1_nflo/grch37-chm13v2.chain
+ 
+ # rename to better match UCSC convetions and compress
+     mv grch37-chm13v2.chain hg19-chm13v2.over.chain
+     pigz hg19-chm13v2.over.chain
+ 
+ # make NCBI query names as well, since chromAlias doesn't work yet for click-through
+ # note doc is wrong in chromToPsl PSL query is 10
+     chainToPslBasic hg19-chm13v2.over.chain.gz stdout | chromToUcsc -k 10 -a /hive/data/genomes/asmHubs/genbankBuild/GCA/009/914/755/GCA_009914755.4_CHM13_T2T_v2.0/GCA_009914755.4_CHM13_T2T_v2.0.chromAlias.txt | pslToChain stdin stdout | pigz -c > hg19-chm13v2.ncbi-qnames.over.chain.gz
+ 
+ # build bigChain files:
+     hgLoadChain -noBin -test none bigChain hg19-chm13v2.over.chain.gz 
+     sed 's/\.000000//' chain.tab | awk 'BEGIN {OFS="\t"} {print $2, $4, $5, $11, 1000, $8, $3, $6, $7, $9, $10, $1}' > bigChainIn.tab
+     bedToBigBed -type=bed6+6 -as=${HOME}/kent/src/hg/lib/bigChain.as -tab bigChainIn.tab ../../chrom.sizes hg19-chm13v2.over.chain.bb
+     tawk '{print $1, $2, $3, $5, $4}' link.tab | csort -k1,1 -k2,2n --parallel=64 > bigLinkIn.tab
+     bedToBigBed -type=bed4+1 -as=${HOME}/kent/src/hg/lib/bigLink.as -tab bigLinkIn.tab  ../../chrom.sizes hg19-chm13v2.over.link.bb
+ 
+ # build bigChain with NCBI names files:
+     hgLoadChain -noBin -test none bigChain hg19-chm13v2.ncbi-qnames.over.chain.gz 
+     sed 's/\.000000//' chain.tab | awk 'BEGIN {OFS="\t"} {print $2, $4, $5, $11, 1000, $8, $3, $6, $7, $9, $10, $1}' > bigChainIn.tab
+     bedToBigBed -type=bed6+6 -as=${HOME}/kent/src/hg/lib/bigChain.as -tab bigChainIn.tab ../../chrom.sizes hg19-chm13v2.ncbi-qnames.over.chain.bb
+     tawk '{print $1, $2, $3, $5, $4}' link.tab | csort -k1,1 -k2,2n --parallel=64 > bigLinkIn.tab
+     bedToBigBed -type=bed4+1 -as=${HOME}/kent/src/hg/lib/bigLink.as -tab bigLinkIn.tab  ../../chrom.sizes hg19-chm13v2.ncbi-qnames.over.link.bb
+ 
+     rm *.tab
+    
+ # link to gbdb
+   mkdir -p /gbdb/hg19/bbi/chm13LiftOver
+   ln -sf $(pwd)/*.bb /gbdb/hg19/bbi/chm13LiftOver/
+ 
+ # make downloads, can't add to liftOver directory due to license in that directory
+   mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/chm13LiftOver/
+   ln -sf $(pwd)/hg19-chm13v2.*.chain.gz /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/chm13LiftOver/
+   
+ # push these files
+   /gbdb/hg19/bbi/chm13LiftOver/hg19-chm13v2.ncbi-qnames.over.chain.bb
+   /gbdb/hg19/bbi/chm13LiftOver/hg19-chm13v2.ncbi-qnames.over.link.bb
+   /gbdb/hg19/bbi/chm13LiftOver/hg19-chm13v2.over.chain.bb
+   /gbdb/hg19/bbi/chm13LiftOver/hg19-chm13v2.over.link.bb
+   /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/chm13LiftOver/hg19-chm13v2.ncbi-qnames.over.chain.gz
+   /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/chm13LiftOver/hg19-chm13v2.over.chain.gz
  
 -  
 -#############################################################################