src/hg/makeDb/doc/hg19.txt 1.49

1.49 2009/10/21 18:34:58 hiram
done with phastCons runs for the 46-way
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.48
retrieving revision 1.49
diff -b -B -U 1000000 -r1.48 -r1.49
--- src/hg/makeDb/doc/hg19.txt	16 Oct 2009 17:17:44 -0000	1.48
+++ src/hg/makeDb/doc/hg19.txt	21 Oct 2009 18:34:58 -0000	1.49
@@ -1,6618 +1,7192 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how we made the browser database on
 # NCBI build 37 (February 2009 freeze) aka:
 #	GRCh37 - Genome Reference Consortium Human Reference 37
 #	Assembly Accession: GCA_000001405.1
 
 #	"$Id$";
 
 #############################################################################
 
 # NOTE FOR NEXT HUMAN ASSEMBLY (2009-07-29 - Brooke): hg19 contains the wrong
 # sequence for chrM. The accession NC_001807 was replaced in GenBank with
 # NC_012920, with the note: "This sequence was removed since the accepted
 # reference sequence for the Homo sapiens mitochondrion is the rCRS/Mitomap
 # sequence, which is now available as the record NC_012920".
 # Also, from http://www.mitomap.org/mitoseq.html:
 # "IMPORTANT:  Do not use NC_001807 as "the rCRS" as it is an African
 # (Yoruban) sequence with over 40 variant nucleotides from the rCRS. As of
 # July 8, 2009 it has been removed from GenBank as a reference sequence but
 # may be found, if needed, as  AF347015, one of 53 African sequence deposited
 # in Genbank by Ingman et al in 2001."
 # Use NC_012920 for the chrM sequence for the next build!
 
 # Download sequence (DONE - 2009-02-04 - Hiram)
     mkdir -p /hive/data/genomes/hg19/download
     cd /hive/data/genomes/hg19/download
     mkdir -p assembled_chromosomes
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=assembled_chromosomes \
         -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
 
     mkdir -p alternate_loci
 for N in 1 2 3 4 5 6 7 8 9
 do
 wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
     --directory-prefix=alternate_loci \
         -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
 done
 
     mkdir -p unlocalized_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=unlocalized_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
 
     mkdir -p unplaced_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=unplaced_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
 
     mkdir -p placed_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=placed_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
 
     mkdir ucscChr
     cd ucscChr
     for F in ../assembled_chromosomes/FASTA/chr*.fa
 do
     C=`basename $F`
     C=${C/.fa}
     echo -n "${C} "
     H=`head -1 "${F}"`
     chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
     A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
     echo $chrN $A
     grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
         | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
     echo ">${chrN}" > ${chrN}.fa
     grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
 done
 
     rm -f scaffolds.agp
     find ../alternate_loci -type f | grep ".agp$" | while read F
 do
     grep "^GL" $F | sed -e \
 "s/^GL000250.1/chr6_apd_hap1/" -e \
 "s/^GL000251.1/chr6_cox_hap2/" -e \
 "s/^GL000252.1/chr6_dbb_hap3/" -e \
 "s/^GL000253.1/chr6_mann_hap4/" -e \ 
 "s/^GL000254.1/chr6_mcf_hap5/" -e \
 "s/^GL000255.1/chr6_qbl_hap6/" -e \
 "s/^GL000256.1/chr6_ssto_hap7/" -e \
 "s/^GL000257.1/chr4_ctg9_hap1/" -e \
 "s/^GL000258.1/chr17_ctg5_hap1/"
 done > scaffolds.agp
 
     find ../unlocalized_scaffolds -type f | grep ".agp$" \
 | while read F
 do
     C=`basename ${F}`   
     C=${C/.unlocalized.scaf.agp}
     grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
 done >> scaffolds.agp
 
     find ../unplaced_scaffolds -type f | grep ".agp$" \
 | while read F
 do
     grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
 done >> scaffolds.agp
 
     rm -f scaffolds.fa
     find ../alternate_loci -type f | grep ".fa$" | while read F
 do  
     sed -e \
 "s/>.*GL000250.*/>chr6_apd_hap1/" -e \
 "s/>.*GL000251.*/>chr6_cox_hap2/" -e \
 "s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
 "s/>.*GL000253.*/>chr6_mann_hap4/" -e \
 "s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
 "s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
 "s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
 "s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
 "s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
 done > scaffolds.fa
 
     find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
 do
     sed -e \
 "s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
 done >> scaffolds.fa
 
     find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
 do
     sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
 done >> scaffolds.fa
 
 
 ############################################################################
 ## Create database (DONE - 2009-03-04 - Hiram)
     cd /hive/data/genomes/hg19
     cat << '_EOF_' > hg19.config.ra
 # Config parameters for makeGenomeDb.pl:
 db hg19
 scientificName Homo sapiens
 commonName Human
 assemblyDate Feb. 2009
 assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
 orderKey 14
 mitoAcc NC_001807
 fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
 agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
 # qualFiles /dev/null
 dbDbSpeciesDir human
 taxId	9606
 '_EOF_'
     # << happy emacs
 
     time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
     #	real    14m8.958s
      featureBits -countGaps hg19 gap
     #	239845127 bases of 3137161264 (7.645%) in intersection
     featureBits -noRandom -noHap -countGaps hg19 gap
     #	234344806 bases of 3095693983 (7.570%) in intersection
     #	verify featureBits is properly ignorning haps and randoms:
     egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
     #	3095693983 chrM 16571
     #	same total as in featureBits
 
     #	much later on, discovered that we needed a chrM definition in the
     #	agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
 # chrM    1       16571   1       F       NC001807        1       16571   +
     #	the spaces there are tabs
 
 ############################################################################
 # running repeat masker (DONE - 2009-03-05 - Hiram)
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg19/bed/repeatMasker
     cd /hive/data/genomes/hg19/bed/repeatMasker
     time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
 	> do.log 2>&1
     #	real    525m23.521s
     cat faSize.rmsk.txt
     #	3137161264 bases (239850802 N's 2897310462 real 1431585691
     #	upper 1465724771 lower) in 93 sequences in 1 files
     #	%46.72 masked total, %50.59 masked real
     featureBits -countGaps hg19 rmsk
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	this is odd, 3 bases more in featureBits than were masked ?
     #	check it out, make a bed file from the featureBits:
     featureBits -countGaps -bed=rmsk.bed hg19 rmsk
     #	went down a sequence of intersections with this idea, but could
     #	not get it resolved.  It appears there are 75 bases in the rmsk
     #	table that were not masked in the 2bit file ?
     #	Later on, realized that featureBits does not count lower case N's
     #	in the "lower" category, but only in the N's category.
 
     #	trying a non-split table:
     hgsql -e "show tables;" hg19 | grep _rmsk | while read T
 do
     hgsql -e "drop table ${T};" hg19
 done
     hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
 bad rep range [4385, 4384] line 1348605 of hg19.fa.out 
 bad rep range [5563, 5562] line 1563988 of hg19.fa.out 
 bad rep range [4539, 4538] line 3111186 of hg19.fa.out 
     #	featureBits still reports 1465724774 bases in rmsk table
     #	cleaning the hg19.fa.out file:
     cp hg19.fa.out hg19.clean.out
     # edit hg19.clean.out and remove the three lines:
 # 1467  20.7  1.2 17.6  chr14     35056767 35056794 (72292746) +  L1ME1          LINE/L1               4385 4384 (1761) 1120962
 # 1943  23.8  5.0 12.6  chr15     65775909 65775924 (36755468) +  L1MC4          LINE/L1               5563 5562 (2480) 1299299
 # 2463  25.1  5.0 11.6  chr3      121291056 121291083 (76731347) +  L1M3           LINE/L1               4539 4538 (1608) 2589267
 
     #	reload the table
     hgsql -e "drop table rmsk;" hg19
     hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
 
     #	try masking with this clean file:
     twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
 	hg19.clean.2bit
     twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
     cat faSize.clean.txt
     #	this gives the lower by 75 bases result:
     #	3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
     #	1465724699 lower) in 93 sequences in 1 files
     #	%46.72 masked total, %50.59 masked real
     featureBits -countGaps hg19 rmsk
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	is the countGaps interferring ?
     featureBits hg19 rmsk
     #	1465724774 bases of 2897316137 (50.589%) in intersection
     #	nope, lets' see what the .out file has:
     grep chr hg19.clean.out | sed -e "s/^  *//" | awk '{print $5,$6-1,$7}' \
 	| sort -k1,1 -k2,2n > hg19.clean.out.bed
     featureBits -countGaps hg19 hg19.clean.out.bed
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	is it perhaps not masking N's ?
     twoBitToFa hg19.clean.2bit stdout | grep n | less
     #	that does find some lower case n's, find all N's:
     findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
 	2> findMotif.out
     grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
     #	which cover:
     featureBits -countGaps hg19 nLocations.bed
     #	251299071 bases of 3137161264 (8.010%) in intersection
     #	overlapping rmsk business with these N locations:
     featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
     #	6494740 bases of 3137161264 (0.207%) in intersection
     #	and overlapping with gap:
     featureBits -countGaps hg19 gap nLocations.bed
     #	239845127 bases of 3137161264 (7.645%) in intersection
 
 ############################################################################
 # running TRF simple repeats (DONE - 2009-03-05 - Hiram)
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg19/bed/simpleRepeat
     cd /hive/data/genomes/hg19/bed/simpleRepeat
     time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
 	-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
     #	real    33m25.815s
 
     twoBitMask bed/repeatMasker/hg19.clean.2bit \
 	-add bed/simpleRepeat/trfMask.bed hg19.2bit
     twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
 # 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
 # 1466923203 lower) in 93 sequences in 1 files
 # %46.76 masked total, %50.63 masked real
 
 ############################################################################
 #	prepare cluster data (DONE - 2009-03-06 - Hiram)
     cd /hive/data/genomes/hg19
     rm /gbdb/hg19/hg19.2bit
     ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
 
     time blat hg19.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
     #	Wrote 30675 overused 11-mers to 11.ooc
     #	real    3m11.302s
 
     mkdir /hive/data/staging/data/hg19
     cp -p hg19.2bit /hive/data/staging/data/hg19
     cp -p 11.ooc /hive/data/staging/data/hg19
     cp -p chrom.sizes /hive/data/staging/data/hg19
 
     mkdir separateChrs
     cd separateChrs
     grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.chrOnly.2bit
     twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
 
     grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.hapOnly.2bit
     twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
 
     grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.scaffolds.2bit
     twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
 
     cp -p *.2bit *.sizes /hive/data/staging/data/hg19
 
     # ask admin to sync this directory: /hive/data/staging/data/hg19/
     #	to the kluster nodes /scratch/data/hg19/
 
 ############################################################################
 # running cpgIsland business (DONE - 2009-03-06 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/cpgIsland
     cd /hive/data/genomes/hg19/bed/cpgIsland
     cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     # comment out the following two lines if it compiles cleanly
     # some day  (there were some other fixups too, adding include lines)
     sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
     mv tmp.c cpg_lh.c
     make
     cd ../../ 
     ln -s hg3rdParty/cpgIslands/cpglh.exe
     mkdir -p hardMaskedFa
     cut -f1 ../../chrom.sizes | while read C
 do
     echo ${C}
     twoBitToFa ../../hg19.2bit:$C stdout \
 	| maskOutFa stdin hard hardMaskedFa/${C}.fa
 done
 
     cut -f1 ../../chrom.sizes > chr.list
     cat << '_EOF_' > template
 #LOOP
 ./runOne $(root1) {check out line results/$(root1).cpg}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > runOne
 #!/bin/csh -fe
 ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
 mv /scratch/tmp/$1.$$ $2
 '_EOF_'
     # << happy emacs
 
     gensub2 chr.list single template jobList
     para create jobList
     para try
     para check ... etc
     para time
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:        172s       2.86m     0.05h    0.00d  0.000 y
 # IO & Wait Time:                  1748s      29.14m     0.49h    0.02d  0.000 y
 # Average job time:                  21s       0.34m     0.01h    0.00d
 # Longest finished job:              34s       0.57m     0.01h    0.00d
 # Submission to last job:            83s       1.38m     0.02h    0.00d
 
     # Transform cpglh output to bed +
     catDir results | awk '{
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }' > cpgIsland.bed
 
     cd /hive/data/genomes/hg19/bed/cpgIsland
     hgLoadBed hg19 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 
 # Reading cpgIsland.bed
 # Loaded 28226 elements of size 10
 # Sorted
 # Saving bed.tab
 # Loading hg19
 
 ############################################################################
 # create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/gap
     cd /hive/data/genomes/hg19/bed/gap
     gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
     cp -p hg19.unBridged.lift ../../jkStuff
     cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
 
 ############################################################################
 # AUTO UPDATE GENBANK RUN  (DONE - 2009-03-07,13 - Hiram)
     # align with latest genbank process.
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add hg19 just after hg18
 
 # hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
 #       Assembly Accession: GCA_000001405.1
 hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
 hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
 hg19.ooc = /scratch/data/hg19/11.ooc
 hg19.lift = /hive/data/genomes/hg19/jkStuff/hg19.unBridged.lift
 # hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
 hg19.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
 hg19.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
 hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
 hg19.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
 hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
 hg19.genbank.est.xeno.pslCDnaFilter   = ${finished.genbank.est.xeno.pslCDnaFilter}
 hg19.genbank.est.xeno.load = yes
 hg19.refseq.mrna.xeno.load  = yes
 hg19.refseq.mrna.xeno.loadDesc = yes
 hg19.mgc = yes
 hg19.orfeome = yes
 hg19.downloadDir = hg19
 # hg19.ccds.ncbiBuild = 36.3
 # hg19.upstreamGeneTbl = refGene
 # hg19.upstreamMaf = multiz28way
 # /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
 # /hive/data/genomes/hg19/bed/multiz44way/species.list
 hg19.genbank.mrna.blatTargetDb = yes
 
     cvs ci -m "Added hg19." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     ssh genbank
     screen		#	use a screen to manage this job
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial hg19 &
     #	logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
     #	real    2761m13.680s
     #	that ran on the swarm with little interference and no problems
 
     # load database when finished
     ssh hgwdev
     screen	# use screen to manage this long running command
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
     # logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
     #	real    369m11.941s
 
     # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add hg19 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 #########################################################################
 #  BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("hg19", "blat13", "17778", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("hg19", "blat13", "17779", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 # Making download files (DONE - 2009-03-13 - Hiram)
     cd /hive/data/genomes/hg19
     makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
 	> downloads.log 2>&1
 ############################################################################
 # Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
 doBlastzChainNet.pl `pwd`/DEF \
         -stop=partition -bigClusterHub=swarm \
         -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
         -workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
 
 doBlastzChainNet.pl `pwd`/DEF \
         -continue=blastz -stop=blastz -bigClusterHub=swarm \
         -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
         -workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
 
 doBlastzChainNet.pl `pwd`/DEF \
         -continue=cat -stop=net -bigClusterHub=swarm \
         -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
         -workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
 real    163m28.438s
 
     # to load, run it in debug, then check the load script
 doBlastzChainNet.pl `pwd`/DEF \
 	-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
 	-debug -smallClusterHub=swarm -chainMinScore=1000 \
 	-chainLinearGap=medium \
 	-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
 
     # and create a synNet for multiz, run in debug, and examine script
     #	to make sure it works correctly
 doBlastzChainNet.pl `pwd`/DEF \
 	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
 	-debug -bigClusterHub=swarm \
 	-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
 	-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
     #	real    31m11.216s
 
 ############################################################################
 # reset position to chr6 haplotype situation
     hgsql -e \
 'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
 	hgcentraltest
 
 # reset to a smaller range (2009-04-24 - Brooke)
 # this is the SOD1 gene, implicated in Lou Gehrig's disease.
 
     hgsql -e \
 'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
         hgcentraltest
 
 ############################################################################
 # Self Lastz run (DONE - 2009-03-19 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
     cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
     cat << '_EOF_'
 # human vs human
 BLASTZ=lastz
 # maximum M allowed with lastz is only 255
 BLASTZ_M=254
 # lastz does not like the O= and E= lines in the matrix file 
 #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
 BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from hg18 vs venter1 lastz on advice from Webb
 BLASTZ_K=10000
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
     
 # QUERY: Human Hg19
 SEQ2_DIR=/scratch/data/hg19/hg19.2bit
 SEQ2_LEN=/scratch/data/hg19/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
     
 BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     screen # use screen to manage this long-running job
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
 	-workhorse=hgwdev \
 	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
     #	cluster difficulties, finished manually, then:
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
 	-continue=cat -workhorse=hgwdev \
 	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
 	-continue=load -debug -workhorse=hgwdev \
 	-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
     #	that indicates it would do:
     hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
     #	adding -normScore
     hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
 
 ############################################################################
 # Chimp Lastz run (DONE - 2009-03-19 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
     cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
     cat << '_EOF_'
 # human vs chimp
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 # lastz does not like the O= and E= lines in the matrix file
 #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
 BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Chimp PanTro2
 SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
 SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     screen # use screen to manage this long-running job
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
     #	real    173m22.880s
     #	cluster problems, continuing after lastz done:
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
 	-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	> net.log 2>&1 &
     #	real    81m20.209s
     #	continuing with the load and adding syntenicNet
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
 	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
 	-chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	> load.log 2>&1 &
     #	real    47m17.871s
     cat fb.hg19.chainPanTro2Link.txt 
     #	2747983350 bases of 2897316137 (94.846%) in intersection
 
     #	running the swap - DONE - 2009-05-24
     ssh swarm
     mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
     cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
 	> swap.log 2>&1 &
     #	real    723m41.377s
     cat fb.panTro2.chainHg19Link.txt 
     #	2761343871 bases of 2909485072 (94.908%) in intersection
 
 ############################################################################
 # Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
     mkdir /hive/data/genomes/hg19/pushQ
     cd /hive/data/genomes/hg19/pushQ
     makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
     # many complaints about the chain and net tables from the haplotype
     #	experiments, and this table:
     #	orfeomeGenes
     #	which is probably in genbank, and these usual ones:
     #	hg19 does not have seq
     #	hg19 does not have extFile
 
 ############################################################################
 # Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/parRegion
     cd /hive/data/genomes/hg19/bed/parRegion
     awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
     awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
     comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
     cat chrXY.par.clone.list \
 	| while read C; do grep "${C}" ../../X/chrX.agp; done \
 	| sort -k1,1 -k2,2n >> chrX.par.region.agp
     cat chrXY.par.clone.list \
 	| while read C; do grep "${C}" ../../Y/chrY.agp; done \
 	| sort -k1,1 -k2,2n >> chrY.par.region.agp
     awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
 	> chrY.par.region.bed
     awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
 	> chrX.par.region.bed
     #	use those bed files in custom tracks on hg19 to verify that they
     #	are two continuous regions with only gaps between these items
     #	these location extents are: (zero relative)
     #	chrX 60000 2722842
     #	chrX 154906585 155260560
     #	chrY 10000 2649520
     #	chrY 59034049 59363566
 
 ############################################################################
 # Gorilla Lastz run (DONE - 2009-03-21,05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
     cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
     cat << '_EOF_'
 # human vs gorilla
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 # lastz does not like the O= and E= lines in the matrix file
 #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
 BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Gorilla gorGor1
 SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
 SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     screen # use screen to manage this long-running job
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     cat fb.hg19.chainGorGor1Link.txt 
     #	1723432141 bases of 2897316137 (59.484%) in intersection
     doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1
 
 ############################################################################
 # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2009-04-02 - Hiram)
     ssh pk
     mkdir /hive/data/genomes/hg19/bed/linSpecRep
     cd /hive/data/genomes/hg19/bed/linSpecRep
     #	create individual .out files from the master record in ../repeatMasker
     mkdir splitOut
     cat << '_EOF_' > split.csh
 #!/bin/csh -fe
 set C = $1
 head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
 grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > template
 #LOOP
 split.csh $(root1) {check out line+ splitOut/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 ../../chrom.sizes > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try ... check ... push ... etc...
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:        127s       2.12m     0.04h    0.00d  0.000 y
 # IO & Wait Time:                 17154s     285.90m     4.76h    0.20d  0.001 y
 # Average job time:                 186s       3.10m     0.05h    0.00d
 # Longest finished job:             224s       3.73m     0.06h    0.00d
 # Submission to last job:           280s       4.67m     0.08h    0.00d
     
     #	now, we can date and process each of those .out files
     #	this really should be a single creation of notInOthers
     #	These four different ones all end up to be the same anyhow
     #	the notInMouse becomes notInOthers below and the others are removed.
     mkdir dateRepeats
     cd dateRepeats
     cat << '_EOF_' > mkLSR
 #!/bin/csh -fe
 rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
 ln -s ../splitOut/$1.out .
 /scratch/data/RepeatMasker/DateRepeats \
     $1.out -query human -comp mouse -comp rat -comp dog -comp cow
 rm $1.out
 mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
 /cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
 	> ../notInMouse/$1.out.spec
 /cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
 	> ../notInRat/$1.out.spec
 /cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
 	> ../notInDog/$1.out.spec
 /cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
 	> ../notInCow/$1.out.spec
 '_EOF_'
     #	<< happy emacs
     chmod +x mkLSR
 
     cat << '_EOF_' > template
 #LOOP
 ./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 ../chrom.list single template jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:       2441s      40.69m     0.68h    0.03d  0.000 y
 # IO & Wait Time:                   332s       5.53m     0.09h    0.00d  0.000 y
 # Average job time:                  30s       0.50m     0.01h    0.00d
 # Longest finished job:             125s       2.08m     0.03h    0.00d
 # Submission to last job:           454s       7.57m     0.13h    0.01d
 
     done
 
     #	these four types of out.spec results all turn out to be identical
     #	To check identical
     cd /hive/data/genomes/hg19/bed/linSpecRep
     find . -name "*.out.spec" | \
 	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
 	| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
 	| sort | uniq -c | less
     #	You will see they are all a count of 4
     #	Set them up on scratch data and get to all the kluster nodes:
     mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
     cd notInMouse
     rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
     cd ..
     mv notInMouse notInOthers
     #	do not need to keep all of these
     rm -fr notInRat notInDog notInCow
 
     # We also need the nibs for blastz runs with lineage specific repeats
     mkdir /hive/data/genomes/hg19/bed/nibs
     cd /hive/data/genomes/hg19/bed/nibs
     cut -f1 ../../chrom.sizes | while read C
 do
     twoBitToFa -seq=${C} ../../hg19.2bit stdout \
 	| faToNib -softMask stdin ${C}.nib
     echo "${C} done"
 done
     mkdir /hive/data/staging/data/hg19/nib
     rsync -a --progress ./ /hive/data/staging/data/hg19/nib
 
     # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
 
 #############################################################################
 # create gc5Base download file (DONE - 2009-04-24 - Hiram)
     cd /hive/data/genomes/hg19/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
         /cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
 
 #############################################################################
 # Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/ctgPos
     cd /hive/data/genomes/hg19/bed/ctgPos
     cat << '_EOF_' > mkCtgPos.sh
 AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
 export AGP
 for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
 do
     C=${F/.agp/}
     grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
 {
 printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
 }
 '
 done
 '_EOF_'
     # << happy emacs
     chmod +x mkCtgPos.sh
     ./mkCtgPos.sh > ctgPos.tab
 
     cat << '_EOF_' > mkRanCtgPos.sh
 AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
 export AGP
 for F in `(cd ${AGP}; ls chr*.agp)`
 do
     C=${F/.unlocalized.scaf.agp/}
     c=${C/chr/}
     export C c
     grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
 BEGIN {
     ctgName=""
     ctgStart=0
     ctgEnd=0
     chrom="'${c}'"
     ctgNameLower=""
 }
 {
 if (match(ctgName,$1)) {
     ctgEnd = $3
 } else {
     if (length(ctgName) > 0) {
         size=ctgEnd - ctgStart
 printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower, 
 ctgStart, ctgEnd
     }
     ctgStart = $2 - 1
     ctgEnd = $3
     ctgName = $1
     ctgNameLower = tolower($1)
     sub(".1$","",ctgNameLower)
 }
 }
 END {
 size=ctgEnd - ctgStart
 printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower, 
 ctgStart, ctgEnd
 }
 '
 done
 '_EOF_'
     # << happy emacs
     chmod +x mkRanCtgPos.sh
     ./mkRanCtgPos.sh >> ctgPos.tab
 
     #	fetch .sql definition from hg18
     chmod 777 .
     hgsqldump --all -c --tab=. hg18 ctgPos
     chmod 775 .
     hgsql hg19 < ctgPos.sql
     hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
 
 #############################################################################
 # CLONE ENDS - first step for BACEND/CytoBand tracks
 #	(DONE - 2009-04-28 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
     cd /hive/data/genomes/hg19/bed/cloneend/ncbi
 
     wget --timestamping \
 'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
     wget --timestamping \
 'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'
 
     cd /hive/data/genomes/hg19/bed/cloneend
     # seems like the *.mfa files were split just for convenience
     # concatenate
 
     for F in ncbi/*.mfa.gz
 do
     zcat "${F}"
     echo "${F}" 1>&2
 done | gzip > all.mfa.gz
     #	that 1>&2 echos to stderr so you can see the file name and not
     #	interfere with the pipe stdout output to gzip
 
     # Convert the title line of the all.mfa file
     zcat all.mfa.gz \
 	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
 	    | gzip > cloneEnds.fa.gz
 
     zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
 
     #	make sure nothing got broken:
     faSize all.mfa.gz
 # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
 # in 833173 sequences in 1 files
 
     faSize cloneEnds.fa.gz
 # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
 # in 833173 sequences in 1 files
 
     #	identical numbers
     #	you can also carefully check the names:
     zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
 	| sed -e "s/\.[0-9]$//" | sort > mfa.names
     #	should be the same as:
     zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names
 
 
     # concatenate the text files, too
     bash
     for F in ncbi/*.txt.gz
 do
     zcat "${F}"
     echo "${F}" 1>&2
 done | gzip > all.txt.gz
 
     # generate cloneEndPairs.txt and cloneEndSingles.txt
     zcat all.txt.gz >all.txt
     $HOME/kent/src/hg/utils/cloneEndParse.pl all.txt
 
     #	Reading in end info
     #	Writing out pair info
     #	Writing out singleton info
     #	302264 pairs and 203094 singles
     #	examined all the clone names and all the bac end names in these two
     #	files and compared with business from all.txt to make sure we properly
     #	classified all of them correctly.  We had 833,173 clone sequences,
     #	and 501,135 bac end names
 
     #	faSplit does not function correctly if given a .gz source file
     #	AND, we need the unzipped file for sequence loading below
     gunzip cloneEnds.fa.gz
     # split
     mkdir splitdir
     cd splitdir
     faSplit sequence ../cloneEnds.fa 100 cloneEnds
     #	Check to ensure no breakage:
     cat *.fa | faSize stdin
 # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
 # in 833173 sequences in 1 files
     #	same numbers as before
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/hg19/cloneend
     cd /gbdb/hg19/cloneend
       ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
     cd /tmp
     hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
     #  Advisory lock created
     # Creating .tab file
     # Adding /gbdb/hg19/cloneend/cloneEnds.fa
     # 833173 sequences
     # Updating seq table
     # Advisory lock has been released
     # All done
 
 ##############################################################################
 # BACEND SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
     cd /hive/data/genomes/hg19/bed/bacends/run.blat
     #	going to run separate runs for the golden path sequence vs. the
     #	randoms, haplotypes, chrUn and chrM
     partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
 	| egrep -v "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://; s/:/./" > hg19.list
     ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
 	> bacEnds.list
 
     ssh swarm
     cd /hive/data/genomes/hg19/bed/bacends/run.blat
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set target = $1
 set query = $2
 set result = $3
 set partSpec = `echo $target | sed -e "s/\./:/"`
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set dir = $result:h
 set chr = `echo $target | sed -e "s/\..*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
 
 # echo $tmpFile
 # echo "chr: $chr $start $end -> size: $chrSize, range: $range"
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p $dir
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
         /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
 rm -f $result
 liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
 rm -f $tmpFile.lift $tmpFile.psl
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list bacEnds.list template jobList
     para create jobList
 # 62034 jobs in batch
     # these jobs run quickly, limit them to 250 at a time
     para try, check, -maxJob=250 push, etc ...
 # Completed: 62034 of 62034 jobs
 # CPU time in finished jobs:     506023s    8433.72m   140.56h    5.86d  0.016 y
 # IO & Wait Time:                175853s    2930.88m    48.85h    2.04d  0.006 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest finished job:             752s      12.53m     0.21h    0.01d
 # Submission to last job:          3533s      58.88m     0.98h    0.04d
 
     #	combine the alignments
     time pslSort dirs raw.psl temp psl/chr*
     #	62034 files in 24 dirs
     #	Got 62034 files 249 files per mid file
     #	real    81m2.820s
 
     #	-rw-rw-r--  1 13410334441 Apr 29 12:00 raw.psl
     # cleanup
     rmdir temp
 
     time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl  bacEnds.psl /dev/null > pslReps.out 2>&1 &
     #	real    5m55.990s
     #	Processed 106254032 alignments
     #	-rw-rw-r--  1   372734361 Apr 29 12:56 bacEnds.psl
 
 
     wc -l bacEnds.psl
     #	2852977 bacEnds.psl
 
     time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
 	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose bacEnds.psl \
 	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
 	all_bacends bacEnds
     #	Reading pair file
     #	Reading psl file
     #	Creating Pairs
     #	Writing to files
     #	real    0m18.851s
     #	this creates the files:
     #	-rw-rw-r--  1    21178741 Apr 29 13:00 bacEnds.pairs
     #	-rw-rw-r--  1     5250873 Apr 29 13:00 bacEnds.orphan
     #	-rw-rw-r--  1      738045 Apr 29 13:00 bacEnds.short
     #	-rw-rw-r--  1      463560 Apr 29 13:00 bacEnds.slop
     #	-rw-rw-r--  1      146369 Apr 29 13:00 bacEnds.mismatch
     #	-rw-rw-r--  1        3528 Apr 29 13:00 bacEnds.long
 
     # filter and sort
     awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
     awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
 	bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed
 
     extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
 	    > bacEndPairs.load.psl
 
 ############################################################################
 # BACEND Randoms SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
     cd /hive/data/genomes/hg19/bed/bacends/run.randoms
     #	this separate run for the randoms, haplotypes, chrUn and chrM
     partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
 	| egrep "random|_hap|chrUn" \
 	| sed -e "s/.*2bit://; s/:/./" > random.list
     cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list
 
     ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
 	> bacEnds.list
 
     ssh swarm
     cd /hive/data/genomes/hg19/bed/bacends/run.randoms
     gensub2 random.list bacEnds.list ../run.blat/template jobList
     # very similar runOne.csh script as above, but it doesn't need to do
     #	the lift
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set target = $1
 set query = $2
 set result = $3
 set partSpec = `echo $target | sed -e "s/\./:/"`
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set dir = $result:h
 set chr = `echo $target | sed -e "s/\..*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
 
 # echo $tmpFile
 # echo "chr: $chr $start $end -> size: $chrSize, range: $range"
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p $dir
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
         /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
 rm -f $result
 mv $tmpFile.psl $result
 echo rm -f $tmpFile.lift
 '_EOF_'
     # << happy emacs
 
     # these jobs run fast, do not let too many of them run
     para -maxJob=100 try...check...push
     para time
 # Completed: 6762 of 6762 jobs
 # CPU time in finished jobs:      20357s     339.29m     5.65h    0.24d  0.001 y
 # IO & Wait Time:                 17839s     297.31m     4.96h    0.21d  0.001 y
 # Average job time:                   6s       0.09m     0.00h    0.00d
 # Longest finished job:             261s       4.35m     0.07h    0.00d
 # Submission to last job:           508s       8.47m     0.14h    0.01d
 
     time pslSort dirs raw.psl temp psl/chr*
     #	6762 files in 69 dirs
     #	Got 6762 files 82 files per mid file
     #	real    6m37.177s
 
     #	37044 files in 98 dirs
     #	Got 37044 files 192 files per mid file
     #	real    32m24.804s
     #	-rw-rw-r--    1 6487445210 Feb  2 21:08 raw.psl
     time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
     #	real    0m5.761s
     #	Processed 1254273 alignments
 
     # cleanup
     rmdir temp
 
     wc -l randomEnds.psl
     #	367567 randomEnds.psl
 
     time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
 	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose randomEnds.psl \
 	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
 	all_bacends bacEnds
     #	Reading pair file
     #	Reading psl file
     #	Creating Pairs
     #	Writing to files
     #	real    0m11.221s
     #	this creates the files:
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.slop
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.short
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.mismatch
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.long
     #	-rw-rw-r--  1    141836 Apr 29 14:53 bacEnds.pairs
     #	-rw-rw-r--  1    649907 Apr 29 14:53 bacEnds.orphan
 
 ##############################################################################
 # BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/bacends
     # filter and sort
     awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
 	| sort -k1,1 -k2,2n > bacEndPairs.bed
     awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
 	run.blat/bacEnds.long run.blat/bacEnds.mismatch \
 	run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
 	run.randoms/bacEnds.short run.randoms/bacEnds.long \
 	run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
 	    | sort -k1,1 -k2,2n > bacEndPairsBad.bed
 
     head -5 run.blat/bacEnds.psl > bacEnds.psl
     headRest 5 run.blat/bacEnds.psl > t.psl
     headRest 5 run.randoms/randomEnds.psl >> t.psl
     sort -k14,14 -k16,16n t.psl >> bacEnds.psl
     extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
 	    > bacEnds.load.psl
 
 
     #	load them into the database
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/bacends
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $4}' bacEndPairs.bed | grep " "
     awk '{print $5}' bacEndPairs.bed | sort | uniq -c
     #	result should be the scores, no extraneous strings:
     #	156984 1000
     #	   195 300
     #	   316 375
     #	   297 500
     #	  1476 750
     #	edit the file and fix it if it has a bad name.
     hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     #	Loaded 208922 elements of size 11
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     #	Loaded 79004 elements of size 11
     #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
     # one complaint, there appears to be a bogus insert count in one
     #	of the blat results:
 # < 585   797     67      0       3       2       -63     9       79188   +      AQ743980 852     42      846     chr19_gl000208_random   92689   4045    84100  11       14,124,84,496,53,6,20,28,28,10,4,       42,56,180,200,696,750,756,776,804,832,842,      4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
 Became:
 # > 585   797     67      0       3       2       0       9       79188   +	 AQ743980 852     42      846     chr19_gl000208_random   92689   4045	84100  11       14,124,84,496,53,6,20,28,28,10,4,	42,56,180,200,696,750,756,776,804,832,842,	4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
 
     hgsql -N -e "select count(*) from all_bacends;" hg19
     #	 2289275
     hgsql -N -e "select count(*) from all_bacends;" hg18
     #	1727387
     hgsql -N -e "select count(*) from all_bacends;" hg17
     #	 1729146
 
     nice featureBits hg19 all_bacends
 # 230917362 bases of 2897316137 (7.970%) in intersection
     nice featureBits hg18 all_bacends
 # 227770876 bases of 2881515245 (7.905%) in intersectio
     nice featureBits hg17 all_bacends
 # 225763317 bases of 2866216770 (7.877%) in intersection
 
     nice featureBits hg19 bacEndPairs
 # 236889607 bases of 2897316137 (8.176%) in intersection
     nice featureBits hg18 bacEndPairs
 # 162690030 bases of 2881515245 (5.646%) in intersection
     nice featureBits hg17 bacEndPairs
 # 162099487 bases of 2866216770 (5.656%) in intersection
 
     nice featureBits hg19 bacEndPairsBad
 # 38344094 bases of 2897316137 (1.323%) in intersection
     nice featureBits hg18 bacEndPairsBad
 # 37326990 bases of 2881515245 (1.295%) in intersection
     nice featureBits hg17 bacEndPairsBad
 # 37437558 bases of 2866216770 (1.306%) in intersection
 
 ############################################################################
 # STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
     mkdir /hive/data/outside/ncbi/sts.2009-04
     cd /hive/data/outside/ncbi
     ln -s sts.2009-04 sts.11
     cd /hive/data/outside/ncbi/sts.2009-04
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
     wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
     gunzip sts.gz
     mv sts dbSTS.fa
 
     #	these items are copied in from the previous builds
     cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
     cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
     #	edit stsInfo2.bed.prev for a
     #	manual fixup of error that is in the hg18 bed file, replace
     #	the line for AFM067XA9 to fix bogus long list of aliases to be:
 # 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
     #	as taken directly out of the hg18.stsInfo2 table which was fixed
     #	by Bob and Archana
 
     # Convert the title line of the dbSTS.fa file
     #	Verify that column 3 only contains gb emb dbj
     grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c 
 #   39124 dbj
 #   57375 emb
 # 1212541 gb
     #	if that is true, this sed will work:
     cat dbSTS.fa \
 	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
 	    > UniSTS.convert.fa
 
     # get accessions
     grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
     #	head and tail that to ensure names are reasonable, odd names would
     #	show up at the beginning or end
     wc -l UniSTS.acc
     #	1309040 UniSTS.acc
 
     # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
     #   all.STS.fa, stsAlias.bed files
 
     updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
 	UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
 
     #	verify the number of aliases is reasonable:
     awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
     #	50 D7S831
     #	34 CHLC.GATA2B06.465
     #	24 CHLC.GATA11E11
     #	23 AFM276ZF5
     #	23 AFM273YH9
     #	22 SHGC-133043
     #	... etc ...
     #	verify there are no unusually long or short lines:
     awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
     #	143
     #	144
     #	144
     awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
     #	552
     #	553
     #	644
     # check for null in the new files:
     grep -i null new.*
     #	if the new files look good, they can become the set to use:
     mv new.info stsInfo2.bed
     mv new.primers all.primers
     mv new.alias stsAlias.bed
     mv new.fa all.STS.fa
 
     # get list of all STS id's in the fasta file
     sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
     wc -l all.STS.id
     # 100520 total sequences
     # in hg18 this was: 93698 total sequences
     $HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
     # check that fasta file for unusual length sequences:
     faSize all.primers.fa
 # 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
 # Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244
 
     # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
     # these will be loaded into the database later
     mkdir -p /hive/data/genomes/hg19/bed/sts
     cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
     cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/
 
     # Create sts sequence alignments
     mkdir /hive/data/genomes/hg19/bed/sts/split
 
     faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts
 
     ssh swarm
     mkdir /hive/data/genomes/hg19/bed/sts/run
     cd /hive/data/genomes/hg19/bed/sts/run
 
     #	going to run separate runs for the golden path sequence vs. the
     #	randoms, haplotypes, chrUn and chrM
     #	40,000,000 chunck sizes, 20,000 overlap
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep -v "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://;" > hg19.list
     ls -1S ../split > sts.list
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set query = $2.fa
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set chr = `echo $partSpec | sed -e "s/:.*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
     /scratch/data/hg19/hg19.2bit:$partSpec \
 	../split/${query} -stepSize=5 $tmpFile.psl
 /bin/rm -f $result
 /cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
 # rm -f $tmpFile.lift $tmpFile.psl
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     gensub2 hg19.list sts.list template jobList
     #	these jobs run quickly, allow only 100 at a time
     para -maxJob=100 create jobList
 # 8367 jobs in batch
     para try ... check ... push ... etc
 # Completed: 8366 of 8366 jobs
 # CPU time in finished jobs:      89744s    1495.74m    24.93h    1.04d  0.003 y
 # IO & Wait Time:                 25467s     424.44m     7.07h    0.29d  0.001 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest finished job:              53s       0.88m     0.01h    0.00d
 # Submission to last job:          1592s      26.53m     0.44h    0.02d
 
     #	and, run the randoms as a separate run:
     mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
     cd /hive/data/genomes/hg19/bed/sts/run.randoms
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep "tParts|random|_hap|chrUn"
     cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
     ls -1S ../split > sts.list
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set query = $2.fa
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
     /scratch/data/hg19/hg19.2bit:$partSpec \
 	../split/${query} -stepSize=5 $tmpFile.psl
 /bin/rm -f $result
 mv $tmpFile.psl $result
 /bin/rm -f $tmpFile.psl
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     gensub2 hg19.list sts.list template jobList
     #	these jobs run quickly, allow only 100 at a time
     para -maxJob=100 create jobList
 # 6486 jobs in batch
     para try ... check ... push ... etc
 # Completed: 6486 of 6486 jobs
 # CPU time in finished jobs:       2206s      36.77m     0.61h    0.03d  0.000 y
 # IO & Wait Time:                 16505s     275.08m     4.58h    0.19d  0.001 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest finished job:              21s       0.35m     0.01h    0.00d
 # Submission to last job:           601s      10.02m     0.17h    0.01d
 
     # Compile sts sequence results
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/sts/run
     time pslSort dirs raw.psl temp psl/chr*
     #	8366 files in 89 dirs
     #	Got 8366 files 91 files per mid file
     #	real    8m50.714s
     #	-rw-rw-r--  1 810438277 May  1 11:45 raw.psl
     cd /hive/data/genomes/hg19/bed/sts/run.randoms
     time pslSort dirs raw.psl temp psl/chr*
     #	6486 files in 69 dirs
     #	Got 6486 files 81 files per mid file
     #	real    1m42.120s
     #	-rw-rw-r--  1 18378188 May  1 11:52 raw.psl
 
     rmdir temp
     cd /hive/data/genomes/hg19/bed/sts
     cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
 	| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
 	stsMarkers.psl /dev/null
     #	Processed 7412166 alignments
     #	-rw-rw-r-- 1 12031760 May  1 11:57 stsMarkers.psl
 
     $HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
     # creates stsMarkers.psl.initial
     #	-rw-rw-r-- 1  4485053 May  1 12:06 stsMarkers.psl.initial
     wc -l stsMarkers.psl.initial
     #	101338  stsMarkers.psl.initial
     #	this command needs a chrom_names file to work correctly with this
     #	new style of layout for hg19:
     cd /hive/data/genomes/hg19
     cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
     cd /hive/data/genomes/hg19/bed/sts
 
     $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
 	/cluster/data/hg19
     wc -l stsMarkers.psl.initial.acc
     #	101338  stsMarkers.psl.initial.acc
 
     sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final
 
     # determine found markers (4th field in file)
     cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
     wc -l stsMarkers.found
     #	96472 stsMarkers.found
     #	out of 100520 total sequences from:
     wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
     #	There are lots of duplicates:
     wc -l stsMarkers.final
     #	101338 stsMarkers.final
     #	And a lot of them are just completely haywire:
     awk '$3-$2 < 1001' stsMarkers.final | wc -l
     #	98382
     #	filter out markers that are too long
     awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered
 
     #  alignment of primers
     ssh swarm
     cd /hive/data/outside/ncbi/sts.2009-04
     awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
 	    all.primers > all.primers.ispcr
     mkdir primerAlign
     cd primerAlign
     mkdir split
     cd split
     split -l 5000 ../../all.primers.ispcr primer_
     ls > ../primer.list
 
     cd ..
     #	we need a 10.ooc file for this business
     time blat /scratch/data/hg19/hg19.2bit \
 	/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
 # Wrote 146902 overused 10-mers to 10.ooc
 # real    19m16.758s
 
     # separate runs for whole genome vs. randoms
     mkdir run
     cd run
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep -v "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://;" > hg19.list
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set primer = ../split/$2
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set chr = `echo $partSpec | sed -e "s/:.*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile.psl
 /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
     -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
 	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
 /bin/rm -f $result
 /cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
 rm -f $tmpFile.lift $tmpFile.psl
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list ../primer.list template jobList
     para create jobList
 # 5696 jobs in batch
     para try ... check ... push ... etc
 # Completed: 5696 of 5696 jobs
 # CPU time in finished jobs:     203899s    3398.32m    56.64h    2.36d  0.006 y
 # IO & Wait Time:                 22049s     367.48m     6.12h    0.26d  0.001 y
 # Average job time:                  40s       0.66m     0.01h    0.00d
 # Longest finished job:            5314s      88.57m     1.48h    0.06d
 # Submission to last job:          5418s      90.30m     1.50h    0.06d
 # Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     #	sort and filter the results
     cd psl
     pslSort dirs raw.psl temp chr*
     #	5696 files in 89 dirs
     #	Got 5696 files 75 files per mid file
     #	-rw-rw-r-- 1 456802973 May  4 13:32 raw.psl
     cd ..
     mkdir filter
     pslQuickFilter -minMatch=26 -maxMismatch=5 \
         -maxTinsert=5000 -verbose psl/ filter/
     #	-rw-rw-r-- 1 50302564 May  4 13:35 raw.psl
 
     #	And, for the randoms
     mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
     
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://;" > hg19.list
     cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
     cat tParts/* > hg19.list
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set primer = ../split/$2
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile.psl
 /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
     -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
 	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
 /bin/rm -f $result
 mv $tmpFile.psl $result
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     #	can not use line+ check here, many of them are empty
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list ../primer.list template jobList
     #	they run quickly, limit to 100
     para -maxJob=100 create jobList
     para try ... check ... push ... etc
 # Completed: 4416 of 4416 jobs
 # CPU time in finished jobs:       1746s      29.09m     0.48h    0.02d  0.000 y
 # IO & Wait Time:                 11407s     190.12m     3.17h    0.13d  0.000 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest finished job:               8s       0.13m     0.00h    0.00d
 # Submission to last job:           147s       2.45m     0.04h    0.00d
 
     #	sort and filter the results
     cd psl
     pslSort dirs raw.psl temp chr*
     #	4416 files in 69 dirs
     #	Got 4416 files 66 files per mid file
     rmdir temp
     #	-rw-rw-r-- 1 9066053 May  4 13:31 raw.psl
 
     #	putting the two runs together
     mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
     ln -s ../run/filter/raw.psl run.psl
     ln -s ../runRandoms/filter/raw.psl runRandoms.psl
     #	-rw-rw-r-- 1 50302564 May  4 13:35 run.psl
     #	-rw-rw-r-- 1   825973 May  4 13:35 runRandoms.psl
     cd ..
     pslSort dirs primers.psl temp psl
     #	2 files in 1 dirs
     #	Got 2 files 1 files per mid file
     #	-rw-rw-r-- 1 51128110 May  4 13:39 primers.psl
     wc -l primers.psl
     #	448107 primers.psl
     rmdir temp
     pslFilterPrimers primers.psl ../all.primers primers.filter.psl
     # creates primers.filter.unlifted.psl.notfound.primers
     wc -l primers*
     #	237962 primers.filter.psl
     #	97191 primers.filter.psl.notfound.primers
 
     #	see if ePCR can find some of these notfound
     ssh swarm
     mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
 
     mkdir split
     cd split
     split -l 5000 ../../primers.filter.psl.notfound.primers  primers_
     cd ..
     ls -1S split > primers.lst
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
     cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set primer = split/$2
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set chr = `echo $partSpec | sed -e "s/:.*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p epcr/$partSpec
 /bin/rm -f $tmpFile.psl
 twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
 /cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
 /bin/rm -f $result
 /bin/mv $tmpFile.epcr $result
 rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list primers.lst template jobList
     para create jobList
 	# 3160 jobs
     para try ... check ... push ... etc ...
 # Completed: 3160 of 3160 jobs
 # CPU time in finished jobs:      86253s    1437.54m    23.96h    1.00d  0.003 y
 # IO & Wait Time:                 11196s     186.61m     3.11h    0.13d  0.000 y
 # Average job time:                  31s       0.51m     0.01h    0.00d
 # Longest finished job:              89s       1.48m     0.02h    0.00d
 # Submission to last job:           237s       3.95m     0.07h    0.00d
 
     find ./epcr -type f | xargs cat > all.epcr
     wc -l all.epcr
     #	797286 all.epcr
     # convert the coordinates from the partitionSequence.pl to a lift file
     awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
     $HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
 	> all.epcr.lift
     cat all.epcr | sed -e "s/\.\./ /; s/  */\t/g" \
 	| liftUp -type=.bed stdout all.epcr.lift error stdin \
 	| awk '
 {
 printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
 }
 ' > all.epcr.lifted
 
     pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
     /cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
 	-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
     #	this took a long time, many hours
 # -rw-rw-r--   1  2785254 May  5 17:28 epcr.not.found
 # -rw-rw-r--   1 27343510 May  5 17:28 epcr.primers.psl
 # -rw-rw-r--   1  1616885 May  5 17:28 epcr.primers.psl.notfound.primers
 
     time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
     time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
 	../../all.primers /hive/data/genomes/hg19
     #	real    69m38.444s
     #	-rw-rw-r--   1        0 May  6 14:18 epcr.not.found.nomatch
     #	-rw-rw-r--   1  8369138 May  6 15:26 epcr.not.found.psl
 
     #	combining everything together now
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign
 
     sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
                 | sort -k15,15 -k17,17n > primers.final.psl
     wc -l primers.final.psl
     #	310705 primers.final.psl
 
     time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
         ../all.primers primers.final.psl > primers.final.fix.psl
     #	real    0m19.580s
     wc -l primers.final.fix.psl
     #	310705 primers.final.fix.psl
 
     # Extract relevant info, make alignments unique, and create final file to
     #	be merged with full sequence alignments
     $HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
     #	real    0m15.303s
     #	-rw-rw-r-- 1 15660447 May  6 15:44 primers.final.fix.psl.initial
     wc -l primers.final.fix.psl.initial
     #	308210 primers.final.fix.psl.initial
     $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
 	primers.final.fix.psl.initial /hive/data/genomes/hg19
     wc -l primers.final.fix.psl.initial.acc
     #	308210 primers.final.fix.psl.initial.acc
 
     $HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
 	primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
     wc -l primers.final
     # 308210 primers.final
     #	There doesn't appear to be any use for this primers.ids list
     #	except for curiosity.  Check the head and tail of this list to
     #	verify no garbage is in here.  There should just be numbers.
     awk '{print $4}' primers.final | sort -n | uniq > primers.ids
     wc -l primers.ids
     #	290961 primers.ids
 
     # Merge primer and sequence files to create final bed file
     # Merge (combineSeqPrimerPos) takes about an hour to run
     cd /hive/data/genomes/hg19/bed/sts
     time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
 	/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
     #	real    0m12.310s
     #	-rw-rw-r-- 1 15222346 May  6 15:55 stsMarkers_pos.rdb
     wc -l stsMarkers_pos.rdb
     #	315308 stsMarkers_pos.rdb
 
     time /cluster/bin/scripts/createSTSbed \
 	/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed  \
 	stsMarkers_pos.rdb > stsMap.bed
     #	real    0m31.886s
     #	-rw-rw-r-- 1 38244880 May  6 16:25 stsMap.bed
     wc -l stsMap.bed
     #	305914 stsMap.bed
 
     # Set up sequence files
     ssh hgwdev
     mkdir /gbdb/hg19/sts.11/
     ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
 	/gbdb/hg19/sts.11/all.STS.fa
     ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
         /gbdb/hg19/sts.11/all.primers.fa
 
     # Load all files
     cd /hive/data/genomes/hg19/bed/sts
     hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
     #	Creating seq.tab file
     #	Adding /gbdb/hg19/sts.11/all.STS.fa
     #	100520 sequences
     #	Adding /gbdb/hg19/sts.11/all.primers.fa
     #	317592 sequences
     #	Updating seq table
     #	Advisory lock has been released
     #	All done
 
 
     hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
     hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
     #	these files already exist here from previous operations
     # cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
     hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
     hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
     #	a couple minutes for each load above
     #	filter the stsMap.bed to eliminate items longer than 5,000 bases,
     #	takes out about 850:
     awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
 	> stsMap.filtered.5000.bed
 
     hgLoadBed -notItemRgb -noBin -tab \
 	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
 	    stsMap.filtered.5000.bed
     #	Loaded 305064 elements of size 28
 
     ln -s \
 /hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
 	primers.psl
 
     hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
     hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl
 
 ##############################################################################
 # FISH CLONES (WORKING - 2009-04-29 - Hiram)
 # The STS Marker and BAC End Pairs tracks must be completed prior to
 # creating this track.  
 
     mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
     cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
 
 # Download information from NCBI
         # point browser at:
 #   http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
 # change "Sequence tag:" to "placed on contig"
         # change "Show details on sequence-tag" to "yes"
         # change "Download or Display" to "Download table for UNIX"
         # press Submit - save as
 # /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
     chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
 
 #	Unfortunately the format of this hbrc file has changed since
 #	last time.  The columns have been rearranged, and one important
 #	column is missing, the contig information.  So, let's see if we
 #	can recover the original format by putting this together with
 #	some other things we have here.
     $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
 	/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
 	    2> dbg
 XXX - need to get this seq_clone.pmd from NCBI, maybe Paul Kitts
     #	the seq_clone.pmd file was obtained via email from Wonhee Jang
     #	jang at ncbi.nlm.nih.gov - I have asked for clarification where
     #	such a file can be fetched without resorting to email.
 
 # Get current clone/accession information
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/clone/reports/clac.out
 http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
 
 # Create initial Fish Clones bed file
     ssh kkstore02
     mkdir /hive/data/genomes/hg19/bed/fishClones
     cd /hive/data/genomes/hg19/bed/fishClones
 
     # Copy previous sts info from fhcrc
     cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
     #	This fhcrc.sts listing doesn't change.  It is merely a listing
     #	of aliases that remain in effect.
 
     #	Create cl_acc_gi_len file form cloneend information:
     grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
     | awk '{gsub(".[0-9]*$", "", $2);
 	printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
 
     hgsql -N \
 	-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
 	| sort -k1,1 -k2,2n > ctgPos.bed
     hgsql -N \
 -e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
 	| sort -k1,1 -k2,2n > gold.bed
     hgsql -N \
 -e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
 	| sort -k1,1 -k2,2n > all_bacends.bed
     hgsql -N \
 -e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
 	| sort -k1,1 -k2,2n > bacEndPairs.bed
 
 
 
     ssh hgwdev
     #	have to be on hgwdev for this since it is going to read from the
     #	database.  Had to work on this program to get it past what is
     #	evidently a bad entry in hbrc.fixed where columns of information
     #	are missing for one clone in particular
     time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
 	/hive/data/genomes/hg19/bed/ncbiCytoBand/contig/fixed.hbrc.txt \
 	/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
          ./cl_acc_gi_len \
 	/hive/data/genomes/hg19/bed/bacends/bacEnds.load.psl \
             fishClones
     #	real    2m4.708s
 # Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
 # reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
 # Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
 # Reading BAC Ends file ./cl_acc_gi_len
 # Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
 # Reading additional STS Marker links fhcrc.sts
 # Determining good positions
 #	findClonePos: determining positions of fish clones
 # Writing output file
 # ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 # ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 
     # Load the track
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/fishClones
     hgLoadBed -notItemRgb -noBin -tab \
         -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
 	hg19 fishClones fishClones.bed
     #	Loaded 9461 elements of size 16
 
 ##############################################################################
 # CytoBands from Wonhee Jang at NCBI (DONE - 2009-06-10 - Hiram)
 
     mkdir /hive/data/genomes/hg19/bed/ncbiCytoBand
     cd /hive/data/genomes/hg19/bed/ncbiCytoBand
     #	received the following files via email:
     ls -ogrt
 # -rw-rw-r-- 1 187930 Jun 10 13:53 ideogram
 # -rw-rw-r-- 1 672327 Jun  8 09:55 fish.markers.bed
 
     #	created cytobands.bed from the ideogram file with:
     cat << '_EOF_' > ideoToCytoBand.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 open (FH,"<ideogram") or die "can not read ideogram";
 
 while (my $line = <FH>) {
     next if $line =~ m/^#/;
     chomp $line;
     my ($chr, $arm, $location, $a, $b, $start, $end, $stain) =
         split('\s+',$line);
     next if ($location =~ m/[a-z]$/);
 //g;$stain =~ s/
     $start -= 1 if ($start == 1);
     printf "chr%s\t%d\t%d\t%s%s\t%s\n", $chr, $start, $end, $arm, $location,
         $stain;
 }
 
 close (FH);
 '_EOF_'
     # << happy emacs
     chmod +x ideoToCytoBand.pl
     ./ideoToCytoBand.pl > cytobands.bed
 
     hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
         hg19 cytoBand cytobands.bed
 
     hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
         hg19 cytoBandIdeo cytobands.bed
     #	checking coverage:
     featureBits -noRandom -noHap -countGaps hg19 cytoBand
     #	3095677412 bases of 3095693983 (99.999%) in intersection
     #	that is everything except chrM:
     echo 3095693983-3095677412 | bc -q
     #	16571
 
 ##############################################################################
 # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
     mkdir /hive/data/genomes/hg19/ensembl
     cd /hive/data/genomes/hg19/ensembl
     wget --timestamping \
 	'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
     #	do not need the repeat masker sequence (although it would be
     #	interesting to measure to see how it compares)
     rm -f *.dna_rm.*
     #	fortunately we have the same sizes as Ensembl for everything
     #	(except the haplotypes) and the sizes are unique for each sequence
     #	so we can relate the names via their sizes
     mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
     cd /hive/data/genomes/hg19/bed/ucscToEnsembl
     #	the toplevel file is a duplicate of everything else
     ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
 	| while read F
 do
     zcat "${F}"
 done | faCount stdin > faCount.txt
 
     cat << '_EOF_' > relateUcscEnsembl.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my %ucscChrs;   # key is size, value is UCSC chr name
 
 open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
 while (my $line = <FH>) {
     chomp $line;
     my ($chr, $size) = split('\s+', $line);
     die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
 );
     $ucscChrs{$size} = $chr;
 }
 close (FH);
 
 my %ensemblChrs;        # key is size, value is Ensembl chr name
 
 open (FH,"<faCount.txt") or die "can not read faCount.txt";
 while (my $line = <FH>) {
     next if ($line =~ m/#/);
     next if ($line =~ m/total/);
     chomp $line;
     my ($chr, $size, $rest) = split('\s+', $line, 3);
     die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
 );
     $ensemblChrs{$size} = $chr;
 }
 close (FH);
 
 my %usedUcscChrs;
 my %usedEnsemblChrs;
 my %ensemblTranslate; # key is Ensembl name, value is UCSC size
 foreach my $size (keys %ucscChrs) {
     if (exists($ensemblChrs{$size})) {
         $usedUcscChrs{$size} = $ucscChrs{$size};
         $usedEnsemblChrs{$size} = $ensemblChrs{$size};
         printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
     } else {
         my $ucscName = $ucscChrs{$size};
         my $ensemblName = "unknown";
         if ($ucscName =~ m/^chr6/) {
             $ucscName =~ s/_hap.//;
             $ucscName =~ s/chr6_/chr6_mhc_/;
             $ensemblName = "HS" . uc($ucscName);
         } elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
             $ucscName =~ s/_.*/_1/;
             $ensemblName = "HS" . uc($ucscName);
         } elsif ($ucscName =~ m/^chrM/) {
             print "# no translation for chrM\n";
         } else {
             die "unknown UCSC chr name: $ucscName";
         }
         printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
         $ensemblTranslate{$ensemblName} = $size;
     }
 }
 
 foreach my $size (keys %ensemblChrs) {
     if (!exists($usedEnsemblChrs{$size})) {
         my $ensemblName = $ensemblChrs{$size};
         if (! exists($ensemblTranslate{$ensemblName})) {
             die "can not translate Ensembl name $ensemblName";
         } else {
             my $ucscSize = $ensemblTranslate{$ensemblName};
             printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
 , $ucscSize, $size;
         }
     }
 }
 
 printf "chrM\tMT\n";
 '_EOF_'
     # << happy emacs
     chmod +x relateUcscEnsembl.pl
 
     ./relateUcscEnsembl.pl  2>&1 | grep -v "^#" \
 	| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab
 
     cat << '_EOF_' > ucscToEnsembl.sql
 # UCSC to Ensembl chr name translation
 CREATE TABLE ucscToEnsembl (
     ucsc varchar(255) not null,        # UCSC chromosome name
     ensembl varchar(255) not null,     # Ensembl chromosome name
               #Indices
     PRIMARY KEY(ucsc(21))
 );
 '_EOF_'
 
     hgsql hg19 < ucscToEnsembl.sql
     hgsql hg19 \
 -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
 
     awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
 	> ensemblLift.tab
 
     cat << '_EOF_' > ensemblLift.sql
 # UCSC offset to Ensembl coordinates
 CREATE TABLE ensemblLift (
     chrom varchar(255) not null,      # Ensembl chromosome name
     offset int unsigned not null,     # offset to add to UCSC position 
               #Indices
     PRIMARY KEY(chrom(15))
 );
 '_EOF_'
 
     hgsql hg19 < ensemblLift.sql
     hgsql hg19 \
 -e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
 
 ##############################################################################
 # LASTZ MOUSE Mm9 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs mouse
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm9
 SEQ2_DIR=/scratch/data/mm9/nib
 SEQ2_SMSK=/scratch/data/mm9/notInOthers
 SEQ2_LEN=/scratch/data/mm9/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
     
 BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     cat fb.hg19.chainMm9Link.txt 
     #	1022734273 bases of 2897316137 (35.299%) in intersection
 
     #	and the swap
     mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
     cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
 	-swap -noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    131m58.763s
     cat fb.mm9.chainHg19Link.txt 
     #	1013880568 bases of 2620346127 (38.693%) in intersection
 
 #########################################################################
 # LASTZ Dog CanFam2 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs dog
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2
 SEQ2_DIR=/scratch/data/canFam2/nib
 SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
 SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     cat fb.hg19.chainCanFam2Link.txt 
     #	1532073507 bases of 2897316137 (52.879%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/canFam2/bed/blastz.hg19.swap
     cd /hive/data/genomes/canFam2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13/DEF \
 	-noLoadChainSplit -swap \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    200m17.158s
     cat fb.canFam2.chainHg19Link.txt 
     #	1480018167 bases of 2384996543 (62.055%) in intersection
 #########################################################################
 # LASTZ Chicken GalGal3 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs chicken
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/data/galGal3/nib
 SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
 SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
     cat fb.hg19.chainGalGal3Link.txt 
     #	104053179 bases of 2897316137 (3.591%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/galGal3/bed/blastz.hg19.swap
     cd /hive/data/genomes/galGal3/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13/DEF \
 	-swap \
 	-noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
     #	real    16m45.090s
     cat fb.galGal3.chainHg19Link.txt 
     #	91605899 bases of 1042591351 (8.786%) in intersection
 
 #########################################################################
 # LASTZ Macaca Mulatta RheMac2 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs macaca mulatta
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Macaca Mulatta RheMac2
 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
 SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    760m22.810s
     cat fb.hg19.chainRheMac2Link.txt 
     #	2397361211 bases of 2897316137 (82.744%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
     cd /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13/DEF \
 	-swap \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> swap.log 2>&1 &
     #	real    83m51.483s
     cat fb.rheMac2.chainHg19Link.txt 
     #	2313806886 bases of 2646704109 (87.422%) in intersection
 #########################################################################
 # LASTZ Rat Rn4 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs rat
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4
 SEQ2_DIR=/scratch/data/rn4/nib
 SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
 SEQ2_LEN=/scratch/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
     
 BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    314m18.227s
     cat fb.hg19.chainRn4Link.txt 
     #	952605822 bases of 2897316137 (32.879%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/rn4/bed/blastz.hg19.swap
     cd /hive/data/genomes/rn4/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13/DEF \
 	-swap -noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    188m0.163s
     cat fb.rn4.chainHg19Link.txt 
     #	947862300 bases of 2571531505 (36.860%) in intersection
 ##############################################################################
 # LASTZ Orangutan PonAbe2 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs orangutan
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Orangutan PonAbe1
 SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     cat fb.hg19.chainPonAbe2Link.txt 
     #	2646687531 bases of 2897316137 (91.350%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
     cd /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13/DEF \
 	-swap \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> swap.log 2>&1 &
     #	real    124m3.610s
     cat fb.ponAbe2.chainHg19Link.txt 
     #	2772351468 bases of 3093572278 (89.617%) in intersection
 ##############################################################################
 # LASTZ Lamprey PetMar1 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Lamprey
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ2_LIMIT=5
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    113m20.116s
     cat fb.hg19.chainPetMar1Link.txt 
     #	31347143 bases of 2897316137 (1.082%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/petMar1/bed/blastz.hg19.swap
     cd /hive/data/genomes/petMar1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
     #	real    59m14.813s
     cat fb.petMar1.chainHg19Link.txt 
     #	26615001 bases of 831696438 (3.200%) in intersection
 ##############################################################################
 # LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Fugu
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Fugu fr2
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
 SEQ2_DIR=/scratch/data/fr2/fr2.2bit
 SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
 SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
 	> do.log 2>&1 &
     #	real    5797m9.288s
     #	had a small problem finishing the fundamental batch run, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
 	> cat.log 2>&1 &
     cat fb.hg19.chainFr2Link.txt 
     #	49309456 bases of 2897316137 (1.702%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/fr2/bed/blastz.hg19.swap
     cd /hive/data/genomes/fr2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
 	-swap > swap.log 2>&1 &
     #	real    25m8.491s
     cat fb.fr2.chainHg19Link.txt 
     #	42984130 bases of 393312790 (10.929%) in intersection
 
 ##############################################################################
 # LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
 
     cat << '_EOF_' > DEF
 # human vs tetraodon
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/data/tetNig1/tetNig1.2bit
 SEQ2_LEN=/hive/data/genomes/tetNig1/chrom.sizes
 SEQ2_CHUNK=410000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    166m19.745s
     cat fb.hg19.chainTetNig1Link.txt 
     #	58038079 bases of 2897316137 (2.003%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
     cd /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
     #	real    29m20.968s
     cat fb.tetNig1.chainHg19Link.txt 
     #	49453375 bases of 342403326 (14.443%) in intersection
 
 ##############################################################################
 # LASTZ Stickleback GasAcu1 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Stickleback
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # TARGET: Stickleback gasAcu1
 SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
 SEQ2_LEN=/hive/data/genomes/gasAcu1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    174m40.659s
     cat fb.hg19.chainGasAcu1Link.txt 
     #	55509003 bases of 2897316137 (1.916%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
     cd /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
     #	real    29m41.433s
     cat fb.gasAcu1.chainHg19Link.txt 
     #	49909819 bases of 446627861 (11.175%) in intersection
 ##############################################################################
 # LASTZ Marmoset CalJac1 (DONE - 2009-05-14,22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
 
     cat << '_EOF_' > DEF
 # human vs. marmoset
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Marmoset (calJac1)
 SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/scratch/data/calJac1/chrom.sizes
 SEQ2_LIMIT=200
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    214m16.294s
     cat fb.hg19.chainCalJac1Link.txt 
     #	2053025318 bases of 2897316137 (70.860%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 calJac1 > rbest.log 2>&1 &
     #	real    97m17.207s
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/calJac1/bed/blastz.hg19.swap
     cd /hive/data/genomes/calJac1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14/DEF \
 	-swap \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> swap.log 2>&1 &
     #	real    162m52.189s
     cat fb.calJac1.chainHg19Link.txt 
     #	2105959656 bases of 2929139385 (71.897%) in intersection
 
 #########################################################################
 # LASTZ Tarsier TarSyr1 (DONE - 2009-05-14,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Tarsier
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Tarsier
 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
 SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1724m48.032s
     #	need to load the chain table manually:
     #	mySQL error 1114: The table 'chainTarSyr1Link' is full
     cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
     wc -l *.tab
     #	 21882142 chain.tab
     #	165017606 link.tab
     #	186899748 total
     awk '{print length($0)}' link.tab | sort | uniq -c | less
       4 23
       9 24
      27 25
     105 26
     767 27
    1401 28
    5020 29
    8472 30
   24390 31
  117666 32
  264774 33
  776095 34
 1632393 35
 2672187 36
 7125988 37
 16831901 38
 34905113 39
 45218159 40
 31570706 41
 13746548 42
 5868689 43
 2460114 44
 1118556 45
  420826 46
  106674 47
   36770 48
   40719 49
   36955 50
   19389 51
    5571 52
    1557 53
      61 54
 
     time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19
 
     cat << '_EOF_' | hgsql hg19
     CREATE TABLE chainTarSyr1Link (
       bin smallint(5) unsigned NOT NULL default 0,
       tName varchar(255) NOT NULL default '',
       tStart int(10) unsigned NOT NULL default 0,
       tEnd int(10) unsigned NOT NULL default 0,
       qStart int(10) unsigned NOT NULL default 0,
       chainId int(10) unsigned NOT NULL default 0,
       KEY tName (tName(16),bin),
       KEY chainId (chainId)
     ) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 hgsql -e \
       "load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
     #	real    157m0.230s
     #	the running the rest of loadUp.csh after the hgLoadChain
     #	real    26m8.263s
     cat fb.hg19.chainTarSyr1Link.txt 
     #	1385797066 bases of 2897316137 (47.830%) in intersection
     #	Continuing:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-continue=download -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> download.log 2>&1 &
     #	real    48m6.573s
     #	ran the script on swarm to recover after hive outages
     time doRecipBest.pl -buildDir=`pwd` hg19 tarSyr1 > rbest.log 2>&1 &
     #	real    404m0.201s
     time doRecipBest.pl -continue=download -buildDir=`pwd` \
 	hg19 tarSyr1 > rbest.download.log 2>&1 &
 
 #########################################################################
 # LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Bushbaby
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
 SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
 SEQ2_LEN=/hive/data/genomes/otoGar1/chrom.sizes
 SEQ2_LIMIT=200
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    762m56.055s
     cat fb.hg19.chainOtoGar1Link.txt 
     #	1264492372 bases of 2897316137 (43.644%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 otoGar1 > rbest.log 2>&1 &
     #	real    271m39.925s
 
 #########################################################################
 # LASTZ Mouse lemur MicMur1 (DONE - 2009-05-14,26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Mouse lemur
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Mouse lemur
 SEQ2_DIR=/hive/data/genomes/micMur1/bed/repeatMasker/micMur1.rmsk.2bit
 SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    5429m52.082s
     #	there is one unusual long running job having trouble
     #	continuing after finishing the lastz run manually:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-continue=cat -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> cat.log 2>&1 &
     #	real    388m25.032s
     cat fb.hg19.chainMicMur1Link.txt 
     #	1347792207 bases of 2897316137 (46.519%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 micMur1 > rbest.log 2>&1
     #	about 4h30m
 
 #########################################################################
 # LASTZ Baboon PapHam1 (DONE - 2009-05-20,22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
     cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
 
     cat << '_EOF_' > DEF
 # human vs baboon
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Baboon papHam1
 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
 SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	forgot that the synNet was not needed here, use recip best as below
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     cat fb.hg19.chainPapHam1Link.txt 
     #	2399269031 bases of 2897316137 (82.810%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
     #	real    182m0.276s
 
 #########################################################################
 # SGP GENES (DONE - 2009-05-22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/sgpGene
     cd /hive/data/genomes/hg19/bed/sgpGene
     mkdir download
     cd download
 for C in `cut -f1 ../../../chrom.sizes`
 do
     echo $C
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
 done
 
     cd ..
     cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin
 
     #	Read 33994 transcripts in 291782 lines in 1 files
     #	33994 groups 85 seqs 1 sources 3 feature types
     #	33994 gene predictions
     nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
 # refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x
 
 ###########################################################################
 # GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg19/bed/geneid
     cd /hive/data/genomes/hg19/bed/geneid
     mkdir download
     cd download
     for C in `cut -f1 ../../../chrom.sizes`
     do
 	echo $C
  wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
     done
 
     cd ..
     cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
     #	Read 33428 transcripts in 277332 lines in 1 files
     #	33428 groups 92 seqs 1 sources 3 feature types
     #	33428 gene predictions
 
 ##########################################################################
 ## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg19/bed/multiz4way
     cd /hive/data/genomes/hg19/bed/multiz4way
 
     #	extract our 4 organisms from the 44-way on hg18:
     ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
 
     /cluster/bin/phast/tree_doctor \
 	--prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
 	| sed -e "s/hg18/hg19/" > 4way.nh
 
     #	this looks like:
     cat 4way.nh
 (((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);
 
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/hg19_4way.gif
 
     /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
     #	Use this output to create the table below
     grep -y hg19 4way.distances.txt | sort -k3,3n
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                        chainHg19Link   chain    linearGap
 #    distance                      on hg19    on other   minScore
 #  1  0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx)       5000     medium
 #  2  0.356914 - dog canFam2    (% 52.879) (% xx.xxx)       3000     medium
 #  3  0.495284 - mouse mm9      (% 35.299) (% 38.693)       3000     medium
 
     #	using the syntenic nets
     cd /cluster/data/hg19/bed/multiz4way
     mkdir mafLinks
     cd mafLinks
     mkdir rheMac2 canFam2 mm9
 
     cd mm9
     ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
     cd ../canFam2
     ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
     cd ../rheMac2
     ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .
 
     #	determine what is the newest version of multiz and use that
     cd /hive/data/genomes/hg19/bed/multiz4way
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
 
     # the autoMultiz cluster run
     ssh swarm
     cd /hive/data/genomes/hg19/bed/multiz4way
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	4way.nh > tmp.nh
     echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir run maf
     cd run
 
     #	NOTE: you need to set the db and multiz dirname properly in this script
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = hg19
 set c = $1
 set maf = $2
 set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 93 jobs
     para try ... check ... push ... etc ...
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:      24282s     404.70m     6.75h    0.28d  0.001 y
 # IO & Wait Time:                  2362s      39.36m     0.66h    0.03d  0.000 y
 # Average job time:                 286s       4.77m     0.08h    0.00d
 # Longest finished job:            2235s      37.25m     0.62h    0.03d
 # Submission to last job:          2241s      37.35m     0.62h    0.03d
 
     #	combine results into a single file for loading and gbdb reference
     cd /hive/data/genomes/hg19/bed/multiz4way
     time nice -n +19 catDir maf > multiz4way.maf
     #	real    3m27.561s
 
     #	makes a 8.5 Gb file:
     #	-rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf
 
     # Load into database
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz4way
     mkdir /gbdb/hg19/multiz4way
     ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
 	/gbdb/hg19/multiz4way
     #	the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf hg19 multiz4way
     #	real    5m31.883s
     #	Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way
 
     cd /hive/data/genomes/hg19/bed/multiz4way
     time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 	-maxSize=50000 hg19 multiz4waySummary multiz4way.maf
     #	Created 1238721 summary blocks from 11959676 components
     #	and 5788627 mafs from multiz4way.maf
     #	real    6m33.936s
 
 #########################################################################
 # LASTZ Medaka OryLat2 (DONE - 2009-05-22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
     cd /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
 
     cat << '_EOF_' > DEF
 # Human vs. Medaka
 # typical parameters for a genome that is distant from human
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
 SEQ2_LEN=/hive/data/genomes/oryLat2/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    124m5.298s
     cat fb.hg19.chainOryLat2Link.txt 
     #	53571737 bases of 2897316137 (1.849%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
     cd /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
     #	real    28m35.174s
     cat fb.oryLat2.chainHg19Link.txt 
     #	46961818 bases of 700386597 (6.705%) in intersection
 ##############################################################################
 # LASTZ Opossum MonDom5 (DONE - 2009-05-23,29 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
     cd /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 # settings for more distant organism alignments
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Opossum monDom5
 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
 SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	One job took a long time to complete, had to run it manually on
     #	swarm:
 # /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
 #	/scratch/data/hg19/hg19.2bit:chr19:50000000-59128983 \
 #	/scratch/data/monDom5/monDom5.2bit:chr4:390000000-420000000 \
 #	../DEF \
 #	../psl/hg19.2bit:chr19:50000000-59128983/hg19.2bit:chr19:50000000-59128983_monDom5.2bit:chr4:390000000-420000000.psl
     #	took about 48 hours, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=cat > cat.log 2>&1 &
     #	real    1508m18.471s ==	about 25h08m
     cat fb.hg19.chainMonDom5Link.txt 
     #	415997117 bases of 2897316137 (14.358%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
     #	real    20m29.049s
 
     mkdir /hive/data/genomes/monDom5/bed/blastz.hg19.swap
     cd /hive/data/genomes/monDom5/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-swap -syntenicNet > swap.log 2>&1 &
     #	real    297m13.041s
     cat fb.monDom5.chainHg19Link.txt 
     #	406727849 bases of 3501660299 (11.615%) in intersection
 
 ##############################################################################
 # LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
     cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
 
     cat << '_EOF_' > DEF
 # Human vs. Armadillo
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Armadillo
 SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
 SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	finished the lastz run manually after hive maintenance outages
     #	then, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-continue=cat > cat.log 2>&1 &
     #	real    458m11.304s
     cat fb.hg19.chainDasNov2Link.txt 
     #	971847303 bases of 2897316137 (33.543%) in intersection
     time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 dasNov2 \
 	> rbest.log 2>&1
     #	time about 6h30m
 
 ##############################################################################
 # LASTZ Rock Hyrax ProCap1 (DONE - 2009-05-23,26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
     cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
 
     cat << '_EOF_' > DEF
 # Human vs. Rock Hyrax
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Rock Hyrax
 SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
 SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
 # Completed: 997438 of 997438 jobs
 # CPU time in finished jobs:   32830587s  547176.45m  9119.61h  379.98d  1.041 y
 # IO & Wait Time:               9549484s  159158.07m  2652.63h  110.53d  0.303 y
 # Average job time:                  42s       0.71m     0.01h    0.00d
 # Longest finished job:            1953s      32.55m     0.54h    0.02d
 # Submission to last job:         67216s    1120.27m    18.67h    0.78d
     #	finished lastz run manually, then continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=cat > cat.log 2>&1 &
     #	real    369m1.678s
     cat fb.hg19.chainProCap1Link.txt 
     #	894221652 bases of 2897316137 (30.864%) in intersection
     time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 proCap1 \
 	> rbest.log 2>&1
     #	real    251m59.549s
 
 ##############################################################################
 # LASTZ Zebra Finch TaeGut1 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs Zebra Finch
 # distant from Human settings
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebra Finch taeGut1 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
 SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
 SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
 SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-qRepeats=windowmaskerSdust > do.log 2>&1 &
     cat fb.hg19.chainTaeGut1Link.txt 
     #	real    192m48.479s
     #	101295490 bases of 2897316137 (3.496%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
 	-chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=syntenicNet -qRepeats=windowmaskerSdust > synNet.log 2>&1 &
     #	real    4m10.261s
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
     cd /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26/DEF \
 	-swap -noLoadChainSplit -chainMinScore=5000 \
 	-chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-qRepeats=windowmaskerSdust > swap.log 2>&1 &
     #	real    real    16m45.080s
     cat fb.taeGut1.chainHg19Link.txt 
     #	95320369 bases of 1222864691 (7.795%) in intersection
 
 ##############################################################################
 # LASTZ Lizard AnoCar1 (DONE - 2009-05-30,31 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
     cd /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
 
     cat << '_EOF_' > DEF
 # human vs lizard
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Lizard anoCar1
 SEQ2_DIR=/scratch/data/anoCar1/anoCar1.2bit
 SEQ2_LEN=/scratch/data/anoCar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-qRepeats=windowmaskerSdust > do.log 2>&1 &
     #	real    168m32.016s
     cat fb.hg19.chainAnoCar1Link.txt 
     #	104045950 bases of 2897316137 (3.591%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
     #	real    45m58.001s
 
     #	running syntenic Net 2009-08-27 - Hiram
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=syntenicNet -syntenicNet \
 	-qRepeats=windowmaskerSdust > syntenicNet.log 2>&1 &
     #	real    6m13.304s
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
     cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
     #	real    34m55.857s
     cat fb.anoCar1.chainHg19Link.txt 
     #	89608316 bases of 1741478929 (5.146%) in intersection
 ##############################################################################
 # LASTZ X. tropicalis XenTro2 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs X. tropicalis
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Lizard anoCar1
 SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
 SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1129m11.568s
     #	finished the lastz run manually after hive difficulties, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=cat > cat.log 2>&1 &
     #	time about 1h30m
     cat fb.hg19.chainXenTro2Link.txt 
     #	92015242 bases of 2897316137 (3.176%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
     cd /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
     #	real    130m53.860s
     cat fb.xenTro2.chainHg19Link.txt 
     #	92070065 bases of 1359412157 (6.773%) in intersection
 
 ##############################################################################
 # LASTZ Zebrafish DanRer5 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs X. zebrafish
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish danRer5
 SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
 SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=40
 
 BASE=/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    311m39.817s
     cat fb.hg19.chainDanRer5Link.txt 
     #	74229561 bases of 2897316137 (2.562%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/danRer5/bed/blastz.hg19.swap
     cd /hive/data/genomes/danRer5/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
     #	real    26m54.605s
     cat fb.danRer5.chainHg19Link.txt 
     #	73852780 bases of 1435609608 (5.144%) in intersection
 
 ##############################################################################
 # LASTZ Platypus OrnAna1 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs platypus
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Platypus ornAna1
 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
 SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    572m18.808s
     cat fb.hg19.chainOrnAna1Link.txt 
     #	220977689 bases of 2897316137 (7.627%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 ornAna1 > rbest.log 2>&1
     #	time about 1h32m
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
     cd /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26/DEF \
 	-swap -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> swap.log 2>&1 &
     #	real    146m52.638s
     cat fb.ornAna1.chainHg19Link.txt 
     #	207415519 bases of 1842236818 (11.259%) in intersection
 
 ##############################################################################
 # LASTZ Elephant LoxAfr2 (DONE - 2009-05-27,29 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
     cd /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
 
     cat << '_EOF_' > DEF
 # Human vs. Elephant
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant
 SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
 SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     # time about 3h23m
     cat fb.hg19.chainLoxAfr2Link.txt 
     #	1018502258 bases of 2897316137 (35.153%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr2 > rbest.log 2>&1
     #	real    322m37.502s
 
 ##############################################################################
 # LASTZ Tenrec EchTel1 (DONE - 2009-05-27 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
     cd /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
 
     cat << '_EOF_' > DEF
 # Human vs. Tenrec
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tenrec
 SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
 SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1153m34.595s
     cat fb.hg19.chainEchTel1Link.txt 
     #	669856841 bases of 2897316137 (23.120%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 echTel1 > rbest.log 2>&1
     # time about 7h13m
 
 ##############################################################################
 # LASTZ Tree Shrew TupBel1 (DONE - 2009-05-27,06-02 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
     cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
 
     cat << '_EOF_' > DEF
 # Human vs. Tree Shrew
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tree Shrew
 SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit
 SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real 811m54.095s
     #	having trouble with pk, finished manually
     #	XXX there is one job that is taking forever ...
     #	finished it in pieces on swarm in a few minutes, like this:
     mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
     cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
 #!/bin/sh
 
 S=100000000
 E=101010000
 export S E
 for I in 0 1 2 3 4 5 6 7 8 9
 do
 echo $S $E
 /usr/bin/time -p /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
 /scratch/data/hg19/nib/chr1.nib:chr1:${S}-${E} ../qParts/part019.lst \
 ../../DEF psl/chr1.nib:chr1:${S}-${E}_part019.lst.psl
 nextS=`echo $S | awk '{printf "%d", $1 + 1000000}'`
 nextE=`echo $E | awk '{printf "%d", $1 + 1000000}'`
 S=$nextS
 E=$nextE
 done
 
     grep -h "^#" psl/chr* | sort -u > result.psl
     grep -h -v "^#" psl/chr* | sort -k14,14 -k16,16n >> result.psl
     cp -p result.psl \
 ../../psl/chr1.nib:chr1:100000000-110010000/chr1.nib:chr1:100000000-110010000_part019.lst.psl
 
     #	then, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	-continue=cat > cat.log 2>&1 &
     #	real    212m22.707s
     time doRecipBest.pl -buildDir=`pwd` hg19 tupBel1 > rbest.log 2>&1
     #	time about 4h22m
 
 ##############################################################################
 # LASTZ Shrew SorAra1 (DONE - 2009-05-28,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
     cd /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
 
     cat << '_EOF_' > DEF
 # Human vs. Shrew
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Shrew
 SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit
 SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	time about 23h26m
     cat fb.hg19.chainSorAra1Link.txt 
     #	572519288 bases of 2897316137 (19.760%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 sorAra1 > rbest.log 2>&1
     #	real    251m20.055s
 
 ##############################################################################
 # LASTZ Rabbit OryCun1 (DONE - 2009-05-28,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
     cd /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
 
     cat << '_EOF_' > DEF
 # Human vs. Rabbit
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rabbit
 SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
 SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	time about 23h09m
     cat fb.hg19.chainOryCun1Link.txt 
     #	975693323 bases of 2897316137 (33.676%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 oryCun1 > rbest.log 2>&1
     #	real    318m1.142s
 
 ##############################################################################
 # LASTZ Hedgehog EriEur1 (DONE - 2009-05-28,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
     cd /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
 
     cat << '_EOF_' > DEF
 # Human vs. Hedgehog
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Hedgehog
 SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit
 SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=500
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    2043m33.198s
     cat fb.hg19.chainEriEur1Link.txt 
     #	560965051 bases of 2897316137 (19.362%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 eriEur1 > rbest.log 2>&1
     #	real    350m17.737s
 
 ##############################################################################
 # LASTZ Pika OchPri2 (DONE - 2009-05-29,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
     cd /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
 
     cat << '_EOF_' > DEF
 # Human vs. Pika
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Pika
 SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit
 SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    393m42.569s
     cat fb.hg19.chainOchPri2Link.txt 
     #	804516397 bases of 2897316137 (27.768%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 ochPri2 > rbest.log 2>&1
     #	real    224m47.979s
 
 ##############################################################################
 # LASTZ Kangaroo Rat DipOrd1 (DONE - 2009-05-29,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
     cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
 
     cat << '_EOF_' > DEF
 # Human vs. Kangaroo Rat
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Kangaroo Rat
 SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
 SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    688m47.595s
     time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
     #	real    140m42.014s
 
 ##############################################################################
 # LIFTOVER TO Hg18 (DONE - 2009-06-04 - Hiram )
     mkdir /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
     cd /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
     # -debug run to create run dir, preview scripts...
     #	verifies files can be found
     doSameSpeciesLiftOver.pl -debug hg19 hg18
     # Real run:
     time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
 	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
 	 hg19 hg18 > do.log 2>&1
     #	real    115m26.071s
 
 #############################################################################
 # BLASTZ/CHAIN/NET/ETC 11 GENOMES TO HG19 (DONE, Andy 2009-06-06)
 ssh hgwdev
 cd /hive/data/genomes/hg19/bed
 mkdir lastz{SpeTri1,FelCat3,CavPor3,BosTau4,PteVam1,EquCab2,VicPac1,MyoLuc1,TurTru1,ChoHof1}.2009-06-04
 ln -s lastzSpeTri1.2009-06-04 lastz.speTri1
 ln -s lastzFelCat3.2009-06-04 lastz.felCat3
 ln -s lastzCavPor3.2009-06-04 lastz.cavPor3
 ln -s lastzBosTau4.2009-06-04 lastz.bosTau4
 ln -s lastzPteVam1.2009-06-04 lastz.pteVam1
 ln -s lastzEquCab2.2009-06-04 lastz.equCab2
 ln -s lastzVicPac1.2009-06-04 lastz.vicPac1
 ln -s lastzMyoLuc1.2009-06-04 lastz.myoLuc1
 ln -s lastzTurTru1.2009-06-04 lastz.turTru1
 ln -s lastzChoHof1.2009-06-04 lastz.choHof1
 cat > lastz.speTri1/DEF << 'EOF'
 # human vs squirrel
 
 # TARGET: human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: squirrel speTri1 
 SEQ2_DIR=/hive/data/genomes/speTri1/speTri1.2bit
 SEQ2_LEN=/hive/data/genomes/speTri1/chrom.sizes
 SEQ2_LIMIT=100
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastz.speTri1
 TMPDIR=/scratch/tmp
 EOF
 
 sed 's/speTri1/felCat3/g; s/squirrel/cat/;' lastz.speTr1/DEF | \
    sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
    > lastz.felCat3/DEF
 sed 's/speTri1/cavPor3/g; s/squirrel/guinea pig/;' lastz.speTr1/DEF | \
    sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/' | \
    sed 's/hive\/data\/genomes\/cavPor3/scratch\/data\/cavPor3/' \
    > lastz.cavPor3/DEF
 sed 's/speTri1/bosTau4/g; s/squirrel/cow/;' lastz.speTr1/DEF | \
    sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
    > lastz.bosTau4/DEF
 sed 's/speTri1/pteVam1/g; s/squirrel/megabat/;' lastz.speTr1/DEF | \
    sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=2/' \
    > lastz.pteVam1/DEF
 sed 's/cavPor3/equCab2/g; s/guinea pig/horse/' lastz.cavPor3/DEF | \ 
    sed 's/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' > lastz.equCab2/DEF
 sed 's/equCab2/vicPac1/g; s/horse/alpaca/' lastz.equCab2/DEF > lastz.vicPac1/DEF
 sed 's/pteVam1/myoLuc1/g; s/megabat/microbat/' lastz.pteVam1/DEF | \
    sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.myoLuc1/DEF
 sed 's/equCab2/turTru1/g; s/horse/dolphin/' lastz.equCab2/DEF | \
    sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.turTru1/DEF
 sed 's/equCab2/choHof11/g; s/horse/sloth/' lastz.equCab2/DEF > lastz.choHof1/DEF
 
 cd andy/
 for db in speTri1 felCat3 cavPor3 bosTau4 pteVam1 equCab2 vicPac1 myoLuc1 turTru1 choHof1; do
     ln -s ../lastz.${db}/DEF ${db}.DEF
 done
 
 screen -S speTri1
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium speTri1.DEF >& speTri1.do.log
 # [detach screen]
 #real    2059m30.699s
 
 screen -S felCat3
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium felCat3.DEF >& felCat3.do.log
 # [detach screen]
 #real    1574m47.522s
 
 screen -S bosTau4
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium bosTau4.DEF >& bosTau4.do.log
 # [detach screen]
 #real    1474m54.655s
 
 screen -S pteVam1
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm
   -chainMinScore=3000 -chainLinearGap=medium pteVam1.DEF >& pteVam1.do.log
 # [detach screen]
 #real    1168m33.923s
 
 screen -S equCab2
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium -syntenicNet equCab2.DEF >& equCab2.do.log
 # [detach screen]
 #real    1662m56.158s
 # (included syntenic net)
 
 screen -S vicPac1
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium vicPac1.DEF >& vicPac1.do.log
 # [detach screen]
 #real    1495m48.173s
 
 screen -S turTru1
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium turTru1.DEF >& turTru1.do.log
 # [detach screen]
 #real    1079m17.234s
 
 screen -S choHof1
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
   -chainMinScore=3000 -chainLinearGap=medium choHof1.DEF >& choHof1.do.log
 # [detach screen]
 #real    1310m49.287s (script and cluster run stopped after halfway...
 # pk was too slow... remaining jobs started on swarm)
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium -continue=cat \
   choHof1.DEF >& choHof1.doAfterBlastz.log
 #real    257m32.701s
 
 screen -S cavPor3
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
   -smallClusterHub=memk -bigClusterHub=pk cavPor3.DEF >& cavPor3.do.log
 # [detach screen]
 #real    1370m5.258s
 # TROUBLE!  got to the 'load' step and failed.  This one needs a special
 # chain table and chainLink table to get loaded.
 cd ../lastz.cavPor3/axtChain/
 # figure out number of rows and average length
 wc -l *.tab
 #   27186468 chain.tab
 #  240602108 link.tab
 randomLines link.tab 10000000 stdout | awk '{print length($0)}' | sort | uniq -c
 randomLines chain.tab 1000000 stdout | awk '{print length($0)}' | sort | uniq -c
 # about 43 average length for the chainLink and 100 for the chain
 sed "s/hgLoadChain.*/hgsqldump hg19 chainSpeTri1Link --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=241000000 avg_row_length=43 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
 hgsqldump hg19 chainSpeTri1 --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=27200000 avg_row_length=100 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
 hgsql hg19 -e \"load data local infile \'chain.tab\' into table chainCavPor3\"\n\
 hgsql hg19 -e \"load data local infile \'link.tab\' into table chainCavPor3Link\"\n\
 hgsql hg19 -e \"INSERT into history (ix, startId, endId, who, what, modTime, errata) VALUES(NULL,0,0,\'aamp\',\'Loaded 27186468 chains into cavPor3 chain table manually\', NOW(), NULL)\"\
 /" loadUp.csh > manualLoadUp.csh
 chmod +x manualLoadUp.csh
 time nice -n +19 ./manualLoadUp.csh
 # [detach screen]
 #real    584m4.093s
 cd ../../andy/
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
   -smallClusterHub=memk -bigClusterHub=swarm -continue=download \
   cavPor3.DEF >& cavPor3.doAfterLoad.log
 #real    5m45.122s
 
 # syntenic nets 
 
 screen -r bosTau4
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
   -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
   -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
   -continue=syntenicNet bosTau4.DEF >& bosTau4.syn.log
 #real    31m48.545s
 
 # reciprocal best choHof1 and cavPor3
 screen -r choHof1
 time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.choHof1 \
   -workhorse=hgwdev hg19 choHof1 >& choHof1.doRecip.log
 #real    367m52.993s
 
 screen -r cavPor3
 time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.cavPor3 \
   -workhorse=hgwdev hg19 cavPor3 >& cavPor3.doRecip.log
 #real    123m3.795s
 
 # reciprocal best small six genome memk run
 
 screen -S recipRun
 mkdir recipRun
 cd recipRun/
 cat > gsub << 'EOF'
 #LOOP
 ./doRecip.sh $(path1) 
 #ENDLOOP
 'EOF'
 cat > doRecip.sh << 'EOF'
 #!/bin/csh -ef
 set db = $1
 /cluster/bin/scripts/doRecipBest.pl -workhorse=`uname -n` -stop=recipBest -buildDir=/hive/data/genomes/hg19/bed/lastz.$db hg19 $db >& $db.recipBest.log
 'EOF'
 chmod +x doRecip.sh
 cat > db.lst << 'EOF'
 speTri1
 vicPac1
 myoLuc1
 turTru1
 pteVam1
 felCat3
 EOF
 ssh memk
 cd /hive/data/genomes/hg19/bed/andy/recipRun
 gensub2 db.lst single gsub jobList
 para create jobList
 para push
 # finished overnight
 exit # to hgwdev
 for log in *.recipBest.log; do 
   db=${log%.recipBest.log}; 
   echo $db;
   doRecipBest.pl -workhorse=hgwdev -continue=download \
     -buildDir=/hive/data/genomes/hg19/bed/lastz.$db \
      hg19 $db >& $db.recipBestDownload.log;
 done
 
 # swaps for equCab2, felCat3, bostTau4, cavPor3
 
 cd /hive/data/genomes/hg19/bed/andy
 screen -r equCab2
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u01 \
   -chainMinScore=3000 -chainLinearGap=medium -swap equCab2.DEF >& equCab2.doSwap.log
 # [detach screen]
 #real    486m35.206s
 
 screen -r felCat3
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u02 \
   -chainMinScore=3000 -chainLinearGap=medium -swap felCat3.DEF >& felCat3.doSwap.log
 # [detach screen]
 #real    463m5.257s
 
 screen -r bosTau4
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u03 \
   -chainMinScore=3000 -chainLinearGap=medium -swap bosTau4.DEF >& bosTau4.doSwap.log
 # [detach screen]
 #real    391m40.132s
 
 screen -r cavPor3
 time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=hgwdev
   -chainMinScore=3000 -chainLinearGap=medium -swap cavPor3.DEF >& cavPor3.doSwap.log
 # [detach screen]
 real    192m39.792s
 
 ##########################################################################
 # LASTZ Venter's Poodle canFamPoodle1 (DONE - 2009-06-05,10 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
     cd /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
 
     cat << '_EOF_' > DEF
 # human vs Venter's poodle
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2
 SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
 SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LAP=0
 SEQ2_LIMIT=600
 
 BASE=/hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl \
         -verbose=2 \
         `pwd`/DEF \
         -noDbNameCheck -noLoadChainSplit \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
         -chainMinScore=3000 -chainLinearGap=medium
     #	real    5162m58.743s
     cat fb.hg19.chainCanFamPoodle1Link.txt 
     #	898034247 bases of 2897316137 (30.995%) in intersection
     #	the original canFam2 measured:
     #	1532073507 bases of 2897316137 (52.879%) in intersection
 
     time nice -n +19 doRecipBest.pl -buildDir=`pwd` \
 	hg19 canFamPoodle1 > rbest.log 2>&1 &
     #	real    811m27.965s
 
 ##############################################################################
 ## 46-Way Multiz (WORKING - 2009-06-09 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/multiz46way
     cd /hive/data/genomes/hg19/bed/multiz46way
 
     #	starting with the 46way tree created from 44 way tree
     cat << '_EOF_' > 46way.nh
 (((((((((((((((((
 ((hg19:0.006591,panTro2:0.006639):0.002184,gorGor1:0.009411):0.009942,
 ponAbe2:0.018342):0.014256,rheMac2:0.036199):0.021496,papHam1:0.04):0.02,
 calJac1:0.066389):0.056911,tarSyr1:0.135169):0.011307,
 (micMur1:0.091452,otoGar1:0.128984):0.035463):0.015304,
 tupBel1:0.183583):0.004688,(((((mm9:0.083220,rn4:0.090564):0.196605,
 dipOrd1:0.209532):0.022555,cavPor3:0.223415):0.009828,
 speTri1:0.146894):0.025042,
 (oryCun2:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
 (((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
 ((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
 (myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
 (eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
 (((loxAfr3:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
 (dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
 macEug1:0.3):0.1,
 monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
 ((galGal3:0.166386,taeGut1:0.170717):0.199763,
 anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
 (((tetNig2:0.224774,fr2:0.205294):0.191836,
 (gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
 danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
 '_EOF_'
     # << happy emacs
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/hg19_46way.gif
 
     /cluster/bin/phast/all_dists 46way.nh > 46way.distances.txt
     #	Use this output to create the table below, with this perl script:
     cat << '_EOF_' > sizeStats.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 open (FH, "grep -y hg19 46way.distances.txt | sort -k3,3n|") or
         die "can not read 46way.distances.txt";
 
 my $count = 0;
 while (my $line = <FH>) {
     chomp $line;
     my ($hg19, $D, $dist) = split('\s+', $line);
     my $chain = "chain" . ucfirst($D);
     my $B="/hive/data/genomes/hg19/bed/lastz.$D/fb.hg19." .
         $chain . "Link.txt";
     my $chainLinkMeasure =
         `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
     chomp $chainLinkMeasure;
     $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
     $chainLinkMeasure =~ s/\%//;
     my $swapFile="/hive/data/genomes/${D}/bed/blastz.hg19.swap/fb.${D}.chainHg19Link.txt";
     my $swapMeasure = "N/A";
     if ( -s $swapFile ) {
 	$swapMeasure =
 	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
 	chomp $swapMeasure;
 	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
 	$swapMeasure =~ s/\%//;
     }
     my $orgName=
     `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
     chomp $orgName;
     if (length($orgName) < 1) {
         $orgName="N/A";
     }
     ++$count;
     if ($swapMeasure eq "N/A") {
 	printf "# %02d  %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
 	    $orgName, $D, $chainLinkMeasure, $swapMeasure
     } else {
 	printf "# %02d  %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
 	    $orgName, $D, $chainLinkMeasure, $swapMeasure
     }
 }
 close (FH);
 '_EOF_'
     # << happy emacs
     chmod +x ./sizeStats.pl
     ./sizeStats.pl
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                        chainOryLat1Link   chain    linearGap
 #    distance                      on hg19    on other   minScore
 # 01  0.0132 - Chimp panTro2    (% 94.846) (% 94.908)
 # 02  0.0182 - Gorilla gorGor1  (% 59.484) (N/A)
 # 03  0.0371 - Orangutan ponAbe2        (% 91.350) (% 89.617)
 # 04  0.0692 - Rhesus rheMac2   (% 82.744) (% 87.422)
 # 05  0.0945 - Baboon papHam1   (% 82.810) (N/A)
 # 06  0.1409 - Marmoset calJac1 (% 70.860) (% 71.897)
 # 07  0.2665 - Tarsier tarSyr1  (% 47.830) (N/A)
 # 08  0.2696 - Mouse lemur micMur1      (% 46.519) (N/A)
 # 09  0.3071 - Bushbaby otoGar1 (% 43.644) (N/A)
 # 10  0.3343 - Horse equCab2    (% 57.050) (% 66.774)
 # 11  0.3416 - TreeShrew tupBel1        (% 36.156) (N/A)
 # 12  0.3451 - Dolphin turTru1  (% 48.398) (N/A)
 # 13  0.3500 - Squirrel speTri1 (% 35.713) (N/A)
 # 14  0.3611 - Alpaca vicPac1   (% 39.399) (N/A)
 # 15  0.3620 - Sloth choHof1    (% 34.377) (N/A)
 # 16  0.3653 - Megabat pteVam1  (% 45.414) (N/A)
 # 17  0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
 # 18  0.3740 - Cat felCat3      (% 35.713) (% 61.104)
 # 19  0.3769 - Dog canFam2      (% 52.879) (% 62.055)
 # 20  0.3809 - Armadillo dasNov2        (% 33.543) (N/A)
 # 21  0.3941 - Rabbit oryCun2   (% 44.317) (58.405)
 # 22  0.3946 - Microbat myoLuc1 (% 33.174) (N/A)
 # 23  0.4028 - Cow bosTau4      (% 46.506) (% 50.297)
 # 24  0.4363 - Guinea Pig cavPor3       (% 43.680) (N/A)
 # 25  0.4421 - Rock hyrax proCap1       (% 30.864) (N/A)
 # 26  0.4450 - Kangaroo rat dipOrd1     (% 27.161) (N/A)
 # 27  0.4764 - Pika ochPri2     (% 27.768) (N/A)
 # 28  0.4811 - Hedgehog eriEur1 (% 19.362) (N/A)
 # 29  0.5035 - Tenrec echTel1   (% 23.120) (N/A)
 # 30  0.5153 - Mouse mm9        (% 35.299) (% 38.693)
 # 31  0.5226 - Rat rn4  (% 32.879) (% 36.860)
 # 32  0.5274 - Shrew sorAra1    (% 19.760) (N/A)
 # 33  0.6394 - Wallaby macEug1  (% 6.011) (N/A)
 # 34  0.7653 - Opossum monDom5  (% 14.358) (N/A)
 # 35  0.9657 - Platypus ornAna1 (% 7.627) (% 11.259)
 # 36  1.0960 - Chicken galGal3  (% 3.591) (% 8.786)
 # 37  1.1003 - Zebra finch taeGut1      (% 3.496) (% 7.795)
 # 38  1.2394 - Lizard anoCar1   (% 3.591) (% 5.146)
 # 39  1.6403 - X. tropicalis xenTro2    (% 3.176) (% 6.773)
 # 40  1.9387 - Stickleback gasAcu1      (% 1.916) (% 11.175)
 # 41  1.9634 - Fugu fr2 (% 1.702) (% 10.929)
 # 42  1.9746 - Zebrafish danRer6        (% 3.051) (% 6.399)
 # 43  1.9829 - Tetraodon tetNig2        (% 1.712) (% 14.194)
 # 44  2.1031 - Medaka oryLat2   (% 1.849) (% 6.705)
 # 45  2.1108 - Lamprey petMar1  (% 1.082) (% 3.200)
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	46way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.list
 
     cd /hive/data/genomes/hg19/bed/multiz46way
     #	bash shell syntax here ...
     export H=/hive/data/genomes/hg19/bed
     mkdir mafLinks
     for G in `sed -e "s/hg19 //" species.list`
     do
 	mkdir mafLinks/$G
 	if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then
 	    echo "$G - recipBest"
 	    ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
 	else
 	    if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then
 		echo "$G - synNet"
 		ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
 	    else
 		if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then
 		    echo "$G - mafNet"
 		    ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
 		else
 		    echo "missing directory lastz.${G}/*Net"
 		fi
 	    fi
 	fi
     done
 
     #	verify the alignment type is correct:
     for D in `cat /hive/users/hiram/bigWayHg19/ordered.list`
 do
     ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}'
 done
     #	compare to the list at:
     #	http://genomewiki.ucsc.edu/index.php/Hg19_Genome_size_statistics
 
     #	need to split these things up into smaller pieces for
     #	efficient kluster run.
     cd /hive/data/genomes/hg19/bed/multiz46way
     mkdir mafSplit
     cd mafSplit
     #	mafSplitPos splits on gaps or repeat areas that will not have
     #	any chains, approx 5 Mbp intervals, gaps at least 10,000
     mafSplitPos -minGap=10000 hg19 5stdout | sort -u \
 	| sort -k1,1 -k2,2n > mafSplit.bed
     #	There is a splitRegions.pl script here (copied from previous 44way)
     #	that can create a custom track from this mafSplit.bed file.
     #	Take a look at that in the browser and see if it looks OK,
     #	check the number of sections on each chrom to verify none are
     #	too large.  Despite the claim above, it does appear that some
     #	areas are split where actual chains exist.
 
     #	run a small kluster job to split them all
     ssh memk
     cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
     cat << '_EOF_' > runOne
 #!/bin/csh -ef
 set G = $1
 set C = $2
 mkdir -p $G
 pushd $G > /dev/null
 if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then
     rm -f hg19_${C}.*.maf
     mafSplit ../mafSplit.bed hg19_ ../../mafLinks/${G}/${C}.maf.gz
     gzip hg19_${C}.*.maf
 else
     touch hg19_${C}.00.maf
     gzip hg19_${C}.00.maf
 endif
 popd > /dev/null
 '_EOF_'
     # << happy emacs
     chmod +x runOne
 
     cat << '_EOF_' > template
 #LOOP
 runOne $(root1) $(root2) {check out line $(root1)/hg19_$(root2).00.maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     for G in `sed -e "s/hg19 //" ../species.list`
 do
     echo $G
 done > species.list
     cut -f 1 ../../../chrom.sizes > chr.list
 
     gensub2 species.list chr.list template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc...
 # Completed: 4185 of 4185 jobs
 # CPU time in finished jobs:      25547s     425.78m     7.10h    0.30d  0.001 y
 # IO & Wait Time:                268664s    4477.73m    74.63h    3.11d  0.009 y
 # Average job time:                  70s       1.17m     0.02h    0.00d
 # Longest finished job:            1234s      20.57m     0.34h    0.01d
 # Submission to last job:          3048s      50.80m     0.85h    0.04d
 
     # the autoMultiz cluster run
     ssh swarm
     cd /hive/data/genomes/hg19/bed/multiz46way/
 
     mkdir splitRun
     cd splitRun
     mkdir maf run
     cd run
     mkdir penn
     cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn 
     cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn 
     cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn 
 
     #	set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = hg19
 set c = $1
 set result = $2
 set run = `/bin/pwd`
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
 /bin/rm -fr $tmp
 /bin/mkdir -p $tmp
 /bin/cp -p ../../tree.nh ../../species.list $tmp
 pushd $tmp > /dev/null
 foreach s (`/bin/sed -e "s/ $db//" species.list`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
         /bin/zcat $in.gz > $out
 	if (! -s $out) then
 	    echo "##maf version=1 scoring=autoMZ" > $out
 	endif
     else if (-e $in) then
         /bin/ln -s $in $out
     else
         echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($run/penn $path); rehash
 $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
 	> /dev/null
 popd > /dev/null
 /bin/rm -f $result
 /bin/cp -p $tmp/$c.maf $result
 /bin/rm -fr $tmp
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
     find ../../mafSplit -type f | grep hg19_ | xargs -L 1 basename \
 	| sed -e "s/.gz//" | sort -u > chr.part.list
     gensub2 chr.part.list single template jobList
     para -ram=8g create jobList
     #	initial run experience suggest some of the big jobs reach 8 Gb
     #	of memory usage, so, tell parasol to limit the number of jobs per
     #	node to avoid thrashing
     para -ram=8g try
     para -ram=8g push
 # Completed: 504 of 504 jobs
 # CPU time in finished jobs:    1342039s   22367.32m   372.79h   15.53d  0.043 y
 # IO & Wait Time:                 63835s    1063.91m    17.73h    0.74d  0.002 y
 # Average job time:                2789s      46.49m     0.77h    0.03d
 # Longest finished job:           12625s     210.42m     3.51h    0.15d
 # Submission to last job:         15300s     255.00m     4.25h    0.18d
 
     # put the split maf results back together into a single maf file
     #	eliminate duplicate comments
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
     mkdir ../maf
     #	the sed edits take out partitioning name information from the comments
     #	so the multiple parts will condense to smaller number of lines
     #	this takes almost 2 hours of time, resulting in a bit over 150 Gb,
     #	almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
     #	HOWEVER, this is actually not necessary to maintain these comments,
     #	they are lost during the mafAddIRows
 
     cat << '_EOF_' >> runOne
 #!/bin/csh -fe
 set C = $1
 if ( -s ../maf/${C}.maf.gz ) then
     rm -f ../maf/${C}.maf.gz
 endif
 head -q -n 1 maf/hg19_${C}.*.maf | sort -u > ../maf/${C}.maf
 grep -h "^#" maf/hg19_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
     sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
         | sort -u >> ../maf/${C}.maf 
 grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
 tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
 '_EOF_'
     # << happy emacs
     chmod +x runOne
 
     cat << '_EOF_' >> template
 #LOOP
 runOne $(root1) {check out exists+ ../maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 ../../../chrom.sizes > chr.list
     ssh encodek
     cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
     gensub2 chr.list single template jobList
     para create jobList
     para try ... check ... push ... etc ...
 # Completed: 92 of 93 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:        412s       6.86m     0.11h    0.00d  0.000 y
 # IO & Wait Time:                 21187s     353.12m     5.89h    0.25d  0.001 y
 # Average job time:                 235s       3.91m     0.07h    0.00d
 # Longest finished job:            1529s      25.48m     0.42h    0.02d
 # Submission to last job:          1542s      25.70m     0.43h    0.02d
 
     #	one of the results is completely empty, the grep for results failed
     #	this file ../maf/chrUn_gl000226.maf only has header comments, no result
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/hg19/multiz46way/maf
     cd /hive/data/genomes/hg19/bed/multiz46way/maf
     ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf
 
     # this generates an immense multiz46way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /data/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
     #	Loaded 33558634 mafs in 93 files from /gbdb/hg19/multiz46way/maf
     #	real    512m8.053s
 
     # load summary table
     time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
 	| $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
 		-mergeGap=1500 -maxSize=200000  multiz46waySummary stdin
     #	real    92m30.700s
 # flushSummaryBlocks: output 45 blocks
 # Created 8766427 summary blocks from 645238409 components and
 #	33558634 mafs from stdin
 # blocks too small to be used: 29456
 # Loading into hg19 table multiz46waySummary...
 
     # Gap Annotation
     # prepare bed files with gap info
     mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
     cd /hive/data/genomes/hg19/bed/multiz46way/anno
     mkdir maf run
 
     #	most of these will already exist from previous multiple alignments
     #	remove the echo from in front of the twoBitInfo command to get them
     #	to run if this loop appears to be correct
     for DB in `cat ../species.list`
 do
     CDIR="/hive/data/genomes/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `sed -e "s/hg19 //" ../../species.list`
 do
     echo "${DB} "
     ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     #	the annotation step requires large memory, run on memk nodes
     ssh memk
     cd /hive/data/genomes/hg19/bed/multiz46way/anno/run
     ls ../../maf | sed -e "s/.maf//" > chr.list
     cat << '_EOF_' > template
 #LOOP
 ./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > anno.csh
 #!/bin/csh -fe
 
 set inMaf = ../../maf/$1.maf
 set outMaf = ../maf/$1.maf
 rm -f $outMaf
 mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg19/hg19.2bit $outMaf
 '_EOF_'
     # << happy emacs
     chmod +x anno.csh
 
     gensub2 chr.list single template jobList
     para -ram=30g create jobList
     #	specify lots of ram to get one job per node
     para -ram=30g push
     #	
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:      10371s     172.85m     2.88h    0.12d  0.000 y
 # IO & Wait Time:                  3365s      56.09m     0.93h    0.04d  0.000 y
 # Average job time:                 148s       2.46m     0.04h    0.00d
 # Longest finished job:            1153s      19.22m     0.32h    0.01d
 # Submission to last job:          7402s     123.37m     2.06h    0.09d
 
     ssh hgwdev
     rm -fr /gbdb/hg19/multiz46way/maf
     mkdir /gbdb/hg19/multiz46way/maf
     cd /hive/data/genomes/hg19/bed/multiz46way/anno/maf
     ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf/
     #	by loading this into the table multiz46way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /data/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
     #	real    113m11.709s
     #	Loaded 33612571 mafs in 93 files from /gbdb/hg19/multiz46way/maf
 XXX - done to here
 
     time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
 	| hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
                  -maxSize=200000  multiz46waySummary stdin
     #	with the quality annotated mafs, and mem interference on hgwdev:
     #	Created 8514381 summary blocks from 600504256 components \
     #	and 33320838 mafs from stdin
     #	real    169m56.936s
 
     #	with the Irow annotations after the multiz fix:
     #	Created 8514380 summary blocks from 600499937
     #		components and 33298894 mafs from stdin
     #	real    184m42.893s
     #	user    70m44.431s
     #	sys     8m7.970s
 
     #	Created 8514078 summary blocks from 604683213 components
     #	and 35125649 mafs from stdin
     #	real    130m55.115s
     #	user    71m37.409s
     #	sys     8m5.110s
 
     #	by loading this into the table multiz46waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	remove the multiz46way*.tab files in this /data/tmp directory
 # -rw-rw-r--   1 1949221892 Nov 15 14:04 multiz46way.tab
 # -rw-rw-r--   1  417994189 Nov 15 20:57 multiz46waySummary.tab
     wc -l multiz46way*.tab
     #	33964377 multiz46way.tab
     #	 8514078 multiz46waySummary.tab
     #	42478455 total
     rm multiz46way*.tab
 
     # create some downloads
     mkdir -p /hive/data/genomes/hg19/bed/multiz46way/download/maf
     cd /hive/data/genomes/hg19/bed/multiz46way/download/maf
     time cp -p ../../anno/maf/chr*.maf .
     #	real    72m46.514s
     #	user    0m1.293s
     #	sys     5m15.981s
     time gzip --rsyncable *.maf
     time gzip --rsyncable *.maf
     #	real    185m37.884s
     #	user    179m51.161s
     #	sys     3m48.016s
     time md5sum *.gz > md5sum.txt
     #	real    3m59.009s
     #	user    1m19.338s
     #	sys     0m18.976s
 
 ##############################################################################
 # LASTZ Sea Hare aplCal1 (STARTING - 2009-06-08 - Galt)
     mkdir /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
     cd /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
 
     cat << '_EOF_' > DEF
 # Human vs. Sea Hare
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ2_LIMIT=5
 
 # QUERY: Sea Hare aplCal1
 SEQ2_DIR=/scratch/data/aplCal1/aplCal1.2bit
 SEQ2_LEN=/scratch/data/aplCal1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     #   (NOTE I SHOULD NOT HAVE USED  -qRepeats=windowmaskerSdust)
     screen
     time nice +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	>& do.log &
     #	real ?? about one hour but one job hung
 
     # resuming from failure 
     # edited loadUp.csh, commenting out the first completed step
     # and removing the unneeded -qRepeats=windowmaskerSdust
     # from the next step, now run it to complete the load step.
     /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/axtChain/loadUp.csh \ 
         >& continue-loadUp.log&
 
     # continue from step 'download'
     time nice +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
         -continue download \
         >& continue-download.log &
  
     cat fb.hg19.chainAplCal1Link.txt 
     #   19675762 bases of 2897316137 (0.679%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     #   (NOTE I SHOULD NOT HAVE USED  -qRepeats=windowmaskerSdust)
     mkdir /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
     cd /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
     time nice +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	-swap >& swap.log &
     #	real  time not long
 
     # resuming from failure
     # edited loadUp.csh, commenting out the first completed step
     # and removing the unneeded -tRepeats=windowmaskerSdust
     # from the next step, now run it to complete the load step.
     /hive/data/genomes/aplCal1/bed/blastz.hg19.swap/axtChain/loadUp.csh \
         >& continue-loadUp.log&
 
     time nice +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
         -continue download \
 	-swap >& continue-download.log &
 
     cat fb.aplCal1.chainHg19Link.txt 
     #   14163455 bases of 619228098 (2.287%) in intersection
 
 #########################################################################
 # EXONIPHY Hg19, lifted from hg18 (DONE - 2009-06-19 - Hiram)
 #	needed for uscsGenes11 building
     # create a syntenic liftOver chain file
     cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
     time nice -n +19 netSyntenic run.chain/hg18.hg19.noClass.net.gz stdout \
 	| netFilter -syn stdin | netChainSubset -verbose=0 stdin \
 		run.chain/hg18.hg19.all.chain.gz stdout \
 	| chainStitchId stdin stdout | gzip -c > hg18.hg19.syn.chain.gz
     #	memory usage 55492608, utime 3 s/100, stime 3
     #	real    2m35.613s
 
     #	real    5m55.575s
     #	slightly smaller than the ordinary liftOver chain file:
 # -rw-rw-r-- 1 137245 Mar  6 17:37 hg18ToHg19.over.chain.gz
 # -rw-rw-r-- 1  96115 Jun 19 14:30 hg18.hg19.syn.chain.gz
 
     # exoniphyHg19.gp is prepared as follows
     mkdir /cluster/data/hg19/bed/exoniphy
     cd /cluster/data/hg19/bed/exoniphy
     hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
     time nice -n +19 liftOver -genePred exoniphyHg18.gp \
       /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06/hg18.hg19.syn.chain.gz \
 	    exoniphyHg19.gp unmapped
     wc -l *
     #	178162 exoniphyHg18.gp
     #	178109 exoniphyHg19.gp
     #	   106 unmapped
 
     mkdir dump
     cd dump
     hgsqldump --all -c --tab=. hg18 exoniphy
     cd ..
     chmod 775 dump
     hgsql hg19 < dump/exoniphy.sql
     hgsql hg19 \
 -e "load data local infile \"exoniphyHg19.gp\" into table exoniphy;"
     nice -n +19 featureBits hg19 exoniphy
     #	27421336 bases of 2897316137 (0.946%) in intersection
     nice -n +19 featureBits hg18 exoniphy
     #	27475705 bases of 2881515245 (0.954%) in intersection
 
 #########################################################################
 # BIOCYCTABLES NEEDED BY hgGene (DONE - 2009-06-22 - Hiram)
 
 # First register with BioCyc to download their HumanCyc database
 # The site will email you the URL for download.  Beware, they supply
 #	a URL to a directory chock a block full of data, almost 7 Gb,
 #	you only need one file
 
     mkdir /hive/data/outside/bioCyc/090623
     cd /hive/data/outside/bioCyc/090623
 
     mkdir download
     cd download
     wget --timestamping --no-directories --recursive \
 	"http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.tar.Z"
     tar xvzf humancyc-flatfiles.tar.Z
 
     mkdir /hive/data/genomes/hg19/bed/bioCyc
     cd /hive/data/genomes/hg19/bed/bioCyc
     #	clean the headers from these files
     grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/genes.col \
 	> genes.tab
     #	this file isn't consistent in its number of columns
     grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/pathways.col \
 | awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' \
 	> pathways.tab
 
     hgsql hg19 -e 'create database bioCyc090623'
 
     hgLoadSqlTab bioCyc090623 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
     hgLoadSqlTab bioCyc090623 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
 
 # Create bioCycMapDesc.tab
     hgsql bioCyc090623 -N \
 	-e 'select UNIQUE_ID, NAME from pathways' | sort -u >  bioCycMapDesc.tab
 XXX see alternative below
 
     #	this kgBioCyc0 thing needs kgXref and other UCSC gene tables to work
 # Create bioCycPathway.tab
     kgBioCyc0 bioCyc090623 hg19 hg19
 
     hgLoadSqlTab hg19 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
     hgLoadSqlTab hg19 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 XXX maybe instead do this in the gene build procedure
     # from the UCSC genes build procedure
 # Do BioCyc Pathways build
     mkdir $dir/bioCyc
     cd $dir/bioCyc
     grep -v '^#' $bioCycPathways > pathways.tab
     grep -v '^#' $bioCycGenes > genes.tab
     kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
     hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
     hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 ##############################################################################
 nscanGene (2009-06-22 markd)
    # nscanGene track from WUSTL
    cd /cluster/data/hg19/bed/nscan
    wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.updated.gtf
    wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.readme
    wget -r -np -l 1 http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19_proteins
    bzip2 hg19.updated.gtf hg19_proteins/*.fa
 
    # load track
    gtfToGenePred -genePredExt hg19.updated.gtf.bz2 stdout| hgLoadGenePred -genePredExt hg19 nscanGene stdin
    bzcat hg19_proteins/chr*.fa.bz2 | hgPepPred hg19 generic nscanPep stdin
    rm *.tab
 
    # validate same number of transcripts and peptides are loaded
    hgsql -Ne 'select count(*) from nscanGene' hg19
    hgsql -Ne 'select count(*) from nscanPep' hg19
 
    # validate search expression
    hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l
 
 #########################################################################
 # Phylogenetic tree from 46-way  (DONE - 2009-06-25,07-07 - Hiram)
 
     # Extract 4-fold degenerate sites based on 
     # of RefSeq Reviewed, coding
     mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
     cd /hive/data/genomes/hg19/bed/multiz46way/4d
 
     hgsql hg19 -Ne \
     "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 \
 	> refSeqReviewed.gp
     wc -l refSeqReviewed.gp
     # 14077 refSeqReviewed.gp
     genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
     wc -l refSeqReviewedNR.gp
     # 7951 refSeqReviewedNR.gp
 
     ssh memk
     mkdir /hive/data/genomes/hg19/bed/multiz46way/4d/run
     cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
     mkdir ../mfa
 
 # whole chrom mafs version, using new version of 
 # uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
     cat << '_EOF_' > 4d.csh
 #!/bin/csh -fe
 set r = "/hive/data/genomes/hg19/bed/multiz46way"
 set c = $1
 set infile = $r/maf/$2
 set outfile = $3
 cd /scratch/tmp
 # 'clean' maf
 perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
 awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
 set PHASTBIN=/cluster/bin/phast.2008-12-18
 $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
 $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/$outfile
 rm -f $c.gp $c.maf $c.ss
 '_EOF_'
     # << happy emacs
     chmod +x 4d.csh
 
     ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/*.maf | \
         egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
 	> maf.list
 
     cat << '_EOF_' > template
 #LOOP
 4d.csh $(root1) {check in line+ $(path1)} {check out line+ mfa/$(root1).mfa}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 maf.list single template stdout | tac > jobList
 XXX - ready to go here - 2009-07-06
     rm -fr /cluster/data/hg19/bed/multiz46way/4d/mfa
     mkdir /cluster/data/hg19/bed/multiz46way/4d/mfa
     para create jobList
     para try
     para check
     para push
 
     # combine mfa files
     cd ..
     sed -e "s/ /,/g" ../species.list > species.lst
     /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
         sed s/"> "/">"/ > 4d.all.mfa
 
     sed -e 's/,macEug1.*//' species.lst > placentals.lst
     #  XXX this didn't work
     /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
         sed s/"> "/">"/ > 4d.placentals.mfa
 
     # use phyloFit to create tree model (output is phyloFit.mod)
     set PHASTBIN=/cluster/bin/phast.2008-12-18
     time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \
 	--subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
     #	real    111m23.119s
     mv phyloFit.mod phyloFit.all.mod
     grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.46way.nh
 
     sed -e 's/.*,choHof1,//' species.lst > notPlacentals.list
 
     $PHASTBIN/tree_doctor \
         --prune=`cat notPlacentals.list` \
                 tree_4d.46way.nh > tree_4d.46way.placental.nh
 
 #############################################################################
 # phastCons 46-way (WORKING - 2009-09-21 - Hiram)
+    #	was unable to split the full chrom MAF files, now working on the
+    #	maf files as they were split up during multiz
 
     # split 46way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh memk
     mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
+    cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
+    ./splitRegions.pl mafSplit.bed > \
+	/hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/region.list
     mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set c = $1
-set MAF = /hive/data/genomes/hg19/bed/multiz46way/maf/$c.maf
+set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
 set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/ss/$c
 rm -fr $WINDOWS
+# set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s -start=%d
+# -end=%d", $1, $2, $3}'`
+set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s", $1}'`
 mkdir $WINDOWS
 pushd $WINDOWS > /dev/null
-twoBitToFa -seq=$c /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
-/cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
+twoBitToFa ${seq} /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
+set empty = `faSize hg19.$c.fa | egrep " 0 real 0 upper 0 lower|masked total" | wc -l`
+if ( $empty != 2 ) then
+    /cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
     -M hg19.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+endif
 rm -f hg19.$c.fa
 popd > /dev/null
-date >> $c.done
+date >> $2
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(root1) {check out line+ $(root1).done}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
-    ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
+    ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
 
     gensub2 maf.list single template jobList
     para -ram=32g create jobList
     para try ... check ... etc
+    #	XXX - this did not work
     #	this takes a really long time.  memk was down to 2 usable
     #	machines - got it finished manually on a combination of hgwdevnew CPUs
     #	and other machines
 
     # Estimate phastCons parameters
     #	experimented with this as a parasol job on hgwdevnew to try a number
     #	of SS files.  With a command of:
 
 /cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
 --tree "(((((((((((((((((hg19,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
 --out-root=$OUT/starting_tree
 
     #	running over the input files ../ss/*/*.ss results to
 #.../genomes/hg19/bed/multiz46way/cons/startingTree/result/*/starting-tree.mod
 
     # add up the C and G:
     find ./result -type f | xargs ls -rt | while read F
 do
     D=`dirname $F`
     echo -n `basename $D`" - "
     grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
 done
     #	counting number of species seen in the maf file:
     find ./result -type f | xargs ls -rt | while read F
 do
     D=`dirname $F`
     echo -n `basename $D`" - "
     grep TREE $F | sed -e \
 "s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
 done
 
     # Run phastCons
-    #	This job is I/O intensive in its output files, thus it is all
-    #	working over in /scratch/tmp/
+    #	This job is I/O intensive in its output files, beware where this
+    #	takes place or do not run too many at once.
     ssh swarm
     mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
-    #	all euarchontogliers placentals
+    #	all primates placentals
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast/x86_64
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set cons = /hive/data/genomes/hg19/bed/multiz46way/cons
 set tmp = $cons/tmp/$f
 mkdir -p $tmp
 set ssSrc = $cons
 if (-s $cons/$grp/$grp.non-inf) then
   ln -s $cons/$grp/$grp.mod $tmp
   ln -s $cons/$grp/$grp.non-inf $tmp
   ln -s $ssSrc/ss/$c/$f.ss $tmp
   ln -s $cons/$grp/$grp.mod $tmp
   ln -s $cons/$grp/$grp.non-inf $tmp
 else
   ln -s $ssSrc/ss/$c/$f.ss $tmp
   ln -s $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p pp/$c bed/$c
 sleep 4
 touch pp/$c bed/$c
 rm -f pp/$c/$f.pp
 rm -f bed/$c/$f.bed
 mv $tmp/$f.pp pp/$c
 mv $tmp/$f.bed bed/$c
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     #	this template will serve for all runs
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     cat << '_EOF_' > template
 #LOOP
 ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
-    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list
-
     # run for all species
     cd /hive/data/genomes/hg19/bed/multiz46way/cons
     mkdir -p all
     cd all
     #	Using Kate's .mod tree
     cp -p ../../4d/46way.all.mod ./all.mod
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc.
-XXX - running Tue Jan 13 22:19:21 PST 2009
-# Completed: 322 of 322 jobs
-# CPU time in finished jobs:      47406s     790.10m    13.17h    0.55d  0.002 y
-# IO & Wait Time:                 29902s     498.37m     8.31h    0.35d  0.001 y
-# Average job time:                 240s       4.00m     0.07h    0.00d
-# Longest finished job:             354s       5.90m     0.10h    0.00d
-# Submission to last job:           536s       8.93m     0.15h    0.01d
+
+# second run on swarm parasol:  the failed jobs have empty bed file results
+# Completed: 575 of 580 jobs
+# Crashed: 5 jobs
+# CPU time in finished jobs:      42049s     700.81m    11.68h    0.49d  0.001 y
+# IO & Wait Time:                 19735s     328.92m     5.48h    0.23d  0.001 y
+# Average job time:                 107s       1.79m     0.03h    0.00d
+# Longest finished job:             267s       4.45m     0.07h    0.00d
+# Submission to last job:           479s       7.98m     0.13h    0.01d
+
+# first run on hgwdev parasol:
+# Completed: 574 of 579 jobs
+# Crashed: 5 jobs
+# CPU time in finished jobs:      53050s     884.17m    14.74h    0.61d  0.002 y
+# IO & Wait Time:                  6633s     110.55m     1.84h    0.08d  0.000 y
+# Average job time:                 104s       1.73m     0.03h    0.00d
+# Longest finished job:             248s       4.13m     0.07h    0.00d
+# Submission to last job:          4121s      68.68m     1.14h    0.05d
 
     # create Most Conserved track
-    cd /hive/data/genomes/hg19/bed/multiz46way/cons
-    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
-        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
-            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+    cut -f1 ../../../../chrom.sizes | while read C
+do
+    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.bed
+    done | awk 'BEGIN{ ID=1 }{printf "%s\t%d\t%d\t%s.%d\t%d\t%s\n", "'${C}'", $2, $3, "'${C}'", ID, $5, $6; ++ID}'
+done > mostConserved.bed
     #	~ 1 minute
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
     time nice -n +19 hgLoadBed hg19 phastConsElements46way mostConserved.bed
-    #	Loaded 4878296 elements of size 5
-    #	real     2m3.414s
+    #	Loaded 5868432 elements of size 6
+    #	real     1m14.357s
 
     # Try for 5% overall cov, and 70% CDS cov 
-    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg19 -enrichment refGene:cds phastConsElements46way
-    #	refGene:cds 1.144%, mostConserved.bed 4.973%,
-    #	both 0.854%, cover 74.62%, enrich 15.01x
-
-    #	--rho .31 --expected-length 45 --target-coverage .3
-    #	refGene:cds 1.144%, phastConsElements46way 4.706%,
-    #	both 0.824%, cover 72.07%, enrich 15.31x
-
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
-    featureBits hg19 -enrichment knownGene:cds phastConsElements46way
-    #	knownGene:cds 1.205%, mostConserved.bed 4.973%,
-    #	both 0.874%, cover 72.55%, enrich 14.59x
-
-    #	--rho .31 --expected-length 45 --target-coverage .3
-    #	knownGene:cds 1.205%, phastConsElements46way 4.706%,
-    #	both 0.844%, cover 70.05%, enrich 14.88x
-
-    featureBits hg19 -enrichment refGene:cds phastConsElements28way
-    #	refGene:cds 1.144%, phastConsElements28way 4.920%,
-    #	both 0.858%, cover 74.96%, enrich 15.24x
-    featureBits hg19 -enrichment knownGene:cds phastConsElements28way
-    #	knownGene:cds 1.205%, phastConsElements28way 4.920%,
-    #	both 0.878%, cover 72.88%, enrich 14.81x
+    #	refGene:cds 1.186%, phastConsElements46way 5.621%,
+    #	both 0.878%, cover 73.98%, enrich 13.16x
 
     # Create merged posterier probability file and wiggle track data files
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
-    cat << '_EOF_' > gzipAscii.sh
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh
 #!/bin/sh
 
-TOP=`pwd`
-export TOP
+set -beEu -o pipefail
 
 mkdir -p downloads
-
-for D in pp/chr*
+cut -f1 ../../../../chrom.sizes | while read C
 do
-    C=${D/pp\/}
-    out=downloads/${C}.phastCons46way.wigFix.gz
-    echo "${D} > ${C}.phastCons46way.wigFix.gz"
-    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
-	gzip > ${out}
+    echo -n "${C} ... working ... "
+    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+    done | gzip > downloads/${C}.phastCons46way.wigFix.gz
+    echo "done"
 done
 '_EOF_'
     #	<< happy emacs
-    chmod +x gzipAscii.sh
-    time nice -n +19 ./gzipAscii.sh
-    #	real    30m7.228s
+    chmod +x phastCat.sh
+    time nice -n +19 ./phastCat.sh
+    #	real    30m2.623s
 
     #	encode those files into wiggle data
     zcat downloads/*.wigFix.gz \
 	| wigEncode stdin phastCons46way.wig phastCons46way.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
-    #	real    22m54.291s
+    #	real    18m37.881s
+    du -hsc *.wi?
+    #	2.7G    phastCons46way.wib
+    #	271M    phastCons46way.wig
+    #	3.0G    total
+
+    #	encode into a bigWig file:
+    #	(warning wigToBigWig process grows to about 36 Gb)
+    #	in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG 
+ulimit -v $sizeG
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons46way.bw
+    #	real    52m36.142s
+# -rw-rw-r--   1 21667535139 Oct 20 13:59 phastCons46way.bw
+    mkdir /gbdb/hg19/bbi
+    ln -s `pwd`/phastCons46way.bw /gbdb/hg19/bbi
+    #	loading bigWig table:
+    hgsql hg19 -e 'drop table if exists phastCons46way; \
+            create table phastCons46way (fileName varchar(255) not null); \
+            insert into phastCons46way values
+	("/gbdb/hg19/bbi/phastCons46way.bw");'
 
+    #	Using the bigWig file instead of this database table:
     # Load gbdb and database with wiggle.
+#    ssh hgwdev
+#    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+#    ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
+#    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+#	phastCons46way phastCons46way.wig
+
+    #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
-    ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
-    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
-	phastCons46way phastCons46way.wig
-    #	real    1m13.681s
+    time nice -n +19 hgWiggle -doHistogram -db=hg19 \
+	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+	    pc46 > histogram.data 2>&1
+    #	real    7m37.212s
+
+    #	create plot of histogram:
+
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Human Hg19 Histogram phastCons46way track"
+set xlabel " phastCons46way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #	<< happy emacs
+
+    display histo.png &
+
+    ########################################################################
+    ### Create a phastCons data set for Primates
+
+    # setup primates-only run
+    ssh swarm
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    # primates-only: exclude all but these for phastCons tree:
+
+    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
+	--prune-all-but=hg19,panTro2,gorGor1,ponAbe2,rheMac2,papHam1,calJac1,tarSyr1,micMur1,otoGar1 \
+	> primates.mod
+    #	and place the removed ones in the non-inf file so phastCons will
+    #	truly ignore them:
+    echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1,macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
+	> primates.non-inf
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList
+    para try ... check ... push ... etc.
+# Completed: 539 of 580 jobs
+# Crashed: 41 jobs
+# CPU time in finished jobs:      19518s     325.30m     5.42h    0.23d  0.001 y
+# IO & Wait Time:                 19782s     329.70m     5.50h    0.23d  0.001 y
+# Average job time:                  73s       1.22m     0.02h    0.00d
+# Longest finished job:             157s       2.62m     0.04h    0.00d
+# Submission to last job:          1989s      33.15m     0.55h    0.02d
+
+    # the 41 crashed jobs are due to empty bed file results.
+# bed/chrUn_gl000237.00/chrUn_gl000237.00.1-45866.bed is empty
+# ... etc
+
+    # create Most Conserved track
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    ../all/bedCat.sh > mostConserved.bed
+    featureBits hg19 mostConserved.bed
+    #	146285948 bases of 2897316137 (5.049%) in intersection
+
+    # load into database
+    ssh hgwdev
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    time nice -n +19 hgLoadBed hg19 phastConsElements46wayPrimates \
+	mostConserved.bed
+    #	Loaded 1109918 elements of size 6
+    #	real    0m15.498s
+    # verify coverage
+    featureBits hg19 phastConsElements46wayPrimates
+    #	146285948 bases of 2897316137 (5.049%) in intersection
+
+    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
+    featureBits hg19 -enrichment refGene:cds phastConsElements46wayPrimates
+    #	refGene:cds 1.186%, phastConsElements46wayPrimates 5.049%,
+    #	both 0.771%, cover 64.95%, enrich 12.86x
+
+    featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPrimates
+    #	knownGene:cds 1.252%, phastConsElements46wayPrimates 5.049%,
+    #	both 0.784%, cover 62.65%, enrich 12.41x
+
+    #	Create the downloads .pp files, from which the phastCons wiggle data
+    #	is calculated
+    # sort by chromName, chromStart so that items are in numerical order 
+    #  for wigEncode
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+    echo -n "${C} ... working ... "
+    if [ -d "pp/${C}.00" ]; then
+        ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+        do
+            cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+        done | gzip > downloads/${C}.phastCons46way.primates.wigFix.gz
+    fi
+    echo "done"
+done
+'_EOF_'
+    # << happy emacs
+    chmod +x ./phastCat.sh
+    time nice -n +19 ./phastCat.sh
+    #	real    39m47.189s
+
+    # Create merged posterier probability file and wiggle track data files
+    zcat downloads/chr*.wigFix.gz \
+	 | wigEncode stdin phastCons46wayPrimates.wig phastCons46wayPrimates.wib
+    # Converted stdin, upper limit 1.00, lower limit 0.00
+    #	real    17m20.601s
+
+    #	encode to bigWig
+    #	(warning wigToBigWig process grows to about 36 Gb)
+    #	in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG 
+ulimit -v $sizeG
+
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPrimates.bw
+
+    ln -s `pwd`/phastCons46wayPrimates.bw /gbdb/hg19/bbi
+    #	loading bigWig table:
+    hgsql hg19 -e 'drop table if exists phastCons46wayPrimates; \
+            create table phastCons46wayPrimates \
+		(fileName varchar(255) not null); \
+            insert into phastCons46wayPrimates values
+	("/gbdb/hg19/bbi/phastCons46wayPrimates.bw");'
+
+    ## load table with wiggle data
+    ## not done now, using the bigWig file instead
+#    ssh hgwdev
+#    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+#    ln -s `pwd`/phastCons46wayPrimates.wib \
+#	/gbdb/hg19/multiz46way/phastCons46wayPrimates.wib
+#    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+#	phastCons46wayPrimates phastCons46wayPrimates.wig
+    #	Instead, temporary load into a table so we can do the histogram
+    ln -s `pwd`/phastCons46wayPrimates.wib /gbdb/hg19/wib/pc46.wib
+    hgLoadWiggle hg19 pc46 phastCons46wayPrimates.wig
 
     #  Create histogram to get an overview of all the data
+    time nice -n +19 hgWiggle -doHistogram \
+	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+	    -db=hg19 pc46 > histogram.data 2>&1
+    #	real    5m30.086s
+
+    #	create plot of histogram:
+
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color \
+        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Mouse Hg19 Histogram phastCons46wayPrimates track"
+set xlabel " phastCons46wayPrimates score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #	<< happy emacs
+
+    display histo.png &
+
+    ########################################################################
+    ### Create a phastCons data set for Placentals
+    # setup placental-only run
+    ssh swarm
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+
+    # placental-only: exclude all but these for phastCons tree:
+    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
+	--prune-all-but=hg19,panTro2,gorGor1,ponAbe2,rheMac2,papHam1,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1 \
+	> placental.mod
+    #	and place the removed ones in the non-inf file so phastCons will
+    #	truly ignore them:
+    echo "macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
+        > placental.non-inf
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList
+    para try ... check ... push ... etc.
+# Completed: 562 of 580 jobs
+# Crashed: 18 jobs
+# CPU time in finished jobs:      33874s     564.57m     9.41h    0.39d  0.001 y
+# IO & Wait Time:                 12493s     208.21m     3.47h    0.14d  0.000 y
+# Average job time:                  83s       1.38m     0.02h    0.00d
+# Longest finished job:             193s       3.22m     0.05h    0.00d
+# Submission to last job:         62872s    1047.87m    17.46h    0.73d
+
+    #	The crashed jobs produce zero length bed files: e.g.
+    #	bed/chrUn_gl000246.00/chrUn_gl000246.00.1-38144.bed is empty
+
+    # create Most Conserved track
+    ../all/bedCat.sh > mostConserved.bed
+
+    # load into database
     ssh hgwdev
-    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+    time nice -n +19 hgLoadBed hg19 phastConsElements46wayPlacental \
+	mostConserved.bed
+    #	Loaded 4785089 elements of size 6
+    #	real    0m58.367s
+    # verify coverage
+    featureBits hg19 phastConsElements46wayPlacental
+    #	146457699 bases of 2897316137 (5.055%) in intersection
+    #	119635433 bases of 2881515245 (4.152%) in intersection
+
+    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
+    featureBits hg19 -enrichment refGene:cds phastConsElements46wayPlacental
+    #	refGene:cds 1.186%, phastConsElements46wayPlacental 5.055%,
+    #	both 0.847%, cover 71.42%, enrich 14.13x
+    featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPlacental
+    #	knownGene:cds 1.252%, phastConsElements46wayPlacental 5.055%,
+    #	both 0.865%, cover 69.10%, enrich 13.67x
+
+    #	Create the downloads .pp files, from which the phastCons wiggle data
+    #	is calculated
+    # sort by chromName, chromStart so that items are in numerical order 
+    #  for wigEncode
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+    echo -n "${C} ... working ... "
+    if [ -d "pp/${C}.00" ]; then
+        ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+        do
+            cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+        done | gzip > downloads/${C}.phastCons46way.placental.wigFix.gz
+    fi
+    echo "done"
+done
+'_EOF_'
+    # << happy emacs
+    chmod +x ./phastCat.sh
+    time nice -n +19 ./phastCat.sh
+
+    # Create merged posterier probability file and wiggle track data files
+    zcat downloads/chr*.wigFix.gz \
+	| wigEncode stdin phastCons46wayPlacental.wig \
+		phastCons46wayPlacental.wib
+    #	Converted stdin, upper limit 1.00, lower limit 0.00
+    #	real    14m53.395s
+
+    #	encode to bigWig
+    #	(warning wigToBigWig process grows to about 36 Gb)
+    #	in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG 
+ulimit -v $sizeG
+
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPlacental.bw
+    #	real    40m55.568s
+
+    ln -s `pwd`/phastCons46wayPlacental.bw /gbdb/hg19/bbi
+    #	loading bigWig table:
+    hgsql hg19 -e 'drop table if exists phastCons46wayPlacental; \
+            create table phastCons46wayPlacental \
+		(fileName varchar(255) not null); \
+            insert into phastCons46wayPlacental values
+	("/gbdb/hg19/bbi/phastCons46wayPlacental.bw");'
+
+
+    ## load table with wiggle data
+    ## no longer load this data, using the bigWig file instead
+#    ssh hgwdev
+#    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+#    ln -s `pwd`/phastCons46wayPlacental.wib \
+#	/gbdb/hg19/multiz46way/phastCons46wayPlacental.wib
+#    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+#	phastCons46wayPlacental phastCons46wayPlacental.wig
+
+    #	Instead, temporary load into a table so we can do the histogram
+    ln -s `pwd`/phastCons46wayPlacental.wib /gbdb/hg19/wib/pc46.wib
+    hgLoadWiggle hg19 pc46 phastCons46wayPlacental.wig
+
+    #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-	    -db=hg19 phastCons46way > histogram.data 2>&1
-    #	real    8m6.841s
+	    -db=hg19 pc46 > histogram.data 2>&1
+    #	real    8m15.623s
+    hgsql -e "drop table pc46;" hg19
+    rm /gbdb/hg19/wib/pc46.wib
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
-set title " Human Hg18 Histogram phastCons46way track"
-set xlabel " phastCons46way score"
+set title " Human Hg19 Histogram phastCons46wayPlacental track"
+set xlabel " phastCons46wayPlacental score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 
 #########################################################################
 # LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
     cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
 
     cat << '_EOF_' > DEF
 # human vs X. zebrafish
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish danRer6
 SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
 SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=40
 
 BASE=/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1678m17.827s
     #	failed during the chain step due to encodek cluster problems
     #	finish that manually, then:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=chainMerge > chainMerge.log 2>&1 &
     #	real    167m6.930s
     cat fb.hg19.chainDanRer6Link.txt 
     #	88391631 bases of 2897316137 (3.051%) in intersection
 
     #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/danRer6/bed/blastz.hg19.swap
     cd /hive/data/genomes/danRer6/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
     #	real    183m21.102s
     cat fb.danRer6.chainHg19Link.txt 
     #	96424507 bases of 1506896106 (6.399%) in intersection
 
 ##############################################################################
 # LASTZ Elephant LoxAfr3 (DONE - 2009-07-21,23 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
     cd /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
 
     cat << '_EOF_' > DEF
 # Human vs. Elephant
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant
 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
 SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    317m32.664s
     #	broken when it went to chaining on encodek, finish the chain then:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=chainMerge > chainMerge.log 2>&1 &
     #	real    217m25.159s
 
     # time about 3h23m
     cat fb.hg19.chainLoxAfr3Link.txt 
     #	1351200080 bases of 2897316137 (46.636%) in intersection
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> synNet.log 2>&1 &
     #	real    32m40.554s
 
     time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
     #	real    184m3.435s
 
     mkdir /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
     cd /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
     #	real    220m16.839s
     cat fb.loxAfr3.chainHg19Link.txt 
     #	1323201500 bases of 3118565340 (42.430%) in intersection
 
 ##############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 # AGILENT PROBES LIFTED FROM HG18 (DONE, 2009-07-28 Andy)
 
 ssh hgwdev
 bash
 mkdir /hive/data/genomes/hg19/bed/agilentProbes
 cd /hive/data/genomes/hg19/bed/agilentProbes
 for table in `echo show tables like \'agilent%\' | hgsql hg18 | tail -n +2 | grep -v Probe`; do
     echo $table; echo "select * from $table" | hgsql hg18 | \
         tail -n +2 | cut -f2- > ${table}.hg18.bed; liftOver ${table}.hg18.bed \
           /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz ${table}.hg19.{bed,unmapped};
     hgLoadBed hg19 $table ${table}.hg19.bed; 
     echo done with $table; 
 done
 for unmap in *.unmapped; do
    table=${unmap%.hg19.unmapped}
    grep Deleted -A1 $unmap | grep -v Deleted | grep -v "^--" > agilentProbesHg18Unmapped/${table}.deleted.bed
    grep Split -A1 $unmap | grep -v Split | grep -v "^--" > agilentProbesHg18Unmapped/${table}.split.bed
    grep Partially -A1 $unmap | grep -v Partially | grep -v "^--" > agilentProbesHg18Unmapped/${table}.partiallyDeleted.bed
 done
 find agilentProbesHg18Unmapped/ -size 0b | xargs rm
 rm *hg18.bed *.unmapped bed.tab
 gzip *.bed
 tar cfz agilentProbesHg18Unmapped.tar.gz agilentProbesHg18Unmapped
 cd /usr/local/apache/htdocs/goldenPath/hg19
 mkdir agilentProbes
 cd agilentProbes/
 ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped beds
 ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped.tar.gz
 
 ##############################################################################
 # LASTZ Tetraodon TetNig2 (DONE - 2009-08-10,11 - Hiram)
     #	This is the incorrect date/time stamp on this directory,
     #	it should be 2009-08-10
     mkdir /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
     cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
 
     cat << '_EOF_' > DEF
 # human vs tetraodon
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
 SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
 SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
 SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
 SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    220m36.068s
     #	forgot the qRepeats for tetNig2
     rm axtChain/hg19.tetNig2.net
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=load -qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> load.log 2>&1 &
     #	real    5m53.096s
     cat fb.hg19.chainTetNig2Link.txt 
     #	49611132 bases of 2897316137 (1.712%) in intersection
 
     #	running the swap
     mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
     cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
     #	real    13m21.591s
     #	forgot the qRepeats for tetNig2
     rm axtChain/tetNig2.hg19.net
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
 	-continue=load -qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-swap > load.log 2>&1 &
     #	real    4m7.559s
     cat fb.tetNig2.chainHg19Link.txt 
     #	42910930 bases of 302314788 (14.194%) in intersection
 
 
 ##############################################################################
 # dbSNP BUILD 130 - PROVISIONAL REMAPPING TO BUILD 37 (DONE 8/28/09 angie)
     # /hive/data/outside/dbSNP/130/ was already set up during the hg18 run --
     # just add hg19 coord files and go from there.
     cd /hive/data/outside/dbSNP/130/human/data
     alias wg wget --timestamping
     set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/misc/exchange
     # These are provisional files in an ad-hoc format.
     wg $ftpSnpDb/README.txt
     wg $ftpSnpDb/Remap_36_3_37_1.info
     wg $ftpSnpDb/Remap_36_3_37_1.txt.gz
     mv README.txt Remap_36_3_37_1_README
     zcat Remap_36_3_37_1.txt.gz | wc -l
 #18823990
 
     # Use the remapping to transform ../ucscNcbiSnp.bed into one for hg19.
     # Useful columns, 1-based: 1=ID, 3=oldChr, 4=oldStart, 5=oldEnd,
     # 10=newChr, 11=newStart, 12=newEnd, 13=newLocType, 14=newWeight, 16=newStrand
     # For mappings to chr*_random, oldStart and oldEnd are empty -- skip.
     # Sort both hg18 snp file and remap file by {rsID,chr,start} to keep them in sync.
     mkdir /hive/data/outside/dbSNP/130/human/hg19
     cd /hive/data/outside/dbSNP/130/human/hg19
     sort -k4n,4n -k1,1 -k2n,2n ../ucscNcbiSnp.bed > /data/tmp/hg18.ucscNcbiSnp.idSorted.bed
     zcat ../data/Remap_36_3_37_1.txt.gz \
     | sort -t "	" -k1n,1n -k3,3 -k4n,4n \
       > /data/tmp/Remap_36_3_37_1.txt
     perl -we \
       'use strict; \
        sub nextMap { \
          my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
              $nLocType, $nWt, $nRef, $nStr);\
          do { \
            ($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
                $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = split("\t", <>); \
            if (defined $nStr) { \
              chomp $nStr; $nStr =~ tr/+-/01/; $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
            } \
            $oStart--;  $oEnd--;  $nStart--;  $nEnd--;  # Yep. 0-based closed vs 1-based closed \
          } while (defined $nStr && ($oEnd < 0 || $nChr eq "chrUn")); \
          return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
                  $nLocType, $nWt, $nRef, $nStr); \
        } # nextMap \
        my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
          &nextMap(); \
        my ($rCount, $oCount, $tCount) = 0; \
        open(my $oldF, "/data/tmp/hg18.ucscNcbiSnp.idSorted.bed") || die; \
        while (my ($chr, $s, $e, $id, $str, $rn,$obs,$mt,$cn,$vn,$ah,$ahse,$fc,$lt,$wt) = \
               split("\t", <$oldF>)) { \
          my $thisRCount = 0; \
          while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
            print join("\t", $nChr,$nStart,$nEnd,$id,$nStr,$nRef,$obs,$mt,$cn,$vn,$ah,$ahse,$fc, \
                             $nLocType,$nWt,$nStart) \
                       . "\n"; \
            ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
              &nextMap(); \
            $thisRCount++; \
          } \
          if (defined $rsId && $id > $rsId) {warn "Slipped a cog"; last;} \
          $tCount += $thisRCount; \
          $rCount++ if ($thisRCount > 0); \
          $oCount++; \
        } \
        close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
       /data/tmp/Remap_36_3_37_1.txt \
     | sort -k1,1 -k2n,2n -k4,4 \
     > /data/tmp/hg19.ucscNcbiSnp.bed
 #Replaced 18693260 of 19189750 inputs (18697579 outputs).
 #504.562u 27.037s 8:59.57 98.5%  0+0k 0+0io 0pf+0w
     wc -l /data/tmp/hg19.ucscNcbiSnp.bed
 #  18697579 /data/tmp/hg19.ucscNcbiSnp.bed
 
     # Drum roll please... translate NCBI's encoding into UCSC's, and
     # perform a bunch of checks.  This is where developer involvement
     # is most likely as NCBI extends the encodings used in dbSNP.
     cd /hive/data/outside/dbSNP/130/human/hg19
     snpNcbiToUcsc /data/tmp/hg19.ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit \
       -1000GenomesRsIds=../data/1000GenomesRsIds.txt snp130
 #spaces stripped from observed:
 #chr12   6093134 6093134 rs41402545
 #Line 8049395 of /data/tmp/hg19.ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
 #count of snps with weight  0 = 0
 #count of snps with weight  1 = 17042465
 #count of snps with weight  2 = 345274
 #count of snps with weight  3 = 1017906
 #count of snps with weight 10 = 291934
 #Skipped 1496 snp mappings due to errors -- see snp130Errors.bed
 #146.837u 9.867s 4:21.63 59.8%   0+0k 0+0io 0pf+0w
     # Comparable to hg18.snp130, with some losses due to coord translation, loss of _randoms,
     # and 1496 errors (new locType or refNCBI inconsistent with new size).
     expr 18697579 - 291934 - 1496
 #18404149
 
     # Move hg19.ucscNcbiSnp.bed from fast tmp to slow (today) hive:
     gzip /data/tmp/hg19.ucscNcbiSnp.bed
     mv /data/tmp/hg19.ucscNcbiSnp.bed.gz hg19.ucscNcbiSnp.bed.gz
 
     # Will try not reuse hg18.snp130's giant 18G fasta file, not duplicate.
 
     # Load up main track tables.
     cd /hive/data/outside/dbSNP/130/human/hg19
     hgLoadBed -tab -tmpDir=/data/tmp -allowStartEqualEnd \
       hg19 snp130 -sqlTable=snp130.sql snp130.bed
 #Loaded 18404149 elements of size 17
 #115.086u 21.663s 2:32:09.98 1.4%        0+0k 0+0io 1pf+0w
 #that is freakishly long -- lots happening today w/db move, hive recovery,...
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
       hg19 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
       snp130Exceptions.bed
 #Loaded 1982828 elements of size 5
 #10.500u 0.851s 1:13.42 15.4%    0+0k 0+0io 0pf+0w
     hgLoadSqlTab hg19 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       snp130ExceptionDesc.tab
     # Load up sequences *from hg18 file*:
     hgLoadSqlTab hg19 snp130Seq ~/kent/src/hg/lib/snpSeq.sql ../snp130Seq.tab
 
     # Put in a link where one would expect to find the track build dir...
     ln -s /hive/data/outside/dbSNP/130/human/hg19 /hive/data/genomes/hg19/bed/snp130
 
     # Look at the breakdown of exception categories:
     cd /hive/data/outside/dbSNP/130/human/hg19
     cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
 #1350217 MultipleAlignments
 # 495981 ObservedMismatch
 #  37603 ObservedTooLong
 #  26855 SingleClassTriAllelic
 #  24443 FlankMismatchGenomeShorter
 #  17927 SingleClassLongerSpan
 #  13685 SingleClassZeroSpan
 #   6238 FlankMismatchGenomeLonger
 #   3016 DuplicateObserved
 #   2851 SingleClassQuadAllelic
 #   1777 MixedObserved
 #   1264 NamedDeletionZeroSpan
 #    508 FlankMismatchGenomeEqual
 #    329 NamedInsertionNonzeroSpan
 #    121 ObservedContainsIupac
 #     11 RefAlleleMismatch
 #      2 ObservedWrongFormat
 
 #TODO: go through those above (esp snp130Errors.bed) and send some bug reports to dbSNP.
 
 
 ##############################################################################
 # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 8/31/09 angie)
     mkdir /hive/data/genomes/hg19/bed/snp130Ortho
     cd /hive/data/genomes/hg19/bed/snp130Ortho
 
     # Following Heather's lead in snp126orthos, filter SNPs to to keep
     # only those with class=single, length=1, chrom!~random;
     # Exclude those with exceptions MultipleAlignments,
     # SingleClassTriAllelic or SingleClassQuadAllelic.
     # Unlike snp masking, we do not filter for weight -- don't know why.
     awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
       /hive/data/outside/dbSNP/130/human/hg19/snp130Exceptions.bed \
     | sort -u \
       > snp130ExcludeIds.txt
     awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
       /hive/data/outside/dbSNP/130/human/hg19/snp130.bed \
     | grep -vFwf snp130ExcludeIds.txt \
       > snp130Simple.bed
 #203.193u 9.197s 2:57.40 119.7%  0+0k 0+0io 0pf+0w
     wc -l snp130Simple.bed
 #12278514 snp130Simple.bed
 
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       snp130Simple.bed > snp130ForLiftOver.bed
     # Map coords to chimp using liftOver.
     # I don't know why chimp took so much longer than macaque... the
     # chimp .over has fewer chains and fewer bytes than the macaque .over.
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../snp130ForLiftOver.bed 25000 split/chunk
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
         \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     ssh swarm
     cd /hive/data/genomes/hg19/bed/snp130Ortho/run.liftOChimp
     para make jobList
 #Completed: 492 of 492 jobs
 #CPU time in finished jobs:      51793s     863.22m    14.39h    0.60d  0.002 y
 #IO & Wait Time:                  3825s      63.75m     1.06h    0.04d  0.000 y
 #Average job time:                 113s       1.88m     0.03h    0.00d
 #Longest finished job:             286s       4.77m     0.08h    0.00d
 #Submission to last job:           300s       5.00m     0.08h    0.00d
 
     # Map coords to orangutan using liftOver.
     mkdir ../run.liftOPon
     cd ../run.liftOPon
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
         \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 492 of 492 jobs
 #CPU time in finished jobs:     125656s    2094.26m    34.90h    1.45d  0.004 y
 #IO & Wait Time:                  5413s      90.22m     1.50h    0.06d  0.000 y
 #Average job time:                 266s       4.44m     0.07h    0.00d
 #Longest finished job:             646s      10.77m     0.18h    0.01d
 #Submission to last job:           649s      10.82m     0.18h    0.01d
 
     # Map coords to macaque using liftOver.
     mkdir ../run.liftOMac
     cd ../run.liftOMac
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
         \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 492 of 492 jobs
 #CPU time in finished jobs:     161612s    2693.54m    44.89h    1.87d  0.005 y
 #IO & Wait Time:                  6218s     103.63m     1.73h    0.07d  0.000 y
 #Average job time:                 341s       5.69m     0.09h    0.00d
 #Longest finished job:             727s      12.12m     0.20h    0.01d
 #Submission to last job:           739s      12.32m     0.21h    0.01d
 
     cd /hive/data/genomes/hg19/bed/snp130Ortho
     # Concatenate the chimp results, sorting by chimp pos in order to
     # efficiently access 2bit sequence in getOrthoSeq.  The output of
     # that is then sorted by the glommed human info field, so that we
     # can use join to combine chimp and macaque results in the next step.
     # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
     sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
     | sort > panTro2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
     | sort > ponAbe2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
     | sort > rheMac2.orthoGlom.txt
     wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
 #  11428526 panTro2.orthoGlom.txt
 #  10861969 ponAbe2.orthoGlom.txt
 #   9694237 rheMac2.orthoGlom.txt
 
     # Use the glommed name field as a key to join up chimp and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom fields from each file, which are in the same order
     # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
     | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
             else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
       > tmp.txt
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       tmp.txt rheMac2.orthoGlom.txt \
     | perl -wpe 'chomp; \
         ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
         $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
         $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
         s/^.*$//;' \
     | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
 #304.434u 27.118s 4:31.30 122.2% 0+0k 0+0io 0pf+0w
     wc -l snp130OrthoPt2Pa2Rm2.bed
 #11876029 snp130OrthoPt2Pa2Rm2.bed
 
     cd /hive/data/genomes/hg19/bed/snp130Ortho
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg19 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
 #Loaded 11876029 elements of size 22
 #75.442u 8.828s 9:50.27 14.2%    0+0k 0+0io 0pf+0w
 
     # Cleanup fileserver:
     cd /hive/data/genomes/hg19/bed/snp130Ortho
     gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed &
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
-
 ##############################################################################
-<<<<<<< hg19.txt
 # LASTZ Rabbit OryCun2 (DONE - 2009-08-12 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
     cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
 
     cat << '_EOF_' > DEF
 # Human vs. Rabbit
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
 SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
 SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
 SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
 SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    516m41.981s
     cat fb.hg19.chainOryCun2Link.txt 
     #	1283994337 bases of 2897316137 (44.317%) in intersection
     #	should have run syntenicNet in that first run
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
     #	about 1 hour
 
     mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
     cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
     
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap -syntenicNet > swap.log 2>&1 &
     #	real    176m35.932s
     cat fb.oryCun2.chainHg19Link.txt 
     #	1260477501 bases of 2604023284 (48.405%) in intersection
 
 ##############################################################################
 # running syntenicNet on CavPor3 lastz (DONE - 2009-08-27 - Hiram)
     cd /hive/data/genomes/hg19/bed/lastzCavPor3.2009-06-04
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
     #	about 44 minutes
 
 ##############################################################################
 # loading the lastz tables on cavPor3 - (DONE - 2009-08-28 - Hiram)
     # the chain.tab and link.tab files are left over from the failed load
     cd  /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
 
     #	find out their sizes, average and total:
     awk '{print length($0)}' chain.tab | ave stdin
 Q1 92.000000 median 93.000000 Q3 96.000000
 average 93.651267
 min 64.000000 max 109.000000
 count 27186468
 total 2546047186.000000
     awk '{print length($0)}' link.tab | ave stdin
 Q1 45.000000 median 47.000000 Q3 48.000000
 average 46.731871
 min 22.000000 max 52.000000
 count 240602108
 total 11243786622.000000
 
     cat << '_EOF_' > chainHg19Link.sql
 CREATE TABLE chainHg19Link (
   bin smallint(5) unsigned NOT NULL default 0,
   tName varchar(255) NOT NULL default '',
   tStart int(10) unsigned NOT NULL default 0,
   tEnd int(10) unsigned NOT NULL default 0,
   qStart int(10) unsigned NOT NULL default 0,
   chainId int(10) unsigned NOT NULL default 0,
   KEY tName (tName(13),bin),
   KEY chainId (chainId)
 ) ENGINE=MyISAM max_rows=241000000 avg_row_length=50 pack_keys=1 CHARSET=latin1;
 '_EOF_'
     # << happy emacs
     hgsql cavPor3 < chainHg19Link.sql
 
     time hgsql -e \
       'load data local infile "link.tab" into table chainHg19Link;' cavPor3
     #	real    405m15.956s
 
     cd  /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
 
     #	and the net tracks were not loaded:
     time netClass -verbose=0 -noAr noClass.net cavPor3 hg19 cavPor3.hg19.net
     #	real    40m25.078s
 
     netFilter -minGap=10 cavPor3.hg19.net \
 	| hgLoadNet -verbose=0 cavPor3 netHg19 stdin
     # real    33m24.972s (plus the featureBits below)
 
     featureBits cavPor3 chainHg19Link > fb.cavPor3.chainHg19Link.txt 2>&1
     cat fb.cavPor3.chainHg19Link.txt
     #	1279572660 bases of 2663369733 (48.043%) in intersection
 
 ##############################################################################
 # DBSNP CODING ANNOTATIONS (DONE 9/1/09 angie)
 
     # Repeat the coord-remapping performed for snp130 on the hg18 coding anno table.
     cd /hive/data/outside/dbSNP/130/human/hg19
     sed -re 's/\trs([0-9]+)\t/\t\1\t/' ../snp130CodingDbSnp.bed \
     | sort -k4n,4n -k1,1 -k2n,2n > /data/tmp/hg18.snp130Coding.idSorted.bed
     # reuse /data/tmp/Remap_36_3_37_1.txt mapping file created for snp130 above:
     perl -we \
       'use strict; \
        sub nextMap { \
          my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd); \
          do { \
            ($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
                $nChr, $nStart, $nEnd) = split("\t", <>); \
            if (defined $nEnd) { \
              $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
            } \
            $oStart--;  $oEnd--;  $nStart--;  $nEnd--;  # Yep. 0-based closed vs 1-based closed \
          } while (defined $rsId && ($oEnd < 0 || $nChr eq "chrUn")); \
          return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd); \
        } # nextMap \
        my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = &nextMap(); \
        my ($rCount, $oCount, $tCount) = 0; \
        open(my $oldF, "/data/tmp/hg18.snp130Coding.idSorted.bed") || die; \
        while (my ($chr, $s, $e, $id, $tx, $frm, $alCount, $funcs, $als, $codons, $peps) = \
               split("\t", <$oldF>)) { \
          my $thisRCount = 0; \
          while (defined $rsId && $rsId < $id) { \
            ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = &nextMap(); \
          } \
          while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
            print join("\t", $nChr, $nStart, $nEnd, "rs$id", $tx, $frm, \
                             $alCount, $funcs, $als, $codons, $peps); \
            ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = &nextMap(); \
            $thisRCount++; \
          } \
          $tCount += $thisRCount; \
          $rCount++ if ($thisRCount > 0); \
          $oCount++; \
        } \
        close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
       /data/tmp/Remap_36_3_37_1.txt \
     | sort -k1,1 -k2n,2n -k4,4 \
     > /data/tmp/hg19.snp130Coding.bed
 #Replaced 197921 of 279815 inputs (198493 outputs).
 #160.824u 1.949s 2:43.01 99.8%   0+0k 0+0io 0pf+0w
     hgLoadBed hg19 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
       -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
       /data/tmp/hg19.snp130Coding.bed
 #Loaded 198493 elements of size 11
     mv /data/tmp/hg19.snp130Coding.bed hg19.snp130CodingDbSnp.bed
 
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
 
 ##########################################################################
 # BUILD ALLEN BRAIN TRACK (DONE 09/30/09 kent)
 
 # Make the working directory
     ssh hgwdev
     cd /cluster/data/hg19/bed
     mkdir allenBrain
     cd allenBrain
 
 # Remap the probe alignments from mm7 to hg19
 
     zcat /gbdb/mm9/liftOver/mm9ToHg19.over.chain.gz \
         |  pslMap -chainMapFile -swapMap \
 	       /cluster/data/mm9/bed/allenBrain/allenBrainAli.psl stdin stdout \
 	  |  sort -k 14,14 -k 16,16n > unscored.psl
 
     pslRecalcMatch unscored.psl /cluster/data/hg19/hg19.2bit \
         /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa allenBrainAli.psl
 
 # Load the database
    hgsql hg19 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql hg19 -e 'load data local infile "/cluster/data/mm9/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
    hgLoadPsl hg19 allenBrainAli.psl
    mkdir /gbdb/hg19/allenBrain
    ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa /gbdb/hg19/allenBrain/allenBrainProbes.fa
    hgLoadSeq hg19 /gbdb/hg19/allenBrain/allenBrainProbes.fa
 
 # Make mapping between known genes and allenBrain
    hgMapToGene hg19 allenBrainAli -type=psl knownGene knownToAllenBrain
 
 ############################################################################
+## Annotate 46-way multiple alignment with gene annotations
+##		(DONE - 2008-12-08,23 - Hiram)
+    # Gene frames
+    ## survey all genomes to see what type of gene track to use
+    ssh hgwdev
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/frames
+    cd /hive/data/genomes/hg19/bed/multiz46way/frames
+    #
+    #	survey all the genomes to find out what kinds of gene tracks they have
+    cat << '_EOF_' > showGenes.csh
+#!/bin/csh -fe
+foreach db (`cat ../species.list`)
+    echo -n "${db}: "
+    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
+    foreach table ($tables)
+	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
+	    $table == "knownGene" || $table == "xenoRefGene" ) then
+		set count = `hgsql $db -N -e "select count(*) from $table"`
+		echo -n "${table}: ${count}, "
+	endif
+    end
+    set orgName = `hgsql hgcentraltest -N -e \
+	    "select scientificName from dbDb where name='$db'"`
+    set orgId = `hgsql hg19 -N -e \
+	    "select id from organism where name='$orgName'"`
+    if ($orgId == "") then
+	echo "Mrnas: 0"
+    else
+	set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
+	echo "Mrnas: ${count}"
+    endif
+end
+'_EOF_'
+    # << happy emacs
+    chmod +x ./showGenes.csh
+    #	rearrange that output to create four sections:
+    #	1. knownGenes for hg19, mm9, rn4
+    #	2. ensGene for almost everything else
+    #	3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1, oryCun2
+
+    mkdir genes
+    # knownGene
+    for DB in hg19 mm9 rn4
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
+    | sed -e "s/  */ /g" > ensGene.list
+
+
+do
+    # ensGene
+    for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list
+
+    # xenoRefGene
+    for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    #	the following single command doesn't work on any 32 Gb computer,
+    #	requires much more memory, turn it into a kluster job, see below ...
+
+    #	Create this command with this script:
+    cat << '_EOF_' > mkCmd.sh
+#!/bin/sh
+
+echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \\"
+for G in mm9 rn4
+do
+    if [ ! -s genes/${G}.gp.gz ]; then
+	echo "missing genes/${G}.gp.gz"
+	exit 255
+    fi
+    echo -n "${G} genes/${G}.gp.gz "
+done
+echo "\\"
+for D in `sort ensGene.list`
+do
+    if [ ! -s genes/${D}.gp.gz ]; then
+        echo "missing genes/${D}.gp.gz"
+        exit 255
+    fi
+    echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+for D in `sort xenoRef.list`
+do
+    if [ ! -s genes/${D}.gp.gz ]; then
+        echo "missing genes/${D}.gp.gz"
+        exit 255
+    fi
+    echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+echo "    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1"
+'_EOF_'
+    # << happy emacs
+    chmod +x ./mkCmd.sh
+
+    time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \
+mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
+panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
+calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
+    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1
+
+    #	that doesn't work on any 32 Gb computer, requires much more memory
+    #	turn it into a kluster job
+    ssh swarm
+    cd /hive/data/genomes/hg19/bed/multiz46way/frames
+    cat << '_EOF_' > runOne
+#!/bin/csh -fe
+
+set C = $1
+set G = $2
+
+cat ../maf/${C}.maf | genePredToMafFrames hg19 stdin stdout \
+        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+'_EOF_'
+    # << happy emacs
+    chmod +x runOne
+
+    ls ../maf | sed -e "s/.maf//" > chr.list
+    ls genes | sed -e "s/.gp.gz//" | grep -v hg19 > gene.list
+
+    cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    mkdir parts
+    gensub2 chr.list gene.list template jobList
+    para -ram=8g create jobList
+    para try ... check ... push
+# Completed: 4185 of 4185 jobs
+# CPU time in finished jobs:      72491s    1208.19m    20.14h    0.84d  0.002 y
+# IO & Wait Time:               1462162s   24369.36m   406.16h   16.92d  0.046 y
+# Average job time:                 367s       6.11m     0.10h    0.00d
+# Longest finished job:            3165s      52.75m     0.88h    0.04d
+# Submission to last job:          6364s     106.07m     1.77h    0.07d
+
+    # see what it looks like in terms of number of annotations per DB:
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
+  79191 rn4
+ 108287 petMar1
+ 139581 gorGor1
+ 140487 taeGut1
+ 143058 choHof1
+ 143233 vicPac1
+ 150073 anoCar1
+ 154462 tarSyr1
+ 163930 sorAra1
+ 164575 galGal3
+ 171191 macEug1
+ 174221 felCat3
+ 175831 dasNov2
+ 177622 ornAna1
+ 190729 eriEur1
+ 192285 tupBel1
+ 198052 speTri1
+ 199639 micMur1
+ 201731 papHam1
+ 201961 panTro2
+ 206170 oryCun2
+ 209327 ponAbe2
+ 209504 otoGar1
+ 210860 rheMac2
+ 212533 proCap1
+ 212848 myoLuc1
+ 213146 dipOrd1
+ 213479 calJac1
+ 215995 echTel1
+ 220341 ochPri2
+ 225132 loxAfr3
+ 226689 turTru1
+ 230903 monDom5
+ 232025 pteVam1
+ 232831 equCab2
+ 236945 cavPor3
+ 238167 bosTau4
+ 239857 mm9
+ 255727 canFam2
+ 316850 xenTro2
+ 359507 danRer6
+ 375156 oryLat2
+ 390076 fr2
+ 426532 gasAcu1
+ 434619 tetNig2
+
+    #	load the resulting file
+    ssh hgwdev
+    cd /cluster/data/hg19/bed/multiz46way/frames
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | sort -k1,1 -k2,2n | hgLoadMafFrames hg19 multiz46wayFrames stdin
+    #	real    5m47.840s
+
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | sort -k1,1 -k2,2n > multiz46wayFrames.bed
+
+    featureBits -countGaps hg19 multiz46wayFrames.bed
+    #	62315198 bases of 3107677273 (2.005%) in intersection
+    featureBits -countGaps hg19 multiz28wayFrames
+    #	48236360 bases of 3107677273 (1.552%) in intersection
+
+    #	enable the trackDb entries:
+# frames multiz46wayFrames
+# irows on
+    #	appears to work OK
+
+#############################################################################
 # AFFY U133AB (Done - 2009-09-30 - Jim)
     # Align probes 
     ssh swarm
     cd /cluster/data/hg19/bed
     mkdir -p affyProbes/affyU133/run
     cd affyProbes/affyU133/run
     mkdir psl
     ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
     ls -1 /hive/data/outside/affyProbes/HG-U133AB_all.fa > mrna.lst
 
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 genome.lst mrna.lst gsub jobList
     para create jobList
     para try
     para check
     para push
     para time
 #Completed: 93 of 93 jobs
 #CPU time in finished jobs:      21246s     354.09m     5.90h    0.25d  0.001 y
 #IO & Wait Time:                   349s       5.82m     0.10h    0.00d  0.000 y
 #Average job time:                 232s       3.87m     0.06h    0.00d
 #Longest finished job:            1650s      27.50m     0.46h    0.02d
 #Submission to last job:          1685s      28.08m     0.47h    0.02d
 
 
     # Do sort, best in genome filter.
     # to create affyU133.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU133.psl /dev/null
     rm -r raw.psl psl
 
     # Load probes and alignments into database.
     ssh hgwdev
     cd /cluster/data/hg19/bed/affyProbes/affyU133
     hgLoadPsl hg19 affyU133.psl
     hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
 
 ##########################################################################
 # GNF ATLAS 2 (Done - 2009-09-30 - Jim)
     # Align probes from GNF1H chip.
     ssh swarm
     cd /cluster/data/hg19/bed
     mkdir -p geneAtlas2/run/psl
     cd geneAtlas2/run
     mkdir psl
     ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
     ls -1 /hive/data/outside/gnf/human/atlas2/gnf1h.fa > mrna.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 genome.lst mrna.lst gsub jobList
     para create jobList
     para try
     para check
     para push
     para time
 #Completed: 93 of 93 jobs
 #CPU time in finished jobs:       3299s      54.98m     0.92h    0.04d  0.000 y
 #IO & Wait Time:                   330s       5.50m     0.09h    0.00d  0.000 y
 #Average job time:                  39s       0.65m     0.01h    0.00d
 #Longest finished job:             370s       6.17m     0.10h    0.00d
 #Submission to last job:           477s       7.95m     0.13h    0.01d
 
 
     # Do sort, best in genome filter
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1h.psl /dev/null
     rm -r raw.psl psl
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/geneAtlas2
     hgLoadPsl hg19 affyGnf1h.psl
     hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/gnf1h.fa
 
     grep -v U133B ../affyProbes/affyU133/affyU133.psl \
 	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
 	| sed -e "s/;//" > affyU133A.psl
 
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
     	affyU133A.psl  affyGnf1h.psl
 
     # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
     # Mapped 33186,  multiply-mapped 3171, missed 48, unmapped 11510
 
     hgLoadBed hg19 gnfAtlas2 gnfAtlas2.bed
     # Loaded 36357 elements of size 15
 
 ##########################################################################
 # BUILD NIBB IMAGE PROBES (DONE 2009-10-12 JK)
 
 # Make directory on san for cluster job and copy in sequence
     ssh swarm
     mkdir /hive/data/genomes/hg19/bed/nibbPics
     cd /hive/data/genomes/hg19/bed/nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
 
 # Make parasol job dir and sequence list files
     mkdir run
     cd run
     mkdir psl
     ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
     echo ../nibbImageProbes.fa > mrna.lst
 
 # Create parasol gensub file file
 cat << '_EOF_' > gsub
 #LOOP
 blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
 
 # Do para try/push/time etc.
 #Completed: 93 of 93 jobs
 #CPU time in finished jobs:       8008s     133.47m     2.22h    0.09d  0.000 y
 #IO & Wait Time:                   364s       6.07m     0.10h    0.00d  0.000 y
 #Average job time:                  90s       1.50m     0.03h    0.00d
 #Longest finished job:             765s      12.75m     0.21h    0.01d
 #Submission to last job:           824s      13.73m     0.23h    0.01d
 
 # Make sort and filter
     catDir psl | sort -k 10 \
         | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
 	| sort -k 14,14 -k 16,16n \
 	| sed 's#/scratch/data/hg19/nib/chr#chr#' \
 	| sed 's/.nib//' > ../nibbImageProbes.psl
 
 # Make bed file and copy in stuff
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
 
 # Load into database
     ln -s /cluster/data/hg19/bed/nibbPics/nibbImageProbes.fa /gbdb/hg19/nibbImageProbes.fa
     hgLoadSeq hg19 /gbdb/hg19/nibbImageProbes.fa
     hgLoadPsl hg19 nibbImageProbes.psl
 
 ##########################################################################
 # Initial vgProbeTrack run for hg19 (galt 2009-10-15)
 # see visiGene.txt make doc
 # uses nibbImageProbes and vgProbeTrack utility
 # creates vgAllProbes and knownToVisiGene
 #    25931 
 # updates visiGene.vgPrbAliAll.
 # creates and runs hgLoadSeq on /gbdb/hg19/visiGene/*.fa
 
 ##########################################################################
 # make new grp table to match hg18 (DONE  2009-10-01 kuhn)
 # to split regulation from expression 
 # phenDis group is also missing in hg19
 # and add one more column: defaultIsClosed
 
 # get the hg18.grp table into hg19 
 
 # copy the hg18.grp table into hg19.grpNew and edit 
    hgsql hg19
    CREATE TABLE grpNew SELECT * FROM hg18.grp;
    # 24 rows in set (0.00 sec)
    
    DELETE FROM grpNew WHERE name LIKE "encode%";
    DELETE FROM grpNew WHERE name LIKE "remc%"; 
    DELETE FROM grpNew WHERE name LIKE "tcga%";
    DELETE FROM grpNew WHERE name LIKE "cancer%"; 
    DELETE FROM grpNew WHERE name LIKE "jk%";
    # 10 rows in set (0.00 sec)
    
 # move the new table into place quickly
    DROP TABLE grp;
    RENAME TABLE grpNew TO grp;
  
 #########################################################################
 # BUILD OMIM RELATED GENES TRACK (done 2009-10-13 jk)
 
 ssh hgwdev
 cd /hive/data/genomes/hg19/bed
 mkdir omimGene
 cd omimGene
 
 # download the file morbidmap and genemap from OMIM
 
 mkdir omim
 cd omim
 wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
 wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
 cat genemap|sed -e 's/|/\t/g' > genemap.tab
 autoSql ~/src/hg/lib/omimGeneMap.as x
 cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
 hgLoadSqlTab -warn hg19 omimGeneMap omimGeneMap.sql genemap.tab
 
 # got warning on 3 records, just ignore them
 # Warning: load of omimGeneMap did not go as planned: 12216 record(s), 0 row(s)
 
 rm x.c x.h
 cd ..
 cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
 autoSql ~/src/hg/lib/omimMorbidMap.as x 
 cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
 hgLoadSqlTab -warn hg19 omimMorbidMap omimMorbidMap.sql mobidmap.tab
 
 # get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene 
 # that has a non-empty OMIM ID according to the refLink table.  And use OMIM ID as
 # the gene name for this new table.  Please note the alignId field still holds the KG ID.
 
 hgsql hg19 -N -e \
 'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
 |cut -f 1,3-13 >o1.tab
 
 # collect more OMIM related genes via the MIM external DB links from UniProt
 
 hgsql hg19 -N -e \
 'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
 |cut -f 1,3-13 >o2.tab
 
 # concatenate the above two gene sets and remove duplications.
 
 cat o1.tab o2.tab |sort -u >o3.tab
 
 # load the result into a temp table, fanO3
 hgLoadSqlTab hg19 fanO3 ~/src/hg/lib/knownGene.sql o3.tab
 
 # while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms, 
 # and knownCanonical tables) that represent a cluster which contains 
 # initial OMIM gene in the fanO3 table
 
 hgsql hg19 -N -e \
 'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
 > o4.tab
 
 # first column is the OMIM ID
 cut -f 1 o4.tab >j1.tmp
 
 # col 3-13 is the gene structure of the canonical KG
 cut -f 3-13 o4.tab >j2.tmp
 
 # stitch them together and remove duplicates, load the result into fanO4 table
 paste j1.tmp j2.tmp |sort -u >fanO4.tab
 hgLoadSqlTab hg19 fanO4  ~/src/hg/lib/knownGene.sql fanO4.tab
 
 # finally sort the table and create bed 4 file and load it as the omimGene table
 
 hgsql hg19 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
 hgLoadBed hg19 omimGene omimGene.bed
 
 # create and load the omimToKnownCanonical table.
 
 hgsql hg19 -N -e 'select name, alignId from fanO4 order by name'\
 > omimToKnownCanonical.tab
 
 hgLoadSqlTab hg19 omimToKnownCanonical  \
 ~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab
 
 # The following clean up could be done.
 # hgsql hg19 -e 'drop table fanO3'
 # hgsql hg19 -e 'drop table fanO4'
 # rm j*.tmp
 # rm o1.tab o2.tab o3.tab o4.tab
 
 #########################################################################
 # BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (in progress 2009-10-14 jk)
 
 # Make the directory to work in
    cd /hive/data/genomes/hg19/bed
    mkdir hprd
    cd hprd
 
 # Download HPRD_XML_070609.tar.gz from www.hprd.org. Unfortunately this
 # requires registration, so can't just wget it.
 
     zcat HPRD_XML_070609.tar.gz | tar -xv
 
 # This will create 20000 or more  xxxx.xml files under HPRD_XML_070609
 
 # Create hprdToCdna table
     echo HPRD_XML_070609/*.xml | xargs grep entry_cdna  > j.cdna
     cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
 	sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
 	grep -v None >hprdToCdna.tab
 
     hgsql hg19 <~/src/hg/lib/hprdToCdna.sql
     hgsql hg19 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'
 
 # Create hprdToUniProt table
 
     echo 'fgrep -H Swiss  HPRD_XML_070609/$1.xml' >do1
 
     ls HPRD_XML_070609 >j
     cat j |sed -e 's/.xml/\tdo1/g' >jj
     cut -f 1 jj >j.2
     cut -f 2 jj >j.1
     paste j.1 j.2 >doall
     chmod +x do*
 
     ./doall >j.out
     cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
     sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hprdToUniProt.tab
 
     hgsql hg19 <~/src/hg/lib/hprdToUniProt.sql
     hgsql hg19 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'
 
 # build knownToHprd table
 
     hgsql hg19 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=refseq' >j.kg1
     hgsql hg19 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
 
     cat j.kg1 j.kg2 | sed 's/_.//' | sort -u >knownToHprd.tab
     wc knownToHprd.tab
 
     hgsql hg19 <~/src/hg/lib/knownToHprd.sql
 
     hgsql hg19 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
     hgsql hg19 -e 'select count(*) from knownToHprd'
 
 # 21,516 records created
 
 # remove temporary files.
 
     rm j*