src/hg/makeDb/doc/hg19.txt 1.23

1.23 2009/06/02 21:43:00 hiram
the swap lastz are done for those genomes that are public on the RR
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/hg19.txt	2 Jun 2009 18:35:57 -0000	1.22
+++ src/hg/makeDb/doc/hg19.txt	2 Jun 2009 21:43:00 -0000	1.23
@@ -1,4058 +1,4044 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how we made the browser database on
 # NCBI build 37 (February 2009 freeze) aka:
 #	GRCh37 - Genome Reference Consortium Human Reference 37
 #	Assembly Accession: GCA_000001405.1
 
 #	"$Id$";
 
 #############################################################################
 # Download sequence (DONE - 2009-02-04 - Hiram)
     mkdir -p /hive/data/genomes/hg19/download
     cd /hive/data/genomes/hg19/download
     mkdir -p assembled_chromosomes
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=assembled_chromosomes \
         -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
 
     mkdir -p alternate_loci
 for N in 1 2 3 4 5 6 7 8 9
 do
 wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
     --directory-prefix=alternate_loci \
         -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
 done
 
     mkdir -p unlocalized_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=unlocalized_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
 
     mkdir -p unplaced_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=unplaced_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
 
     mkdir -p placed_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=placed_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
 
     mkdir ucscChr
     cd ucscChr
     for F in ../assembled_chromosomes/FASTA/chr*.fa
 do
     C=`basename $F`
     C=${C/.fa}
     echo -n "${C} "
     H=`head -1 "${F}"`
     chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
     A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
     echo $chrN $A
     grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
         | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
     echo ">${chrN}" > ${chrN}.fa
     grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
 done
 
     rm -f scaffolds.agp
     find ../alternate_loci -type f | grep ".agp$" | while read F
 do
     grep "^GL" $F | sed -e \
 "s/^GL000250.1/chr6_apd_hap1/" -e \
 "s/^GL000251.1/chr6_cox_hap2/" -e \
 "s/^GL000252.1/chr6_dbb_hap3/" -e \
 "s/^GL000253.1/chr6_mann_hap4/" -e \ 
 "s/^GL000254.1/chr6_mcf_hap5/" -e \
 "s/^GL000255.1/chr6_qbl_hap6/" -e \
 "s/^GL000256.1/chr6_ssto_hap7/" -e \
 "s/^GL000257.1/chr4_ctg9_hap1/" -e \
 "s/^GL000258.1/chr17_ctg5_hap1/"
 done > scaffolds.agp
 
     find ../unlocalized_scaffolds -type f | grep ".agp$" \
 | while read F
 do
     C=`basename ${F}`   
     C=${C/.unlocalized.scaf.agp}
     grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
 done >> scaffolds.agp
 
     find ../unplaced_scaffolds -type f | grep ".agp$" \
 | while read F
 do
     grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
 done >> scaffolds.agp
 
     rm -f scaffolds.fa
     find ../alternate_loci -type f | grep ".fa$" | while read F
 do  
     sed -e \
 "s/>.*GL000250.*/>chr6_apd_hap1/" -e \
 "s/>.*GL000251.*/>chr6_cox_hap2/" -e \
 "s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
 "s/>.*GL000253.*/>chr6_mann_hap4/" -e \
 "s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
 "s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
 "s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
 "s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
 "s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
 done > scaffolds.fa
 
     find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
 do
     sed -e \
 "s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
 done >> scaffolds.fa
 
     find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
 do
     sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
 done >> scaffolds.fa
 
 
 ############################################################################
 ## Create database (DONE - 2009-03-04 - Hiram)
     cd /hive/data/genomes/hg19
     cat << '_EOF_' > hg19.config.ra
 # Config parameters for makeGenomeDb.pl:
 db hg19
 scientificName Homo sapiens
 commonName Human
 assemblyDate Feb. 2009
 assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
 orderKey 14
 mitoAcc NC_001807
 fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
 agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
 # qualFiles /dev/null
 dbDbSpeciesDir human
 taxId	9606
 '_EOF_'
     # << happy emacs
 
     time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
     #	real    14m8.958s
      featureBits -countGaps hg19 gap
     #	239845127 bases of 3137161264 (7.645%) in intersection
     featureBits -noRandom -noHap -countGaps hg19 gap
     #	234344806 bases of 3095693983 (7.570%) in intersection
     #	verify featureBits is properly ignorning haps and randoms:
     egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
     #	3095693983 chrM 16571
     #	same total as in featureBits
 
     #	much later on, discovered that we needed a chrM definition in the
     #	agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
 # chrM    1       16571   1       F       NC001807        1       16571   +
     #	the spaces there are tabs
 
 ############################################################################
 # running repeat masker (DONE - 2009-03-05 - Hiram)
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg19/bed/repeatMasker
     cd /hive/data/genomes/hg19/bed/repeatMasker
     time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
 	> do.log 2>&1
     #	real    525m23.521s
     cat faSize.rmsk.txt
     #	3137161264 bases (239850802 N's 2897310462 real 1431585691
     #	upper 1465724771 lower) in 93 sequences in 1 files
     #	%46.72 masked total, %50.59 masked real
     featureBits -countGaps hg19 rmsk
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	this is odd, 3 bases more in featureBits than were masked ?
     #	check it out, make a bed file from the featureBits:
     featureBits -countGaps -bed=rmsk.bed hg19 rmsk
     #	went down a sequence of intersections with this idea, but could
     #	not get it resolved.  It appears there are 75 bases in the rmsk
     #	table that were not masked in the 2bit file ?
     #	Later on, realized that featureBits does not count lower case N's
     #	in the "lower" category, but only in the N's category.
 
     #	trying a non-split table:
     hgsql -e "show tables;" hg19 | grep _rmsk | while read T
 do
     hgsql -e "drop table ${T};" hg19
 done
     hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
 bad rep range [4385, 4384] line 1348605 of hg19.fa.out 
 bad rep range [5563, 5562] line 1563988 of hg19.fa.out 
 bad rep range [4539, 4538] line 3111186 of hg19.fa.out 
     #	featureBits still reports 1465724774 bases in rmsk table
     #	cleaning the hg19.fa.out file:
     cp hg19.fa.out hg19.clean.out
     # edit hg19.clean.out and remove the three lines:
 # 1467  20.7  1.2 17.6  chr14     35056767 35056794 (72292746) +  L1ME1          LINE/L1               4385 4384 (1761) 1120962
 # 1943  23.8  5.0 12.6  chr15     65775909 65775924 (36755468) +  L1MC4          LINE/L1               5563 5562 (2480) 1299299
 # 2463  25.1  5.0 11.6  chr3      121291056 121291083 (76731347) +  L1M3           LINE/L1               4539 4538 (1608) 2589267
 
     #	reload the table
     hgsql -e "drop table rmsk;" hg19
     hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
 
     #	try masking with this clean file:
     twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
 	hg19.clean.2bit
     twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
     cat faSize.clean.txt
     #	this gives the lower by 75 bases result:
     #	3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
     #	1465724699 lower) in 93 sequences in 1 files
     #	%46.72 masked total, %50.59 masked real
     featureBits -countGaps hg19 rmsk
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	is the countGaps interferring ?
     featureBits hg19 rmsk
     #	1465724774 bases of 2897316137 (50.589%) in intersection
     #	nope, lets' see what the .out file has:
     grep chr hg19.clean.out | sed -e "s/^  *//" | awk '{print $5,$6-1,$7}' \
 	| sort -k1,1 -k2,2n > hg19.clean.out.bed
     featureBits -countGaps hg19 hg19.clean.out.bed
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	is it perhaps not masking N's ?
     twoBitToFa hg19.clean.2bit stdout | grep n | less
     #	that does find some lower case n's, find all N's:
     findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
 	2> findMotif.out
     grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
     #	which cover:
     featureBits -countGaps hg19 nLocations.bed
     #	251299071 bases of 3137161264 (8.010%) in intersection
     #	overlapping rmsk business with these N locations:
     featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
     #	6494740 bases of 3137161264 (0.207%) in intersection
     #	and overlapping with gap:
     featureBits -countGaps hg19 gap nLocations.bed
     #	239845127 bases of 3137161264 (7.645%) in intersection
 
 ############################################################################
 # running TRF simple repeats (DONE - 2009-03-05 - Hiram)
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg19/bed/simpleRepeat
     cd /hive/data/genomes/hg19/bed/simpleRepeat
     time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
 	-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
     #	real    33m25.815s
 
     twoBitMask bed/repeatMasker/hg19.clean.2bit \
 	-add bed/simpleRepeat/trfMask.bed hg19.2bit
     twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
 # 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
 # 1466923203 lower) in 93 sequences in 1 files
 # %46.76 masked total, %50.63 masked real
 
 ############################################################################
 #	prepare cluster data (DONE - 2009-03-06 - Hiram)
     cd /hive/data/genomes/hg19
     rm /gbdb/hg19/hg19.2bit
     ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
 
     time blat hg19.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
     #	Wrote 30675 overused 11-mers to 11.ooc
     #	real    3m11.302s
 
     mkdir /hive/data/staging/data/hg19
     cp -p hg19.2bit /hive/data/staging/data/hg19
     cp -p 11.ooc /hive/data/staging/data/hg19
     cp -p chrom.sizes /hive/data/staging/data/hg19
 
     mkdir separateChrs
     cd separateChrs
     grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.chrOnly.2bit
     twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
 
     grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.hapOnly.2bit
     twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
 
     grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.scaffolds.2bit
     twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
 
     cp -p *.2bit *.sizes /hive/data/staging/data/hg19
 
     # ask admin to sync this directory: /hive/data/staging/data/hg19/
     #	to the kluster nodes /scratch/data/hg19/
 
 ############################################################################
 # running cpgIsland business (DONE - 2009-03-06 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/cpgIsland
     cd /hive/data/genomes/hg19/bed/cpgIsland
     cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     # comment out the following two lines if it compiles cleanly
     # some day  (there were some other fixups too, adding include lines)
     sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
     mv tmp.c cpg_lh.c
     make
     cd ../../ 
     ln -s hg3rdParty/cpgIslands/cpglh.exe
     mkdir -p hardMaskedFa
     cut -f1 ../../chrom.sizes | while read C
 do
     echo ${C}
     twoBitToFa ../../hg19.2bit:$C stdout \
 	| maskOutFa stdin hard hardMaskedFa/${C}.fa
 done
 
     cut -f1 ../../chrom.sizes > chr.list
     cat << '_EOF_' > template
 #LOOP
 ./runOne $(root1) {check out line results/$(root1).cpg}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > runOne
 #!/bin/csh -fe
 ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
 mv /scratch/tmp/$1.$$ $2
 '_EOF_'
     # << happy emacs
 
     gensub2 chr.list single template jobList
     para create jobList
     para try
     para check ... etc
     para time
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:        172s       2.86m     0.05h    0.00d  0.000 y
 # IO & Wait Time:                  1748s      29.14m     0.49h    0.02d  0.000 y
 # Average job time:                  21s       0.34m     0.01h    0.00d
 # Longest finished job:              34s       0.57m     0.01h    0.00d
 # Submission to last job:            83s       1.38m     0.02h    0.00d
 
     # Transform cpglh output to bed +
     catDir results | awk '{
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }' > cpgIsland.bed
 
     cd /hive/data/genomes/hg19/bed/cpgIsland
     hgLoadBed hg19 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 
 # Reading cpgIsland.bed
 # Loaded 28226 elements of size 10
 # Sorted
 # Saving bed.tab
 # Loading hg19
 
 ############################################################################
 # create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/gap
     cd /hive/data/genomes/hg19/bed/gap
     gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
     cp -p hg19.unBridged.lift ../../jkStuff
     cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
 
 ############################################################################
 # AUTO UPDATE GENBANK RUN  (DONE - 2009-03-07,13 - Hiram)
     # align with latest genbank process.
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add hg19 just after hg18
 
 # hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
 #       Assembly Accession: GCA_000001405.1
 hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
 hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
 hg19.ooc = /scratch/data/hg19/11.ooc
 hg19.lift = /scratch/data/hg19/hg19.unBridged.lift
 # hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
 hg19.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
 hg19.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
 hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
 hg19.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
 hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
 hg19.genbank.est.xeno.pslCDnaFilter   = ${finished.genbank.est.xeno.pslCDnaFilter}
 hg19.genbank.est.xeno.load = yes
 hg19.refseq.mrna.xeno.load  = yes
 hg19.refseq.mrna.xeno.loadDesc = yes
 hg19.mgc = yes
 hg19.orfeome = yes
 hg19.downloadDir = hg19
 # hg19.ccds.ncbiBuild = 36.3
 # hg19.upstreamGeneTbl = refGene
 # hg19.upstreamMaf = multiz28way
 # /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
 # /hive/data/genomes/hg19/bed/multiz44way/species.list
 hg19.genbank.mrna.blatTargetDb = yes
 
     cvs ci -m "Added hg19." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     ssh genbank
     screen		#	use a screen to manage this job
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial hg19 &
     #	logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
     #	real    2761m13.680s
     #	that ran on the swarm with little interference and no problems
 
     # load database when finished
     ssh hgwdev
     screen	# use screen to manage this long running command
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
     # logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
     #	real    369m11.941s
 
     # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add hg19 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 #########################################################################
 #  BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("hg19", "blat13", "17778", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("hg19", "blat13", "17779", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 # Making download files (DONE - 2009-03-13 - Hiram)
     cd /hive/data/genomes/hg19
     makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
 	> downloads.log 2>&1
 ############################################################################
 # Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
 doBlastzChainNet.pl `pwd`/DEF \
         -stop=partition -bigClusterHub=swarm \
         -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
         -workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
 
 doBlastzChainNet.pl `pwd`/DEF \
         -continue=blastz -stop=blastz -bigClusterHub=swarm \
         -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
         -workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
 
 doBlastzChainNet.pl `pwd`/DEF \
         -continue=cat -stop=net -bigClusterHub=swarm \
         -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
         -workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
 real    163m28.438s
 
     # to load, run it in debug, then check the load script
 doBlastzChainNet.pl `pwd`/DEF \
 	-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
 	-debug -smallClusterHub=swarm -chainMinScore=1000 \
 	-chainLinearGap=medium \
 	-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
 
     # and create a synNet for multiz, run in debug, and examine script
     #	to make sure it works correctly
 doBlastzChainNet.pl `pwd`/DEF \
 	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
 	-debug -bigClusterHub=swarm \
 	-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
 	-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
     #	real    31m11.216s
 
 ############################################################################
 # reset position to chr6 haplotype situation
     hgsql -e \
 'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
 	hgcentraltest
 
 # reset to a smaller range (2009-04-24 - Brooke)
 # this is the SOD1 gene, implicated in Lou Gehrig's disease.
 
     hgsql -e \
 'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
         hgcentraltest
 
 ############################################################################
 # Self Lastz run (DONE - 2009-03-19 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
     cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
     cat << '_EOF_'
 # human vs human
 BLASTZ=lastz
 # maximum M allowed with lastz is only 255
 BLASTZ_M=254
 # lastz does not like the O= and E= lines in the matrix file 
 #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
 BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from hg18 vs venter1 lastz on advice from Webb
 BLASTZ_K=10000
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
     
 # QUERY: Human Hg19
 SEQ2_DIR=/scratch/data/hg19/hg19.2bit
 SEQ2_LEN=/scratch/data/hg19/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
     
 BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     screen # use screen to manage this long-running job
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
 	-workhorse=hgwdev \
 	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
     #	cluster difficulties, finished manually, then:
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
 	-continue=cat -workhorse=hgwdev \
 	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
 	-continue=load -debug -workhorse=hgwdev \
 	-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
     #	that indicates it would do:
     hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
     #	adding -normScore
     hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
 
 ############################################################################
 # Chimp Lastz run (DONE - 2009-03-19 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
     cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
     cat << '_EOF_'
 # human vs chimp
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 # lastz does not like the O= and E= lines in the matrix file
 #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
 BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Chimp PanTro2
 SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
 SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     screen # use screen to manage this long-running job
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
     #	real    173m22.880s
     #	cluster problems, continuing after lastz done:
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
 	-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	> net.log 2>&1 &
     #	real    81m20.209s
     #	continuing with the load and adding syntenicNet
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
 	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
 	-chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	> load.log 2>&1 &
     #	real    47m17.871s
     cat fb.hg19.chainPanTro2Link.txt 
     #	2747983350 bases of 2897316137 (94.846%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-05-24
     ssh swarm
     mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
     cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
 	> swap.log 2>&1 &
     #	real    723m41.377s
     cat fb.panTro2.chainHg19Link.txt 
     #	2761343871 bases of 2909485072 (94.908%) in intersection
 
 ############################################################################
 # Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
     mkdir /hive/data/genomes/hg19/pushQ
     cd /hive/data/genomes/hg19/pushQ
     makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
     # many complaints about the chain and net tables from the haplotype
     #	experiments, and this table:
     #	orfeomeGenes
     #	which is probably in genbank, and these usual ones:
     #	hg19 does not have seq
     #	hg19 does not have extFile
 
 ############################################################################
 # Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/parRegion
     cd /hive/data/genomes/hg19/bed/parRegion
     awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
     awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
     comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
     cat chrXY.par.clone.list \
 	| while read C; do grep "${C}" ../../X/chrX.agp; done \
 	| sort -k1,1 -k2,2n >> chrX.par.region.agp
     cat chrXY.par.clone.list \
 	| while read C; do grep "${C}" ../../Y/chrY.agp; done \
 	| sort -k1,1 -k2,2n >> chrY.par.region.agp
     awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
 	> chrY.par.region.bed
     awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
 	> chrX.par.region.bed
     #	use those bed files in custom tracks on hg19 to verify that they
     #	are two continuous regions with only gaps between these items
     #	these location extents are: (zero relative)
     #	chrX 60000 2722842
     #	chrX 154906585 155260560
     #	chrY 10000 2649520
     #	chrY 59034049 59363566
 
 ############################################################################
 # Gorilla Lastz run (DONE - 2009-03-21,05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
     cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
     cat << '_EOF_'
 # human vs gorilla
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 # lastz does not like the O= and E= lines in the matrix file
 #       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
 BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Gorilla gorGor1
 SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
 SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     screen # use screen to manage this long-running job
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     cat fb.hg19.chainGorGor1Link.txt 
     #	1723432141 bases of 2897316137 (59.484%) in intersection
     doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1
 
 ############################################################################
 # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2009-04-02 - Hiram)
     ssh pk
     mkdir /hive/data/genomes/hg19/bed/linSpecRep
     cd /hive/data/genomes/hg19/bed/linSpecRep
     #	create individual .out files from the master record in ../repeatMasker
     mkdir splitOut
     cat << '_EOF_' > split.csh
 #!/bin/csh -fe
 set C = $1
 head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
 grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > template
 #LOOP
 split.csh $(root1) {check out line+ splitOut/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 ../../chrom.sizes > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try ... check ... push ... etc...
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:        127s       2.12m     0.04h    0.00d  0.000 y
 # IO & Wait Time:                 17154s     285.90m     4.76h    0.20d  0.001 y
 # Average job time:                 186s       3.10m     0.05h    0.00d
 # Longest finished job:             224s       3.73m     0.06h    0.00d
 # Submission to last job:           280s       4.67m     0.08h    0.00d
     
     #	now, we can date and process each of those .out files
     #	this really should be a single creation of notInOthers
     #	These four different ones all end up to be the same anyhow
     #	the notInMouse becomes notInOthers below and the others are removed.
     mkdir dateRepeats
     cd dateRepeats
     cat << '_EOF_' > mkLSR
 #!/bin/csh -fe
 rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
 ln -s ../splitOut/$1.out .
 /scratch/data/RepeatMasker/DateRepeats \
     $1.out -query human -comp mouse -comp rat -comp dog -comp cow
 rm $1.out
 mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
 /cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
 	> ../notInMouse/$1.out.spec
 /cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
 	> ../notInRat/$1.out.spec
 /cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
 	> ../notInDog/$1.out.spec
 /cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
 	> ../notInCow/$1.out.spec
 '_EOF_'
     #	<< happy emacs
     chmod +x mkLSR
 
     cat << '_EOF_' > template
 #LOOP
 ./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 ../chrom.list single template jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:       2441s      40.69m     0.68h    0.03d  0.000 y
 # IO & Wait Time:                   332s       5.53m     0.09h    0.00d  0.000 y
 # Average job time:                  30s       0.50m     0.01h    0.00d
 # Longest finished job:             125s       2.08m     0.03h    0.00d
 # Submission to last job:           454s       7.57m     0.13h    0.01d
 
     done
 
     #	these four types of out.spec results all turn out to be identical
     #	To check identical
     cd /hive/data/genomes/hg19/bed/linSpecRep
     find . -name "*.out.spec" | \
 	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
 	| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
 	| sort | uniq -c | less
     #	You will see they are all a count of 4
     #	Set them up on scratch data and get to all the kluster nodes:
     mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
     cd notInMouse
     rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
     cd ..
     mv notInMouse notInOthers
     #	do not need to keep all of these
     rm -fr notInRat notInDog notInCow
 
     # We also need the nibs for blastz runs with lineage specific repeats
     mkdir /hive/data/genomes/hg19/bed/nibs
     cd /hive/data/genomes/hg19/bed/nibs
     cut -f1 ../../chrom.sizes | while read C
 do
     twoBitToFa -seq=${C} ../../hg19.2bit stdout \
 	| faToNib -softMask stdin ${C}.nib
     echo "${C} done"
 done
     mkdir /hive/data/staging/data/hg19/nib
     rsync -a --progress ./ /hive/data/staging/data/hg19/nib
 
     # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
 
 #############################################################################
 # create gc5Base download file (DONE - 2009-04-24 - Hiram)
     cd /hive/data/genomes/hg19/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
         /cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
 
 #############################################################################
 # Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/ctgPos
     cd /hive/data/genomes/hg19/bed/ctgPos
     cat << '_EOF_' > mkCtgPos.sh
 AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
 export AGP
 for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
 do
     C=${F/.agp/}
     grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
 {
 printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
 }
 '
 done
 '_EOF_'
     # << happy emacs
     chmod +x mkCtgPos.sh
     ./mkCtgPos.sh > ctgPos.tab
 
     cat << '_EOF_' > mkRanCtgPos.sh
 AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
 export AGP
 for F in `(cd ${AGP}; ls chr*.agp)`
 do
     C=${F/.unlocalized.scaf.agp/}
     c=${C/chr/}
     export C c
     grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
 BEGIN {
     ctgName=""
     ctgStart=0
     ctgEnd=0
     chrom="'${c}'"
     ctgNameLower=""
 }
 {
 if (match(ctgName,$1)) {
     ctgEnd = $3
 } else {
     if (length(ctgName) > 0) {
         size=ctgEnd - ctgStart
 printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower, 
 ctgStart, ctgEnd
     }
     ctgStart = $2 - 1
     ctgEnd = $3
     ctgName = $1
     ctgNameLower = tolower($1)
     sub(".1$","",ctgNameLower)
 }
 }
 END {
 size=ctgEnd - ctgStart
 printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower, 
 ctgStart, ctgEnd
 }
 '
 done
 '_EOF_'
     # << happy emacs
     chmod +x mkRanCtgPos.sh
     ./mkRanCtgPos.sh >> ctgPos.tab
 
     #	fetch .sql definition from hg18
     chmod 777 .
     hgsqldump --all -c --tab=. hg18 ctgPos
     chmod 775 .
     hgsql hg19 < ctgPos.sql
     hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
 
 #############################################################################
 # CLONE ENDS - first step for BACEND/CytoBand tracks
 #	(DONE - 2009-04-28 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
     cd /hive/data/genomes/hg19/bed/cloneend/ncbi
 
     wget --timestamping \
 'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
     wget --timestamping \
 'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'
 
     cd /hive/data/genomes/hg19/bed/cloneend
     # seems like the *.mfa files were split just for convenience
     # concatenate
 
     for F in ncbi/*.mfa.gz
 do
     zcat "${F}"
     echo "${F}" 1>&2
 done | gzip > all.mfa.gz
     #	that 1>&2 echos to stderr so you can see the file name and not
     #	interfere with the pipe stdout output to gzip
 
     # Convert the title line of the all.mfa file
     zcat all.mfa.gz \
 	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
 	    | gzip > cloneEnds.fa.gz
 
     zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
 
     #	make sure nothing got broken:
     faSize all.mfa.gz
 # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
 # in 833173 sequences in 1 files
 
     faSize cloneEnds.fa.gz
 # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
 # in 833173 sequences in 1 files
 
     #	identical numbers
     #	you can also carefully check the names:
     zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
 	| sed -e "s/\.[0-9]$//" | sort > mfa.names
     #	should be the same as:
     zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names
 
 
     # concatenate the text files, too
     bash
     for F in ncbi/*.txt.gz
 do
     zcat "${F}"
     echo "${F}" 1>&2
 done | gzip > all.txt.gz
 
     # generate cloneEndPairs.txt and cloneEndSingles.txt
     zcat all.txt.gz >all.txt
     $HOME/kent/src/hg/utils/cloneEndParse.pl all.txt
 
     #	Reading in end info
     #	Writing out pair info
     #	Writing out singleton info
     #	302264 pairs and 203094 singles
     #	examined all the clone names and all the bac end names in these two
     #	files and compared with business from all.txt to make sure we properly
     #	classified all of them correctly.  We had 833,173 clone sequences,
     #	and 501,135 bac end names
 
     #	faSplit does not function correctly if given a .gz source file
     #	AND, we need the unzipped file for sequence loading below
     gunzip cloneEnds.fa.gz
     # split
     mkdir splitdir
     cd splitdir
     faSplit sequence ../cloneEnds.fa 100 cloneEnds
     #	Check to ensure no breakage:
     cat *.fa | faSize stdin
 # 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
 # in 833173 sequences in 1 files
     #	same numbers as before
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/hg19/cloneend
     cd /gbdb/hg19/cloneend
       ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
     cd /tmp
     hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
     #  Advisory lock created
     # Creating .tab file
     # Adding /gbdb/hg19/cloneend/cloneEnds.fa
     # 833173 sequences
     # Updating seq table
     # Advisory lock has been released
     # All done
 
 ##############################################################################
 # BACEND SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
     cd /hive/data/genomes/hg19/bed/bacends/run.blat
     #	going to run separate runs for the golden path sequence vs. the
     #	randoms, haplotypes, chrUn and chrM
     partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
 	| egrep -v "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://; s/:/./" > hg19.list
     ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
 	> bacEnds.list
 
     ssh swarm
     cd /hive/data/genomes/hg19/bed/bacends/run.blat
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set target = $1
 set query = $2
 set result = $3
 set partSpec = `echo $target | sed -e "s/\./:/"`
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set dir = $result:h
 set chr = `echo $target | sed -e "s/\..*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
 
 # echo $tmpFile
 # echo "chr: $chr $start $end -> size: $chrSize, range: $range"
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p $dir
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
         /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
 rm -f $result
 liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
 rm -f $tmpFile.lift $tmpFile.psl
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list bacEnds.list template jobList
     para create jobList
 # 62034 jobs in batch
     # these jobs run quickly, limit them to 250 at a time
     para try, check, -maxJob=250 push, etc ...
 # Completed: 62034 of 62034 jobs
 # CPU time in finished jobs:     506023s    8433.72m   140.56h    5.86d  0.016 y
 # IO & Wait Time:                175853s    2930.88m    48.85h    2.04d  0.006 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest finished job:             752s      12.53m     0.21h    0.01d
 # Submission to last job:          3533s      58.88m     0.98h    0.04d
 
     #	combine the alignments
     time pslSort dirs raw.psl temp psl/chr*
     #	62034 files in 24 dirs
     #	Got 62034 files 249 files per mid file
     #	real    81m2.820s
 
     #	-rw-rw-r--  1 13410334441 Apr 29 12:00 raw.psl
     # cleanup
     rmdir temp
 
     time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl  bacEnds.psl /dev/null > pslReps.out 2>&1 &
     #	real    5m55.990s
     #	Processed 106254032 alignments
     #	-rw-rw-r--  1   372734361 Apr 29 12:56 bacEnds.psl
 
 
     wc -l bacEnds.psl
     #	2852977 bacEnds.psl
 
     time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
 	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose bacEnds.psl \
 	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
 	all_bacends bacEnds
     #	Reading pair file
     #	Reading psl file
     #	Creating Pairs
     #	Writing to files
     #	real    0m18.851s
     #	this creates the files:
     #	-rw-rw-r--  1    21178741 Apr 29 13:00 bacEnds.pairs
     #	-rw-rw-r--  1     5250873 Apr 29 13:00 bacEnds.orphan
     #	-rw-rw-r--  1      738045 Apr 29 13:00 bacEnds.short
     #	-rw-rw-r--  1      463560 Apr 29 13:00 bacEnds.slop
     #	-rw-rw-r--  1      146369 Apr 29 13:00 bacEnds.mismatch
     #	-rw-rw-r--  1        3528 Apr 29 13:00 bacEnds.long
 
     # filter and sort
     awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
     awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
 	bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed
 
     extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
 	    > bacEndPairs.load.psl
 
 ############################################################################
 # BACEND Randoms SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
     cd /hive/data/genomes/hg19/bed/bacends/run.randoms
     #	this separate run for the randoms, haplotypes, chrUn and chrM
     partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
 	| egrep "random|_hap|chrUn" \
 	| sed -e "s/.*2bit://; s/:/./" > random.list
     cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list
 
     ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
 	> bacEnds.list
 
     ssh swarm
     cd /hive/data/genomes/hg19/bed/bacends/run.randoms
     gensub2 random.list bacEnds.list ../run.blat/template jobList
     # very similar runOne.csh script as above, but it doesn't need to do
     #	the lift
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set target = $1
 set query = $2
 set result = $3
 set partSpec = `echo $target | sed -e "s/\./:/"`
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set dir = $result:h
 set chr = `echo $target | sed -e "s/\..*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
 
 # echo $tmpFile
 # echo "chr: $chr $start $end -> size: $chrSize, range: $range"
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p $dir
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
         /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
 rm -f $result
 mv $tmpFile.psl $result
 echo rm -f $tmpFile.lift
 '_EOF_'
     # << happy emacs
 
     # these jobs run fast, do not let too many of them run
     para -maxJob=100 try...check...push
     para time
 # Completed: 6762 of 6762 jobs
 # CPU time in finished jobs:      20357s     339.29m     5.65h    0.24d  0.001 y
 # IO & Wait Time:                 17839s     297.31m     4.96h    0.21d  0.001 y
 # Average job time:                   6s       0.09m     0.00h    0.00d
 # Longest finished job:             261s       4.35m     0.07h    0.00d
 # Submission to last job:           508s       8.47m     0.14h    0.01d
 
     time pslSort dirs raw.psl temp psl/chr*
     #	6762 files in 69 dirs
     #	Got 6762 files 82 files per mid file
     #	real    6m37.177s
 
     #	37044 files in 98 dirs
     #	Got 37044 files 192 files per mid file
     #	real    32m24.804s
     #	-rw-rw-r--    1 6487445210 Feb  2 21:08 raw.psl
     time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
     #	real    0m5.761s
     #	Processed 1254273 alignments
 
     # cleanup
     rmdir temp
 
     wc -l randomEnds.psl
     #	367567 randomEnds.psl
 
     time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
 	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose randomEnds.psl \
 	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
 	all_bacends bacEnds
     #	Reading pair file
     #	Reading psl file
     #	Creating Pairs
     #	Writing to files
     #	real    0m11.221s
     #	this creates the files:
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.slop
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.short
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.mismatch
     #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.long
     #	-rw-rw-r--  1    141836 Apr 29 14:53 bacEnds.pairs
     #	-rw-rw-r--  1    649907 Apr 29 14:53 bacEnds.orphan
 
 ##############################################################################
 # BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/bacends
     # filter and sort
     awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
 	| sort -k1,1 -k2,2n > bacEndPairs.bed
     awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
 	run.blat/bacEnds.long run.blat/bacEnds.mismatch \
 	run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
 	run.randoms/bacEnds.short run.randoms/bacEnds.long \
 	run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
 	    | sort -k1,1 -k2,2n > bacEndPairsBad.bed
 
     head -5 run.blat/bacEnds.psl > bacEnds.psl
     headRest 5 run.blat/bacEnds.psl > t.psl
     headRest 5 run.randoms/randomEnds.psl >> t.psl
     sort -k14,14 -k16,16n t.psl >> bacEnds.psl
     extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
 	    > bacEnds.load.psl
 
 
     #	load them into the database
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/bacends
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $4}' bacEndPairs.bed | grep " "
     awk '{print $5}' bacEndPairs.bed | sort | uniq -c
     #	result should be the scores, no extraneous strings:
     #	156984 1000
     #	   195 300
     #	   316 375
     #	   297 500
     #	  1476 750
     #	edit the file and fix it if it has a bad name.
     hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     #	Loaded 208922 elements of size 11
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     #	Loaded 79004 elements of size 11
     #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
     # one complaint, there appears to be a bogus insert count in one
     #	of the blat results:
 # < 585   797     67      0       3       2       -63     9       79188   +      AQ743980 852     42      846     chr19_gl000208_random   92689   4045    84100  11       14,124,84,496,53,6,20,28,28,10,4,       42,56,180,200,696,750,756,776,804,832,842,      4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
 Became:
 # > 585   797     67      0       3       2       0       9       79188   +	 AQ743980 852     42      846     chr19_gl000208_random   92689   4045	84100  11       14,124,84,496,53,6,20,28,28,10,4,	42,56,180,200,696,750,756,776,804,832,842,	4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
 
     hgsql -N -e "select count(*) from all_bacends;" hg19
     #	 2289275
     hgsql -N -e "select count(*) from all_bacends;" hg18
     #	1727387
     hgsql -N -e "select count(*) from all_bacends;" hg17
     #	 1729146
 
     nice featureBits hg19 all_bacends
 # 230917362 bases of 2897316137 (7.970%) in intersection
     nice featureBits hg18 all_bacends
 # 227770876 bases of 2881515245 (7.905%) in intersectio
     nice featureBits hg17 all_bacends
 # 225763317 bases of 2866216770 (7.877%) in intersection
 
     nice featureBits hg19 bacEndPairs
 # 236889607 bases of 2897316137 (8.176%) in intersection
     nice featureBits hg18 bacEndPairs
 # 162690030 bases of 2881515245 (5.646%) in intersection
     nice featureBits hg17 bacEndPairs
 # 162099487 bases of 2866216770 (5.656%) in intersection
 
     nice featureBits hg19 bacEndPairsBad
 # 38344094 bases of 2897316137 (1.323%) in intersection
     nice featureBits hg18 bacEndPairsBad
 # 37326990 bases of 2881515245 (1.295%) in intersection
     nice featureBits hg17 bacEndPairsBad
 # 37437558 bases of 2866216770 (1.306%) in intersection
 
 ############################################################################
 # STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
     mkdir /hive/data/outside/ncbi/sts.2009-04
     cd /hive/data/outside/ncbi
     ln -s sts.2009-04 sts.11
     cd /hive/data/outside/ncbi/sts.2009-04
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
     wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
     gunzip sts.gz
     mv sts dbSTS.fa
 
     #	these items are copied in from the previous builds
     cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
     cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
     #	edit stsInfo2.bed.prev for a
     #	manual fixup of error that is in the hg18 bed file, replace
     #	the line for AFM067XA9 to fix bogus long list of aliases to be:
 # 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
     #	as taken directly out of the hg18.stsInfo2 table which was fixed
     #	by Bob and Archana
 
     # Convert the title line of the dbSTS.fa file
     #	Verify that column 3 only contains gb emb dbj
     grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c 
 #   39124 dbj
 #   57375 emb
 # 1212541 gb
     #	if that is true, this sed will work:
     cat dbSTS.fa \
 	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
 	    > UniSTS.convert.fa
 
     # get accessions
     grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
     #	head and tail that to ensure names are reasonable, odd names would
     #	show up at the beginning or end
     wc -l UniSTS.acc
     #	1309040 UniSTS.acc
 
     # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
     #   all.STS.fa, stsAlias.bed files
 
     updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
 	UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
 
     #	verify the number of aliases is reasonable:
     awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
     #	50 D7S831
     #	34 CHLC.GATA2B06.465
     #	24 CHLC.GATA11E11
     #	23 AFM276ZF5
     #	23 AFM273YH9
     #	22 SHGC-133043
     #	... etc ...
     #	verify there are no unusually long or short lines:
     awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
     #	143
     #	144
     #	144
     awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
     #	552
     #	553
     #	644
     # check for null in the new files:
     grep -i null new.*
     #	if the new files look good, they can become the set to use:
     mv new.info stsInfo2.bed
     mv new.primers all.primers
     mv new.alias stsAlias.bed
     mv new.fa all.STS.fa
 
     # get list of all STS id's in the fasta file
     sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
     wc -l all.STS.id
     # 100520 total sequences
     # in hg18 this was: 93698 total sequences
     $HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
     # check that fasta file for unusual length sequences:
     faSize all.primers.fa
 # 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
 # Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244
 
     # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
     # these will be loaded into the database later
     mkdir -p /hive/data/genomes/hg19/bed/sts
     cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
     cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/
 
     # Create sts sequence alignments
     mkdir /hive/data/genomes/hg19/bed/sts/split
 
     faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts
 
     ssh swarm
     mkdir /hive/data/genomes/hg19/bed/sts/run
     cd /hive/data/genomes/hg19/bed/sts/run
 
     #	going to run separate runs for the golden path sequence vs. the
     #	randoms, haplotypes, chrUn and chrM
     #	40,000,000 chunck sizes, 20,000 overlap
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep -v "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://;" > hg19.list
     ls -1S ../split > sts.list
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set query = $2.fa
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set chr = `echo $partSpec | sed -e "s/:.*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
     /scratch/data/hg19/hg19.2bit:$partSpec \
 	../split/${query} -stepSize=5 $tmpFile.psl
 /bin/rm -f $result
 /cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
 # rm -f $tmpFile.lift $tmpFile.psl
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     gensub2 hg19.list sts.list template jobList
     #	these jobs run quickly, allow only 100 at a time
     para -maxJob=100 create jobList
 # 8367 jobs in batch
     para try ... check ... push ... etc
 # Completed: 8366 of 8366 jobs
 # CPU time in finished jobs:      89744s    1495.74m    24.93h    1.04d  0.003 y
 # IO & Wait Time:                 25467s     424.44m     7.07h    0.29d  0.001 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest finished job:              53s       0.88m     0.01h    0.00d
 # Submission to last job:          1592s      26.53m     0.44h    0.02d
 
     #	and, run the randoms as a separate run:
     mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
     cd /hive/data/genomes/hg19/bed/sts/run.randoms
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep "tParts|random|_hap|chrUn"
     cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
     ls -1S ../split > sts.list
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set query = $2.fa
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile
 /cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
     /scratch/data/hg19/hg19.2bit:$partSpec \
 	../split/${query} -stepSize=5 $tmpFile.psl
 /bin/rm -f $result
 mv $tmpFile.psl $result
 /bin/rm -f $tmpFile.psl
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     gensub2 hg19.list sts.list template jobList
     #	these jobs run quickly, allow only 100 at a time
     para -maxJob=100 create jobList
 # 6486 jobs in batch
     para try ... check ... push ... etc
 # Completed: 6486 of 6486 jobs
 # CPU time in finished jobs:       2206s      36.77m     0.61h    0.03d  0.000 y
 # IO & Wait Time:                 16505s     275.08m     4.58h    0.19d  0.001 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest finished job:              21s       0.35m     0.01h    0.00d
 # Submission to last job:           601s      10.02m     0.17h    0.01d
 
     # Compile sts sequence results
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/sts/run
     time pslSort dirs raw.psl temp psl/chr*
     #	8366 files in 89 dirs
     #	Got 8366 files 91 files per mid file
     #	real    8m50.714s
     #	-rw-rw-r--  1 810438277 May  1 11:45 raw.psl
     cd /hive/data/genomes/hg19/bed/sts/run.randoms
     time pslSort dirs raw.psl temp psl/chr*
     #	6486 files in 69 dirs
     #	Got 6486 files 81 files per mid file
     #	real    1m42.120s
     #	-rw-rw-r--  1 18378188 May  1 11:52 raw.psl
 
     rmdir temp
     cd /hive/data/genomes/hg19/bed/sts
     cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
 	| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
 	stsMarkers.psl /dev/null
     #	Processed 7412166 alignments
     #	-rw-rw-r-- 1 12031760 May  1 11:57 stsMarkers.psl
 
     $HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
     # creates stsMarkers.psl.initial
     #	-rw-rw-r-- 1  4485053 May  1 12:06 stsMarkers.psl.initial
     wc -l stsMarkers.psl.initial
     #	101338  stsMarkers.psl.initial
     #	this command needs a chrom_names file to work correctly with this
     #	new style of layout for hg19:
     cd /hive/data/genomes/hg19
     cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
     cd /hive/data/genomes/hg19/bed/sts
 
     $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
 	/cluster/data/hg19
     wc -l stsMarkers.psl.initial.acc
     #	101338  stsMarkers.psl.initial.acc
 
     sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final
 
     # determine found markers (4th field in file)
     cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
     wc -l stsMarkers.found
     #	96472 stsMarkers.found
     #	out of 100520 total sequences from:
     wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
     #	There are lots of duplicates:
     wc -l stsMarkers.final
     #	101338 stsMarkers.final
     #	And a lot of them are just completely haywire:
     awk '$3-$2 < 1001' stsMarkers.final | wc -l
     #	98382
     #	filter out markers that are too long
     awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered
 
     #  alignment of primers
     ssh swarm
     cd /hive/data/outside/ncbi/sts.2009-04
     awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
 	    all.primers > all.primers.ispcr
     mkdir primerAlign
     cd primerAlign
     mkdir split
     cd split
     split -l 5000 ../../all.primers.ispcr primer_
     ls > ../primer.list
 
     cd ..
     #	we need a 10.ooc file for this business
     time blat /scratch/data/hg19/hg19.2bit \
 	/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
 # Wrote 146902 overused 10-mers to 10.ooc
 # real    19m16.758s
 
     # separate runs for whole genome vs. randoms
     mkdir run
     cd run
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep -v "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://;" > hg19.list
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set primer = ../split/$2
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set chr = `echo $partSpec | sed -e "s/:.*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile.psl
 /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
     -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
 	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
 /bin/rm -f $result
 /cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
 rm -f $tmpFile.lift $tmpFile.psl
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list ../primer.list template jobList
     para create jobList
 # 5696 jobs in batch
     para try ... check ... push ... etc
 # Completed: 5696 of 5696 jobs
 # CPU time in finished jobs:     203899s    3398.32m    56.64h    2.36d  0.006 y
 # IO & Wait Time:                 22049s     367.48m     6.12h    0.26d  0.001 y
 # Average job time:                  40s       0.66m     0.01h    0.00d
 # Longest finished job:            5314s      88.57m     1.48h    0.06d
 # Submission to last job:          5418s      90.30m     1.50h    0.06d
 # Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     #	sort and filter the results
     cd psl
     pslSort dirs raw.psl temp chr*
     #	5696 files in 89 dirs
     #	Got 5696 files 75 files per mid file
     #	-rw-rw-r-- 1 456802973 May  4 13:32 raw.psl
     cd ..
     mkdir filter
     pslQuickFilter -minMatch=26 -maxMismatch=5 \
         -maxTinsert=5000 -verbose psl/ filter/
     #	-rw-rw-r-- 1 50302564 May  4 13:35 raw.psl
 
     #	And, for the randoms
     mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
     
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| egrep "tParts|random|_hap|chrUn" \
 	| sed -e "s/.*2bit://;" > hg19.list
     cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
     cat tParts/* > hg19.list
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set primer = ../split/$2
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 /bin/mkdir -p psl/$partSpec
 /bin/rm -f $tmpFile.psl
 /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
     -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
 	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
 /bin/rm -f $result
 mv $tmpFile.psl $result
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     #	can not use line+ check here, many of them are empty
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list ../primer.list template jobList
     #	they run quickly, limit to 100
     para -maxJob=100 create jobList
     para try ... check ... push ... etc
 # Completed: 4416 of 4416 jobs
 # CPU time in finished jobs:       1746s      29.09m     0.48h    0.02d  0.000 y
 # IO & Wait Time:                 11407s     190.12m     3.17h    0.13d  0.000 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest finished job:               8s       0.13m     0.00h    0.00d
 # Submission to last job:           147s       2.45m     0.04h    0.00d
 
     #	sort and filter the results
     cd psl
     pslSort dirs raw.psl temp chr*
     #	4416 files in 69 dirs
     #	Got 4416 files 66 files per mid file
     rmdir temp
     #	-rw-rw-r-- 1 9066053 May  4 13:31 raw.psl
 
     #	putting the two runs together
     mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
     ln -s ../run/filter/raw.psl run.psl
     ln -s ../runRandoms/filter/raw.psl runRandoms.psl
     #	-rw-rw-r-- 1 50302564 May  4 13:35 run.psl
     #	-rw-rw-r-- 1   825973 May  4 13:35 runRandoms.psl
     cd ..
     pslSort dirs primers.psl temp psl
     #	2 files in 1 dirs
     #	Got 2 files 1 files per mid file
     #	-rw-rw-r-- 1 51128110 May  4 13:39 primers.psl
     wc -l primers.psl
     #	448107 primers.psl
     rmdir temp
     pslFilterPrimers primers.psl ../all.primers primers.filter.psl
     # creates primers.filter.unlifted.psl.notfound.primers
     wc -l primers*
     #	237962 primers.filter.psl
     #	97191 primers.filter.psl.notfound.primers
 
     #	see if ePCR can find some of these notfound
     ssh swarm
     mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
 
     mkdir split
     cd split
     split -l 5000 ../../primers.filter.psl.notfound.primers  primers_
     cd ..
     ls -1S split > primers.lst
     partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
 	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
 	| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
     cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list
 
     cat > runOne.csh << '_EOF_'
 #!/bin/csh -fe
 
 set partSpec = $1
 set primer = split/$2
 set result = $3
 set tmpFile = "/scratch/tmp/$1.$2"
 set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
 set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
 set range = `echo $start $end | awk '{print $2-$1}'`
 set chr = `echo $partSpec | sed -e "s/:.*//"`
 set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
 /bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
 /bin/mkdir -p epcr/$partSpec
 /bin/rm -f $tmpFile.psl
 twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
 /cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
 /bin/rm -f $result
 /bin/mv $tmpFile.epcr $result
 rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
 '_EOF_'
     # << happy emacs
     chmod +x runOne.csh
 
     cat > template << '_EOF_'
 #LOOP
 runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 hg19.list primers.lst template jobList
     para create jobList
 	# 3160 jobs
     para try ... check ... push ... etc ...
 # Completed: 3160 of 3160 jobs
 # CPU time in finished jobs:      86253s    1437.54m    23.96h    1.00d  0.003 y
 # IO & Wait Time:                 11196s     186.61m     3.11h    0.13d  0.000 y
 # Average job time:                  31s       0.51m     0.01h    0.00d
 # Longest finished job:              89s       1.48m     0.02h    0.00d
 # Submission to last job:           237s       3.95m     0.07h    0.00d
 
     find ./epcr -type f | xargs cat > all.epcr
     wc -l all.epcr
     #	797286 all.epcr
     # convert the coordinates from the partitionSequence.pl to a lift file
     awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
     $HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
 	> all.epcr.lift
     cat all.epcr | sed -e "s/\.\./ /; s/  */\t/g" \
 	| liftUp -type=.bed stdout all.epcr.lift error stdin \
 	| awk '
 {
 printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
 }
 ' > all.epcr.lifted
 
     pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
     /cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
 	-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
     #	this took a long time, many hours
 # -rw-rw-r--   1  2785254 May  5 17:28 epcr.not.found
 # -rw-rw-r--   1 27343510 May  5 17:28 epcr.primers.psl
 # -rw-rw-r--   1  1616885 May  5 17:28 epcr.primers.psl.notfound.primers
 
     time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
     time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
 	../../all.primers /hive/data/genomes/hg19
     #	real    69m38.444s
     #	-rw-rw-r--   1        0 May  6 14:18 epcr.not.found.nomatch
     #	-rw-rw-r--   1  8369138 May  6 15:26 epcr.not.found.psl
 
     #	combining everything together now
     cd /hive/data/outside/ncbi/sts.2009-04/primerAlign
 
     sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
                 | sort -k15,15 -k17,17n > primers.final.psl
     wc -l primers.final.psl
     #	310705 primers.final.psl
 
     time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
         ../all.primers primers.final.psl > primers.final.fix.psl
     #	real    0m19.580s
     wc -l primers.final.fix.psl
     #	310705 primers.final.fix.psl
 
     # Extract relevant info, make alignments unique, and create final file to
     #	be merged with full sequence alignments
     $HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
     #	real    0m15.303s
     #	-rw-rw-r-- 1 15660447 May  6 15:44 primers.final.fix.psl.initial
     wc -l primers.final.fix.psl.initial
     #	308210 primers.final.fix.psl.initial
     $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
 	primers.final.fix.psl.initial /hive/data/genomes/hg19
     wc -l primers.final.fix.psl.initial.acc
     #	308210 primers.final.fix.psl.initial.acc
 
     $HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
 	primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
     wc -l primers.final
     # 308210 primers.final
     #	There doesn't appear to be any use for this primers.ids list
     #	except for curiosity.  Check the head and tail of this list to
     #	verify no garbage is in here.  There should just be numbers.
     awk '{print $4}' primers.final | sort -n | uniq > primers.ids
     wc -l primers.ids
     #	290961 primers.ids
 
     # Merge primer and sequence files to create final bed file
     # Merge (combineSeqPrimerPos) takes about an hour to run
     cd /hive/data/genomes/hg19/bed/sts
     time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
 	/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
     #	real    0m12.310s
     #	-rw-rw-r-- 1 15222346 May  6 15:55 stsMarkers_pos.rdb
     wc -l stsMarkers_pos.rdb
     #	315308 stsMarkers_pos.rdb
 
     time /cluster/bin/scripts/createSTSbed \
 	/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed  \
 	stsMarkers_pos.rdb > stsMap.bed
     #	real    0m31.886s
     #	-rw-rw-r-- 1 38244880 May  6 16:25 stsMap.bed
     wc -l stsMap.bed
     #	305914 stsMap.bed
 
     # Set up sequence files
     ssh hgwdev
     mkdir /gbdb/hg19/sts.11/
     ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
 	/gbdb/hg19/sts.11/all.STS.fa
     ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
         /gbdb/hg19/sts.11/all.primers.fa
 
     # Load all files
     cd /hive/data/genomes/hg19/bed/sts
     hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
     #	Creating seq.tab file
     #	Adding /gbdb/hg19/sts.11/all.STS.fa
     #	100520 sequences
     #	Adding /gbdb/hg19/sts.11/all.primers.fa
     #	317592 sequences
     #	Updating seq table
     #	Advisory lock has been released
     #	All done
 
 
     hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
     hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
     #	these files already exist here from previous operations
     # cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
     hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
     hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
     #	a couple minutes for each load above
     #	filter the stsMap.bed to eliminate items longer than 5,000 bases,
     #	takes out about 850:
     awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
 	> stsMap.filtered.5000.bed
 
     hgLoadBed -notItemRgb -noBin -tab \
 	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
 	    stsMap.filtered.5000.bed
     #	Loaded 305064 elements of size 28
 
     ln -s \
 /hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
 	primers.psl
 
     hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
     hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl
 
 ##############################################################################
 # FISH CLONES (WORKING - 2009-04-29 - Hiram)
 # The STS Marker and BAC End Pairs tracks must be completed prior to
 # creating this track.  
 
     mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
     cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
 
 # Download information from NCBI
         # point browser at:
 #   http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
 # change "Sequence tag:" to "placed on contig"
         # change "Show details on sequence-tag" to "yes"
         # change "Download or Display" to "Download table for UNIX"
         # press Submit - save as
 # /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
     chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
 
 #	Unfortunately the format of this hbrc file has changed since
 #	last time.  The columns have been rearranged, and one important
 #	column is missing, the contig information.  So, let's see if we
 #	can recover the original format by putting this together with
 #	some other things we have here.
     $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
 	/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
 	    2> dbg
 XXX - need to get this seq_clone.pmd from NCBI, maybe Paul Kitts
     #	the seq_clone.pmd file was obtained via email from Wonhee Jang
     #	jang at ncbi.nlm.nih.gov - I have asked for clarification where
     #	such a file can be fetched without resorting to email.
 
 # Get current clone/accession information
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/clone/reports/clac.out
 http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
 
 # Create initial Fish Clones bed file
     ssh kkstore02
     mkdir /hive/data/genomes/hg19/bed/fishClones
     cd /hive/data/genomes/hg19/bed/fishClones
 
     # Copy previous sts info from fhcrc
     cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
     #	This fhcrc.sts listing doesn't change.  It is merely a listing
     #	of aliases that remain in effect.
 
     #	Create cl_acc_gi_len file form cloneend information:
     grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
     | awk '{gsub(".[0-9]*$", "", $2);
 	printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
 
     hgsql -N \
 	-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
 	| sort -k1,1 -k2,2n > ctgPos.bed
     hgsql -N \
 -e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
 	| sort -k1,1 -k2,2n > gold.bed
     hgsql -N \
 -e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
 	| sort -k1,1 -k2,2n > all_bacends.bed
     hgsql -N \
 -e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
 	| sort -k1,1 -k2,2n > bacEndPairs.bed
 
 
 
     ssh hgwdev
     #	have to be on hgwdev for this since it is going to read from the
     #	database.  Had to work on this program to get it past what is
     #	evidently a bad entry in hbrc.fixed where columns of information
     #	are missing for one clone in particular
     time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
 	/hive/data/outside/ncbi/fishClones/fishClones.2009-04/fixed.hbrc.txt \
 	/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
          ./cl_acc_gi_len \
          /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl \
             fishClones
     #	real    2m4.708s
 # Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
 # reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
 # Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
 # Reading BAC Ends file ./cl_acc_gi_len
 # Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
 # Reading additional STS Marker links fhcrc.sts
 # Determining good positions
 #	findClonePos: determining positions of fish clones
 # Writing output file
 # ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 # ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 
     # Load the track
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/fishClones
     hgLoadBed -notItemRgb -noBin -tab \
         -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
 	hg19 fishClones fishClones.bed
     #	Loaded 9461 elements of size 16
 
 ##############################################################################
 # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
     mkdir /hive/data/genomes/hg19/ensembl
     cd /hive/data/genomes/hg19/ensembl
     wget --timestamping \
 	'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
     #	do not need the repeat masker sequence (although it would be
     #	interesting to measure to see how it compares)
     rm -f *.dna_rm.*
     #	fortunately we have the same sizes as Ensembl for everything
     #	(except the haplotypes) and the sizes are unique for each sequence
     #	so we can relate the names via their sizes
     mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
     cd /hive/data/genomes/hg19/bed/ucscToEnsembl
     #	the toplevel file is a duplicate of everything else
     ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
 	| while read F
 do
     zcat "${F}"
 done | faCount stdin > faCount.txt
 
     cat << '_EOF_' > relateUcscEnsembl.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my %ucscChrs;   # key is size, value is UCSC chr name
 
 open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
 while (my $line = <FH>) {
     chomp $line;
     my ($chr, $size) = split('\s+', $line);
     die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
 );
     $ucscChrs{$size} = $chr;
 }
 close (FH);
 
 my %ensemblChrs;        # key is size, value is Ensembl chr name
 
 open (FH,"<faCount.txt") or die "can not read faCount.txt";
 while (my $line = <FH>) {
     next if ($line =~ m/#/);
     next if ($line =~ m/total/);
     chomp $line;
     my ($chr, $size, $rest) = split('\s+', $line, 3);
     die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
 );
     $ensemblChrs{$size} = $chr;
 }
 close (FH);
 
 my %usedUcscChrs;
 my %usedEnsemblChrs;
 my %ensemblTranslate; # key is Ensembl name, value is UCSC size
 foreach my $size (keys %ucscChrs) {
     if (exists($ensemblChrs{$size})) {
         $usedUcscChrs{$size} = $ucscChrs{$size};
         $usedEnsemblChrs{$size} = $ensemblChrs{$size};
         printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
     } else {
         my $ucscName = $ucscChrs{$size};
         my $ensemblName = "unknown";
         if ($ucscName =~ m/^chr6/) {
             $ucscName =~ s/_hap.//;
             $ucscName =~ s/chr6_/chr6_mhc_/;
             $ensemblName = "HS" . uc($ucscName);
         } elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
             $ucscName =~ s/_.*/_1/;
             $ensemblName = "HS" . uc($ucscName);
         } elsif ($ucscName =~ m/^chrM/) {
             print "# no translation for chrM\n";
         } else {
             die "unknown UCSC chr name: $ucscName";
         }
         printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
         $ensemblTranslate{$ensemblName} = $size;
     }
 }
 
 foreach my $size (keys %ensemblChrs) {
     if (!exists($usedEnsemblChrs{$size})) {
         my $ensemblName = $ensemblChrs{$size};
         if (! exists($ensemblTranslate{$ensemblName})) {
             die "can not translate Ensembl name $ensemblName";
         } else {
             my $ucscSize = $ensemblTranslate{$ensemblName};
             printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
 , $ucscSize, $size;
         }
     }
 }
 
 printf "chrM\tMT\n";
 '_EOF_'
     # << happy emacs
     chmod +x relateUcscEnsembl.pl
 
     ./relateUcscEnsembl.pl  2>&1 | grep -v "^#" \
 	| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab
 
     cat << '_EOF_' > ucscToEnsembl.sql
 # UCSC to Ensembl chr name translation
 CREATE TABLE ucscToEnsembl (
     ucsc varchar(255) not null,        # UCSC chromosome name
     ensembl varchar(255) not null,     # Ensembl chromosome name
               #Indices
     PRIMARY KEY(ucsc(21))
 );
 '_EOF_'
 
     hgsql hg19 < ucscToEnsembl.sql
     hgsql hg19 \
 -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
 
     awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
 	> ensemblLift.tab
 
     cat << '_EOF_' > ensemblLift.sql
 # UCSC offset to Ensembl coordinates
 CREATE TABLE ensemblLift (
     chrom varchar(255) not null,      # Ensembl chromosome name
     offset int unsigned not null,     # offset to add to UCSC position 
               #Indices
     PRIMARY KEY(chrom(15))
 );
 '_EOF_'
 
     hgsql hg19 < ensemblLift.sql
     hgsql hg19 \
 -e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
 
 ##############################################################################
 # LASTZ MOUSE Mm9 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs mouse
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm9
 SEQ2_DIR=/scratch/data/mm9/nib
 SEQ2_SMSK=/scratch/data/mm9/notInOthers
 SEQ2_LEN=/scratch/data/mm9/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
     
 BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     cat fb.hg19.chainMm9Link.txt 
     #	1022734273 bases of 2897316137 (35.299%) in intersection
 
     #	and the swap
     mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
     cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
 	-swap -noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    131m58.763s
     cat fb.mm9.chainHg19Link.txt 
     #	1013880568 bases of 2620346127 (38.693%) in intersection
 
 #########################################################################
 # LASTZ Dog CanFam2 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs dog
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2
 SEQ2_DIR=/scratch/data/canFam2/nib
 SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
 SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     cat fb.hg19.chainCanFam2Link.txt 
     #	1532073507 bases of 2897316137 (52.879%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/canFam2/bed/blastz.hg19.swap
     cd /hive/data/genomes/canFam2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13/DEF \
 	-noLoadChainSplit -swap \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:19:50 PDT 2009
-    #	real    723m41.377s
+    #	real    200m17.158s
     cat fb.canFam2.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	1480018167 bases of 2384996543 (62.055%) in intersection
 #########################################################################
 # LASTZ Chicken GalGal3 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs chicken
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/data/galGal3/nib
 SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
 SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
     cat fb.hg19.chainGalGal3Link.txt 
     #	104053179 bases of 2897316137 (3.591%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/galGal3/bed/blastz.hg19.swap
     cd /hive/data/genomes/galGal3/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13/DEF \
 	-swap \
 	-noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
-XXX - running Tue Jun  2 11:24:11 PDT 2009
-    #	real    723m41.377s
+    #	real    16m45.090s
     cat fb.galGal3.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	91605899 bases of 1042591351 (8.786%) in intersection
+
 #########################################################################
 # LASTZ Macaca Mulatta RheMac2 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs macaca mulatta
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Macaca Mulatta RheMac2
 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
 SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    760m22.810s
     cat fb.hg19.chainRheMac2Link.txt 
     #	2397361211 bases of 2897316137 (82.744%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
     cd /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13/DEF \
 	-swap \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> swap.log 2>&1 &
-XXX - running Tue Jun  2 11:16:25 PDT 2009
-    #	real    723m41.377s
+    #	real    83m51.483s
     cat fb.rheMac2.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	2313806886 bases of 2646704109 (87.422%) in intersection
 #########################################################################
 # LASTZ Rat Rn4 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs rat
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4
 SEQ2_DIR=/scratch/data/rn4/nib
 SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
 SEQ2_LEN=/scratch/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
     
 BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    314m18.227s
     cat fb.hg19.chainRn4Link.txt 
     #	952605822 bases of 2897316137 (32.879%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/rn4/bed/blastz.hg19.swap
     cd /hive/data/genomes/rn4/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13/DEF \
 	-swap -noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:21:44 PDT 2009
-    #	real    723m41.377s
+    #	real    188m0.163s
     cat fb.rn4.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	947862300 bases of 2571531505 (36.860%) in intersection
 ##############################################################################
 # LASTZ Orangutan PonAbe2 (DONE - 2009-05-13 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
     cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
 
     cat << '_EOF_' > DEF
 # human vs orangutan
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Orangutan PonAbe1
 SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     cat fb.hg19.chainPonAbe2Link.txt 
     #	2646687531 bases of 2897316137 (91.350%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
     cd /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13/DEF \
 	-swap \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> swap.log 2>&1 &
-XXX - running Tue Jun  2 11:08:17 PDT 2009
-    #	real    723m41.377s
+    #	real    124m3.610s
     cat fb.ponAbe2.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	2772351468 bases of 3093572278 (89.617%) in intersection
 ##############################################################################
 # LASTZ Lamprey PetMar1 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Lamprey
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ2_LIMIT=5
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    113m20.116s
     cat fb.hg19.chainPetMar1Link.txt 
     #	31347143 bases of 2897316137 (1.082%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/petMar1/bed/blastz.hg19.swap
     cd /hive/data/genomes/petMar1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:35:37 PDT 2009
-    #	real    723m41.377s
+    #	real    59m14.813s
     cat fb.petMar1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	26615001 bases of 831696438 (3.200%) in intersection
 ##############################################################################
 # LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Fugu
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Fugu fr2
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
 SEQ2_DIR=/scratch/data/fr2/fr2.2bit
 SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
 SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
 	> do.log 2>&1 &
     #	real    5797m9.288s
     #	had a small problem finishing the fundamental batch run, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
 	> cat.log 2>&1 &
     cat fb.hg19.chainFr2Link.txt 
     #	49309456 bases of 2897316137 (1.702%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/fr2/bed/blastz.hg19.swap
     cd /hive/data/genomes/fr2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:30:44 PDT 2009
-    #	real    723m41.377s
+    #	real    25m8.491s
     cat fb.fr2.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	42984130 bases of 393312790 (10.929%) in intersection
+
 ##############################################################################
 # LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
 
     cat << '_EOF_' > DEF
 # human vs tetraodon
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/data/tetNig1/tetNig1.2bit
 SEQ2_LEN=/hive/data/genomes/tetNig1/chrom.sizes
 SEQ2_CHUNK=410000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    166m19.745s
     cat fb.hg19.chainTetNig1Link.txt 
     #	58038079 bases of 2897316137 (2.003%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
     cd /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:33:01 PDT 2009
-    #	real    723m41.377s
+    #	real    29m20.968s
     cat fb.tetNig1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	49453375 bases of 342403326 (14.443%) in intersection
 ##############################################################################
 # LASTZ Stickleback GasAcu1 (DONE - 2009-05-14 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Stickleback
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # TARGET: Stickleback gasAcu1
 SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
 SEQ2_LEN=/hive/data/genomes/gasAcu1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    174m40.659s
     cat fb.hg19.chainGasAcu1Link.txt 
     #	55509003 bases of 2897316137 (1.916%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
     cd /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:08:17 PDT 2009
-    #	real    723m41.377s
+    #	real    29m41.433s
     cat fb.gasAcu1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	49909819 bases of 446627861 (11.175%) in intersection
 ##############################################################################
 # LASTZ Marmoset CalJac1 (DONE - 2009-05-14,22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
 
     cat << '_EOF_' > DEF
 # human vs. marmoset
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Marmoset (calJac1)
 SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/scratch/data/calJac1/chrom.sizes
 SEQ2_LIMIT=200
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    214m16.294s
     cat fb.hg19.chainCalJac1Link.txt 
     #	2053025318 bases of 2897316137 (70.860%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 calJac1 > rbest.log 2>&1 &
     #	real    97m17.207s
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/calJac1/bed/blastz.hg19.swap
     cd /hive/data/genomes/calJac1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14/DEF \
 	-swap \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> swap.log 2>&1 &
-XXX - running Tue Jun  2 11:17:56 PDT 2009
-    #	real    723m41.377s
+    #	real    162m52.189s
     cat fb.calJac1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	2105959656 bases of 2929139385 (71.897%) in intersection
 
 #########################################################################
 # LASTZ Tarsier TarSyr1 (DONE - 2009-05-14,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Tarsier
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Tarsier
 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
 SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1724m48.032s
     #	need to load the chain table manually:
     #	mySQL error 1114: The table 'chainTarSyr1Link' is full
     cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
     wc -l *.tab
     #	 21882142 chain.tab
     #	165017606 link.tab
     #	186899748 total
     awk '{print length($0)}' link.tab | sort | uniq -c | less
       4 23
       9 24
      27 25
     105 26
     767 27
    1401 28
    5020 29
    8472 30
   24390 31
  117666 32
  264774 33
  776095 34
 1632393 35
 2672187 36
 7125988 37
 16831901 38
 34905113 39
 45218159 40
 31570706 41
 13746548 42
 5868689 43
 2460114 44
 1118556 45
  420826 46
  106674 47
   36770 48
   40719 49
   36955 50
   19389 51
    5571 52
    1557 53
      61 54
 
     time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19
 
     cat << '_EOF_' | hgsql hg19
     CREATE TABLE chainTarSyr1Link (
       bin smallint(5) unsigned NOT NULL default 0,
       tName varchar(255) NOT NULL default '',
       tStart int(10) unsigned NOT NULL default 0,
       tEnd int(10) unsigned NOT NULL default 0,
       qStart int(10) unsigned NOT NULL default 0,
       chainId int(10) unsigned NOT NULL default 0,
       KEY tName (tName(16),bin),
       KEY chainId (chainId)
     ) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 hgsql -e \
       "load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
     #	real    157m0.230s
     #	the running the rest of loadUp.csh after the hgLoadChain
     #	real    26m8.263s
     cat fb.hg19.chainTarSyr1Link.txt 
     #	1385797066 bases of 2897316137 (47.830%) in intersection
     #	Continuing:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-continue=download -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> download.log 2>&1 &
     #	real    48m6.573s
     #	ran the script on swarm to recover after hive outages
     time doRecipBest.pl -buildDir=`pwd` hg19 tarSyr1 > rbest.log 2>&1 &
     #	real    404m0.201s
     time doRecipBest.pl -continue=download -buildDir=`pwd` \
 	hg19 tarSyr1 > rbest.download.log 2>&1 &
 
 #########################################################################
 # LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Bushbaby
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
 SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
 SEQ2_LEN=/hive/data/genomes/otoGar1/chrom.sizes
 SEQ2_LIMIT=200
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    762m56.055s
     cat fb.hg19.chainOtoGar1Link.txt 
     #	1264492372 bases of 2897316137 (43.644%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 otoGar1 > rbest.log 2>&1 &
     #	real    271m39.925s
 
 #########################################################################
 # LASTZ Mouse lemur MicMur1 (DONE - 2009-05-14,26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
     cd /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
 
     cat << '_EOF_' > DEF
 # Human vs. Mouse lemur
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Mouse lemur
 SEQ2_DIR=/hive/data/genomes/micMur1/bed/repeatMasker/micMur1.rmsk.2bit
 SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    5429m52.082s
     #	there is one unusual long running job having trouble
     #	continuing after finishing the lastz run manually:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-continue=cat -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> cat.log 2>&1 &
     #	real    388m25.032s
     cat fb.hg19.chainMicMur1Link.txt 
     #	1347792207 bases of 2897316137 (46.519%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 micMur1 > rbest.log 2>&1
     #	about 4h30m
 
 #########################################################################
 # LASTZ Baboon PapHam1 (DONE - 2009-05-20,22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
     cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
 
     cat << '_EOF_' > DEF
 # human vs baboon
 BLASTZ=lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 # and place those items here
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Baboon papHam1
 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
 SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	forgot that the synNet was not needed here, use recip best as below
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     cat fb.hg19.chainPapHam1Link.txt 
     #	2399269031 bases of 2897316137 (82.810%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
     #	real    182m0.276s
 
 #########################################################################
 # SGP GENES (DONE - 2009-05-22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/sgpGene
     cd /hive/data/genomes/hg19/bed/sgpGene
     mkdir download
     cd download
 for C in `cut -f1 ../../../chrom.sizes`
 do
     echo $C
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
 done
 
     cd ..
     cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin
 
     #	Read 33994 transcripts in 291782 lines in 1 files
     #	33994 groups 85 seqs 1 sources 3 feature types
     #	33994 gene predictions
     nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
 # refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x
 
 ###########################################################################
 # GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg19/bed/geneid
     cd /hive/data/genomes/hg19/bed/geneid
     mkdir download
     cd download
     for C in `cut -f1 ../../../chrom.sizes`
     do
 	echo $C
  wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
     done
 
     cd ..
     cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
     #	Read 33428 transcripts in 277332 lines in 1 files
     #	33428 groups 92 seqs 1 sources 3 feature types
     #	33428 gene predictions
 
 ##########################################################################
 ## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg19/bed/multiz4way
     cd /hive/data/genomes/hg19/bed/multiz4way
 
     #	extract our 4 organisms from the 44-way on hg18:
     ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
 
     /cluster/bin/phast/tree_doctor \
 	--prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
 	| sed -e "s/hg18/hg19/" > 4way.nh
 
     #	this looks like:
     cat 4way.nh
 (((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);
 
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/hg19_4way.gif
 
     /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
     #	Use this output to create the table below
     grep -y hg19 4way.distances.txt | sort -k3,3n
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                        chainHg19Link   chain    linearGap
 #    distance                      on hg19    on other   minScore
 #  1  0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx)       5000     medium
 #  2  0.356914 - dog canFam2    (% 52.879) (% xx.xxx)       3000     medium
 #  3  0.495284 - mouse mm9      (% 35.299) (% 38.693)       3000     medium
 
     #	using the syntenic nets
     cd /cluster/data/hg19/bed/multiz4way
     mkdir mafLinks
     cd mafLinks
     mkdir rheMac2 canFam2 mm9
 
     cd mm9
     ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
     cd ../canFam2
     ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
     cd ../rheMac2
     ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .
 
     #	determine what is the newest version of multiz and use that
     cd /hive/data/genomes/hg19/bed/multiz4way
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
 
     # the autoMultiz cluster run
     ssh swarm
     cd /hive/data/genomes/hg19/bed/multiz4way
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	4way.nh > tmp.nh
     echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir run maf
     cd run
 
     #	NOTE: you need to set the db and multiz dirname properly in this script
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = hg19
 set c = $1
 set maf = $2
 set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 93 jobs
     para try ... check ... push ... etc ...
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:      24282s     404.70m     6.75h    0.28d  0.001 y
 # IO & Wait Time:                  2362s      39.36m     0.66h    0.03d  0.000 y
 # Average job time:                 286s       4.77m     0.08h    0.00d
 # Longest finished job:            2235s      37.25m     0.62h    0.03d
 # Submission to last job:          2241s      37.35m     0.62h    0.03d
 
     #	combine results into a single file for loading and gbdb reference
     cd /hive/data/genomes/hg19/bed/multiz4way
     time nice -n +19 catDir maf > multiz4way.maf
     #	real    3m27.561s
 
     #	makes a 8.5 Gb file:
     #	-rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf
 
     # Load into database
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz4way
     mkdir /gbdb/hg19/multiz4way
     ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
 	/gbdb/hg19/multiz4way
     #	the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf hg19 multiz4way
     #	real    5m31.883s
     #	Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way
 
     cd /hive/data/genomes/hg19/bed/multiz4way
     time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 	-maxSize=50000 hg19 multiz4waySummary multiz4way.maf
     #	Created 1238721 summary blocks from 11959676 components
     #	and 5788627 mafs from multiz4way.maf
     #	real    6m33.936s
 
 #########################################################################
 # LASTZ Medaka OryLat2 (DONE - 2009-05-22 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
     cd /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
 
     cat << '_EOF_' > DEF
 # Human vs. Medaka
 # typical parameters for a genome that is distant from human
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
 SEQ2_LEN=/hive/data/genomes/oryLat2/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    124m5.298s
     cat fb.hg19.chainOryLat2Link.txt 
     #	53571737 bases of 2897316137 (1.849%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
     cd /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:08:17 PDT 2009
-    #	real    723m41.377s
+    #	real    28m35.174s
     cat fb.oryLat2.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	46961818 bases of 700386597 (6.705%) in intersection
 ##############################################################################
 # LASTZ Opossum MonDom5 (DONE - 2009-05-23,29 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
     cd /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 # settings for more distant organism alignments
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Opossum monDom5
 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
 SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	One job took a long time to complete, had to run it manually on
     #	swarm:
 # /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
 #	/scratch/data/hg19/hg19.2bit:chr19:50000000-59128983 \
 #	/scratch/data/monDom5/monDom5.2bit:chr4:390000000-420000000 \
 #	../DEF \
 #	../psl/hg19.2bit:chr19:50000000-59128983/hg19.2bit:chr19:50000000-59128983_monDom5.2bit:chr4:390000000-420000000.psl
     #	took about 48 hours, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=cat > cat.log 2>&1 &
     #	real    1508m18.471s ==	about 25h08m
     cat fb.hg19.chainMonDom5Link.txt 
     #	415997117 bases of 2897316137 (14.358%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
     #	real    20m29.049s
 
 ##############################################################################
 # LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
     cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
 
     cat << '_EOF_' > DEF
 # Human vs. Armadillo
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Armadillo
 SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
 SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	finished the lastz run manually after hive maintenance outages
     #	then, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-continue=cat > cat.log 2>&1 &
     #	real    458m11.304s
     cat fb.hg19.chainDasNov2Link.txt 
     #	971847303 bases of 2897316137 (33.543%) in intersection
     time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 dasNov2 \
 	> rbest.log 2>&1
     #	time about 6h30m
 
 ##############################################################################
 # LASTZ Rock Hyrax ProCap1 (DONE - 2009-05-23,26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
     cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
 
     cat << '_EOF_' > DEF
 # Human vs. Rock Hyrax
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/hg19.2bit
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Rock Hyrax
 SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
 SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
 # Completed: 997438 of 997438 jobs
 # CPU time in finished jobs:   32830587s  547176.45m  9119.61h  379.98d  1.041 y
 # IO & Wait Time:               9549484s  159158.07m  2652.63h  110.53d  0.303 y
 # Average job time:                  42s       0.71m     0.01h    0.00d
 # Longest finished job:            1953s      32.55m     0.54h    0.02d
 # Submission to last job:         67216s    1120.27m    18.67h    0.78d
     #	finished lastz run manually, then continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=cat > cat.log 2>&1 &
     #	real    369m1.678s
     cat fb.hg19.chainProCap1Link.txt 
     #	894221652 bases of 2897316137 (30.864%) in intersection
     time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 proCap1 \
 	> rbest.log 2>&1
     #	real    251m59.549s
 
 ##############################################################################
 # LASTZ Zebra Finch TaeGut1 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs Zebra Finch
 # distant from Human settings
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebra Finch taeGut1 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
 SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
 SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
 SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-qRepeats=windowmaskerSdust > do.log 2>&1 &
     cat fb.hg19.chainTaeGut1Link.txt 
     #	real    192m48.479s
     #	101295490 bases of 2897316137 (3.496%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
 	-chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-continue=syntenicNet -qRepeats=windowmaskerSdust > synNet.log 2>&1 &
     #	real    4m10.261s
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
     cd /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26/DEF \
 	-swap -noLoadChainSplit -chainMinScore=5000 \
 	-chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-qRepeats=windowmaskerSdust > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:25:44 PDT 2009
-    #	real    723m41.377s
+    #	real    real    16m45.080s
     cat fb.taeGut1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	95320369 bases of 1222864691 (7.795%) in intersection
 
 ##############################################################################
 # LASTZ Lizard AnoCar1 (DONE - 2009-05-30,31 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
     cd /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
 
     cat << '_EOF_' > DEF
 # human vs lizard
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Lizard anoCar1
 SEQ2_DIR=/scratch/data/anoCar1/anoCar1.2bit
 SEQ2_LEN=/scratch/data/anoCar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-qRepeats=windowmaskerSdust > do.log 2>&1 &
     #	real    168m32.016s
     cat fb.hg19.chainAnoCar1Link.txt 
     #	104045950 bases of 2897316137 (3.591%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
     #	real    45m58.001s
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
     cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:26:55 PDT 2009
-    #	real    723m41.377s
+    #	real    34m55.857s
     cat fb.anoCar1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	89608316 bases of 1741478929 (5.146%) in intersection
 ##############################################################################
 # LASTZ X. tropicalis XenTro2 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs X. tropicalis
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Lizard anoCar1
 SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
 SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1129m11.568s
     #	finished the lastz run manually after hive difficulties, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=cat > cat.log 2>&1 &
     #	time about 1h30m
     cat fb.hg19.chainXenTro2Link.txt 
     #	92015242 bases of 2897316137 (3.176%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
     cd /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:28:06 PDT 2009
-    #	real    723m41.377s
+    #	real    130m53.860s
     cat fb.xenTro2.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	92070065 bases of 1359412157 (6.773%) in intersection
 ##############################################################################
 # LASTZ Zebrafish DanRer5 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs X. zebrafish
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish danRer5
 SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
 SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=40
 
 BASE=/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    311m39.817s
     cat fb.hg19.chainDanRer5Link.txt 
     #	74229561 bases of 2897316137 (2.562%) in intersection
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/danRer5/bed/blastz.hg19.swap
     cd /hive/data/genomes/danRer5/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
-XXX - running Tue Jun  2 11:31:49 PDT 2009
-    #	real    723m41.377s
+    #	real    26m54.605s
     cat fb.danRer5.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	73852780 bases of 1435609608 (5.144%) in intersection
 ##############################################################################
 # LASTZ Platypus OrnAna1 (DONE - 2009-05-26 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
     cd /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
 
     cat << '_EOF_' > DEF
 # human vs platypus
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Platypus ornAna1
 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
 SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    572m18.808s
     cat fb.hg19.chainOrnAna1Link.txt 
     #	220977689 bases of 2897316137 (7.627%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 ornAna1 > rbest.log 2>&1
     #	time about 1h32m
 
-    #	running the swap
+    #	running the swap - DONE - 2009-06-02
     mkdir /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
     cd /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26/DEF \
 	-swap -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> swap.log 2>&1 &
-XXX - running Tue Jun  2 11:23:00 PDT 2009
-    #	real    723m41.377s
+    #	real    146m52.638s
     cat fb.ornAna1.chainHg19Link.txt 
-    #	2761343871 bases of 2909485072 (94.908%) in intersection
+    #	207415519 bases of 1842236818 (11.259%) in intersection
 
 ##############################################################################
 # LASTZ Elephant LoxAfr2 (DONE - 2009-05-27,29 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
     cd /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
 
     cat << '_EOF_' > DEF
 # Human vs. Elephant
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant
 SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
 SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     # time about 3h23m
     cat fb.hg19.chainLoxAfr2Link.txt 
     #	1018502258 bases of 2897316137 (35.153%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr2 > rbest.log 2>&1
     #	real    322m37.502s
 
 ##############################################################################
 # LASTZ Tenrec EchTel1 (DONE - 2009-05-27 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
     cd /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
 
     cat << '_EOF_' > DEF
 # Human vs. Tenrec
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tenrec
 SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
 SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    1153m34.595s
     cat fb.hg19.chainEchTel1Link.txt 
     #	669856841 bases of 2897316137 (23.120%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 echTel1 > rbest.log 2>&1
     # time about 7h13m
 
 ##############################################################################
-# LASTZ Tree Shrew TupBel1 (WORKING - 2009-05-27 - Hiram)
+# LASTZ Tree Shrew TupBel1 (DONE - 2009-05-27,06-02 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
     cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
 
     cat << '_EOF_' > DEF
 # Human vs. Tree Shrew
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tree Shrew
 SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit
 SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real 811m54.095s
     #	having trouble with pk, finished manually
     #	XXX there is one job that is taking forever ...
     #	finished it in pieces on swarm in a few minutes, like this:
     mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
     cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
 #!/bin/sh
 
 S=100000000
 E=101010000
 export S E
 for I in 0 1 2 3 4 5 6 7 8 9
 do
 echo $S $E
 /usr/bin/time -p /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
 /scratch/data/hg19/nib/chr1.nib:chr1:${S}-${E} ../qParts/part019.lst \
 ../../DEF psl/chr1.nib:chr1:${S}-${E}_part019.lst.psl
 nextS=`echo $S | awk '{printf "%d", $1 + 1000000}'`
 nextE=`echo $E | awk '{printf "%d", $1 + 1000000}'`
 S=$nextS
 E=$nextE
 done
 
     grep -h "^#" psl/chr* | sort -u > result.psl
     grep -h -v "^#" psl/chr* | sort -k14,14 -k16,16n >> result.psl
     cp -p result.psl \
 ../../psl/chr1.nib:chr1:100000000-110010000/chr1.nib:chr1:100000000-110010000_part019.lst.psl
 
     #	then, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
 	-continue=cat > cat.log 2>&1 &
     #	real    212m22.707s
     time doRecipBest.pl -buildDir=`pwd` hg19 tupBel1 > rbest.log 2>&1
-XXX - running Tue Jun  2 10:15:24 PDT 2009
+    #	time about 4h22m
 
 ##############################################################################
 # LASTZ Shrew SorAra1 (DONE - 2009-05-28,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
     cd /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
 
     cat << '_EOF_' > DEF
 # Human vs. Shrew
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Shrew
 SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit
 SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	time about 23h26m
     cat fb.hg19.chainSorAra1Link.txt 
     #	572519288 bases of 2897316137 (19.760%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 sorAra1 > rbest.log 2>&1
     #	real    251m20.055s
 
 ##############################################################################
 # LASTZ Rabbit OryCun1 (DONE - 2009-05-28,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
     cd /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
 
     cat << '_EOF_' > DEF
 # Human vs. Rabbit
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rabbit
 SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
 SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	time about 23h09m
     cat fb.hg19.chainOryCun1Link.txt 
     #	975693323 bases of 2897316137 (33.676%) in intersection
 
     time doRecipBest.pl -buildDir=`pwd` hg19 oryCun1 > rbest.log 2>&1
     #	real    318m1.142s
 
 ##############################################################################
 # LASTZ Hedgehog EriEur1 (DONE - 2009-05-28,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
     cd /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
 
     cat << '_EOF_' > DEF
 # Human vs. Hedgehog
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Hedgehog
 SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit
 SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=500
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	> do.log 2>&1 &
     #	real    2043m33.198s
     cat fb.hg19.chainEriEur1Link.txt 
     #	560965051 bases of 2897316137 (19.362%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 eriEur1 > rbest.log 2>&1
     #	real    350m17.737s
 
 ##############################################################################
 # LASTZ Pika OchPri2 (DONE - 2009-05-29,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
     cd /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
 
     cat << '_EOF_' > DEF
 # Human vs. Pika
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Pika
 SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit
 SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    393m42.569s
     cat fb.hg19.chainOchPri2Link.txt 
     #	804516397 bases of 2897316137 (27.768%) in intersection
     time doRecipBest.pl -buildDir=`pwd` hg19 ochPri2 > rbest.log 2>&1
     #	real    224m47.979s
 
 ##############################################################################
 # LASTZ Kangaroo Rat DipOrd1 (DONE - 2009-05-29,30 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
     cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
 
     cat << '_EOF_' > DEF
 # Human vs. Kangaroo Rat
 BLASTZ_M=50
 
 # TARGET: Human Hg19
 SEQ1_DIR=/scratch/data/hg19/nib
 SEQ1_LEN=/scratch/data/hg19/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Kangaroo Rat
 SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
 SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    688m47.595s
     time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
     #	real    140m42.014s
 
 ##############################################################################