src/hg/makeDb/doc/ponAbe2.txt 1.23

1.23 2010/02/12 23:51:36 hiram
last runs to Marmoset calJac3
Index: src/hg/makeDb/doc/ponAbe2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ponAbe2.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/ponAbe2.txt	20 Sep 2009 17:16:46 -0000	1.22
+++ src/hg/makeDb/doc/ponAbe2.txt	12 Feb 2010 23:51:36 -0000	1.23
@@ -1,2083 +1,2144 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes browser build for the Orangutan
 # genome, July 31 2007
 #
 #	"$Id$"
 #
 ######################################################################
 ## DOWNLOAD SEQUENCE (DONE - 2007-09-06 - Hiram)
     ssh kkstore02
     mkdir /cluster/store5/ponAbe2
     ln -s /cluster/store5/ponAbe2 /cluster/data/ponAbe2
     mkdir /cluster/data/ponAbe2/wustl
     cd /cluster/data/ponAbe2/wustl
     # LaDeana Hillier provided us with a private copy before it got to the
     # public site
     wget --timestamping \
 "http://genome.wustl.edu/pub/user/lhillier/private/P_abelii/orang_070731.tar.gz"
 
     tar xvzf orang_070731.tar.gz
     #	creates a directory FINAL_070731
     ls -ogrt
 # drwxrwsr-x  2      4096 Sep  6 03:29 FINAL_070731
 # -rw-rw-r--  1 906641840 Sep  6 05:01 orang_070731.tar.gz
 
     # chrX.agp and chr20.agp had blank lines at the end, remove those manually
 
 #######################################################################
 ## create config.ra and run makeGenomeDb.pl (DONE - 2007-09-12 - Hiram)
     ssh kkstore02
     cd /cluster/data/ponAbe2
 
     cat << '_EOF_' > ponAbe2.config.ra
 # Config parameters for makeGenomeDb.pl:
 db ponAbe2
 scientificName Pongo pygmaeus abelii
 commonName Orangutan
 assemblyDate Jul. 2007
 assemblyLabel WUSTL 3.0
 orderKey 31
 # clade already on in ponAbe1
 # clade mammal
 # genomeCladePriority 11
 # mitoAcc gi:1743294
 mitoAcc X97707.1
 fastaFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.fa
 agpFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.agp
 # qualFiles /dev/null
 dbDbSpeciesDir orangutan
 '_EOF_'
     # << happy emacs
     time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
 	ponAbe2.config.ra > makeGenomeDb.out 2>&1 &
     #	real    19m49.887s
     # failed in the makeDb due to chrM vs. AGP confusion.  Fixed the code,
     #	and ran the jkStuff/makeDb.csh manually.  Then continuing:
     time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
 	-continue=dbDb -stop=dbDb ponAbe2.config.ra > dbDb.out 2>&1 &
     #	no need to run the trackDb business since we can simply
     #	copy the ponAbe1 directory in the source tree
 
 ##########################################################################
 ## Repeat masker (DONE - 2007-09-15 - Hiram)
     ssh kkstore02
     ## use screen for this
     mkdir /cluster/data/ponAbe2/bed/RepeatMasker
     cd /cluster/data/ponAbe2/bed/RepeatMasker
     time nice -n +19 ~/kent/src/hg/utils/automation/doRepeatMasker.pl \
 	-bigClusterHub=pk \
 	-buildDir=/cluster/data/ponAbe2/bed/RepeatMasker ponAbe2 > do.out 2>&1 &
     #	real    1831m46.301s
 
 ##############################################################################
 ## simpleRepeat masking (DONE - 2007-09-19 - Hiram)
     ## create a kki kluster run
     ssh kkr1u00
     mkdir /iscratch/i/ponAbe2
     cd /iscratch/i/ponAbe2
     cp -p /cluster/data/ponAbe2/ponAbe2.unmasked.2bit .
     cp -p /cluster/data/ponAbe2/chrom.sizes .
     twoBitToFa ponAbe2.unmasked.2bit ponAbe2.unmasked.fa
     mkdir split
     #  split sequence into individual chrom sequences
     faSplit sequence ponAbe2.unmasked.fa 100 split/s_
     rm ponAbe2.unmasked.fa ponAbe2.unmasked.2bit
 
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/
 done
 
     ssh kki
     mkdir -p /cluster/data/ponAbe2/bed/simpleRepeat/trf
     cd /cluster/data/ponAbe2/bed/simpleRepeat/trf
 
     cat << '_EOF_' > runTrf
 #!/bin/csh -fe 
 #
 set C = $1:r
 set SRC = /iscratch/i/ponAbe2/split/$C.fa
 mkdir -p /scratch/tmp/$C
 cp -p $SRC /scratch/tmp/$C/$C.fa
 pushd /scratch/tmp/$C
 /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf $C.fa \
 	/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
 popd
 rm -f $C.bed
 cp -p /scratch/tmp/$C/$C.bed .
 rm -fr /scratch/tmp/$C
 '_EOF_'
     # << happy emacs
     chmod +x runTrf
 
     cat << '_EOF_' > template
 #LOOP
 ./runTrf $(path1) {check out line $(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     ls /iscratch/i/ponAbe2/split > part.list
     gensub2 part.list single template jobList
     para create jobList
     para try ... check ... push ... etc ...
 # Completed: 55 of 55 jobs
 # CPU time in finished jobs:      57854s     964.23m    16.07h    0.67d  0.002 y
 # IO & Wait Time:                   473s       7.88m     0.13h    0.01d  0.000 y
 # Average job time:                1060s      17.67m     0.29h    0.01d
 # Longest finished job:           32538s     542.30m     9.04h    0.38d
 # Submission to last job:         32538s     542.30m     9.04h    0.38d
 
     ssh kkstore02
     cd /cluster/data/ponAbe2/bed/simpleRepeat/trf
     cat s_*.bed > ../simpleRepeat.bed
     cd ..
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
 
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/simpleRepeat
     time nice -n +19 hgLoadBed ponAbe2 simpleRepeat \
       simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     #	Loaded 1036457 elements of size 16
 
     nice -n +19 featureBits ponAbe2 simpleRepeat \
 	> fb.simpleRepeat.ponAbe2.txt 2>&1 &
     cat fb.simpleRepeat.ponAbe2.txt
     #	120386097 bases of 3093572278 (3.891%) in intersection
 
     #	add the trfMask to the rmsk masked sequence to get our final
     #	masked sequence
     ssh kkstore02
     cd /cluster/data/ponAbe2
     time nice -n +19 cat bed/simpleRepeat/trfMask.bed \
 	| twoBitMask -add -type=.bed ponAbe2.rmsk.2bit stdin ponAbe2.2bit
     #	can safely ignore the warning about >= 13 fields
     #	measure it
     time nice -n +19 twoBitToFa ponAbe2.2bit stdout \
 	| faSize stdin > faSize.ponAbe2.2bit.txt 2>&1
     grep masked faSize.ponAbe2.2bit.txt
     #	%45.68 masked total, %50.89 masked real
 
     ## clean up the /iscratch/i/ponAbe2/ directory
     ssh kkr1u00
     cd /iscratch/i/ponAbe2
     rm -fr *
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress --delete --stats /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/
     ssh kkr${R}u00 rmdir /iscratch/i/ponAbe2
 done
     cd ..
     rmdir ponAbe2
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2007-09-19 - Hiram)
 #	Replaced the ponAbe1 blat server with this sequence
 
     ssh hgwdev
     hgsql -e 'update blatServers set db="ponAbe2" where db="ponAbe1"' \
 	hgcentraltest
 
     #	Do not need to do this insert, just the update above
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("ponAbe2", "blat13", "17784", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("ponAbe2", "blat13", "17785", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence, i.e. the GABRA3 protein from Hg18
     #	to establish the defaultPos next:
 
 ############################################################################
     #	Reset default position to GABRA3 gene location (like Hg18)
     hgsql -e \
 'update dbDb set defaultPos="chrX:152299762-152501081" where name="ponAbe2";' \
 	hgcentraltest
 
 ############################################################################
 # SWAP BLASTZ Mouse Mm9 (DONE - 2007-09-20 - Hiram)
     # the original
     ssh kkstore02
     screen  # control with a screen
     cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
     cat fb.mm9.chainPonAbe2Link.txt
     #	914561309 bases of 2620346127 (34.902%) in intersection
 
     #	And, for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap
     cd /cluster/data/ponAbe2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
 	-chainMinScore=3000 -swap -chainLinearGap=medium \
 	-bigClusterHub=pk > swap.log 2>&1 &
     #	real    102m23.209s
     cat fb.ponAbe2.chainMm9Link.txt
     #	948458190 bases of 3093572278 (30.659%) in intersection
 
     #	create syntenicNets for 7-way conservation maf use
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
 	-continue=syntenicNet -syntenicNet \
 	-chainMinScore=3000 -swap -chainLinearGap=medium \
 	-bigClusterHub=pk > synNet.log 2>&1 &
     #	real    80m44.564s
 
 ############################################################################
 # GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram)
     #	Create a lift file as per the procedures for Chimp from the AGP:
     ssh kolossus
     cd /cluster/data/ponAbe2
     cat ponAbe2.agp | /cluster/bin/scripts/agpToLift -revStrand \
 	> jkStuff/genbank.lft
     # MAKE 11.OOC FILE FOR BLAT
     blat ponAbe2.2bit \
 	 /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
     #	Wrote 36396 overused 11-mers to 11.ooc
     # copy that over to /iscratch/i/ponAbe2/ and get that directory copied
     #	to all the Iservers
 
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add ponAbe2 just after panTro2
     # ponAbe2
 # Orangutan
 ponAbe2.serverGenome = /cluster/data/ponAbe2/ponAbe2.2bit
 ponAbe2.clusterGenome = /iscratch/i/ponAbe2/ponAbe2.2bit
 ponAbe2.ooc = /iscratch/i/ponAbe2/11.ooc
 ponAbe2.align.unplacedChroms = chrUn,chr*_random
 ponAbe2.lift = /cluster/data/ponAbe2/jkStuff/genbank.lft
 ponAbe2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 ponAbe2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 ponAbe2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 ponAbe2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 ponAbe2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 ponAbe2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
 ponAbe2.downloadDir = ponAbe2
 ponAbe2.genbank.est.xeno.load = no
 ponAbe2.refseq.mrna.native.load  = yes
 ponAbe2.refseq.mrna.xeno.load = yes
 ponAbe2.refseq.mrna.xeno.loadDesc  = yes
 
     cvs ci -m "Added ponAbe2." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     # Edit src/lib/gbGenome.c to add new species.  With these two lines:
     #	static char *ponAbeNames[] = {"Pongo abelii", "Pongo pygmaeus",
     #		"Pongo pygmaeus pygmaeus", "Pongo pygmaeus abelii", NULL};
 
     cvs ci -m "Added Pongo pygmaeus abelii (Orangutan)." src/lib/gbGenome.c
     make install-server
 
     ssh genbank
     screen  # control this business with a screen since it takes a while
     cd /cluster/data/genbank
 
     #	2008-02-14 12:28 add other Pongo names to gbGeneom.c,
     #	ssh to genbank machine, remove data/aligned/*/ponAbe2
     #	and start this build again.
     # This is a call to a script that will push our jobs out to the cluster
     # since it's a big job.  
     time nice -n +19 bin/gbAlignStep -initial ponAbe2 &
     # logFile: var/build/logs/2008.02.14-12:28:36.ponAbe2.initalign.log
     #	real    702m19.137s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ponAbe2
     #	logFile: var/dbload/hgwdev/logs/2008.02.22-17:08:01.dbload.log
     #	real    21m12.354s
 
     # enable daily alignment and update of hgwdev (DONE - 2007-11-09 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add ponAbe2 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added ponAbe2." etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 #########################################################################
 
 
 #########################################################################
 # MAKE DOWNLOADABLE / GOLDENPATH FILES (SCRIPT DONE 10/1/07 angie -- README EDITS TODO)
     cd /cluster/data/ponAbe2
     # Thought this was done in most chrom-based assemblies:
     ssh -x kkstore02 splitFileByColumn \
       `pwd`/bed/simpleRepeat/trfMask.bed `pwd`/bed/simpleRepeat/trfMaskChrom
     # And these aren't quite where expected, so make some symbolic links:
     foreach c (5_h2_hap1 \
                6_cox_hap1 6_qbl_hap2 6_cox_hap1_random 6_qbl_hap2_random)
       set cOldBase = `echo $c | sed -e 's/_random$//;'`
       set cNewBase = `echo $c | sed -e 's/_.*//;'`
       ln -s `pwd`/$cOldBase/chr$c.agp $cNewBase/chr$c.agp
       ln -s `pwd`/$cOldBase/chr$c.fa.out $cNewBase/chr$c.fa.out
     end
 
     ln -s bed/RepeatMasker/ponAbe2.fa.out .
     makeDownloads.pl ponAbe2 -verbose=2 \
       >& jkStuff/downloads.log & tail -f jkStuff/downloads.log
 
 #TODO:
 # *** Edit each README.txt to resolve any notes marked with "***":
 #     /cluster/data/ponAbe2/goldenPath/database/README.txt
 #     /cluster/data/ponAbe2/goldenPath/bigZips/README.txt
 #     /cluster/data/ponAbe2/goldenPath/chromosomes/README.txt
 #     (The htdocs/goldenPath/ponAbe2/*/README.txt "files" are just links to those.)
 # *** If you have to make any edits that would always apply to future
 #     assemblies from the same sequencing center, please edit them into
 #     ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help).
 
 #########################################################################
 ## genscan run (DONE - 2007-11-08 - Hiram)
 ##	create hard masked sequence
     ssh kkstore02
     cd /cluster/data/ponAbe2
     for C in `cut -f1 chrom.sizes`
 do
     CN=${C/_*}
     CN=${CN/chr}
     echo "${CN}/${C}.hard.fa"
     twoBitToFa -seq=${C} ponAbe2.2bit stdout \
 	| maskOutFa stdin hard ${CN}/${C}.hard.fa
     ls -ld "${CN}/${C}.hard.fa"
 done
 
     #	And, make sure there aren't any sequences in this lot that have
     #	become all N's with no sequence left in them.  This drives genscan nuts
     faCount ?/*hard.fa ??/*hard.fa > faCount.hard.txt
     #	the lowest three are:
     egrep -v "^#|^total" faCount.hard.txt \
 	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
     #	chr6_qbl_hap2_random 44531
     #	chr5_h2_hap1 25615
     #	chrM 16079
     # this is OK, the numbers mean there is plenty of sequence in those bits
 
     ssh kkr1u00
     mkdir /iscratch/i/ponAbe2/hardChunks
     cd /iscratch/i/ponAbe2/hardChunks
     #	creating 4,000,000 sized chunks, the chroms stay together as
     #	single pieces.  The contigs get grouped together into 4,000,000
     #	sized fasta files.  You don't want to break these things up
     #	because genscan will be doing its own internal 2.4 million
     #	window on these pieces, and the gene names are going to be
     #	constructed from the sequence name in these fasta files.
     cat /cluster/data/ponAbe2/?/*.hard.fa /cluster/data/ponAbe2/??/*.hard.fa \
 	| faSplit about stdin 4000000 c_
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/hardChunks/
 done
 
     ssh hgwdev
     mkdir /cluster/data/ponAbe2/bed/genscan
     cd /cluster/data/ponAbe2/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
 
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/ponAbe2/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     #	Since we split on gaps, we have no chunks like that.  You can
     #	verify with faCount on the chunks.
     ls -1Sr /iscratch/i/ponAbe2/hardChunks/c_*.fa > genome.list
 
     # Create run-time script to operate gsBig in a cluster safe manner
     cat << '_EOF_' > runGsBig
 #!/bin/csh -fe
 set runDir = `pwd`
 set srcDir = $1
 set inFile = $2
 set fileRoot = $inFile:r
 mkdir /scratch/tmp/$fileRoot
 cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
 pushd /scratch/tmp/$fileRoot
 /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
 popd
 cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
 cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
 cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
 rm -fr /scratch/tmp/$fileRoot
 '_EOF_'
     # << happy emacs
     chmod +x runGsBig
     cat << '_EOF_' > template
 #LOOP
 runGsBig /iscratch/i/ponAbe2/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} 
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 genome.list single template jobList
     para create jobList
     para try, check, push, check, ...
 # Completed: 44 of 48 jobs
 # Crashed: 4 jobs
 # CPU time in finished jobs:     327299s    5454.99m    90.92h    3.79d  0.010 y
 # IO & Wait Time:                  1091s      18.18m     0.30h    0.01d  0.000 y
 # Average job time:                7463s     124.39m     2.07h    0.09d
 # Longest finished job:           93797s    1563.28m    26.05h    1.09d
 # Submission to last job:        104573s    1742.88m    29.05h    1.21d
     #	four jobs will not finish.  They don't finish on kolossus either.
     #	The sequence involved is:
     #	c_12.fa -> chr8
     #	c_08.fa -> chr6
     #	c_06.fa -> chr5
     #	c_04.fa -> chr4
     #	So, split up these chroms on a couple of the non-bridged gaps
     ssh kkr1u00
     cd /iscratch/i/ponAbe2/hardChunks
     mkdir ../genscanSplit
     # examine the gap sizes to see if there is a good size range to
     #	break them up on, c_12 turns out to be about 30,000
     faGapSizes c_12.fa
     # this makes 6 or 7 pieces for each of these
     faSplit -minGapSize=29999 -lift=../genscanSplit/c_12.lift gap c_12.fa \
 	40000000 ../genscanSplit/c_12_
     faSplit -minGapSize=29999 -lift=../genscanSplit/c_08.lift gap c_08.fa \
 	40000000 ../genscanSplit/c_08_
     faSplit -minGapSize=29999 -lift=../genscanSplit/c_06.lift gap c_06.fa \
 	40000000 ../genscanSplit/c_06_
     faSplit -minGapSize=29999 -lift=../genscanSplit/c_04.lift gap c_04.fa \
 	40000000 ../genscanSplit/c_04_
     #	copy the genscanSplit directory to the other Iservers
     cd ../genscanSplit
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/genscanSplit/
 done
     # and run these four items in a small kluster job
     #	can re-use the runGsBig script from before, just a different
     #	source directory location
     ssh kki
     mkdir /cluster/data/ponAbe2/bed/genscan/split
     cd /cluster/data/ponAbe2/bed/genscan/split
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     cat << '_EOF_' > template
 #LOOP
 ../runGsBig /iscratch/i/ponAbe2/genscanSplit $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} 
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     ls -1Sr /iscratch/i/ponAbe2/genscanSplit/c_*.fa > genome.list
     gensub2 genome.list single template jobList
     para create jobList
     #	even after this split, three of these jobs would not complete
 # Completed: 22 of 25 jobs
 # Crashed: 3 jobs
 # CPU time in finished jobs:      17154s     285.89m     4.76h    0.20d  0.001 y
 # IO & Wait Time:                    77s       1.29m     0.02h    0.00d  0.000 y
 # Average job time:                 783s      13.05m     0.22h    0.01d
 # Longest finished job:            1135s      18.92m     0.32h    0.01d
 # Submission to last job:         22398s     373.30m     6.22h    0.26d
     #	so, take those three jobs and split them up and run them like this
     #	after all that is done, lifting all results back to their chroms
     for F in c_04 c_08 c_12
 do
 sort -k1,1 -k4,4n gtf/${F}_*.gtf ../secondSplit/${F}_?.gtf | liftUp -type=.gtf stdout split.lift error stdin \
 	| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
         | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
         > ${F}.gtf
 sort -k1,1 -k2,2n subopt/${F}_*.bed ../secondSplit/${F}_?.bed | liftUp -type=.bed stdout split.lift error stdin \
 	| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
         | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
         > ${F}.bed
 cat pep/${F}_*.pep ../secondSplit/${F}_?.pep \
 	| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
         | sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
         > ${F}.pep
 echo $F
 done
     #	that includes the secondary splits for three jobs too.
     #	the result of all this is copied to ../gtf ../pep and ../subopt
     #	to fully populate the original result, now continuing with the
     #	processing of all results
 
     # cat and lift the results into single files
     ssh kkstore02
     cd /cluster/data/ponAbe2/bed/genscan
     sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
     sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed 
     cat pep/c_*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/genscan
     ldHgGene ponAbe2 -gtf genscan genscan.gtf
     #	Read 42190 transcripts in 314789 lines in 1 files
     #	42190 groups 54 seqs 1 sources 1 feature types
     #	42190 gene predictions
 
     hgPepPred ponAbe2 generic genscanPep genscan.pep
     hgLoadBed ponAbe2 genscanSubopt genscanSubopt.bed
     #	Loaded 517874 elements of size 6
 
     #	check the numbers
     time nice -n +19 featureBits ponAbe2 genscan
     #	52587049 bases of 3093572278 (1.700%) in intersection
 
 ###########################################################################
 # Blastz Platypus ornAna1 (DONE - 2007-11-20 - Hiram)
     ssh kkstore05
     screen # use screen to control this job
     mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
     cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
 
     cat << '_EOF_' > DEF
 # Orangutan vs. platypus
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Orangutan PonAbe2
 SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Platypus ornAna1
 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
 SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    2585m50.826s
     # some of the jobs got confused during kk parasol hub reboots, where they
     # think they aren't done, but they actually are.  A 'para recover' finds
     # nothing to re-run, all the result files are present.
 # Completed: 263242 of 263488 jobs
 # Crashed: 246 jobs
 # CPU time in finished jobs:   48351127s  805852.12m 13430.87h  559.62d  1.533 y
 # IO & Wait Time:               7609407s  126823.45m  2113.72h   88.07d  0.241 y
 # Average job time:                 213s       3.54m     0.06h    0.00d
 # Longest finished job:            2146s      35.77m     0.60h    0.02d
 # Submission to last job:        259048s    4317.47m    71.96h    3.00d
 
     #	after verifying all jobs are actually finished, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
     #	real    53m18.757s
     cat fb.ponAbe2.chainOrnAna1Link.txt
     #	214557704 bases of 3093572278 (6.936%) in intersection
 
     mkdir /cluster/data/ornAna1/bed/blastz.ponAbe2.swap
     cd /cluster/data/ornAna1/bed/blastz.ponAbe2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-swap -bigClusterHub=kk > swap.log 2>&1 &
     #	real    120m20.860s
     cat fb.ornAna1.chainPonAbe2Link.txt
     #	201368027 bases of 1842236818 (10.931%) in intersection
 
 ###########################################################################
 # Blastz Marmoset calJac1 (DONE - 2007-11-19 - Hiram)
     ssh kkstore02
     screen # use screen to control this job
     mkdir /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
     cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
 
     cat << '_EOF_' > DEF
 # Orangutan vs. Marmoset
 BLASTZ_M=50
 
 # TARGET: Orangutan PonAbe2
 SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Marmoset calJac1
 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > do.log 2>&1 &
     #	real    1441m
 # Completed: 97520 of 97520 jobs
 # CPU time in finished jobs:   21651347s  360855.79m  6014.26h  250.59d  0.687 y
 # IO & Wait Time:              10087025s  168117.08m  2801.95h  116.75d  0.320 y
 # Average job time:                 325s       5.42m     0.09h    0.00d
 # Longest finished job:            5645s      94.08m     1.57h    0.07d
 # Submission to last job:         67173s    1119.55m    18.66h    0.78d
     cat fb.ponAbe2.chainCalJac1Link.txt
     #	2310720863 bases of 3093572278 (74.694%) in intersection
 
     # reciprocal best net mafs for 7-way multiz
     time nice -n +19 doRecipBest.pl -workhorse=hgwdev ponAbe2 calJac1 \
 	> rbest.log 2>&1
     #	this failed at the download step, expects to run on hgwdev
     #	So, let's see what it makes on hgwdev
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
     time nice -n +19 doRecipBest.pl -continue=download -workhorse=hgwdev \
 	ponAbe2 calJac1 > rbest.download.log 2>&1 &
     # it populates the directory:
     # /usr/local/apache/htdocs/goldenPath/ponAbe2/vsCalJac1/reciprocalBest
 
     ssh kkstore02
     mkdir /cluster/data/calJac1/bed/blastz.ponAbe2.swap
     cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=kk > swap.log 2>&1 &
     #	real    341m54.548s
     cat fb.calJac1.chainPonAbe2Link.txt
     #	2253236255 bases of 2929139385 (76.925%) in intersection
 
 #########################################################################
 ## 8-Way Multiz (DONE - 2007-11-27 - Hiram)
 ##
     ssh hgwdev
     mkdir /cluster/data/ponAbe2/bed/multiz8way
     cd /cluster/data/ponAbe2/bed/multiz8way
     #	take the 30-way tree from mm9 and eliminate genomes not in
     #	this alignment
     #	rearrange to get ponAbe2 on the top of the graph
     #	paste this tree into the on-line phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to create the image for the tree diagram
 
     #	select the 8 organisms from the 30-way recently done on mouse mm9
     /cluster/bin/phast/tree_doctor \
 	--prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Platypus_ornAna1,Opossum_monDom4 \
 	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
 	> 8-way.fullNames.nh
 
     #	looks something like this:
 (((Mouse_mm9:0.325818,
 ((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
 Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,
 Marmoset_calJac1:0.070000):0.058454):0.263313,
 Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110);
 
     #	rearrange to get Orangutan at the top:
     # this leaves us with:
     cat << '_EOF_' > ponAbe2.8-way.nh
 ((((((Orangutan_ponAbe2:0.020000,
       (Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037):0.013037,
      Rhesus_rheMac2:0.031973):0.036500,
     Marmoset_calJac1:0.070000):0.058454,
   Mouse_mm9:0.325818):0.263313,
  Opossum_monDom4:0.320721):0.088647,
 Platypus_ornAna1:0.488110);
 '_EOF_'
     #	<< happy emacs
 
     #	create a species list from that file:
     sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' ponAbe2.8-way.nh \
         | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
         | sed -e "s/.*_//; s/:.*//" | sort > species.list
     #	verify that has 8 db names in it
     # create a stripped down nh file for use in autoMZ run
     echo \
 `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' ponAbe2.8-way.nh \
 	| sed -e "s/  / /g"` > tree.8.nh
     #	that looks like, as a single line:
     # ((((((ponAbe2 (hg18 panTro2)) rheMac2) calJac1) mm9) monDom4) ornAna1)
 
     # verify all blastz's exists
     cat << '_EOF_' > listMafs.csh
 #!/bin/csh -fe
 cd /cluster/data/ponAbe2/bed/multiz8way
 foreach db (`cat species.list`)
     set bdir = /cluster/data/ponAbe2/bed/blastz.$db
     if (-e $bdir/mafRBestNet/chr1.maf.gz) then
 	echo "$db mafRBestNet"
     else if (-e $bdir/mafSynNet/chr1.maf.gz) then
 	echo "$db mafSynNet"
     else if (-e $bdir/mafNet/chr1.maf.gz) then
 	echo "$db mafNet"
     else
 	echo "$db mafs not found"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./listMafs.csh
     #	see what it says, the "mafs not found" should only show up on ponAbe2
     ./listMafs.csh
 # calJac1 mafRBestNet
 # hg18 mafSynNet
 # mm9 mafSynNet
 # monDom4 mafNet
 # ornAna1 mafNet
 # panTro2 mafSynNet
 # ponAbe2 mafs not found
 # rheMac2 mafSynNet
 
 
     /cluster/bin/phast/all_dists ponAbe2.8-way.nh > 8way.distances.txt
     grep -i ponabe 8way.distances.txt | sort -k3,3n
 # Orangutan_ponAbe2       Human_hg18      0.038910
 # Orangutan_ponAbe2       Chimp_panTro2   0.040705
 # Orangutan_ponAbe2       Rhesus_rheMac2  0.065010
 # Orangutan_ponAbe2       Marmoset_calJac1        0.139537
 # Orangutan_ponAbe2       Mouse_mm9       0.453809
 # Orangutan_ponAbe2       Opossum_monDom4 0.712025
 # Orangutan_ponAbe2       Platypus_ornAna1        0.968061
 
     #	use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.  Zebrafish ends up before
     #	tetraodon and fugu on the browser despite its distance.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                       chainPonAbe2Link   chain   linearGap
 #    distance                     on PonAbe2    on other   minScore
 #  1  0.038910 Human_hg18         (% 91.302)   (% 92.892)   3000     medium
 #  2  0.040705 Chimp_panTro2      (% 89.431)   (% 91.033)   3000     medium
 #  3  0.065010 Rhesus_rheMac2     (% 82.623)   (% 88.157)   3000     medium
 #  4  0.139537 Marmoset_calJac1   (% 74.694)   (% 76.925)   3000     medium
 #  5  0.453809 Mouse_mm9          (% 30.659)   (% 34.902)   3000     medium
 #  6  0.712025 Opossum_monDom4    (% 11.306)   (% 13.137)   5000     loose
 #  7  0.968061 Platypus_ornAna1   (%  6.936)   (% 10.931)   5000     loose
 
     # copy net mafs to cluster-friendly storage, splitting chroms
     # into 50MB chunks  to improve run-time
     # NOTE: splitting will be different for scaffold-based reference asemblies
     ssh hgwdev
     mkdir /cluster/data/ponAbe2/bed/multiz8way/run.split
     cd /cluster/data/ponAbe2/bed/multiz8way/run.split
     #	this works by examining the rmsk table for likely repeat areas
     #	that won't be used in blastz
     mafSplitPos ponAbe2 50 mafSplit.bed
 
     ssh kki
     cd /cluster/data/ponAbe2/bed/multiz8way/run.split
  
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set targDb = "ponAbe2"
 set db = $1
 set sdir = /san/sanvol1/scratch/$targDb/splitStrictMafNet
 mkdir -p $sdir
 if (-e $sdir/$db) then
     echo "directory $sdir/$db already exists -- remove and retry"
     exit 1
 endif
 set bdir = /cluster/data/$targDb/bed/blastz.$db
 if (! -e $bdir) then
     echo "directory $bdir not found"
     exit 1
 endif
 mkdir -p $sdir/$db
 if (-e $bdir/mafRBestNet) then
     set mdir = $bdir/mafRBestNet
 else if (-e $bdir/mafSynNet) then
     set mdir = $bdir/mafSynNet
 else if (-e $bdir/mafNet) then
     set mdir = $bdir/mafNet
 else
     echo "$bdir maf dir not found"
     exit 1
 endif
 echo $mdir
 foreach f ($mdir/*)
     set c = $f:t:r:r
     echo "  $c"
     nice mafSplit mafSplit.bed $sdir/$db/ $f
 end
 echo "gzipping $sdir/$db mafs"
 nice gzip $sdir/$db/*
 endif
 echo $mdir > $db.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     grep -v ponAbe2  ../species.list > split.list
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(path1) {check out line+ $(path1).done}
 #ENDLOOP
 '_EOF_'
     gensub2 split.list single template jobList
     para create jobList
     # 7 jobs
     # start these gently, this is a good load on the san filesystem
     para -maxPush=3 push
     #	wait a while, verify these are running OK
     para push
     # let that run to a couple completions, a few minutes, then again:
     para try
     # etc ...
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:       6048s     100.81m     1.68h    0.07d  0.000 y
 # IO & Wait Time:                   593s       9.88m     0.16h    0.01d  0.000 y
 # Average job time:                 949s      15.81m     0.26h    0.01d
 # Longest finished job:            1451s      24.18m     0.40h    0.02d
 # Submission to last job:          1471s      24.52m     0.41h    0.02d
 
     # ready for the multiz run
     ssh pk
     cd /cluster/data/ponAbe2/bed/multiz8way
     #	actually, the result directory here should be maf.split instead of maf
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $P/{autoMZ,multiz,maf_project} penn
 
     # list chrom chunks, any db dir will do; better would be for the
     # splitter to generate this file
     # We temporarily use __ instead of . to delimit chunk in filename
     # so we can use $(root) to get basename
     find /san/sanvol1/scratch/ponAbe2/splitStrictMafNet -type f \
 	| while read F; do basename $F; done \
 	| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
     wc -l chromChunks.list
         # 104
 
 cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 
     set db = ponAbe2
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../tree.8.nh ../species.list $tmp
     pushd $tmp
     foreach s (`cat species.list`)
         set c2 = `echo $c | sed 's/__/./'`
         set in = $pairs/$s/$c2.maf
         set out = $db.$s.sing.maf
         if ($s == ponAbe2) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << emacs
     gensub2 chromChunks.list single template jobList
     para create jobList
     # the 104 jobs run time:
 # Completed: 104 of 104 jobs
 # CPU time in finished jobs:     174748s    2912.47m    48.54h    2.02d  0.006 y
 # IO & Wait Time:                  2952s      49.20m     0.82h    0.03d  0.000 y
 # Average job time:                1709s      28.48m     0.47h    0.02d
 # Longest finished job:            3340s      55.67m     0.93h    0.04d
 # Submission to last job:          4413s      73.55m     1.23h    0.05d
 
     # put the split maf results back together into single chroms
     ssh kkstore02
     cd /cluster/data/ponAbe2/bed/multiz8way
     # here is where the result directory maf should have already been maf.split
     mv maf maf.split
     mkdir maf
     # going to sort out the redundant header garbage to leave a cleaner maf
     for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
 do
     echo ${C}
     head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
     grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
 	sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
     grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
     tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
 done
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/ponAbe2/multiz8way/maf
     ln -s /cluster/data/ponAbe2/bed/multiz8way/maf/*.maf \
                 /gbdb/ponAbe2/multiz8way/maf
     # this generates a large 1 Gb multiz8way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/ponAbe2/multiz8way/maf ponAbe2 multiz8way
     #	real    6m37.534s
     #	Loaded 6981959 mafs in 55 files from /gbdb/ponAbe2/multiz8way/maf
 
     # load summary table
     time nice -n +19 cat /gbdb/ponAbe2/multiz8way/maf/*.maf \
 	| hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz8waySummary stdin
     #	Created 1417364 summary blocks from 29928557 components
     #	and 6981421 mafs from stdin
     #	real    21m35.057s
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore02
     mkdir /cluster/data/ponAbe2/bed/multiz8way/anno
     cd /cluster/data/ponAbe2/bed/multiz8way/anno
     mkdir maf run
 
     #	these actually already all exist from previous multiple alignments
     for DB in `cat ../species.list`
 do
     CDIR="/cluster/data/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `grep -v ponAbe2 ../../species.list`
 do
     echo "${DB} "
     ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     ssh kki
     cd /cluster/data/ponAbe2/bed/multiz8way/anno/run
 
     cat << '_EOF_' > doAnno.csh
 #!/bin/csh -ef
     set dir = /cluster/data/ponAbe2/bed/multiz8way
     set c = $1
     cat $dir/maf/${c}.maf | \
         nice mafAddIRows -nBeds=nBeds stdin /cluster/data/ponAbe2/ponAbe2.2bit $2
 '_EOF_'
     # << happy emacs
     chmod +x doAnno.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./doAnno.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/anno/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /cluster/data/ponAbe2/chrom.sizes > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 55 of 55 jobs
 # CPU time in finished jobs:       2278s      37.97m     0.63h    0.03d  0.000 y
 # IO & Wait Time:                  9909s     165.15m     2.75h    0.11d  0.000 y
 # Average job time:                 222s       3.69m     0.06h    0.00d
 # Longest finished job:            3340s      55.67m     0.93h    0.04d
 # Submission to last job:          3897s      64.95m     1.08h    0.05d
 
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/anno
     mkdir -p /gbdb/ponAbe2/multiz8way/anno/maf
     ln -s /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf \
                 /gbdb/ponAbe2/multiz8way/anno/maf
     #	by loading this into the table multiz8way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/ponAbe2/multiz8way/anno/maf \
                 ponAbe2 multiz8way
     #	Loaded 7331265 mafs in 55 files from /gbdb/ponAbe2/multiz8way/anno/maf
     #	real    8m31.092s
 
     cat /cluster/data/ponAbe2/chrom.sizes | \
 	awk '{if ($2 > 1000000) { print $1 }}' |
 	while read C
 do
     echo /gbdb/ponAbe2/multiz8way/anno/maf/$C.maf
 done | xargs cat | \
         hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multiz8waySummary stdin
     #	Created 1417364 summary blocks from 29928557 components and 7330669
     #	mafs from stdin
     #	by loading this into the table multiz8waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	remove the multiz8way*.tab files in this /scratch/tmp directory
     rm multiz8way*
 
 #############################################################################
 ## Annotate 8-way multiple alignment with gene annotations
 ##		(DONE - 2007-11-27 - Hiram)
     # Gene frames
     ## survey all genomes to see what type of gene track to use
     ssh hgwdev
     mkdir /cluster/data/ponAbe2/bed/multiz8way/frames
     cd /cluster/data/ponAbe2/bed/multiz8way/frames
     #	dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
     cat << '_EOF_' > showGenes.csh
 #!/bin/csh -fe
 foreach db (`cat ../species.list`)
     echo -n "${db}: "
     echo -n "Tables: "
     set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
     foreach table ($tables)
 	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
 	    $table == "knownGene") then
 		set count = `hgsql $db -N -e "select count(*) from $table"`
 		echo -n "${table}: ${count}, "
 	endif
     end
     set orgName = `hgsql hgcentraltest -N -e \
 	    "select scientificName from dbDb where name='$db'"`
     set orgId = `hgsql ponAbe2 -N -e \
 	    "select id from organism where name='$orgName'"`
     if ($orgId == "") then
 	echo "Mrnas: 0"
     else
 	set count = `hgsql ponAbe2 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
 	echo "Mrnas: ${count}"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./showGenes.csh
     #	given this output, manually sorted for this display:
 # hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29037,
 #	refGene: 25902, Mrnas: 209126
 # mm9: Tables: knownGene: 49409, mgcGenes: 22950, refGene: 21146, Mrnas: 242485
 # monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398
 # ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141
 # panTro2: Tables: ensGene: 32852, refGene: 26179, Mrnas: 1297
 # rheMac2: Tables: ensGene: 38561, refGene: 426, Mrnas: 3202
 # calJac1: Tables: Mrnas: 952
 # ponAbe2: Tables: Mrnas: 2
 
     #	from this information, conclusions:
     #	use knownGene for hg18, mm9
     #	use ensGene for monDom4, ornAna1, panTro2, rheMac2
     #	no annotations for calJac1, ponAbe2
 
     mkdir genes
     # knownGene
     for DB in hg18 mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # ensGene
     for DB in monDom4 ornAna1 panTro2 rheMac2
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     ls -og genes
 # -rw-rw-r--  1 2008806 Nov 27 14:37 hg18.gp.gz
 # -rw-rw-r--  1 1965274 Nov 27 14:37 mm9.gp.gz
 # -rw-rw-r--  1 1751726 Nov 27 14:37 monDom4.gp.gz
 # -rw-rw-r--  1 1232719 Nov 27 14:37 ornAna1.gp.gz
 # -rw-rw-r--  1 1980696 Nov 27 14:37 panTro2.gp.gz
 # -rw-rw-r--  1 1935916 Nov 27 14:37 rheMac2.gp.gz
 
 
     ssh kkstore02
     cd /cluster/data/ponAbe2/bed/multiz8way/frames
     # leaving out calJac1 ponAbe2
     time (cat  ../maf/*.maf | nice -n +19 genePredToMafFrames ponAbe2 stdin stdout hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz rheMac2 genes/rheMac2.gp.gz panTro2 genes/panTro2.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz8way.mafFrames.gz) > frames.log 2>&1
     # see what it looks like in terms of number of annotations per DB:
     zcat multiz8way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
 # 219627 hg18
 # 221459 panTro2
 # 230108 rheMac2
 # 238734 mm9
 # 412384 ornAna1
 # 459521 monDom4
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/frames
     time nice -n +19 hgLoadMafFrames ponAbe2 multiz8wayFrames \
 	multiz8way.mafFrames.gz
     #	real    0m45.837s
 
     #	enable the trackDb entries:
 # frames multiz8wayFrames
 # irows on
 
 #############################################################################
 # phastCons 8-way (DONE - 2007-10-16 - Hiram)
 
     # split 8way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh kki
     mkdir /cluster/data/ponAbe2/bed/multiz8way/msa.split
     cd /cluster/data/ponAbe2/bed/multiz8way/msa.split
     mkdir -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /cluster/data/ponAbe2/bed/multiz8way/maf
 set WINDOWS = /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
 pushd $WINDOWS
 set c = $1
 rm -fr $c
 mkdir $c
 twoBitToFa -seq=$c /scratch/data/ponAbe2/ponAbe2.2bit /scratch/tmp/ponAbe2.$c.fa
 # need to truncate odd-ball scaffold/chrom names that include dots
 # as phastCons utils can't handle them
 set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
 perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
 /cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
     -M /scratch/tmp/ponAbe2.$c.fa \
     -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
 rm -f $CLEAN_MAF /scratch/tmp/ponAbe2.$c.fa
 popd
 date >> $c.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(root1) {check out line+ $(root1).done}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
     ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
     #	push these with a -maxPush number that would put a single job on
     #	each iserver, you don't want two jobs at a time on each iserver,
     #	they consume too much memory and cause swapping to the point where
     #	they slow to a crawl and take hours to finish.
 # Completed: 55 of 55 jobs
 # CPU time in finished jobs:       2679s      44.65m     0.74h    0.03d  0.000 y
 # IO & Wait Time:                  2980s      49.67m     0.83h    0.03d  0.000 y
 # Average job time:                 103s       1.71m     0.03h    0.00d
 # Longest finished job:             716s      11.93m     0.20h    0.01d
 # Submission to last job:         10391s     173.18m     2.89h    0.12d
 
     # XXXX Estimates were attempted, not really very useful, instead, as seen
     # below, merely take the cons and noncons trees from the mouse 30-way
 
     # Estimate phastCons parameters
     #	see also:
     #	http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html
 
     # Create a list of .ss files over 3,000,000 in length
     #	this is almost everything
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
     ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \
 	awk '$5 > 3000000 {print $9;}' > ../tuningRun.list
 
     # Set up parasol directory to calculate trees on these 50 regions
     ssh kk
     mkdir /cluster/data/ponAbe2/bed/multiz8way/treeRun2
     cd /cluster/data/ponAbe2/bed/multiz8way/treeRun2
     mkdir tree log most
 
     #	Tuning this loop should come back to here to recalculate 
     # Create script that calls phastCons with right arguments
     cat > makeTree.csh << '_EOF_'
 #!/bin/csh -fe
 set SAN="/san/sanvol1/scratch/ponAbe2/multiz8way/cons"
 set SS=$1
 set C=$1:h
 set F=$1:t
 set tmpDir="/scratch/tmp/pA2_$2"
 rm -fr $tmpDir
 mkdir $tmpDir
 mkdir -p log/${C} tree/${C} most/${C}
 scp -p hiram@pk:$SAN/ss/$1 $tmpDir/$F
 scp -p hiram@pk:$SAN/estimate/starting-tree.mod $tmpDir
 pushd $tmpDir
 /cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \
       --gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \
       --expected-length 45 --target-coverage 0.3 --most-conserved $F.most \
       --quiet --log $F.log --estimate-trees $F.tree
 popd
 cp -p $tmpDir/$F.log log/$C
 cp -p $tmpDir/$F.most most/$C
 cp -p $tmpDir/$F.tree.*cons.mod tree/$C
 rm -fr $tmpDir
 '_EOF_'
 #	<< happy emacs
       chmod a+x makeTree.csh
 
 # Create gensub file
       cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1) $(num1)
 #ENDLOOP
 '_EOF_'
 #	<< happy emacs
 
 # Make cluster job and run it
   scp -p hiram@pk:/san/sanvol1/scratch/ponAbe2/multiz8way/cons/tuningRun.list .
     gensub2 tuningRun.list single template jobList
     para create jobList
     para try/push/check/etc
 # Completed: 50 of 50 jobs
 # CPU time in finished jobs:      51439s     857.32m    14.29h    0.60d  0.002 y
 # IO & Wait Time:                   948s      15.79m     0.26h    0.01d  0.000 y
 # Average job time:                1048s      17.46m     0.29h    0.01d
 # Longest finished job:            1388s      23.13m     0.39h    0.02d
 # Submission to last job:          4376s      72.93m     1.22h    0.05d
 
 # Now combine parameter estimates.  We can average the .mod files
 # using phyloBoot.  This must be done separately for the conserved
 # and nonconserved models
     ls -1 tree/chr*/*.cons.mod > cons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
 	--output-average ave.cons.mod > cons_summary.txt
     ls -1 tree/chr*/*.noncons.mod > noncons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
 	--output-average ave.noncons.mod > noncons_summary.txt
     sort -k1,1 -k2,2n  most/chr*/*.most > mostConserved.bed
     wc -l mostConserved.bed
     #	at --target_coverage .3 and --expected_length 45
     #	1159312 most conserved elements
     #	primates only at --target_coverage .75 and --expected_length 1000
 
 #	measuring entropy
 #	consEntopy <target coverage> <expected lengths>
 #		 ave.cons.mod ave.noncons.mod --NH 9.78
     /cluster/bin/phast/$MACHTYPE/consEntropy .3 45 --NH 9.78 \
 	ave.cons.mod ave.noncons.mod
 
 # ( Solving for new omega: 45.000000 21.095519 13.646969 11.146376
 #	10.499722 10.437794 10.437185 )
 # Transition parameters: gamma=0.300000, omega=45.000000,
 #	mu=0.022222, nu=0.009524
 # Relative entropy: H=0.415553 bits/site
 # Expected min. length: L_min=30.634230 sites
 # Expected max. length: L_max=24.942235 sites
 # Phylogenetic information threshold: PIT=L_min*H=12.730148 bits
 # Recommended expected length: omega=10.437185 sites (for L_min*H=9.780000)
 
     #	and checking the most conserved areas, on hgwdev:
     #	at --target_coverage .3 and --expected_length 45
     featureBits -noRandom -noHap ponAbe2 mostConserved.bed
     #	221759187 bases of 2723012131 (8.144%) in intersection
     featureBits -noRandom -noHap -enrichment ponAbe2 genscan:cds \
 	mostConserved.bed
     #	genscan:cds 1.688%, mostConserved.bed 8.144%, both 0.146%,
     #	cover 8.63%, enrich 1.06x
 
     #	primates only at --target_coverage .75 and --expected_length 1000
     featureBits -noRandom -noHap ponAbe2 mostConserved.bed
     #	524207695 bases of 2723012131 (19.251%) in intersection
     featureBits -enrichment -noRandom -noHap ponAbe2 genscan:cds \
 	mostConserved.bed
     #	genscan:cds 1.688%, mostConserved.bed 19.251%, both 0.314%,
     #	cover 18.61%, enrich 0.97x
 
     #	Estimates could be made, but more correctly, take the 30-way
     #	.mod file, and re-use it here.
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way
     cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
 
     ssh hgwdev
     screen # use screen to manage this long running job
     cd /cluster/data/ponAbe2/bed/multiz8way/msa.split
     #	using one of the sections of chr1 as an example
     time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
 /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss/chr1/chr1.50000209-60000000.ss \
 --tree \
 "((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \
     --out-root starting-tree
     #	real    1m35.979s
 
     #	create a single ss file from a whole chrom maf file
     /cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \
 	-o SS -r chr1 -w 300000000,0 -I 1000 -B 5000 
 
     #	run a phyloFit on that large .ss file to find a starting-tree
     time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
 	chr1.1-229942017.ss --tree \
 "((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \
 	--out-root starting-tree
 # Reading alignment from chr1.1-229942017.ss ...
 # Compacting sufficient statistics ...
 # Fitting tree model to chr1.1-229942017.ss using REV ...
 # Writing model to starting-tree.mod ...
 # Done.
 
 # real    5m30.470s
 
     # add up the C and G:
     grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     #	0.355
     #	This 0.355 is used in the --gc argument below
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh pk
     mkdir -p /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons
     cd /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all gliers placentals
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.2007-05-04
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 set cons = /cluster/data/ponAbe2/bed/multiz8way/cons
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/ponAbe2/multiz8way/cons
 if (-s $cons/$grp/$grp.non-inf) then
   cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf .
   cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
 else
   cp -p $cons/$grp/$grp.mod .
   cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
 sleep 4
 touch $san/$grp/pp/$c $san/$grp/bed/$c
 rm -f $san/$grp/pp/$c/$f.pp
 rm -f $san/$grp/bed/$c/$f.bed
 mv $tmp/$f.pp $san/$grp/pp/$c
 mv $tmp/$f.bed $san/$grp/bed/$c
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     #	it may be better to have the result check file be the .pp file, since
     #	this .bed file is sometimes empty even though there are .pp results
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/ponAbe2/multiz8way/cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/ponAbe2/bed/multiz8way/cons/ss.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
     /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
     --prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \
 	> all.mod
     cd ../run.cons/all
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     #	it may be better to have the result check file be the .pp file, since
     #	this .bed file is sometimes empty even though there are .pp results
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 368 of 368 jobs
 # CPU time in finished jobs:      11954s     199.23m     3.32h    0.14d  0.000 y
 # IO & Wait Time:                  3688s      61.47m     1.02h    0.04d  0.000 y
 # Average job time:                  43s       0.71m     0.01h    0.00d
 # Longest finished job:              52s       0.87m     0.01h    0.00d
 # Submission to last job:           577s       9.62m     0.16h    0.01d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
     time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
     time nice -n +19 hgLoadBed ponAbe2 phastConsElements8way mostConserved.bed
     #	Loaded 1096981 elements of size 5
 
     # Try for 5% overall cov, and 70% CDS cov 
     #	We don't have and gene tracks to compare CDS coverage
     #	--rho .31 --expected-length 45 --target-coverage .3
     featureBits ponAbe2 phastConsElements8way
     #	134611760 bases of 3093572278 (4.351%) in intersection
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastCons8wayScores
 
 for D in pp/chr*
 do
     C=${D/pp\/}
     out=phastCons8wayScores/${C}.data.gz
     echo "${D} > ${C}.data.gz"
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
 	gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
     #	real    38m22.760s
     #	copy the phastCons8wayScores to:
 # /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores
     #	for hgdownload downloads
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
     time nice -n +19 ls phastCons8wayScores/*.data.gz | xargs zcat \
 	| wigEncode -noOverlap stdin phastCons8way.wig phastCons8way.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    23m55.813s
     time nice -n +19 cp -p *.wi? /cluster/data/ponAbe2/bed/multiz8way/cons/all
     #	real    2m2.107s
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
     ln -s `pwd`/phastCons8way.wib /gbdb/ponAbe2/multiz8way/phastCons8way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \
 	phastCons8way phastCons8way.wig
     #	real    1m27.666s
 
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=ponAbe2 phastCons8way > histogram.data 2>&1
     #	real    5m0.608s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Orangutan PonAbe2 Histogram phastCons8way track"
 set xlabel " phastCons8way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     ### Create a phastCons data set for primates-only
     # XXX - this was only a test, this data was not used
 
     # setup primates-only run
     ssh pk
     cd /cluster/data/ponAbe2/bed/multiz8way/cons
     mkdir primates run.cons/primates
     cd primates
     # primates-only: exclude all but these for phastCons tree:
     /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
 	--prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1 \
 	    > primates.mod
     #	and place the removed ones in the non-inf file so phastCons will
     #	truly ignore them:
     echo "monDom4,ornAna1" > primates.non-inf
 
     cd ../run.cons/primates
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     #	it may be better to have the result check file be the .pp file, since
     #	this .bed file is sometimes empty even though there are .pp results
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
     #	Four of these jobs fail to produce any output:
     #	chr5_h2_hap1/chr5_h2_hap1.1-191932.bed
     #	chr6_cox_hap1_random/chr6_cox_hap1_random.1-239941.bed
     #	chr6_qbl_hap2_random/chr6_qbl_hap2_random.1-120011.bed
     #	chrM/chrM.1-16499.bed
     # this chrM missing one is interesting.  It should have something ?
     # I don't know.  They all have data in their pp files.
 
 # Completed: 364 of 368 jobs
 # Crashed: 4 jobs
 # CPU time in finished jobs:      11969s     199.49m     3.32h    0.14d  0.000 y
 # IO & Wait Time:                  3757s      62.61m     1.04h    0.04d  0.000 y
 # Average job time:                  43s       0.72m     0.01h    0.00d
 # Longest finished job:              56s       0.93m     0.02h    0.00d
 # Submission to last job:          2362s      39.37m     0.66h    0.03d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/euarchontoglires
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/primates
 
     # load into database
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates
     time nice -n +19 hgLoadBed ponAbe2 phastConsElements8wayPrimates \
 	mostConserved.bed
     #	Loaded 298598 elements of size 5
     #	real    0m8.768s
     # verify coverage
     featureBits ponAbe2 phastConsElements8wayPrimates
     #	92315637 bases of 3093572278 (2.984%) in intersection
 
     #	Create the downloads .pp files, from which the phastCons wiggle data
     #	is calculated
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates
     mkdir downloads
     for D in pp/chr*
 do
     C=${D/pp\//}
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
 	> downloads/${C}.primates.pp.data.gz
     echo $D $C done
 done
 
     # Create merged posterier probability file and wiggle track data files
     cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates
     time nice -n +19 ls downloads/chr*.data.gz | xargs zcat \
 	| wigEncode -noOverlap stdin \
 	    phastCons8wayPrimates.wig phastCons8wayPrimates.wib
 # Converted stdin, upper limit 1.00, lower limit 0.00
 
     ## load table with wiggle data
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates
     cp -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/*.wi? .
     ln -s `pwd`/phastCons8wayEuarch.wib \
 	/gbdb/ponAbe2/multiz8way/phastCons8wayPrimates.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \
 	phastCons8wayPrimates phastCons8wayPrimates.wig
     #	real    0m44.161s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=ponAbe2 phastCons8wayPrimates > histogram.data 2>&1
     #	real    5m15.581s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Orantutan PonAbe2 Histogram phastCons8wayPrimates track"
 set xlabel " phastCons8wayPrimates score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #############################################################################
 ## Create downloads for 8-way business (DONE - 2007-12-27 - Hiram)
     ssh kkstore02
     mkdir -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
     cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
     cp -p /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf .
     time nice -n +19 gzip -v *.maf
     #	creating upstream files
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
     #	creating upstream files from xenoRefGene, bash script:
 DB=ponAbe2
 GENE=xenoRefGene
 NWAY=multiz8way
 export DB GENE
 
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                 stdin stdout \
                 -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
     #
     ssh kkstore02
     cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
     md5sum *.maf.gz > md5sum.txt
 
     cd ..
     ln -s ../../ponAbe2.8-way.nh ./8way.nh
     ln -s $HOME/kent/src/hg/makeDb/trackDb/orangutan/ponAbe2/multiz8way.html .
     # copy README.txt from mm9 30-way downloads and edit to represent
     #	this directory
     cd ..
     mkdir -p phastCons8way/phastConsScores
     cd phastCons8way/phastConsScores
     cp -p \
 /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/phastCons8wayScores/*.data.gz .
     md5sum *.data.gz > md5sum.txt
     cd ..
     ln -s ../../cons/all/all.mod 8way.mod
     # copy README.txt from mm9 30way downloads and edit to represent
     #	this directory
 
     #	enable them for hgdownload rsync transfer
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/ponAbe2
     mkdir multiz8way phastCons8way
     cd multiz8way
     ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/8way.nh .
     ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/*.html .
     ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/R*.txt .
     mkdir maf
     cd maf
     ln -s \
 /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/*.maf.gz .
     ln -s \
 /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/md5sum.txt
     cd ../../phastCons8way
     mkdir phastConsScores
     cd phastConsScores
     ln -s \
 /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/*.gz .
     ln -s \
 /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/md5sum.txt .
     cd ..
     ln -s \
 /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/8way.mod .
     ln -s \
 /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt .
     
     
 ###########################################################################
 # BLASTZ SWAP Human Hg18 (DONE - 2007-10-05 - Hiram)
     ssh kkstore02
     screen # use a screen to control this job
     #	the original blastz
     cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
     cat fb.hg18.chainPonAbe2Link.txt
     #	2676696124 bases of 2881515245 (92.892%) in intersection
 
     #	And the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap
     cd /cluster/data/ponAbe2/bed/blastz.hg18.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=pk > swap.log 2>&1 &
     #	real    123m9.197s
     cat fb.ponAbe2.chainHg18Link.txt
     #	2824501297 bases of 3093572278 (91.302%) in intersection
 
     #	create syntenicNets for 7-way conservation maf use
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-continue=syntenicNet -syntenicNet \
 	-swap -bigClusterHub=pk > synNet.log 2>&1 &
     #	real    91m18.097s
 
 ###########################################################################
 # BLASTZ SWAP Chimp PanTro2 (DONE - 2007-10-16 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     #	the original blastz
     cd /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13
     cat fb.panTro2.chainPonAbe2Link.txt
     #	2648603020 bases of 2909485072 (91.033%) in intersection
 
     #	and, for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.panTro2.swap
     cd /cluster/data/ponAbe2/bed/blastz.panTro2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=kk > swap.log 2>&1 &
     #	real    160m51.639s
     cat fb.ponAbe2.chainPanTro2Link.txt
     #	2766615040 bases of 3093572278 (89.431%) in intersection
     #	real    2597m24.771s
 
     #	create syntenicNets for 7-way conservation maf use
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
 	-continue=syntenicNet -syntenicNet \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=kk > synNet.log 2>&1 &
     #	real    131m30.045s
 
 ###########################################################################
 # BLASTZ SWAP Rhesus rheMac2 (DONE - 2007-10-18 - Hiram)
     ssh kkstore01
     screen # use screen to control this job
     cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
     cat fb.rheMac2.chainPonAbe2Link.txt
     #	2333264093 bases of 2646704109 (88.157%) in intersection
 
     #	and, for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
     cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
 	-swap -chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > swap.log 2>&1 &
     #	real    289m9.479s
     cat fb.ponAbe2.chainRheMac2Link.txt
     #	2556010573 bases of 3093572278 (82.623%) in intersection
 
     #	create syntenicNets for 7-way conservation maf use
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
 	-continue=syntenicNet -syntenicNet \
 	-swap -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -bigClusterHub=kk > synNet.log 2>&1 &
     #	real    62m10.816s
 
 ###########################################################################
 # BLASTZ SWAP Opossum monDom4 (DONE - 2007-11-26 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     #	the original blastz
     cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
     cat fb.monDom4.chainPonAbe2Link.txt
     #	395887393 bases of 3501643220 (11.306%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap
     cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
 	/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> swap.log 2>&1 &
     #	real    223m38.325s
     cat fb.ponAbe2.chainMonDom4Link.txt
     #	406409435 bases of 3093572278 (13.137%) in intersection
 
 ###########################################################################
 ## PUSHQ entry creation (DONE - 2007-12-12 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/ponAbe2/pushQ
     cd /cluster/data/ponAbe2/pushQ
     #	had to touch
     #	goldenPath/ponAbe2/liftOver/ponAbe2ToPonAbe1.over.chain.gz
     #	to get it to pass that check.  (there isn't any ponAbe1 release)
     /cluster/bin/scripts/makePushQSql.pl -noGenbank ponAbe2 \
 	> ponAbe2.sql 2> stderr.out
     ## check the stderr.out for anything that needs to be fixed
     ## copy ponAbe2.sql to hgwbeta:/tmp
     scp ponAbe2.sql hgwbeta:/tmp
     ## then on hgwbeta
     ssh hgwbeta
     cd /tmp
     ### XXX remember to set the pushQ id number in the
     #	last statement in this SQL file
     hgsql qapushq < ponAbe2.sql
 
 ###########################################################################
 # Blastz Chicken galGal3 (DONE - 2007-12-13 - Hiram)
     ssh kkstore03
     screen # use screen to control this job
     mkdir /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
     cd /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
 
     cat << '_EOF_' > DEF
 # Orangutan vs. chicken
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Orangutan PonAbe2
 SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/hg/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	testing some changes to the script
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	`pwd`/DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    407m0.026s
     #	some of the jobs failed due to out of memory on kk nodes
     #	did a para recover and ran the last set of jobs on pk
     #	then, continuing and testing the new memk kluster:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-continue=cat `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
     #	failed when it came to the kolossus run because kkr1u00 was out
     #	which provides some filesystems to kolossus.  Got that fixed
     #	and finished the netChains.csh script manually. 
     #	real    6m2.271s
     #	Then, continuing:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-continue=load `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > load.log 2>&1 &
     #	real    4m21.364s
     cat fb.ponAbe2.chainGalGal3Link.txt
     #	134448942 bases of 3093572278 (4.346%) in intersection
 
     mkdir /cluster/data/galGal3/bed/blastz.ponAbe2.swap
     cd /cluster/data/galGal3/bed/blastz.ponAbe2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose -smallClusterHub=memk \
 	-swap -bigClusterHub=kk > swap.log 2>&1 &
     cat fb.galGal3.chainPonAbe2Link.txt
     #	115578761 bases of 1042591351 (11.086%) in intersection
 
 ###########################################################################
 #  Add quality score track (DONE - 2008-03-10 - Hiram)
     ssh kkstore02
     cd /cluster/data/ponAbe2/wustl
     #	use mScore=98 since the "Finished" contigs are not in the qual file
     time nice -n +19 zcat contigs.fa.qual.gz \
 	| qaToQac stdin stdout \
 	| qacAgpLift -mScore=98 ../ponAbe2.agp stdin ponAbe2.qual.qac
     #	3m35.075s
     mkdir /cluster/data/ponAbe2/bed/qual
     cd /cluster/data/ponAbe2/bed/qual
     time nice -n +19 qacToWig -fixed \
 	/cluster/data/ponAbe2/wustl/ponAbe2.qual.qac stdout \
 	| wigEncode stdin qual.wig qual.wib
     #	real    21m33.899s
 
     ssh hgwdev
     cd /cluster/data/ponAbe2/bed/qual
     rm -f /gbdb/ponAbe2/wib/qual.wib
     ln -s /cluster/data/ponAbe2/bed/qual/qual.wib /gbdb/ponAbe2/wib/
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/wib \
 	ponAbe2 quality qual.wig
     #	real    1m30.877s
 
 #############################################################################
 # N-SCAN gene predictions (nscanGene) - (2008-03-11 markd)
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     cd /cluster/data/ponAbe2/bed/nscan/
     wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.gtf
     wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.prot.fa
     wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/readme.html
     bzip2 ponAbe2.*
     chmod a-w *
 
     # load track
     gtfToGenePred -genePredExt ponAbe2.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt ponAbe2 nscanGene stdin
     hgPepPred ponAbe2 generic nscanPep  ponAbe2.prot.fa.bz2
     rm *.tab
 
     # update trackDb; need a ponAbe2-specific page to describe informants
     orangutan/ponAbe2/nscanGene.html   (copy from readme.html)
     orangutan/ponAbe2/trackDb.ra
     # set search regex to
         termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
 
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
 # Create quality download files (DONE - 2008-10-14 - Hiram)
     cd /hive/data/genomes/ponAbe2/bed/qual
     qacToQa /hive/data/genomes/ponAbe2/wustl/ponAbe2.qual.qac ponAbe2.qa.fa
     cat << '_EOF_' > splitByName.pl
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 my $chrom = "";
 while (my $line = <>) {
     if ($line =~ m/^>/) {
 	close FH if (length($chrom) > 0);
 	chomp $line;
 	$chrom = $line;
 	$chrom =~ s/>//;
 	$chrom =~ s/\s+//g;
 	open (FH, "| gzip -c > qa/$chrom.qa.fa.gz") or
 		die "can not write to qa/$chrom.qa.fa.gz";
 	print FH ">$chrom\n";
     } else {
 	print FH "$line";
     }
 }
 close (FH);
 '_EOF_'
     # << happy emacs
     chmod +x ./splitByName.pl
     mkdir qa
     ./splitByName.pl < ponAbe2.qa.fa
     cd qa
     tar cvzf ../ponAbe2.quality.fa.gz ./*.fa.gz
     #	check byte counts with these files to make sure they are OK
     for F in *.fa.gz; do zcat $F; done | wc
     #	172337823 3446754951 10512603041
     cd ..
     wc ponAbe2.qa.fa
     #	172337823  3446754951 10512603041 ponAbe2.qa.fa
     # identical counts, OK.
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
+
 ############################################################################
+# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
+    screen # use a screen to manage this multi-day job
+    mkdir /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
+    cd /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
+
+    #	same kind of parameters as used in human<->marmoset
+    cat << '_EOF_' > DEF
+# Orangutan vs. marmoset
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Orangutan PonAbe2
+SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
+SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Marmoset (calJac3)
+SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
+SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
+SEQ2_LIMIT=50
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << this line keeps emacs coloring happy
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-syntenicNet \
+	-chainMinScore=5000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	> do.log 2>&1 &
+    #	real    329m55.081s
+    cat fb.ponAbe2.chainCalJac3Link.txt 
+    #	2086557592 bases of 3093572278 (67.448%) in intersection
+
+    mkdir /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap
+    cd /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11/DEF \
+	-swap -syntenicNet \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
+    #	real    146m12.301s
+    cat fb.calJac3.chainHg19Link.txt 
+    #	1978857628 bases of 2752505800 (71.893%) in intersection
+############################################################################
+