src/hg/makeDb/doc/petMar1.txt 1.21

1.21 2009/07/21 21:01:45 markd
added transmap update blurb
Index: src/hg/makeDb/doc/petMar1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/petMar1.txt,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/hg/makeDb/doc/petMar1.txt	25 Nov 2008 20:22:45 -0000	1.20
+++ src/hg/makeDb/doc/petMar1.txt	21 Jul 2009 21:01:45 -0000	1.21
@@ -1,1737 +1,1746 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes browser build for the Sea Lamprey
 # genome, Petromyzon marinus, March 2007, v.3.0 from WUSTL
 #
 #       "$Id$"
 #
 
 ##########################################################################
 ### Fetch sequence       (DONE - 2008-01-14 - Hiram)
     #	determine estimated disk usage, from example genome like this one,
     #	braFlo1 which used about 3.5 Gb, do not need much space, store04
     #	has 42 Gb, plenty of space on store9
     ssh kkstore04
     mkdir /cluster/store9/petMar1
     ln -s /cluster/store9/petMar1 /cluster/data/petMar1
     mkdir /cluster/data/petMar1/downloads
     cd /cluster/data/petMar1/downloads
 
     wget --timestamping \
     'ftp://genome.wustl.edu/pub/organism/Other_Vertebrates/Petromyzon_marinus/assembly/Petromyzon_marinus-3.0/output/*'
 
 ##########################################################################
 #  Fixup the agp file to make a chrUn agp file, add a 1000 base gap
 #	between the supercontigs
 
 ##########################################################################
 #  Run the makeGenomeDb.pl script (DONE - 2008-01-27 - Hiram)
     ssh kkstore04
     cd /cluster/data/petMar1/downloads
     #	eliminate zero-length fragments from the AGP file
     zcat supercontigs.agp.gz | awk '
 {
 size=$3-$2
 if (size >= 0) print
 }
 ' | gzip -c > supers.agp.gz
 
     cd /cluster/data/petMar1
     cat << '_EOF_' > petMar1.config.ra
 # Config parameters for makeGenomeDb.pl:
 db petMar1
 clade vertebrate
 genomeCladePriority 130
 scientificName Petromyzon marinus
 commonName Lamprey
 assemblyDate Mar. 2007
 assemblyLabel WUSTL v.3.0 (March 2007)
 orderKey 480
 mitoAcc 5835065
 fastaFiles /cluster/data/petMar1/downloads/supercontigs.fa.gz
 agpFiles /cluster/data/petMar1/downloads/supers.agp.gz
 # qualFiles none
 dbDbSpeciesDir lamprey
 '_EOF_'
 
     #	can stop at db since trackDb and dbDb were created in first
     #	attempt
     makeGenomeDb.pl -stop=db petMar1.config.ra > makeGenome.log 2>&1 &
     twoBitToFa petMar1.unmasked.2bit stdout | faSize stdin
     #	1135497967 bases (303801529 N's 831696438 real 831696438
     #	upper 0 lower) in 2 sequences in 1 files
 
     #	chrM has a gi:5835065 fragment name, I would rather have NC_001626
     hgsql -e 'update gold set frag="NC_001626" where chrom="chrM";' petMar1
 
 ##########################################################################
 #  Pick up an image from the EPA, usage rights are free
 #	http://www.epa.gov/glnpo/image/restrictions.htm
     mkdir /cluster/data/petMar1/photo
     cd /cluster/data/petMar1/photo
     wget --timestamping 'http://www.epa.gov/glnpo/image/vbig/229.jpg' \
 	-O petromyzon_marinus.epa.jpg
 
 ##########################################################################
 #  Running WindowMasker (DONE - 2008-01-26 - Hiram)
     ssh kolossus
     mkdir /cluster/data/petMar1/bed/WindowMasker
     cd /cluster/data/petMar1/bed/WindowMasker
     /cluster/bin/scripts/doWindowMasker.pl \
 	-buildDir /cluster/data/petMar1/bed/WindowMasker \
 	-workhorse=kolossus -verbose=2 petMar1 > wm.log 2>&1 &
     twoBitToFa petMar1.wmsk.sdust.2bit stdout | faSize stdin
     #	1027258967 bases (195562529 N's 831696438 real 494032631 upper
     #	337663807 lower) in 108241 sequences in 1 files
 
     #	%32.87 masked total, %40.60 masked real
 
     ssh hgwdev
     cd /cluster/data/petMar1/bed/WindowMasker
     hgLoadBed petMar1 windowmaskerSdust windowmasker.sdust.bed.gz
     #	Loaded 4938211 elements of size 3
     featureBits petMar1 windowmaskerSdust
     #	533224658 bases of 831696438 (64.113%) in intersection
 
     #	WM has a bug where it masks the gaps too, don't like that,
     #	fetch the WM track without the gaps, can do this in the
     #	table browser with an intersection of WMask AND ! gap
     #	or via featureBits:
     featureBits petMar1 -not gap -bed=notGap.bed
     #	hint: also works with the gold table instead of notGap
     time nice -n +19 featureBits petMar1 windowmaskerSdust notGap.bed \
 	-bed=stdout | gzip -c > cleanWMask.bed.gz
     #	337663807 bases of 831696438 (40.599%) in intersection
     #	real    182m45.469s
     hgLoadBed petMar1 windowmaskerSdust cleanWMask.bed.gz
     #	Loaded 4919722 elements of size 4
     featureBits petMar1 windowmaskerSdust > fb.petMar1.cleanWMask.txt 2>&1
     #	337663807 bases of 831696438 (40.599%) in intersection
  
 ##########################################################################
 #	Running TRF simple repeats (DONE - 2008-01-26 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/petMar1/bed/simpleRepeat
     cd /cluster/data/petMar1/bed/simpleRepeat
     doSimpleRepeat.pl petMar1 -smallClusterHub=memk \
 	-workhorse=kolossus \
 	-buildDir=/cluster/data/petMar1/bed/simpleRepeat > do.log 2>&1 &
     #	ran into a bug with the nice difficulty on the featureBits
     featureBits petMar1 simpleRepeat > fb.petMar1.simpleRepeat.txt 2>&1
     #	75844864 bases of 831696438 (9.119%) in intersection
     doSimpleRepeat.pl petMar1 -smallClusterHub=memk \
 	-continue=cleanup -workhorse=kolossus \
 	-buildDir=/cluster/data/petMar1/bed/simpleRepeat > cleanup.log 2>&1 &
 
 ##########################################################################
 # Repeat Masker (DONE - 2008-01-26 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/petMar1/bed/RepeatMasker
     cd /cluster/data/petMar1/bed/RepeatMasker
     doRepeatMasker.pl petMar1 -bigClusterHub=kk -workhorse=kolossus \
 	-buildDir=/cluster/data/petMar1/bed/RepeatMasker \
 	-verbose=2 > do.log 2>&1
 ##########################################################################
 # Add WindowMasker and trfMask masking to the 2bit sequenct
 #	(DONE - 2008-01-26 - Hiram)
     ssh kkstore04
     cd /cluster/data/petMar1
     gzip bed/simpleRepeat/trfMask.bed
     zcat bed/WindowMasker/cleanWMask.bed.gz bed/simpleRepeat/trfMask.bed.gz \
 	| twoBitMask -type=.bed petMar1.unmasked.2bit stdin petMar1.2bit
     #	ignore the warning about bed file > 13 fields, that is OK
     twoBitToFa petMar1.2bit stdout | faSize stdin
     #	1027258967 bases (195562529 N's 831696438 real 493479824 upper
     #	338216614 lower) in 108241 sequences in 1 files
     #	%32.92 masked total, %40.67 masked real
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2008-01-15 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("petMar1", "blat15", "17788", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("petMar1", "blat15", "17789", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 ## Reset default position to 
 #	Where RHO1 protein is found via blat:
 #	GEMSLWSLVV LAIERYIVIC KPMGNFRFGS THAYMGVAFT WFMALSCAAP PLVGWSR
     ssh hgwdev
     hgsql -e 'update dbDb set defaultPos="Contig11034:8429-10197"
 	where name="petMar1";' hgcentraltest
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2008-01-29 - Hiram)
     # Use -repMatch=200 (based on size -- for human we use 1024, and 
     # medaka size is ~20% of human judging by gapless oryLat1 vs. hg18 
     # genome sizes from featureBits.
     ssh kkstore04
     cd /cluster/data/petMar1
     blat petMar1.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=petMar1.11.ooc -repMatch=200
     #	Wrote 49155 overused 11-mers to petMar1.11.ooc
 
 ##########################################################################
 # GENBANK AUTO UPDATE (DONE - 2008-01-15 - Hiram)
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add petMar1 just after gasAcu1
 
 # petMar1 (Branchiostoma floridae - Lancelet)
 petMar1.serverGenome = /cluster/data/petMar1/petMar1.2bit
 petMar1.clusterGenome = /cluster/bluearc/scratch/data/petMar1/petMar1.2bit
 petMar1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 petMar1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 petMar1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 petMar1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 petMar1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 petMar1.ooc = /cluster/data/petMar1/petMar1.11.ooc
 petMar1.lift = /cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft
 petMar1.refseq.mrna.native.load = yes
 petMar1.refseq.mrna.xeno.load = yes
 petMar1.genbank.mrna.xeno.load = yes
 petMar1.genbank.est.native.load = yes
 petMar1.downloadDir = petMar1
 petMar1.perChromTables = no
 
     cvs ci -m "Added petMar1." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     # Edit src/lib/gbGenome.c to add new species.
     cvs ci -m "Added Petromyzon marinus (Sea Lamprey)." src/lib/gbGenome.c
     make install-server
 
     ssh genbank
     screen # control this with a screen so you can get back to it
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial petMar1 &
     #	logFile: var/build/logs/2008.01.29-11:27:10.petMar1.initalign.log
     #	real    332m36.025s
     #	There was some kind of problem with this alignment, although the
     #	kluster run actually finished:
 # Completed: 366520 of 366520 jobs
 # CPU time in finished jobs:    6854057s  114234.28m  1903.90h   79.33d  0.217 y
 # IO & Wait Time:               2300432s   38340.53m   639.01h   26.63d  0.073 y
 # Average job time:                  25s       0.42m     0.01h    0.00d
 # Longest finished job:            1292s      21.53m     0.36h    0.01d
 # Submission to last job:         21121s     352.02m     5.87h    0.24d
     #	So, continuing:
     time nice -n +19 bin/gbAlignStep -continue=finish -initial petMar1 &
     #	logFile: var/build/logs/2008.01.30-09:18:56.petMar1.initalign.log
     #	real    70m20.669s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad petMar1
     #	logFile: var/dbload/hgwdev/logs/2008.01.15-16:21:10.dbload.log
     #	real    19m34.853s
 
     # enable daily alignment and update of hgwdev (2008-01-30 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add petMar1 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added petMar1 - Petromyzon marinus (lamprey)" \
 	etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 #########################################################################
 # BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-15 - Hiram)
 #	Try 1 failed, run again with contigs
     ssh kkstore04
     screen # use screen to control this job
     mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
     cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
 
     cat << '_EOF_' > DEF
 # Medaka vs. Lamprey
 
 # using the "close" genome alignment parameters
 #	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
 BLASTZ_Y=9400
 BLASTZ_L=3000
 BLASTZ_K=3000
 BLASTZ_M=50
 
 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
 #       chrUn in Scaffolds for this alignment run
 SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LIMIT=30
 SEQ1_LAP=10000
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=memk > do.log 2>&1 &
     #	abandon this run, ended up using the swap from oryLat1 in:
     #	/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
     #	real    1053m9.027s
     #	netChains failed, during chainStitchId:
     #	q end mismatch 1073742062 vs 1111312799 line 510 of stdin
 
 ############################################################################
 #  Create contigs-only masked 2bit file for non-bridged gap free blastz runs
 #	 (DONE - 2008-01-15 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/petMar1/contigSequence
     cd /cluster/data/petMar1/contigSequence
     cp -p /cluster/data/priPac1/jkStuff/lft2BitToFa.pl ../jkStuff
     ../jkStuff/lft2BitToFa.pl ../petMar1.2bit \
 	../jkStuff/petMar1.nonBridged.lft > supercontigs.fa
     #	this took quite a long time, about 6 hours or so
     #	this doesn't have chrM yet
     faSize supercontigs.fa ../M/chrM.fa
     #	1027258967 bases (195562529 N's 831696438 real 493405059 upper
     #	338291379 lower) in 108241 sequences in 2 files
     #	%32.93 masked total, %40.67 masked real
     cat supercontigs.fa ../M/chrM.fa | faToTwoBit stdin petMar1.supers.2bit
     twoBitToFa petMar1.supers.2bit stdout | faSize stdin
     #	1027258967 bases (195562529 N's 831696438 real 493405059 upper
     #	338291379 lower) in 108241 sequences in 1 files
     #	%32.93 masked total, %40.67 masked real
     twoBitInfo petMar1.supers.2bit stdout | sort -k2nr > super.sizes
     cp -p petMar1.supers.2bit /cluster/bluearc/scratch/data/petMar1
     cd ..
     ln -s contigSequence/super.sizes .
 
 #########################################################################
 # BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-16 - 2008-01-30 - Hiram)
 #	Try 2 with contigs for Lamprey
     ssh kkstore04
     screen # use screen to control this job
     mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
     cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
 
     cat << '_EOF_' > DEF
 # Medaka vs. Lamprey
 
 # using the "close" genome alignment parameters
 #	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
 BLASTZ_Y=9400
 BLASTZ_L=3000
 BLASTZ_K=3000
 BLASTZ_M=50
 
 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
 #       chrUn in Scaffolds for this alignment run
 SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LIMIT=30
 SEQ1_LAP=10000
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
 SEQ2_CTGDIR=/cluster/bluearc/scratch/data/petMar1/petMar1.supers.2bit
 SEQ2_CTGLEN=/cluster/data/petMar1/super.sizes
 SEQ2_LIFT=/cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=150
 SEQ2_LAP=0
 
 BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk > do.log 2>&1 &
     #	real    218m14.787s
 # Completed: 69774 of 70854 jobs
 # Crashed: 1080 jobs
 # CPU time in finished jobs:    3950133s   65835.55m  1097.26h   45.72d  0.125 y
 # IO & Wait Time:               1218716s   20311.93m   338.53h   14.11d  0.039 y
 # Average job time:                  74s       1.23m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1629s      27.15m     0.45h    0.02d
 # Submission to last job:         12951s     215.85m     3.60h    0.15d
     #	had some parasol bookeeping problems, it actually finished
     #	the kluster run but got confused.  Checked manually with
     #	para recover -> nothing in new job list, and check files in
     #	psl result directory finding the correct number of files for the
     #	the job count.  Make the run.time file and continuing:
     time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=cat -chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk > cat.log 2>&1 &
     #	real    7m47.963s
     #	something failed
     #	had the specification to the lft file incorrect, failed during
     #	chain run.  Fix that, finish the run, now continuing:
     time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=chainMerge -chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk > chainMerge.log 2>&1 &
     #	had the same failure as the first time test of this alignment.
     #	something is wrong with netChainSubset, needs to be debugged.
     #	ignore the step to create the "over.chain" file, finish the net
     #	step manually, then continuing:
     time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=load -chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk > load.log 2>&1 &
     #	real    3m53.686s
     cat fb.oryLat1.chainPetMar1Link.txt
     #	24990733 bases of 700386597 (3.568%) in intersection
 
     #	And, for the swap:
     mkdir /cluster/data/petMar1/bed/blastz.oryLat1.swap
     cd /cluster/data/petMar1/bed/blastz.oryLat1.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk > swap.log 2>&1 &
     #	real    1m3.572s
     #	same problem in netChainSubset trying to make the "over.chain" file.
     #	skip that, finish the chains manually:
     #	real    14m35.340s
     #	then continuing:
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \
 	-continue=load -chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk > load.log 2>&1 &
     cat fb.petMar1.chainOryLat1Link.txt
     #	23358156 bases of 831696438 (2.808%) in intersection
 
     #	create reciprocal best chains/nets for 9-way maf alignments
     ssh hgwdev
     cd /cluster/data/petMar1/bed/blastz.oryLat1.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 oryLat1 \
 	> rbest.log 2>&1 &
     #	real    22m20.900s
 
 #############################################################################
 # BLASTZ SWAP Lamprey - Lanclet (DONE - 2008-04-12 - Hiram)
     ssh kkstore05
     screen	#	control this job with a screen
     #	the original
     cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
     cat fb.braFlo1.chainPetMar1Link.txt
     #	27418217 bases of 923355587 (2.969%) in intersection
 
     #	and the swap
     mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap
     cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     #	real    194m53.514s
     cat fb.petMar1.chainBraFlo1Link.txt
     #	27373264 bases of 831696438 (3.291%) in intersection
 
     #	create reciprocal best chains/nets for 9-way maf alignments
     ssh hgwdev
     cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 braFlo1 \
 	> rbest.log 2>&1 &
     #	real    110m25.613s
 
 #############################################################################
 # BLASTZ SWAP Chicken GalGal3 (DONE - 2008-04-15 - Hiram)
     #	the original
     ssh kkstore03
     cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
     cat fb.galGal3.chainPetMar1Link.txt
     #	20134896 bases of 1042591351 (1.931%) in intersection
 
     #	and the swap
     mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap
     cd /cluster/data/petMar1/bed/blastz.galGal3.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 &
     #	real    33m41.587s
     cat fb.petMar1.chainGalGal3Link.txt
     #	22308118 bases of 831696438 (2.682%) in intersection
 
     #	create reciprocal best chains/nets for 9-way maf alignments
     ssh hgwdev
     cd /cluster/data/petMar1/bed/blastz.galGal3.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 galGal3 \
 	> rbest.log 2>&1 &
     #	real    13m33.139s
 
 #############################################################################
 # BLASTZ SWAP Human hg18 (DONE - 2008-01-30 - Hiram)
     ssh kkstore02
     screen # use screen to control this job
     # the original
     cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
     cat fb.hg18.chainPetMar1Link.txt
     #	36042598 bases of 2881515245 (1.251%) in intersection
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/petMar1/bed/blastz.hg18.swap
     cd /cluster/data/petMar1/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk > swap.log 2>&1 &
     #	real    60m1.928s
     cat  fb.petMar1.chainHg18Link.txt
     #	26751073 bases of 831696438 (3.216%) in intersection
 
     #	create reciprocal best chains/nets for 9-way maf alignments
     ssh hgwdev
     cd /cluster/data/petMar1/bed/blastz.hg18.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 hg18 \
 	> rbest.log 2>&1 &
     #	real    22m34.274s
 
 #############################################################################
 # BLASTZ SWAP Mouse mm9 (DONE - 2008-04-15 - Hiram)
     ssh kkstore06
     screen # use screen to control this job
     #	the original
     cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
     cat fb.mm9.chainPetMar1Link.txt
     #	29113438 bases of 2620346127 (1.111%) in intersection
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/petMar1/bed/blastz.mm9.swap
     cd /cluster/data/petMar1/bed/blastz.mm9.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 &
     #	real    33m29.076s
     cat  fb.petMar1.chainMm9Link.txt
     #	26052507 bases of 831696438 (3.132%) in intersection
 
     #	create reciprocal best chains/nets for 9-way maf alignments
     ssh hgwdev
     cd /cluster/data/petMar1/bed/blastz.mm9.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 mm9 \
 	> rbest.log 2>&1 &
     #	real    21m9.320s
 
 #############################################################################
 ## 6-Way Multiz (DONE - 2008-04-17 - Hiram)
 ##
     ssh hgwdev
     mkdir /cluster/data/petMar1/bed/multiz6way
     cd /cluster/data/petMar1/bed/multiz6way
     #	take the 30-way tree from mm9 and eliminate genomes not in
     #	this alignment.  Manually add in petMar1 and braFlo1
     #	arbitrarily somewhere outside of the fishes
     #	rearrange to get petMar1 on the top of the graph
     #	paste this tree into the on-line phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to create the image for the tree diagram
 
     #	select the 6 organisms from the 30-way recently done on mouse mm9
     /cluster/bin/phast/tree_doctor \
 	--prune-all-but Human_hg18,Mouse_mm9,Chicken_galGal3,Medaka_oryLat1 \
 	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
 	> 6-way.fullNames.nh
 
     #	looks something like this:
     (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
 	Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873);
 
     #	Adding Lamprey and Lanclet:
 ((Lamprey_petMar1:1.6,
   (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
    Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873):0.4):0.4,
     Lanclet_braFlo1:2.0);
 
 ((Lamprey_petMar1:1.6,
   (Medaka_oryLat1:1.077873,((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
    Chicken_galGal3:0.488824):0.325954):0.4):0.4,
     Lanclet_braFlo1:2.0);
 
 ((Lamprey_petMar1:1.6,
   (Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824,
     (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4,
     Lanclet_braFlo1:2.0);
 
     #	rearrange to get Marmoset at the top:
     # this leaves us with:
     cat << '_EOF_' > petMar1.6-way.nh
 ((Lamprey_petMar1:1.6,
   (Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824,
     (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4,
     Lanclet_braFlo1:2.0);
 '_EOF_'
     #	<< happy emacs
 
     #	create a species list from that file:
     sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' petMar1.6-way.nh \
         | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
         | sed -e "s/.*_//; s/:.*//" | sort > species.list
     #	verify that has 6 db names in it
     # create a stripped down nh file for use in autoMZ run
     echo \
 `sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' petMar1.6-way.nh \
 	| sed -e "s/  / /g"` > tree.6.nh
     #	that looks like, as a single line:
 # ((petMar1 (oryLat1 (galGal3 (mm9 hg18)))) braFlo1)
 
     # verify all blastz's exists
     cat << '_EOF_' > listMafs.csh
 #!/bin/csh -fe
 cd /cluster/data/petMar1/bed/multiz6way
 foreach db (`grep -v petMar1 species.list`)
     set bdir = /cluster/data/petMar1/bed/blastz.$db
     if (-e $bdir/mafRBestNet/petMar1.$db.rbest.maf.gz) then
 	echo "$db mafRBestNet"
     else if (-e $bdir/mafSynNet/petMar1.$db.net.maf.gz) then
 	echo "$db mafSynNet"
     else if (-e $bdir/mafNet/petMar1.$db.net.maf.gz) then
 	echo "$db mafNet"
     else
 	echo "$db mafs not found"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./listMafs.csh
     #	see what it says, the "mafs not found" should only show up on petMar1
     ./listMafs.csh
 # braFlo1 mafRBestNet
 # galGal3 mafRBestNet
 # hg18 mafRBestNet
 # mm9 mafRBestNet
 # oryLat1 mafRBestNet
 
     /cluster/bin/phast/all_dists petMar1.6-way.nh > 6way.distances.txt
     grep -i petmar 6way.distances.txt | sort -k3,3n
 Lamprey_petMar1 Chicken_galGal3 2.814778
 Lamprey_petMar1 Human_hg18      2.923612
 Lamprey_petMar1 Medaka_oryLat1  3.077873
 Lamprey_petMar1 Mouse_mm9       3.122529
 Lamprey_petMar1 Lanclet_braFlo1 4.000000
 
     #	use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.  Zebrafish ends up before
     #	tetraodon and fugu on the browser despite its distance.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                       chainCalJac1Link   chain   linearGap
 #                     distance    on PetMar1    on other   minScore
 #  1  Chicken_galGal3 2.814778     (% 2.682)    (% 1.931)   5000      loose
 #  2  Human_hg18      2.923612     (% 3.216)    (% 1.251)   5000      loose
 #  3  Medaka_oryLat1  3.077873     (% 2.808)    (% 3.568)   3000      medium
 #  4  Mouse_mm9       3.122529     (% 3.132)    (% 1.111)   5000      loose
 #  5  Lanclet_braFlo1 4.000000     (% 3.291)    (% 2.969)   5000      loose
 
     # copy net mafs to cluster-friendly storage, splitting chroms
     mkdir mafLinks
     cd mafLinks
     ln -s ../../blastz.*.swap/mafRBestNet/*.rbest.maf.gz .
 
     #	need to split these things up by Contig number for efficient kluster run
     ssh kkstore04
     mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/contigMaf
     cd /scratch/tmp
     #	the 16201 is from petMar/chrom.sizes
     echo "chrM 0 16201" > chrM.bed
 
     for D in `grep -v petMar1 /cluster/data/petMar1/bed/multiz6way/species.list`
 do
     echo ${D}
     zcat \
 /cluster/data/petMar1/bed/multiz6way/mafLinks/petMar1.${D}.rbest.maf.gz \
 	> ${D}.maf
     mkdir /scratch/tmp/${D}
     cd /scratch/tmp/${D}
     mafSplit -verbose=2 /dev/null -byTarget -useSequenceName Contig \
 	../${D}.maf -outDirDepth=2
     mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf
     rsync -a --progress ./ \
 	/san/sanvol1/scratch/petMar1/multiz6way/contigMaf/${D}
     cd /scratch/tmp
     rm -fr ${D} ${D}.maf
 done
     #	create a run-time list of contigs to operate on, not all contigs
     #	exist in all alignments, but we want all contig names used in any
     #	alignment:
     cd /san/sanvol1/scratch/petMar1/multiz6way/contigMaf
     for D in *
 do
     cd "${D}"
     find . -type f
     cd ..
 done | sort -u > /tmp/6-way.contig.list
     wc -l /tmp/6-way.contig.list
     mkdir /cluster/data/petMar1/bed/multiz6way/splitRun
     cp -p /tmp/6-way.contig.list \
 	/cluster/data/petMar1/bed/multiz6way/splitRun
     #	31488 /tmp/6-way.contig.list
 
     # ready for the multiz run
     ssh pk
     cd /cluster/data/petMar1/bed/multiz6way/splitRun
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $P/{autoMZ,multiz,maf_project} penn
 
     #	set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = petMar1
 set subdir = $1
 set c = $2
 set result = $3
 set resultDir = $result:h
 set run = `pwd`
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz6way/contigMaf
 rm -fr $tmp
 mkdir -p $tmp
 mkdir -p $resultDir
 cp ../../tree.6.nh ../../species.list $tmp
 pushd $tmp
 foreach s (`grep -v $db species.list`)
     set in = $pairs/$s/$subdir/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($run/penn $path); rehash
 $run/penn/autoMZ + T=$tmp E=$db "`cat tree.6.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $result
 rm -fr $tmp
 rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/petMar1/bed/multiz6way/splitRun/maf/$(dir1)/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << emacs
 
     sed -e "s/^\.\///" ../6-way.contig.list \
 	| gensub2 stdin single template jobList
 
     para create jobList
     para try ... check ... push ... etc
 # Completed: 31488 of 31488 jobs
 # CPU time in finished jobs:       5723s      95.38m     1.59h    0.07d  0.000 y
 # IO & Wait Time:                 84949s    1415.82m    23.60h    0.98d  0.003 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest finished job:              12s       0.20m     0.00h    0.00d
 # Submission to last job:           900s      15.00m     0.25h    0.01d
 # Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     # put the split maf results back together into a single maf file
     #	eliminate duplicate comments
     ssh kkstore04
     cd /cluster/data/petMar1/bed/multiz6way
     mkdir togetherMaf
     grep "^##maf version" splitRun/maf/0/0/Contig00000.maf \
 	| sort -u > togetherMaf/petMar1.6way.maf
     for F in `find ./splitRun/maf -type f -depth`
 do
     grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
 	| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
 done | sort -u >> togetherMaf/petMar1.6way.maf
     for F in `find ./splitRun/maf -type f -depth`
 do
     grep -v -h "^#" "${F}"
 done >> togetherMaf/petMar1.6way.maf
 
     grep "^##eof maf" splitRun/maf/0/0/Contig00000.maf \
 	| sort -u >> togetherMaf/petMar1.6way.maf
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/petMar1/multiz6way/maf
     ln -s /cluster/data/petMar1/bed/multiz6way/togetherMaf/*.maf \
                 /gbdb/petMar1/multiz6way/maf/multiz6way.maf
     # this generates an immense multiz6way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/petMar1/multiz6way/maf petMar1 multiz6way
     #	real    0m6.011s
     #	Loaded 232205 mafs in 1 files from /gbdb/petMar1/multiz6way/maf
 
     # load summary table *NOT* - no Contigs are long enough to create
     #	anything for the summary table.  This command produces an empty
     #	table.
     time nice -n +19 cat /gbdb/petMar1/multiz6way/maf/*.maf \
 	| hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz6waySummary stdin
     #	real    5m58.150s
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore04
     mkdir /cluster/data/petMar1/bed/multiz6way/anno
     cd /cluster/data/petMar1/bed/multiz6way/anno
     mkdir maf run
 
     #	these actually already all exist from previous multiple alignments
     #	if there are done you will see an ls of them
     #	or else the twoBitInfo command will be echoed, run it if you want
     for DB in `cat ../species.list`
 do
     CDIR="/cluster/data/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `grep -v petMar1 ../../species.list`
 do
     echo "${DB} "
     ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     ssh memk
     #	temporarily copy the petMar1.6way.maf file onto the memk
     #	nodes /scratch/data/petMar1/maf/ directory
     for R in 0 1 2 3 4 5 6 7
 do
     ssh mkr0u${R} mkdir /scratch/data/petMar1/maf
     ssh mkr0u${R} rsync -a --progress \
 /cluster/data/petMar1/bed/multiz6way/togetherMaf/petMar1.6way.maf \
 	/scratch/data/petMar1/maf/
 done
     mkdir /cluster/data/petMar1/bed/multiz6way/anno/splitMaf
     #	need to split up the single maf file into individual
     #	per-scaffold maf files to run annotation on
     cd /cluster/data/petMar1/bed/multiz6way/anno/splitMaf
     #	create bed files to list approximately 1553 scaffolds in
     #	a single list, approximately 33 lists
     cat << '_EOF_' > mkBedLists.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $bedCount = 0;
 my $i = 0;
 
 my $bedFile = sprintf("file_%d.bed", $bedCount);
 
 open (BF,">$bedFile") or die "can not write to $bedFile $!";
 
 open (FH,"</cluster/data/petMar1/chrom.sizes") or
         die "can not read /cluster/data/petMar1/chrom.sizes $!";
 while (my $line = <FH>) {
     chomp $line;
     if ( (($i + 1) % 1553) == 0 )  {
         printf "%s\n", $line;
         close (BF);
         ++$bedCount;
         $bedFile = sprintf("file_%d.bed", $bedCount);
         open (BF,">$bedFile") or die "can not write to $bedFile $!";
     }
     ++$i;
     my ($chr, $size) = split('\s+',$line);
     printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
 }
 close (FH);
 close (BF);
 '_EOF_'
     # << happy emacs
     chmod +x mkBedLists.pl
     ./mkBedLists.pl
 
     #	now, run a mafsInRegion on each one of those lists
     cat << '_EOF_' > runOne
 #!/bin/csh -fe
 set runDir = "/cluster/data/petMar1/bed/multiz6way/anno/splitMaf"
 set resultDir = $1
 set bedFile = $resultDir.bed
 mkdir -p $resultDir
 mkdir -p /scratch/tmp/petMar1/$resultDir
 pushd /scratch/tmp/petMar1/$resultDir
 mafsInRegion $runDir/$bedFile -outDir . \
         /scratch/data/petMar1/maf/petMar1.6way.maf
 popd
 rsync -q -a /scratch/tmp/petMar1/$resultDir/ ./$resultDir/
 rm -fr /scratch/tmp/petMar1/$resultDir
 rmdir --ignore-fail-on-non-empty /scratch/tmp/petMar1
 '_EOF_'
     # << happy emacs
     chmod +x runOne
 
     cat << '_EOF_' > template
 #LOOP
 ./runOne $(root1)
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     ls file*.bed > runList
     gensub2 runList single template jobList
     para create jobList
     para try ... check ... push ... etc
 # Completed: 70 of 70 jobs
 # CPU time in finished jobs:        255s       4.24m     0.07h    0.00d  0.000 y
 # IO & Wait Time:                  1904s      31.74m     0.53h    0.02d  0.000 y
 # Average job time:                  31s       0.51m     0.01h    0.00d
 # Longest finished job:              72s       1.20m     0.02h    0.00d
 # Submission to last job:           143s       2.38m     0.04h    0.00d
 
     cd /cluster/data/petMar1/bed/multiz6way/anno/run
 
     cat << '_EOF_' > doAnno.csh
 #!/bin/csh -ef
 set outDir = ../maf/$2
 set result = $3
 set input = $1
 mkdir -p $outDir
 cat $input | \
 nice mafAddIRows -nBeds=nBeds stdin /scratch/data/petMar1/petMar1.2bit $result
 '_EOF_'
     # << happy emacs
     chmod +x doAnno.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     find ../splitMaf -type f -name "*.maf" > maf.list
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 31488 of 31488 jobs
 # CPU time in finished jobs:      10711s     178.52m     2.98h    0.12d  0.000 y
 # IO & Wait Time:                 79430s    1323.83m    22.06h    0.92d  0.003 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest finished job:              12s       0.20m     0.00h    0.00d
 # Submission to last job:          3237s      53.95m     0.90h    0.04d
 
     ssh kkstore04
     #	put them all back together into a single file
     cd /cluster/data/petMar1/bed/multiz6way/anno
     grep "^##maf version" maf/file_0/Contig0.maf \
 	| sort -u > petMar1.anno.6way.maf
     find ./maf -type f -depth -name "*.maf" | while read F
 do
     grep -v -h "^#" "${F}"
 done >> petMar1.anno.6way.maf
     echo "##eof maf" >> petMar1.anno.6way.maf
 XXX - running 2008-04-23 09:58
     ls -og *.maf
 # -rw-rw-r--   1 285783792 Apr 23 09:58 petMar1.anno.6way.maf
 
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way/anno
     mkdir -p /gbdb/petMar1/multiz6way/anno
     ln -s `pwd`/petMar1.anno.6way.maf \
                 /gbdb/petMar1/multiz6way/anno/multiz6way.maf
     #	by loading this into the table multiz6way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/petMar1/multiz6way/anno \
                 petMar1 multiz6way
     #	Loaded 296783 mafs in 1 files from /gbdb/petMar1/multiz6way/anno
     #	real    0m6.690s
 
     #	normally filter this for chrom size > 1,000,000 and only load
     #	those chroms.  But this is a scaffold assembly, load everything.
     #	*NOT* Does not work on petMar1, all contigs are too small,
     #	this makes an empty table.
     hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \
 	-maxSize=200000  multiz6waySummary \
 	    /gbdb/petMar1/multiz6way/anno/multiz6way.maf
     #	Created 121083 summary blocks from 3410157 components and 749940 mafs
     #	from /gbdb/petMar1/multiz6way/anno/multiz6way.maf
 
     #	by loading this into the table multiz6waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	remove the multiz6way*.tab files in this /scratch/tmp directory
     rm multiz6way*.tab
     #	And, you can remove the previously loaded non-annotated maf file link:
     rm /gbdb/petMar1/multiz6way/maf/multiz6way.maf
     rmdir /gbdb/petMar1/multiz6way/maf
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-19)
 
 # (auto) establish native host of /cluster/data/<assembly>
     ssh kkstore04
     # bash  if not using bash shell already
 
     mkdir /cluster/data/petMar1/blastDb
     cd /cluster/data/petMar1
     awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
     if test -s 1meg.lst; then
 	twoBitToFa -seqList=1meg.lst  petMar1.unmasked.2bit temp.fa
 	faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
 	rm temp.fa 
     fi
     rm 1meg.lst
 
     awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
     if test -s less1meg.lst; then
 	twoBitToFa -seqList=less1meg.lst  petMar1.unmasked.2bit temp.fa
 	faSplit about temp.fa 1000000 blastDb/y 
 	rm temp.fa 
     fi
     rm less1meg.lst
 
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 # 1018
 
     mkdir -p /san/sanvol1/scratch/petMar1/blastDb
     cd /cluster/data/petMar1/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/petMar1/blastDb
     done
 
     mkdir -p /cluster/data/petMar1/bed/tblastn.hg18KG
     cd /cluster/data/petMar1/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/petMar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 
 # 1018 query.lst
 
    # we want around 100,000 jobs per gig (0.0001 jobs per base)
    numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/petMar1/chrom.sizes`
 
    # we want around numJobs jobs
    numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
    numQueries=`wc query.lst | awk '{print $1}'`
    numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `
 
    mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa
    split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/petMar1/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	if test -s /cluster/data/petMar1/blastDb.lft
 	then
 	    liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/petMar1/blastDb.lft carry $f.2
 	else
 	    mv $f.2 $f.3
 	fi
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
         if pslCheck -prot $3.tmp
         then                  
             mv $3.tmp $3     
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0               
     fi                      
 fi                         
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit 
     
     ssh pk
     cd /cluster/data/petMar1/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 103836 of 103836 jobs
 # CPU time in finished jobs:    9418283s  156971.39m  2616.19h  109.01d  0.299 y
 # IO & Wait Time:                867856s   14464.26m   241.07h   10.04d  0.028 y
 # Average job time:                  99s       1.65m     0.03h    0.00d
 # Longest finished job:             683s      11.38m     0.19h    0.01d
 # Submission to last job:         38789s     646.48m    10.77h    0.45d
 
     ssh kkstore05
     cd /cluster/data/petMar1/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh memk
     cd /cluster/data/petMar1/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 102 of 102 jobs
 # CPU time in finished jobs:       4354s      72.57m     1.21h    0.05d  0.000 y
 # IO & Wait Time:                 70795s    1179.91m    19.67h    0.82d  0.002 y
 # Average job time:                 737s      12.28m     0.20h    0.01d
 # Longest finished job:            1308s      21.80m     0.36h    0.02d
 # Submission to last job:          2927s      48.78m     0.81h    0.03d
 
     ssh kkstore04
     cd /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/petMar1/bed/tblastn.hg18KG/blastHg18KG.psl
     cd ..
     pslCheck blastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/petMar1/bed/tblastn.hg18KG
     hgLoadPsl petMar1 blastHg18KG.psl
 
     # check coverage
     featureBits petMar1 blastHg18KG 
 # 7171010 bases of 831696438 (0.862%) in intersection
 
     featureBits petMar1 xenoRefGene:cds blastHg18KG  -enrichment
 # xenoRefGene:cds 0.676%, blastHg18KG 0.862%, both 0.362%, cover 53.51%, enrich 62.06x 
 
     ssh kkstore05
     rm -rf /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
 #end tblastn
 
 ###########################################################################
 ## Annotate 6-way multiple alignment with gene annotations
 ##		(DONE - 2008-01-08 - Hiram)
     # Gene frames
     ## given previous survey done for 8-way alignment on Orangutan,
     ## try using the following tables for this gene annotation:
     #	hg18 is in a bit of transition at the moment, so use ensGene
     #	mm9 can use knownGene
     #	xenoMrna for braFlo1, petMar1
     #	ensGene v49 for galGal3, oryLat1
 
     ssh hgwdev
     mkdir /cluster/data/petMar1/bed/multiz6way/frames
     cd /cluster/data/petMar1/bed/multiz6way/frames
     mkdir genes
     # knownGene
     for DB in mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # ensGene
     for DB in hg18 galGal3 oryLat1
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
 
     #	use xenoMrna for braFlo1, petMar1
     for DB in braFlo1 petMar1
 do
 tmpExt=`mktemp temp.XXXXXX`
 tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
 tmpMrna=${DB}.mrna.${tmpExt}
 tmpCds=${DB}.cds.${tmpExt}
 hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
 	   from xenoMrna,gbCdnaInfo,cds \
 	   where (xenoMrna.qName = gbCdnaInfo.acc) and \
 	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
 $DB > ${tmpMrnaCds}
 cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
 cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
 mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
 genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
 rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
 mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
 rm -f $tmpExt
 echo "${DB} done"
 done
 
     ls -og genes
 # -rw-rw-r--  1 1040642 Apr 28 14:30 braFlo1.gp.gz
 # -rw-rw-r--  1 1557911 Apr 28 14:29 galGal3.gp.gz
 # -rw-rw-r--  1 2151412 Apr 28 14:29 hg18.gp.gz
 # -rw-rw-r--  1 1965274 Apr 28 14:28 mm9.gp.gz
 # -rw-rw-r--  1 1713692 Apr 28 14:29 oryLat1.gp.gz
 # -rw-rw-r--  1  596014 Apr 28 14:31 petMar1.gp.gz
 
     ssh kkstore04
     cd /cluster/data/petMar1/bed/multiz6way/frames
     #	hg18 is in a bit of transition at the moment, so use ensGene
     #	mm9 can use knownGene
     #	xenoMrna for braFlo1, petMar1
     #	ensGene v49 for galGal3, oryLat1
     #	anything to annotate is in a pair, e.g.: petMar1 genes/petMar1.gp.gz
     time (cat  ../anno/petMar1.anno.6way.maf | nice -n +19 genePredToMafFrames petMar1 stdin stdout petMar1 genes/petMar1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz oryLat1 genes/oryLat1.gp.gz braFlo1 genes/braFlo1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
     # see what it looks like in terms of number of annotations per DB:
     zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
     #	 37711 braFlo1
     #	 55189 petMar1
     #	 75566 oryLat1
     #	 95715 galGal3
     #	107885 mm9
     #	110835 hg18
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way/frames
     time nice -n +19 hgLoadMafFrames petMar1 multiz6wayFrames \
 	multiz6way.mafFrames.gz
     #	real    0m7.652s
 
     #	enable the trackDb entries:
 # frames multiz6wayFrames
 # irows on
 #############################################################################
 # phastCons 6-way (DONE - 2008-04-28 - Hiram)
 
     # split 6way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh memk
     mkdir /cluster/data/petMar1/bed/multiz6way/msa.split
     cd /cluster/data/petMar1/bed/multiz6way/msa.split
     mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/cons/ss
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /cluster/data/petMar1/bed/multiz6way/anno/maf
 set WINDOWS = /san/sanvol1/scratch/petMar1/multiz6way/cons/ss
 pushd $WINDOWS
 set resultDir = $1
 set c = $2
 rm -fr $resultDir/$c
 mkdir -p $resultDir
 twoBitToFa -seq=$c /scratch/data/petMar1/petMar1.2bit /scratch/tmp/petMar1.$c.fa
 /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \
     -M /scratch/tmp/petMar1.$c.fa \
     -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
 rm -f /scratch/tmp/petMar1.$c.fa
 popd
 mkdir -p $resultDir
 date > $resultDir/$c.out
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	create list of maf files:
     (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
 # Completed: 31488 of 31488 jobs
 # CPU time in finished jobs:       2289s      38.15m     0.64h    0.03d  0.000 y
 # IO & Wait Time:                 82557s    1375.95m    22.93h    0.96d  0.003 y
 # Average job time:                   3s       0.04m     0.00h    0.00d
 # Longest finished job:              40s       0.67m     0.01h    0.00d
 # Submission to last job:          3052s      50.87m     0.85h    0.04d
 
     # take the cons and noncons trees from the mouse 30-way
 
     #	Estimates are not easy to make, probably more correctly,
     #	take the 30-way .mod file, and re-use it here.
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way
     cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh memk
     mkdir -p /cluster/data/petMar1/bed/multiz6way/cons/run.cons
     cd /cluster/data/petMar1/bed/multiz6way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all gliers placentals
     #	Well, that's what it was when used in the Mm9 30-way,
     #	in this instance, there is only the directory "all"
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
 set subDir = $1
 set f = $2
 set c = $2:r
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 set cons = /cluster/data/petMar1/bed/multiz6way/cons
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/petMar1/multiz6way/cons
 if (-s $cons/$grp/$grp.non-inf) then
   cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
   cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
 else
   cp -p $cons/$grp/$grp.mod $tmp
   cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
 sleep 4
 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
 rm -f $san/$grp/pp/$subDir/$f.pp
 rm -f $san/$grp/bed/$subDir/$f.bed
 mv $tmp/$f.pp $san/$grp/pp/$subDir
 mv $tmp/$f.bed $san/$grp/bed/$subDir
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(root1)/$(file1).pp}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/petMar1/multiz6way/cons
     find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
 	> /cluster/data/petMar1/bed/multiz6way/cons/ss.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
     /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
     ../../mm9.30way.mod \
 --prune-all-but=petMar1,hg18,galGal3,mm9,oryLat1,braFlo1 \
 	> all.mod
     #	Actually, this doesn't work because petMar1 and braFlo1 are not
     #	in that business.  So, manually place them in the TREE: line in all.mod,
     #	from the manufactured tree above, and the numbers from mm9.30way.mod:
 
 # TREE: ((petMar1:1.6,(oryLat1:1.077873,(galGal3:0.488824,(mm9:0.325818,hg18:0.136019):0.470757):0.325954):0.4):0.4,braFlo1:2.0);
 
     cd ../run.cons/all
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(lastDir1)/$(file1).pp}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 10471 of 10471 jobs
 # CPU time in finished jobs:        991s      16.52m     0.28h    0.01d  0.000 y
 # IO & Wait Time:                 68810s    1146.83m    19.11h    0.80d  0.002 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest finished job:              10s       0.17m     0.00h    0.00d
 # Submission to last job:          4767s      79.45m     1.32h    0.06d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
     find ./bed -type f -name "Contig*.bed" | xargs cat \
 	| sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 3 minutes
     cp -p mostConserved.bed /cluster/data/petMar1/bed/multiz6way/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way/cons/all
     time nice -n +19 hgLoadBed petMar1 phastConsElements6way mostConserved.bed
     #	Loaded 76016 elements of size 5
 
     # Try for 5% overall cov, and 70% CDS cov 
     #	We don't have any gene tracks to compare CDS coverage
     #	--rho .31 --expected-length 45 --target-coverage .3
     featureBits petMar1 phastConsElements6way
     #	23909522 bases of 831696438 (2.875%) in intersection
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
     mkdir -p phastCons6wayScores
 
 for D in `ls -1d pp/file* | sort -t_ -k2n`
 do
     F=${D/pp\/}
     out=phastCons6wayScores/${F}.data.gz
     echo "${D} > ${F}.data.gz"
     ls -S ${D}/*.pp | xargs cat | gzip > ${out}
 done
 
     #	copy the phastCons6wayScores to:
 # /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/phastConsScores
     #	for hgdownload downloads
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
     ls -1 phastCons6wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
 	| wigEncode stdin phastCons6way.wig phastCons6way.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     time nice -n +19 cp -p *.wi? /cluster/data/petMar1/bed/multiz6way/cons/all
     #	real    0m1.729s
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way/cons/all
     ln -s `pwd`/phastCons6way.wib /gbdb/petMar1/multiz6way/phastCons6way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/petMar1/multiz6way petMar1 \
 	phastCons6way phastCons6way.wig
     #	real    0m2.068s
     # remove garbage
     rm wiggle.tab
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way/cons/all
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=petMar1 phastCons6way > histogram.data 2>&1
     #	real    5m0.608s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Lamprey PetMar1 Histogram phastCons6way track"
 set xlabel " phastCons6way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     #	These trackDb entries turn on the wiggle phastCons data track:
     #	type wigMaf 0.0 1.0
     #	maxHeightPixels 100:40:11
     #	wiggle phastCons6way
     #	spanList 1
     #	autoScaleDefault Off
     #	windowingFunction mean
     #	pairwiseHeight 12
     #	yLineOnOff Off
 
 #############################################################################
 #  Downloads (DONE - 2008-04-29 - Hiram)
     #	Let's see if the downloads will work
     ssh hgwdev
     cd /cluster/data/petMar1
     #	expecting to find repeat masker .out file here:
     ln -s bed/RepeatMasker/petMar1.fa.out .
     gunzip bed/simpleRepeat/trfMask.bed.gz
     time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
 	-workhorse=hgwdev petMar1 > jkStuff/downloads.log 2>&1
     #	real    7m13.080s
 
     #	failed making upstream sequences:
     #	featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
     #	setpriority: Permission denied.
     #	the 'nice' from my bash shell causes trouble inside the csh
     #	script which uses nice.  Finish off the install step manually
     #	with the mgcGenes upstreams ...
 
 #############################################################################
 #  PushQ entries (DONE - 2008-04-29 - Hiram)
     ssh hgwdev
     /cluster/data/petMar1
     /cluster/bin/scripts/makePushQSql.pl petMar1 > jkStuff/pushQ.sql
     #	output warnings:
 # petMar1 does not have seq
 # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
 # and genbank tables) which tracks to assign these tables to:
 #	gbWarn
 #	multiz6wayFrames
 #	phastCons6way
     scp -p jkStuff/pushQ.sql hgwbeta:/tmp/petMar1.sql
     ssh hgwbeta
     cd /tmp
     hgsql qapushq < petMar1.sql
 
 #############################################################################
 #  Create 6-way downloads (DONE - 2008-05-01 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way
     cd /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way
     #	if these haven't been copied here from before:
     cp -p \
 /san/sanvol1/scratch/petMar1/multiz6way/cons/all/phastCons6wayScores/* .
     ln -s ../../cons/all/all.mod ./6way.mod
     cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
     # edit that README.txt to be correct for this 6-way alignment
     cd ..
     mkdir multiz6way
     cd multiz6way
     cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
     # edit that README.txt to be correct for this 6-way alignment
     ssh kkstore04
     cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
     ln -s ../../petMar1.6-way.nh ./6way.nh
 
     time nice -n +19 gzip -c ../../anno/petMar1.anno.6way.maf \
 	> petMar1.6way.maf.gz
     #	real    0m49.861s
 
     ssh hgwdev
     cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
     #	creating upstream files from xenoRefGene, bash script:
     cat << '_EOF_' > mkUpstream.sh
 #!/bin/bash
 DB=petMar1
 GENE=xenoRefGene
 NWAY=multiz6way
 export DB GENE
 
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                 stdin stdout \
                 -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 '_EOF_'
     # << happy emacs
     chmod +x ./mkUpstream.sh
     time nice -n +19 ./mkUpstream.sh
     #	real    2m26.254s
 # -rw-rw-r--  1    61861 May  1 16:10 upstream1000.maf.gz
 # -rw-rw-r--  1   104203 May  1 16:11 upstream2000.maf.gz
 # -rw-rw-r--  1   195382 May  1 16:12 upstream5000.maf.gz
 
     #	check the names in these upstream files to ensure sanity:
     zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
 	| sort | uniq -c | sort -rn | less
     #	should be a list of the other 4 species with a high count,
     #	then xenoRefGene names, e.g.:
     #	206 oryLat1
     #	206 mm9
     #	206 hg18
     #	206 galGal3
     #	206 braFlo1
     #	  8 NM_054045
     #	  7 NM_013550
     #	  5 NM_069752
 
     ssh kkstore04
     cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
     md5sum *.maf.gz > md5sum.txt
     cd ../phastCons6way
     md5sum *.data.gz *.mod > md5sum.txt
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way
     mkdir /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way
     cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
     ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way
     rm /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way/mkUpstream.sh 
     cd ../phastCons6way
     ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way
     #	if your ln -s `pwd`/* made extra links to files you don't want there,
     #	check the goldenPath locations and remove those extra links
 
 #############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram
     ssh kkstore01	# not too important since everything moved to hive
     screen # use screen to control this job
     cd /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26
     cat fb.oryLat2.chainPetMar1Link.txt
     #	41568923 bases of 700386597 (5.935%) in intersection
 
     #	And, for the swap:
     mkdir /cluster/data/petMar1/bed/blastz.oryLat2.swap
     cd /cluster/data/petMar1/bed/blastz.oryLat2.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-smallClusterHub=pk -bigClusterHub=kk > swap.log 2>&1 &
     #	real    52m27.942s
     cat fb.petMar1.chainOryLat2Link.txt
     #	39181422 bases of 831696438 (4.711%) in intersection
 
 ############################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 petMar1.upstreamGeneTbl = xenoRefGene
 petMar1.upstreamMaf = multiz6way /hive/data/genomes/petMar1/bed/multiz6way/species.list
 
 ############################################################################
 # QUALITY TRACK (DONE - 2008-11-25 - Hiram)
     cd /hive/data/genomes/petMar1/downloads
     qaToQac contigs.fa.quals.gz assembly.qac
     qacAgpLift ../petMar1.agp assembly.qac scaffolds.qac
     mkdir /hive/data/genomes/petMar1/bed/qual
     cd /hive/data/genomes/petMar1/bed/qual
     qacToWig -fixed ../../downloads/scaffolds.qac stdout \
         | wigEncode stdin qual.wig qual.wib
     ln -s `pwd`/qual.wib /gbdb/petMar1/wib
     hgLoadWiggle -pathPrefix=/gbdb/petMar1/wib petMar1 quality qual.wig
 
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built  Tracks are created and loaded
+by a single Makefile. This is available from:
+   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################