src/hg/makeDb/doc/braFlo1.txt 1.19

1.19 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/braFlo1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/braFlo1.txt,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 1000000 -r1.18 -r1.19
--- src/hg/makeDb/doc/braFlo1.txt	17 Oct 2008 01:06:30 -0000	1.18
+++ src/hg/makeDb/doc/braFlo1.txt	25 Nov 2009 21:48:38 -0000	1.19
@@ -1,1527 +1,1527 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes browser build for the Lancelet
 # genome, Branchiostoma floridae, March 2006, v.1.0 from JGI
 #
 #       "$Id$"
 #
 
 ##########################################################################
 ### Fetch sequence       (DONE - 2007-03-26 - Hiram)
     ssh kkstore05
     mkdir /cluster/store12/braFlo1
     ln -s /cluster/store12/braFlo1 /cluster/data/braFlo1
     mkdir /cluster/data/braFlo1/downloads
     cd /cluster/data/braFlo1/downloads
     wget --timestamping \
 'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/Branchiostoma.fasta.gz'
     wget --timestamping \
 'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/proteins.Brafl1.fasta.gz'
     wget --timestamping \
 'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/transcripts.Brafl1.fasta.gz'
     gunzip Branchiostoma.fasta.gz
     scaffoldFaToAgp Branchiostoma.fasta
 
 ##########################################################################
 ## Set up initial database (DONE - 2007-03-26 - Hiram)
     cd /cluster/data/braFlo1
     cat << '_EOF_' > braFlo1.config.ra
 # Config parameters for makeGenomeDb.pl:
 db braFlo1
 clade deuterostome
 genomeCladePriority 20
 scientificName Branchiostoma floridae
 commonName Lancelet
 assemblyDate Mar. 2006
 assemblyLabel JGI v.1.0 (March 2006)
 orderKey 799
 mitoAcc 5881414
 fastaFiles /cluster/data/braFlo1/downloads/Branchiostoma.fasta
 agpFiles /cluster/data/braFlo1/downloads/Branchiostoma.agp
 # qualFiles none
 dbDbSpeciesDir lancelet
 '_EOF_'
     time nice -n +19 makeGenomeDb.pl braFlo1.config.ra > makeGenomeDb.out 2>&1
 
     hgsql -e 'INSERT INTO chrM_gold
 (bin, chrom, chromStart, chromEnd, ix, type, frag, fragStart, fragEnd, strand)
 VALUES (585, "chrM", 0, 15083, 1, "F", "NC_000834.1", 0, 15083, "+");' braFlo1
 
     ## done with this sequence
     cd downloads
     gzip Branchiostoma.fasta
 
 ##########################################################################
 ## RepeatMasker (DONE - 2007-03-26 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/braFlo1/bed/RepeatMasker
     cd /cluster/data/braFlo1/bed/RepeatMasker
     time nice -n +10 doRepeatMasker.pl braFlo1 \
 	-buildDir=/cluster/data/braFlo1/bed/RepeatMasker > do.log 2>&1 &
     # about 11 minutes
     cd /cluster/data/braFlo1
     twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin
     #	926386587 bases (95184565 N's 831202022 real
     #	806238402 upper 24963620 lower) in 2 sequences in 1 files
     #	twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin
     #	%2.69 masked total, %3.00 masked real
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Hiram)
     ssh kolossus
     mkdir /cluster/data/braFlo1/bed/simpleRepeat
     cd /cluster/data/braFlo1/bed/simpleRepeat
     for C in chrM chrUn
 do
     mkdir /scratch/tmp/braFlo1.trf
     twoBitToFa -seq="${C}" ../../braFlo1.unmasked.2bit \
 	/scratch/tmp/braFlo1.trf/${C}.fa
     time nice -n 19 trfBig -trf=/cluster/bin/i386/trf \
 	/scratch/tmp/braFlo1.trf/${C}.fa /dev/null \
 	-bedAt=${C}.bed -tempDir=/scratch/tmp/braFlo1.trf > ${C}.log 2>&1
     rm -fr /scratch/tmp/braFlo1.trf
     echo ${C} done
 done
     #	real    187m42.772s
     #	no repeats found on chrM
     # Make a filtered version for sequence masking:
     awk '{if ($5 <= 12) print;}' chrUn.bed > trfMask.bed
 
   # Load unfiltered repeats into the database:
     ssh hgwdev
     hgLoadBed braFlo1 simpleRepeat \
       /cluster/data/braFlo1/bed/simpleRepeat/chrUn.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
 
 ################################################
 ## WINDOWMASKER (DONE - 2007-03-26 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/braFlo1/bed/WindowMasker
     cd /cluster/data/braFlo1/bed/WindowMasker
     time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl braFlo1 \
       -buildDir=/cluster/data/braFlo1/bed/WindowMasker \
 	-workhorse=kolossus > wmRun.log 2>&1 &
     #	real    82m55.387s
 
     # Masking statistics
     twoBitToFa braFlo1.wmsk.sdust.2bit stdout | faSize stdin
     #	926386587 bases (95184565 N's 831202022 real
     #	611909619 upper 219292403 lower) in 2 sequences in 1 files
     #	%23.67 masked total, %26.38 masked real
 
     ssh hgwdev
     time nice -n +19 \
 	hgLoadBed -strict braFlo1 windowmaskerSdust windowmasker.sdust.bed.gz
     #	Loaded 4263263 elements of size 3
 
     #	eliminate the gaps from the masking (WM bug)
     featureBits braFlo1 -not gap -bed=notGap.bed
     #	831202022 bases of 831202022 (100.000%) in intersection
  
     time nice -n +19 featureBits braFlo1 windowmaskerSdust notGap.bed \
         -bed=stdout | gzip -c > cleanWMask.bed.gz
     #	219292403 bases of 831202022 (26.383%) in intersection
 
     #	reload track to get it clean
     hgLoadBed braFlo1 windowmaskerSdust cleanWMask.bed.gz
     #	Loaded 4255796 elements of size 4
     featureBits braFlo1 windowmaskerSdust
     #	219292403 bases of 831202022 (26.383%) in intersection
 
 ##############################################################################
 ## combine trf mask with windowmasker (DONE - 2008-05-23 - Hiram)
     ssh kkstore05
     cd /cluster/data/braFlo1
     zcat bed/WindowMasker/cleanWMask.bed.gz \
 	| twoBitMask braFlo1.unmasked.2bit stdin \
 	    -type=.bed braFlo1.cleanWMSdust.2bit
     cat bed/simpleRepeat/trfMask.bed \
       | twoBitMask -add -type=.bed braFlo1.cleanWMSdust.2bit stdin braFlo1.2bit
     #	can safely ignore the warning about BED file >= 13 fields
     #	check it
     twoBitToFa braFlo1.2bit stdout | faSize stdin
     #	926386587 bases (95184565 N's 831202022 real 611655540 upper
     #	219546482 lower) in 2 sequences in 1 files
     #	%23.70 masked total, %26.41 masked real
 
     #	create gbdb symlink
     ssh hgwdev
     ln -s /cluster/data/braFlo1/braFlo1.2bit /gbdb/braFlo1
 
 #########################################################################
 ## Create masked scaffolds-only 2bit file (DONE - 2007-03-26 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/braFlo1/scaffolds
     cp -p /cluster/data/oryLat1/jkStuff/lft2BitToFa.pl \
 	/cluster/data/braFlo1/jkStuff
     cp -p /cluster/data/oryLat1/jkStuff/agpToLift.pl \
 	/cluster/data/braFlo1/jkStuff
     cd /cluster/data/braFlo1/scaffolds
     ln -s ../Un/chrUn.agp .
     ../jkStiff/agpToLift.pl chrUn.agp > braFlo1.lift
     ../jkStuff/lft2BitToFa.pl ../braFlo1.2bit braFlo1.lift \
 	> chrUn.scaffolds.fa
     twoBitToFa ../braFlo1.2bit chrM.fa
     faToTwoBit chrUn.scaffolds.fa chrM.fa braFlo1UnScaffolds.2bit
     twoBitInfo braFlo1UnScaffolds.2bit stdout | sort -k2nr \
 	> braFlo1UnScaffolds.sizes
     cp -p braFlo1UnScaffolds.sizes braFlo1UnScaffolds.2bit braFlo1.lift \
 	 /san/sanvol1/scratch/braFlo1
 
 #########################################################################
 ## Reload rmsk to take care of empty chrM (DONE - 2007-03-26 - Hiram)
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/RepeatMasker
     hgLoadOut -table=rmsk -nosplit braFlo1 braFlo1.fa.out
     hgsql -e "drop table chrUn_rmsk;" braFlo1
     #	before:
     featureBits braFlo1 rmsk
     #	24973684 bases of 923355587 (2.705%) in intersection
     #	after:
     featureBits braFlo1 rmsk
     #	24973684 bases of 923355587 (2.705%) in intersection
  
 #########################################################################
 ## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram)
     # measurement on hg18
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
     cat fb.hg18.chainBraFlo1Link.txt
     #	26455595 bases of 2881515245 (0.918%) in intersection
 
     # and now the swap
     mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap
     cd /cluster/data/braFlo1/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
 	/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 \
 	-swap > swap.log 2>&1 &
     #	real    83m46.258s
     cat fb.braFlo1.chainHg18Link.txt
     #	30912893 bases of 923355587 (3.348%) in intersection
 
     #	create reciprocal best chains/nets for 5-way maf alignments
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/blastz.hg18.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 hg18 \
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doRecipBest.pl \
 	braFlo1 hg18 > rbest.log 2>&1 &
     #	real    34m14.843s
     #	real    32m11.695s
     #	This did not work right.  Something went haywire somewhere:
 # Error: braFlo1 rbest net coverage 14190116 != hg18 14270477
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram)
     # Use -repMatch=328 (based on size -- for human we use 1024, and 
     # lancelet size is ~32% of human judging by gapless braFlo1 vs. hg18 
     # genome sizes from featureBits.
     ssh kolossus
     cd /cluster/data/braFlo1
     blat braFlo1.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/11.ooc -repMatch=328
     #	Wrote 9536 overused 11-mers to jkStuff/11.ooc
     cp -p jkStuff/11.ooc /san/sanvol1/scratch/braFlo1
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE - 2007-03-26,27 - Hiram)
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add braFlo1 just after gasAcu1
 
 # braFlo1 (Branchiostoma floridae - Lancelet)
 braFlo1.serverGenome = /cluster/data/braFlo1/braFlo1.2bit
 braFlo1.clusterGenome = /san/sanvol1/scratch/braFlo1/braFlo1.2bit
 braFlo1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 braFlo1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 braFlo1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 braFlo1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 braFlo1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 braFlo1.ooc = /san/sanvol1/scratch/braFlo1/11.ooc
 braFlo1.lift = /cluster/data/braFlo1/jkStuff/liftAll.lft
 braFlo1.refseq.mrna.native.load = no
 braFlo1.refseq.mrna.xeno.load = yes
 braFlo1.genbank.mrna.xeno.load = yes
 braFlo1.genbank.est.native.load = yes
 braFlo1.downloadDir = braFlo1
 braFlo1.perChromTables = no
 
     cvs ci -m "Added braFlo1." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     # Edit src/lib/gbGenome.c to add new species.
     cvs ci -m "Added Branchiostoma floridae (lancelet)." src/lib/gbGenome.c
     make install-server
 
     ssh kkstore05
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial braFlo1 &
     #	logFile: var/build/logs/2007.03.26-21:58:34.braFlo1.initalign.log
     #	real    230m8.123s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad braFlo1
     #	logFile: var/dbload/hgwdev/logs/2007.03.27-07:52:23.dbload.log
     #	real    6m57.557s
     ## re-load to get the est track after adding that to the .conf file
     #	var/dbload/hgwdev/logs/2007.03.29-17:09:35.dbload.log
     #	real    19m2.168s
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add braFlo1 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added braFlo1 - Branchiostoma floridae (lancelet)" \
 	etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 ############################################################################
 ## Adding a photograph (DONE - 2007-03-28 - Hiram)
 ## From Hector Escriva: hector.escriva@obs-banyuls.fr
 # Dr. Clawson
 # All the european teams working with amphioxus have changed to B.
 # lanceolatum species and none of us is still working with B. floridae. We
 # are able here to induce spawnings everyday (on B. lanceolatum) and this
 # is impossible with the american species which limits very much the
 # amount of experiments that one can perform. So I have pictures of B.
 # lanceolatum, which is morphologically the same as floridae (even me I
 # cannot distinghish them). So I send you here a picture of a B.
 # lanceolatum juvenile in which we clearly see the main morphological
 # aspects of this animal. You should however write the correct name of the
 # species in order to be serious.
 # I would also ask you to write in "Photo courtesy of", the name of my
 # technicien who did the picture and myself. Michael Fuentes and Hector Escriva.
 # Let me know if you agree with this picture.
 # Sincerely Hector Escriva
     mkdir /cluster/data/braFlo1/photograph
     cd /cluster/data/braFlo1/photograph
     convert -sharpen 0 -normalize -geometry 350x200 \
 	PostmetamorphicLarvae.jpg Branchiostoma_lanceolatum.jpg
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2007-04-04 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("braFlo1", "blat11", "17784", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("braFlo1", "blat11", "17785", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 ## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram)
     ssh hgwdev
     hgsql -e 'update dbDb set defaultPos="chrUn:619013791-619054719"
 	where name="braFlo1";' hgcentraltest
 
 ############################################################################
 #  BLASTZ petMar1 Lamprey (DONE - 2008-04-11 - Hiram)
     ssh kkstore05
     screen	# use a screen to control this multi-day job
     mkdir /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
     cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
 
     cat << '_EOF_' > DEF
 # Lancelet vs Lamprey
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Lancelet braFlo1 - largest chunk big enough for largest scaffold
 #       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
 SEQ1_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit
 SEQ1_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes
 SEQ1_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit
 SEQ1_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes
 SEQ1_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift
 SEQ1_CHUNK=10000000
 SEQ1_LIMIT=50
 SEQ1_LAP=10000
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
 # Completed: 78590 of 78590 jobs
 # CPU time in finished jobs:   10056466s  167607.76m  2793.46h  116.39d  0.319 y
 # IO & Wait Time:               6633099s  110551.65m  1842.53h   76.77d  0.210 y
 # Average job time:                 212s       3.54m     0.06h    0.00d
 # Longest finished job:            4572s      76.20m     1.27h    0.05d
 # Submission to last job:        107694s    1794.90m    29.91h    1.25d
     #	continuing after various interruptions and finished the batch manually
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
     #	real    12m43.792s
     cat fb.braFlo1.chainPetMar1Link.txt
     #	27418217 bases of 923355587 (2.969%) in intersection
 
     #	create reciprocal best chains/nets for 5-way maf alignments
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 petMar1 \
 	> rbest.log 2>&1 &
     #	real    30m11.990s
 
     #	and the swap
     mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap
     cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     #	real    194m53.514s
     cat fb.petMar1.chainBraFlo1Link.txt
     #	27373264 bases of 831696438 (3.291%) in intersection
 
 ##########################################################################
 ## BLASTZ Chicken GalGal3 swap (DONE - 2008-04-16 - Hiram)
     #	the original
     ssh kkstore03
     cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
     cat fb.galGal3.chainBraFlo1Link.txt
     #	19795687 bases of 1042591351 (1.899%) in intersection
 
     #	and the swap
     mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap
     cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 &
     #	real    4m45.054s
     cat fb.braFlo1.chainGalGal3Link.txt
     #	30287175 bases of 923355587 (3.280%) in intersection
 
     #	create reciprocal best chains/nets for 5-way maf alignments
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 galGal3 \
 	> rbest.log 2>&1 &
     #	real    24m30.834s
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-21)
 
 # (auto) establish native host of /cluster/data/<assembly>
     df /cluster/data/braFlo1
     ssh kkstore05
     # bash  if not using bash shell already
 
     mkdir /cluster/data/braFlo1/blastDb
     cd /cluster/data/braFlo1
     awk '{if ($2 > 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > 1meg.lst
     if test -s 1meg.lst; then
 	twoBitToFa -seqList=1meg.lst  scaffolds/braFlo1UnScaffolds.2bit temp.fa
 	faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
 	rm temp.fa 
     fi
     rm 1meg.lst
 
     awk '{if ($2 <= 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > less1meg.lst
     if test -s less1meg.lst; then
 	twoBitToFa -seqList=less1meg.lst  scaffolds/braFlo1UnScaffolds.2bit temp.fa
 	faSplit about temp.fa 1000000 blastDb/y 
 	rm temp.fa 
     fi
     rm less1meg.lst
 
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 # 1063
 
     mkdir -p /san/sanvol1/scratch/braFlo1/blastDb
     cd /cluster/data/braFlo1/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/braFlo1/blastDb
     done
 
     mkdir -p /cluster/data/braFlo1/bed/tblastn.hg18KG
     cd /cluster/data/braFlo1/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/braFlo1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 
 # 1063 query.lst
 
    # we want around 100,000 jobs per gig (0.0001 jobs per base)
    numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/braFlo1/chrom.sizes`
 
    # we want around numJobs jobs
    numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
    numQueries=`wc query.lst | awk '{print $1}'`
    numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `
 
    mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa
    split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	if test -s /cluster/data/braFlo1/blastDb.lft
 	then
 	    liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/braFlo1/blastDb.lft carry $f.2
 	else
 	    mv $f.2 $f.3
 	fi
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
         if pslCheck -prot $3.tmp
         then                  
             mv $3.tmp $3     
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0               
     fi                      
 fi                         
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit 
     
     ssh pk
     cd /cluster/data/braFlo1/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 93544 of 93544 jobs
 # CPU time in finished jobs:    8499504s  141658.40m  2360.97h   98.37d  0.270 y
 # IO & Wait Time:                669830s   11163.84m   186.06h    7.75d  0.021 y
 # Average job time:                  98s       1.63m     0.03h    0.00d
 # Longest finished job:             376s       6.27m     0.10h    0.00d
 # Submission to last job:         44890s     748.17m    12.47h    0.52d
 
     ssh kkstore05
     cd /cluster/data/braFlo1/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh memk
     cd /cluster/data/braFlo1/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 88 of 88 jobs
 # CPU time in finished jobs:        906s      15.09m     0.25h    0.01d  0.000 y
 # IO & Wait Time:                 19435s     323.92m     5.40h    0.22d  0.001 y
 # Average job time:                 231s       3.85m     0.06h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             322s       5.37m     0.09h    0.00d
 # Submission to last job:           812s      13.53m     0.23h    0.01d
 
     ssh kkstore05
     cd /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/braFlo1/bed/tblastn.hg18KG/preblastHg18KG.psl
     cd ..
     pslCheck preblastHg18KG.psl
     liftUp -nosort -type=".psl" -nohead blastHg18KG.psl /cluster/data/braFlo1/scaffolds/braFlo1.lift carry preblastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/tblastn.hg18KG
     hgLoadPsl braFlo1 blastHg18KG.psl
 
     # check coverage
     featureBits braFlo1 blastHg18KG 
 # 12239766 bases of 923355587 (1.326%) in intersection
 
     featureBits braFlo1 xenoRefGene:cds blastHg18KG  -enrichment
 # xenoRefGene:cds 1.206%, blastHg18KG 1.326%, both 0.621%, cover 51.51%, enrich 38.86x
 
     ssh kkstore05
     rm -rf /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
 #end tblastn
 
 ## BLASTZ Mouse mm9 swap (DONE - 2008-04-17 - Hiram)
     #	the original
     ssh kkstore06
     cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
     cat fb.mm9.chainBraFlo1Link.txt
     #	26725980 bases of 2620346127 (1.020%) in intersection
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap
     cd /cluster/data/braFlo1/bed/blastz.mm9.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 &
     #	real    12m23.402s
     cat  fb.braFlo1.chainMm9Link.txt
     #	31517169 bases of 923355587 (3.413%) in intersection
 
     #	create reciprocal best chains/nets for 5-way maf alignments
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/blastz.mm9.swap
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 mm9 \
 	> rbest.log 2>&1 &
     #	real    35m15.178s
 
 #############################################################################
 ## 5-Way Multiz (DONE - 2008-04-17 - Hiram)
 ##
     ssh hgwdev
     mkdir /cluster/data/braFlo1/bed/multiz5way
     cd /cluster/data/braFlo1/bed/multiz5way
     #	take the 6-way tree from mm9 and eliminate genomes not in
     #	this alignment.  Manually add in petMar1 and braFlo1
     #	arbitrarily somewhere outside of the fishes
     #	rearrange to get braFlo1 on the top of the graph
     #	paste this tree into the on-line phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to create the image for the tree diagram
 
     #	select the 5 organisms from the 6-way on Lamprey petMar1
     /cluster/bin/phast/tree_doctor \
 	--prune-all-but \
 Human_hg18,Mouse_mm9,Chicken_galGal3,Lamprey_petMar1,Lanclet_braFlo1 \
 	/cluster/data/petMar1/bed/multiz6way/petMar1.6-way.nh \
 	> 5-way.fullNames.nh
 
     #	looks something like this:
     ((Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824,
       (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000,
 	Lanclet_braFlo1:2.000000);
 
     #	rearrange to get Lanclet at the top, this leaves us with:
     cat << '_EOF_' > braFlo1.5-way.nh
     (Lanclet_braFlo1:2.0,(Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824,
       (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000);
 '_EOF_'
     #	<< happy emacs
 
     #	create a species list from that file:
     sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' braFlo1.5-way.nh \
         | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
         | sed -e "s/.*_//; s/:.*//" | sort > species.list
     #	verify that has 5 db names in it
     # create a stripped down nh file for use in autoMZ run
     echo \
 `sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' braFlo1.5-way.nh \
 	| sed -e "s/  / /g"` > tree.5.nh
     #	that looks like, as a single line:
 # (braFlo1 (petMar1 (galGal3 (mm9 hg18))))
 
     # verify all blastz's exists
     cat << '_EOF_' > listMafs.csh
 #!/bin/csh -fe
 cd /cluster/data/braFlo1/bed/multiz5way
 foreach db (`grep -v braFlo1 species.list`)
     set bdir = /cluster/data/braFlo1/bed/blastz.$db
     if (-e $bdir/mafRBestNet/chrUn.maf.gz) then
 	echo "$db mafRBestNet"
     else if (-e $bdir/mafSynNet/chrUn.maf.gz) then
 	echo "$db mafSynNet"
     else if (-e $bdir/mafNet/chrUn.maf.gz) then
 	echo "$db mafNet"
     else
 	echo "$db mafs not found"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./listMafs.csh
     #	see what it says, these should all be mafRBestNet
     ./listMafs.csh
 # galGal3 mafRBestNet
 # hg18 mafRBestNet
 # mm9 mafRBestNet
 # petMar1 mafRBestNet
 
     /cluster/bin/phast/all_dists braFlo1.5-way.nh > 5way.distances.txt
     grep -i braflo 5way.distances.txt | sort -k3,3n
 Lanclet_braFlo1 Chicken_galGal3 3.614778
 Lanclet_braFlo1 Human_hg18      3.723612
 Lanclet_braFlo1 Mouse_mm9       3.922529
 Lanclet_braFlo1 Lamprey_petMar1 4.000000
 
     #	use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.  Zebrafish ends up before
     #	tetraodon and fugu on the browser despite its distance.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                       chainBraFlo1Link   chain   linearGap
 #    distance                     on BraFlo1    on other   minScore
 #  1  3.614778 Chicken_galGal3    (%  3.280)   (%  1.899)   5000     loose
 #  2  3.723612 Human_hg18         (%  3.348)   (%  0.918)   2000     loose
 #  3  3.922529 Mouse_mm9          (%  3.413)   (%  1.020)   5000     loose
 #  4  4.000000 Lamprey_petMar1    (%  2.969)   (%  3.291)   5000     loose
 
     # copy net mafs to cluster-friendly storage, splitting chroms
     #	 XXX - no need to split, there is only chrM and chrUn
     mkdir mafLinks
     cd mafLinks
     for D in galGal3 petMar1 mm9 hg18
 do
     mkdir ${D}
     ln -s /cluster/data/braFlo1/bed/blastz.${D}/mafRBestNet/*.maf.gz ./${D}
 done
 
     #	copy to kluster friendly san for multiz run
     ssh kkstore05
     mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc
     cd /cluster/data/braFlo1/bed/multiz5way/mafLinks
     rsync -a --progress -L ./ /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc
 
     # ready for the multiz run
     ssh memk
     mkdir /cluster/data/braFlo1/bed/multiz5way/cons
     cd /cluster/data/braFlo1/bed/multiz5way/cons
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $P/{autoMZ,multiz,maf_project} penn
 
     #	set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = braFlo1
 set c = $1
 set result = $2
 set resultDir = $result:h
 set run = `pwd`
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz5way/mafSrc
 rm -fr $tmp
 mkdir -p $tmp
 mkdir -p $resultDir
 cp ../../tree.5.nh ../../species.list $tmp
 pushd $tmp
 foreach s (`grep -v $db species.list`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
         zcat $in.gz > $out
     else if (-e $in) then
         cp $in $out
     else
         echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($run/penn $path); rehash
 $run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf
 popd
 rm -f $result
 cp $tmp/$c.maf $result
 rm -fr $tmp
 rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/braFlo1/bed/multiz5way/cons/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << emacs
 
     echo "chrM.maf" > mafList
     echo "chrUn.maf" >> mafList
     gensub2 mafList single template jobList
 
     para create jobList
     para try ... check ... push ... etc
 # Completed: 2 of 2 jobs
 # CPU time in finished jobs:        375s       6.25m     0.10h    0.00d  0.000 y
 # IO & Wait Time:                    17s       0.28m     0.00h    0.00d  0.000 y
 # Average job time:                 196s       3.27m     0.05h    0.00d
 # Longest finished job:             385s       6.42m     0.11h    0.00d
 # Submission to last job:           385s       6.42m     0.11h    0.00d
 
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/braFlo1/multiz5way/maf
     ln -s /cluster/data/braFlo1/bed/multiz5way/cons/maf/*.maf \
                 /gbdb/braFlo1/multiz5way/maf
     # this generates an immense multiz5way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/braFlo1/multiz5way/maf braFlo1 multiz5way
     #	Loaded 220560 mafs in 2 files from /gbdb/braFlo1/multiz5way/maf
     #	real    0m4.151s
 
     # load summary table
     time nice -n +19 cat /gbdb/braFlo1/multiz5way/maf/*.maf \
 	| hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz5waySummary stdin
     #	Created 87678 summary blocks from 366073 components and 220521 mafs from stdin
     #	real    0m4.911s
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore05
     mkdir /cluster/data/braFlo1/bed/multiz5way/anno
     cd /cluster/data/braFlo1/bed/multiz5way/anno
     mkdir maf run
 
     #	these actually already all exist from previous multiple alignments
     #	if there are done you will see an ls of them
     #	or else the twoBitInfo command will be echoed, run it if you want
     for DB in `cat ../species.list`
 do
     CDIR="/cluster/data/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `grep -v braFlo1 ../../species.list`
 do
     echo "${DB} "
     ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     ssh memk
     cd /cluster/data/braFlo1/bed/multiz5way/anno/run
     cat << '_EOF_' > doAnno.csh
 #!/bin/csh -ef
 set dir = /cluster/data/braFlo1/bed/multiz5way/cons
 set c = $1
 cat $dir/maf/${c}.maf | nice \
     mafAddIRows -nBeds=nBeds stdin /cluster/data/braFlo1/braFlo1.2bit $2
 '_EOF_'
     # << happy emacs
     chmod +x doAnno.csh
 
 
     cat << '_EOF_' > template
 #LOOP
 ./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /cluster/data/braFlo1/chrom.sizes > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 2 of 2 jobs
 # CPU time in finished jobs:       7240s     120.66m     2.01h    0.08d  0.000 y
 # IO & Wait Time:                     9s       0.16m     0.00h    0.00d  0.000 y
 # Average job time:                3625s      60.41m     1.01h    0.04d
 # Longest finished job:            7241s     120.68m     2.01h    0.08d
 # Submission to last job:          7241s     120.68m     2.01h    0.08d
 
     #	load the results
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way/anno
     mkdir -p /gbdb/braFlo1/multiz5way/anno
     ln -s `pwd`/maf/*.maf /gbdb/braFlo1/multiz5way/anno
     #	by loading this into the table multiz5way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/braFlo1/multiz5way/anno \
                 braFlo1 multiz5way
     #	Loaded 311293 mafs in 2 files from /gbdb/braFlo1/multiz5way/anno
     #	real    0m7.505s
 
     #	normally filter this for chrom size > 1,000,000 and only load
     #	those chroms.  But this is a scaffold assembly, load everything.
     time nice -n +19 cat /gbdb/braFlo1/multiz5way/anno/*.maf \
 	| hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz5waySummary stdin
     #	Created 87678 summary blocks from 366073 components and 311253 mafs from stdin
     #	real    0m6.998s
 
     #	by loading this into the table multiz5waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	remove the multiz5way*.tab files in this /scratch/tmp directory
     rm multiz5way*.tab
     #	And, you can remove the previously loaded non-annotated maf file link:
     rm /gbdb/braFlo1/multiz5way/maf/*.maf
     rmdir /gbdb/braFlo1/multiz5way/maf
 
 ###########################################################################
 ## Annotate 5-way multiple alignment with gene annotations
 ##		(DONE - 2008-05-01 - Hiram)
     # Gene frames
     ## following pattern used in Lamprey petMar1
     #	hg18 is in a bit of transition at the moment, so use ensGene
     #	mm9 can use knownGene
     #	xenoMrna for braFlo1, petMar1
     #	ensGene v49 for galGal3
 
     ssh hgwdev
     mkdir /cluster/data/braFlo1/bed/multiz5way/frames
     cd /cluster/data/braFlo1/bed/multiz5way/frames
     mkdir genes
     # knownGene
     for DB in mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # ensGene
     for DB in hg18 galGal3
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
 
     #	use xenoMrna for petMar1, braFlo1
     for DB in petMar1 braFlo1
 do
 tmpExt=`mktemp temp.XXXXXX`
 tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
 tmpMrna=${DB}.mrna.${tmpExt}
 tmpCds=${DB}.cds.${tmpExt}
 hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
 	   from xenoMrna,gbCdnaInfo,cds \
 	   where (xenoMrna.qName = gbCdnaInfo.acc) and \
 	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
 $DB > ${tmpMrnaCds}
 cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
 cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
 mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
 genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
 rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
 mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
 rm -f $tmpExt
 echo "${DB} done"
 done
 
     ls -og genes
 # -rw-rw-r--  1 1040639 May  1 09:53 braFlo1.gp.gz
 # -rw-rw-r--  1 1557911 May  1 09:50 galGal3.gp.gz
 # -rw-rw-r--  1 2151412 May  1 09:50 hg18.gp.gz
 # -rw-rw-r--  1 1965274 May  1 09:50 mm9.gp.gz
 # -rw-rw-r--  1  596023 May  1 09:51 petMar1.gp.gz
 
 
     ssh kkstore05
     cd /cluster/data/braFlo1/bed/multiz5way/frames
     #	hg18 is in a bit of transition at the moment, so use ensGene
     #	mm9 can use knownGene
     #	xenoMrna for braFlo1, braFlo1
     #	ensGene v49 for galGal3, oryLat1
     #	anything to annotate is in a pair, e.g.: braFlo1 genes/braFlo1.gp.gz
     time (cat  ../anno/maf/*.maf | nice -n +19 genePredToMafFrames braFlo1 stdin stdout braFlo1 genes/braFlo1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz petMar1 genes/petMar1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
     # see what it looks like in terms of number of annotations per DB:
     zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
     #	 44216 petMar1
     #	 71268 braFlo1
     #	107089 galGal3
     #	107857 hg18
     #	111452 mm9
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way/frames
     time nice -n +19 hgLoadMafFrames braFlo1 multiz5wayFrames \
 	multiz5way.mafFrames.gz
     #	real    0m7.902s
 
     #	enable the trackDb entries:
 # frames multiz5wayFrames
 # irows on
 #############################################################################
 # phastCons 5-way (DONE - 2008-05-01 - Hiram)
 
     # split 5way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh memk
     mkdir /cluster/data/braFlo1/bed/multiz5way/msa.split
     cd /cluster/data/braFlo1/bed/multiz5way/msa.split
     mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /cluster/data/braFlo1/bed/multiz5way/anno/maf
 set WINDOWS = /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss
 pushd $WINDOWS
 set c = $1
 rm -f $c.out
 twoBitToFa -seq=$c /scratch/data/braFlo1/braFlo1.2bit /scratch/tmp/braFlo1.$c.fa
 /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
     -M /scratch/tmp/braFlo1.$c.fa \
     -o SS -r $c -w 10000000,0 -I 1000 -B 5000
 rm -f /scratch/tmp/braFlo1.$c.fa
 popd
 date > $c.out
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(root1) {check out line+ $(root1).out}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	create list of maf files:
     (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
 # Completed: 2 of 2 jobs
 # CPU time in finished jobs:        337s       5.61m     0.09h    0.00d  0.000 y
 # IO & Wait Time:                    40s       0.67m     0.01h    0.00d  0.000 y
 # Average job time:                 189s       3.14m     0.05h    0.00d
 # Longest finished job:             375s       6.25m     0.10h    0.00d
 # Submission to last job:           375s       6.25m     0.10h    0.00d
 
     # take the cons and noncons trees from the Lamprey 6-way
 
     #	Estimates are not easy to make, probably more correctly,
     #	take the 6-way .mod file, and re-use it here.
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way
     cp -p /cluster/data/petMar1/bed/multiz6way/cons/all/all.mod \
 	./petMar1.6way.mod
 
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh memk
     mkdir -p /cluster/data/braFlo1/bed/multiz5way/cons/run.cons
     cd /cluster/data/braFlo1/bed/multiz5way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all gliers placentals
     #	Well, that's what it was when used in the Mm9 30-way,
     #	in this instance, there is only the directory "all"
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
 set f = $1
 set c = $1:r
 set len = $2
 set cov = $3
 set rho = $4
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 set cons = /cluster/data/braFlo1/bed/multiz5way/cons
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/braFlo1/multiz5way/cons
 if (-s $cons/$grp/$grp.non-inf) then
   cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
   cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
 else
   cp -p $cons/$grp/$grp.mod $tmp
   cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p $san/$grp/pp $san/$grp/bed
 sleep 4
 touch $san/$grp/pp $san/$grp/bed
 rm -f $san/$grp/pp/$f.pp
 rm -f $san/$grp/bed/$f.bed
 mv $tmp/$f.pp $san/$grp/pp
 mv $tmp/$f.bed $san/$grp/bed
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/braFlo1/multiz5way/cons
     find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
 	> /cluster/data/braFlo1/bed/multiz5way/cons/ss.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
     /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
     ../../petMar1.6way.mod \
 --prune-all-but=braFlo1,hg18,galGal3,mm9,petMar1 \
 	> all.mod
 
     cd ../run.cons/all
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 94 of 94 jobs
 # CPU time in finished jobs:       1945s      32.42m     0.54h    0.02d  0.000 y
 # IO & Wait Time:                   810s      13.49m     0.22h    0.01d  0.000 y
 # Average job time:                  29s       0.49m     0.01h    0.00d
 # Longest finished job:              34s       0.57m     0.01h    0.00d
 # Submission to last job:           201s       3.35m     0.06h    0.00d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
     find ./bed -type f -name "chr*.bed" | xargs cat \
 	| sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     cp -p mostConserved.bed /cluster/data/braFlo1/bed/multiz5way/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way/cons/all
     time nice -n +19 hgLoadBed braFlo1 phastConsElements5way mostConserved.bed
     #	Loaded 122683 elements of size 5
     #	real    0m2.299s
 
     # Try for 5% overall cov, and 70% CDS cov 
     #	We don't have any gene tracks to compare CDS coverage
     #	--rho .31 --expected-length 45 --target-coverage .3
     featureBits braFlo1 phastConsElements5way
     #	35581251 bases of 923355587 (3.853%) in intersection
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
     mkdir phastCons5wayScores
 
     cat pp/chrM*.pp | gzip > phastCons5wayScores/chrM.data.gz
     #	this is a bit of a cheat on the sorting order here, but it works.
     ls pp/chrUn*.pp | sort -t. -k2n | xargs cat \
 	| gzip > phastCons5wayScores/chrUn.data.gz
 
     #	copy the phastCons5wayScores to:
     mkdir -p \
 /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
     cp -p phastCons5wayScores/chr*.data.gz \
 /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
     #	for hgdownload downloads
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
     zcat phastCons5wayScores/chr*.data.gz \
 	| wigEncode stdin phastCons5way.wig phastCons5way.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     time nice -n +19 cp -p *.wi? /cluster/data/braFlo1/bed/multiz5way/cons/all
     #	real    0m1.216s
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way/cons/all
     ln -s `pwd`/phastCons5way.wib /gbdb/braFlo1/multiz5way/phastCons5way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/braFlo1/multiz5way braFlo1 \
 	phastCons5way phastCons5way.wig
     #	real    0m2.166s
     # remove garbage
     rm wiggle.tab
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way/cons/all
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=braFlo1 phastCons5way > histogram.data 2>&1
     #	real    5m0.608s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Lancelet BraFlo1 Histogram phastCons5way track"
 set xlabel " phastCons5way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
     #	it is a bit lopsided toward the highly conserved end, but that
     #	is because there is so little bits being used
 
     #	These trackDb entries turn on the wiggle phastCons data track:
     #	type wigMaf 0.0 1.0
     #	maxHeightPixels 100:40:11
     #	wiggle phastCons5way
     #	spanList 1
-    #	autoScaleDefault Off
+    #	autoScale Off
     #	windowingFunction mean
     #	pairwiseHeight 12
     #	yLineOnOff Off
 
 #############################################################################
 #  Downloads (DONE - 2008-05-01 - Hiram)
     #	Let's see if the downloads will work
     ssh hgwdev
     cd /cluster/data/braFlo1
     #	expecting to find repeat masker .out file here:
     ln -s bed/RepeatMasker/braFlo1.fa.out .
     cd bed/simpleRepeat
     mkdir trfMaskChrom
     splitFileByColumn trfMask.bed trfMaskChrom
     touch trfMaskChrom/chrM.bed
     cd /cluster/data/braFlo1
     time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
 	-workhorse=hgwdev braFlo1 > jkStuff/downloads.log 2>&1
     #	real    11m53.475s
 
     #	failed making upstream sequences:
     #	featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
     #	setpriority: Permission denied.
     #	the 'nice' from my bash shell causes trouble inside the csh
     #	script which uses nice.  Finish off the install step manually
     #	with the mgcGenes upstreams ...
 
 #############################################################################
 #  PushQ entries (DONE - 2008-05-01 - Hiram)
     ssh hgwdev
     /cluster/data/braFlo1
     /cluster/bin/scripts/makePushQSql.pl braFlo1 > jkStuff/pushQ.sql
     #	output warnings:
 # braFlo1 does not have seq
 # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
 # and genbank tables) which tracks to assign these tables to:
 #	gbWarn
 #	multiz5wayFrames
 #	phastCons5way
     scp -p jkStuff/pushQ.sql hgwbeta:/tmp/braFlo1.sql
     ssh hgwbeta
     cd /tmp
     hgsql qapushq < braFlo1.sql
 
 #############################################################################
 #  Create 5-way downloads (DONE - 2008-05-01 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
     cd /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
     #	if these haven't been copied here from before:
     cp -p \
 /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/phastCons5wayScores/* .
     ln -s ../../cons/all/all.mod ./5way.mod
     cp /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/README.txt .
     # edit that README.txt to be correct for this 5-way alignment
     cd ..
     mkdir multiz5way
     cd multiz5way
     cp -p /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way/README.txt .
     # edit that README.txt to be correct for this 5-way alignment
     ssh kkstore05
     cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
     ln -s ../../braFlo1.5-way.nh ./5way.nh
 
     cp -p ../../anno/maf/chr*.maf .
     gzip chr*.maf
 
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
     #	creating upstream files from xenoRefGene, bash script:
     cat << '_EOF_' > mkUpstream.sh
 #!/bin/bash
 DB=braFlo1
 GENE=xenoRefGene
 NWAY=multiz5way
 export DB GENE
 
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                 stdin stdout \
                 -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 '_EOF_'
     # << happy emacs
     chmod +x ./mkUpstream.sh
     time nice -n +19 ./mkUpstream.sh
     #	real    3m1.228s
 # -rw-rw-r--  1   126900 May  1 16:51 upstream1000.maf.gz
 # -rw-rw-r--  1   224320 May  1 16:52 upstream2000.maf.gz
 # -rw-rw-r--  1   501687 May  1 16:53 upstream5000.maf.gz
 
     #	check the names in these upstream files to ensure sanity:
     zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
 	| sort | uniq -c | sort -rn | less
     #	should be a list of the other 4 species with a high count,
     #	then xenoRefGene names, e.g.:
     #	364 petMar1
     #	364 mm9
     #	364 hg18
     #	364 galGal3
     #	  7 NM_069006
     #	  7 NM_054045
     #	  7 NM_021059
 
     ssh kkstore05
     cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
     md5sum *.maf.gz > md5sum.txt
     cd ../phastCons5way
     md5sum *.data.gz *.mod > md5sum.txt
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way
     mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way
     cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
     ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way
     rm /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way/mkUpstream.sh 
     cd ../phastCons5way
     ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way
     #	if your ln -s `pwd`/* made extra links to files you don't want there,
     #	check the goldenPath locations and remove those extra links
 
 #############################################################################
 #  add strings of N's to the gap track (DONE - 2008-05-09 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/braFlo1/bed/gap
     cd /cluster/data/braFlo1/bed/gap
     time nice -n +19 findMotif -strand=+ -verbose=4 -motif=gattaca \
 	../../braFlo1.2bit > braFlo1.gaps.txt 2>&1
     #	real    0m16.659s
     grep "^#GAP chrUn" braFlo1.gaps.txt | sed -e "s/#GAP //" > chrUn.gap.bed
     #	what are their sizes like:
     ave -col=5 chrUn.gap.bed
 Q1 50.000000
 median 60.000000
 Q3 704.000000
 average 1004.236543
 min 1.000000
 max 144292.000000
 count 94786
 total 95187565.000000
 standard deviation 4497.943141
     #	don't let this overlap with the gap track already
     ssh hgwdev
     cd /cluster/data/braFlo1/bed/gap
     #	already existing gap track
     featureBits braFlo1 gap
     #	3031000 bases of 923355587 (0.328%) in intersection
     #	prove that it is a complete subset of this new set
     featureBits braFlo1 gap chrUn.gap.bed
     #	3031000 bases of 923355587 (0.328%) in intersection
     #	let's make sure none of the regular gaps run-on into these new
     #	found gaps
     hgsql -e "select chrom,chromStart,chromEnd from chrUn_gap;" braFlo1 \
 	| sort -k1,1 -k2,2n > gap.bed
     awk '{printf "%s\t%s\t%s\n", $1, $2, $3}' chrUn.gap.bed \
 	| sort -k1,1 -k2,2n > new.bed
     #	after working with that for a while, indeed there is a problem
     #	found in the existing gap track.  There are "contig" gaps in the
     #	the track which claim to be 1000 bases, but the DNA sequence
     #	has 1001 N's in that location.  Let's see if that is because the
     #	scaffold itself ends in a single N.  Yes, after some investigation,
     #	it is found there are scaffolds which either begin with Ns, or
     #	or end with N's, and it looks like they are single N's.
     #	there are 30 gap locations where this happens.
     #	So, let's make up a proper exclusive or of the new gaps:
     #	intersect the new gaps with everything that is not in the current
     #	gap track.  Get the regions that are not in the gap track:
     featureBits -not -countGaps braFlo1 gap -chrom=chrUn -bed=notGap.bed
     #	then, find only those bits in the new gaps that intersect that:
     featureBits -countGaps braFlo1 notGap.bed new.bed -bed=newGaps.bed
     #	and profile their sizes:
     awk '{print $3-$2}' newGaps.bed | ave stdin
 Q1 50.000000
 median 51.000000
 Q3 591.000000
 average 1004.037315
 min 1.000000
 max 144292.000000
 count 91785
 total 92155565.000000
 standard deviation 4570.916383
     #	make the new gaps into a properly formatted gap table
     #	currently, the last index number in the gap track is:o
     hgsql -e "select max(ix) from chrUn_gap;" braFlo1
     #	6062
 
     awk '
 BEGIN{ ix=6063 }
 {
 printf "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $1, $2, $3, ix, $3-$2
 ++ix
 }' newGaps.bed > newGaps.gap.table
     #	findMotif has made an error on the last one, there is an extra one
     #	at the end that is not legitimate, take it off.
     #	let's get the SQL definition:
     mkdir dump
     cd dump
     hgsqldump -c --tab=. braFlo1 chrUn_gap
     cd ..
     #	and, make the bin column 
     hgLoadBed -noLoad braFlo1 xyt newGaps.gap.table
     #	count before:
     hgsql -e "select count(*) from chrUn_gap;" braFlo1
     #	3031
     #	then, add this business to the existing table:
     hgLoadSqlTab -append braFlo1 chrUn_gap dump/chrUn_gap.sql bed.tab
     #	and after
     hgsql -e "select count(*) from chrUn_gap;" braFlo1
     #	94815
     #	that is the sum of old and new:
     wc -l gap.bed bed.tab
     #	 3031 gap.bed
     #	91784 bed.tab
     #	94815 total
 
 #############################################################################
 # GENBANK reload due to problems found by gbSanity (2008-05-09 markd)
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep3 -drop -initialLoad braFlo1&
 
 #############################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 braFlo1.upstreamGeneTbl = xenoRefGene
 braFlo1.upstreamMaf = multiz5way /hive/data/genomes/braFlo1/bed/multiz5way/species.list