src/hg/makeDb/doc/fr2.txt 1.23

1.23 2009/07/21 21:01:42 markd
added transmap update blurb
Index: src/hg/makeDb/doc/fr2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/fr2.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/fr2.txt	28 Aug 2008 21:18:58 -0000	1.22
+++ src/hg/makeDb/doc/fr2.txt	21 Jul 2009 21:01:42 -0000	1.23
@@ -1,1715 +1,1724 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes browser build for the Medaka
 # genome, Takifugu rubripes, October 2004, MEDAKA1 from
 #       Institute of Genetics (NIG) and the University of Tokyo
 #       NIG: http://dolphin.lab.nig.ac.jp/medaka/
 #       UTGB: http://medaka.utgenome.org/o
 #       Data release policy: http://medaka.utgenome.org/#policy
 #       Ensembl: http://www.ensembl.org/Oryzias_latipes/index.html
 #
 #       "$Id$"
 #
 
 ##########################################################################
 ### Fetch sequence       (DONE - 2007-01-22 - Cory McLean and Hiram)
   ssh kkstore02
   mkdir /cluster/store5/fr2
   ln -s /cluster/store5/fr2 /cluster/data/fr2
   cd /cluster/data/fr2
   mkdir jgi
   cd jgi
   cat << '_EOF_' > fetch.sh
 #!/bin/sh
 
 wget --timestamping "ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/v4.0/*"
 
 gunzip fugu.041029.scaffolds.fasta.gz
 
 scaffoldFaToAgp fugu.041029.scaffolds.fasta
 
 gzip fugu.041029.scaffolds.fasta
 '_EOF_'
     # << happy emacs
     chmod +x fetch.sh
     ./fetch.sh
 
 ##########################################################################
 #  Run the makeGenomeDb.pl script (DONE - 2007-01-22 - Cory and Hiram)
     # prepare for the makeGenomeDb.pl script:
     ssh hgwdev
     cd /cluster/data/fr2
     #   the config.ra script pretty much specifies everything
     cat << '_EOF_' >  config.ra
 db fr2
 scientificName Takifugu Rubripes
 assemblyDate Oct. 2004
 assemblyLabel JGI V4.0
 # orderKey = fr1.orderKey - 1
 orderKey 464
 #       NC_004299.1
 mitoAcc 23397366
 fastaFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.fasta.gz
 dbDbSpeciesDir fugu
 agpFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.agp
 commonName Fugu
 clade Vertebrate
 genomeCladePriority 110
 '_EOF_'
     # << happy emacs
 
     makeGenomeDb.pl config.ra > mgdb.out 2>&1
     #	This sequence creates and loads the following files into the
     #	new database oryLat1
     #	chr*_gap, chr*_gold, chromInfo, gc5Base, grp
     #	And, when you follow the instructions it gives at the end to check in
     #	the trackDb files it creates, and you do a make in your trackDb
     #	hierarchy, you will then create the trackDb and hgFindSpec tables
     #	(with a 'make alpha') or your specific trackDb_logname
     #	hgFindSpec_logname with a simple 'make' with no arguments.
     #	The sequence also adds an entry to the dbDb table to turn on this
     #	organism in the drop-down menus
 
     ############################################
     # Checked in trackDb fr2 files to the source tree.  Instructions
     # located in ~/kent/src/product/README.trackDb
     ssh hgwdev
     cd /gbdb/fr2
     ln -s /cluster/data/fr2/fr2.unmasked.2bit ./fr2.2bit
 
 ################################################
 ## WINDOWMASKER (DONE - 2007-01-22 - Cory and Hiram)
   cd /cluster/data/fr2/bed/
   ~/kent/src/hg/utils/automation/doWindowMasker.pl fr2 \
       -workhorse=kolossus > wmRun.log 2>&1 &
 
   # Save the log
   mv wmRun.log WindowMasker.2007-01-22
 
   # Masking statistics
   cd WindowMasker.2007-01-22
   twoBitToFa fr2.wmsk.2bit stdout | faSize stdin
 
   # 400525790 bases (49313545 N's 351212245 real 284686886 upper
   # 66525359 lower)
 
   hgLoadBed -strict fr2 windowmaskerSdust windowmasker.sdust.bed.gz
   # Loaded 1747418 elements of size 3
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Cory and Hiram)
   ssh kolossus
   mkdir /cluster/data/fr2/bed/simpleRepeat
   cd /cluster/data/fr2/bed/simpleRepeat
   #  This missed chrM sequence
   time nice -n 19 twoBitToFa ../../fr2.unmasked.2bit stdout \
      | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
   # ~31m
   # Make a filtered version for sequence masking:
   awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
   splitFileByColumn trfMask.bed trfMaskChrom
 
   # Load unfiltered repeats into the database:
     ssh hgwdev
     hgLoadBed fr2 simpleRepeat \
       /cluster/data/fr2/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
 
     featureBits fr2 simpleRepeat
     # 9915088 bases of 393312790 (2.521%) in intersection
 
     featureBits fr1 simpleRepeat
     # 6801339 bases of 315518167 (2.156%) in intersection
 
     #	recovery attempt to get chrM masked
   time nice -n 19 twoBitToFa -seq=chrM ../../fr2.unmasked.2bit stdout \
      | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=chrM.simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
     #	It finds nothing !
     #	make an empty trfMaskChrom file:
     touch trfMaskChrom/chrM.bed
 
 #########################################################################
 ## Add TRF mask to WindowMasker masked sequence
     ssh kkstore02
     cd /cluster/data/fr2
     twoBitMask bed/WindowMasker.2007-01-22/fr2.wmsk.sdust.2bit \
       -add bed/simpleRepeat/trfMask.bed fr2.2bit
 
     # Received ignorable warning:
     # Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
     # might contain block coordinates, but this program uses only the first three
     # fields (the entire span -- no support for blocks).
 
     # Make this be the actual file the get DNA browser sees
     ssh hgwdev
     cd /gbdb/fr2/
     rm fr2.2bit
     ln -s /cluster/data/fr2/fr2.2bit fr2.2bit
 
 #########################################################################
 ## Lift our .2bit file against the .lft file to create a scaffold fasta file
     cd /cluster/data/fr2/jkStuff
     cp ../jgi/fugu.041029.scaffolds.lft liftAll.lft
     cp /cluster/data/tetNig1/jkStuff/lft2BitToFa.pl .
     cd ..
     mkdir noUn
     cd noUn
     time ../jkStuff/lft2BitToFa.pl ../fr2.2bit ../jkStuff/liftAll.lft \
          > chrUn.scaffolds.fa
     # real    5m4.520s
 
     twoBitToFa -seq=chrM ../fr2.2bit chrM.fa
     faToTwoBit *.fa fr2.scaffolds.2bit
     twoBitInfo *.2bit stdout | sort -k2nr > fr2.scaffolds.sizes
 
 
 ##########################################################################
 ## Move the data out to the cluster
     cd /san/sanvol1/scratch/
     mkdir fr2
     cd fr2
     cp -p /cluster/data/fr2/jkStuff/liftAll.lft
     cp -p /cluster/data/fr2/chrom.sizes .
     cp -p /cluster/data/fr2/fr2.2bit .
     cp -p /cluster/data/fr2/noUn/*2bit .
     cp -p /cluster/data/fr2/noUn/*sizes .
 
 ## Edit the kent/src/hg/makeDb/doc/gasAcu1.txt doc to show what we're going to
 ## do there.
 ##
 ## To display the new chains and nets in the gasAcu1 browser, we had to edit the
 ## trackDb.ra file to include the new chain and net:
 ##
 ## track chainFr2
 ## shortLabel $o_db Chain
 ## longLabel $o_Organism ($o_date/$o_db) Chained Alignments
 ## group compGeno
 ## priority 140
 ## visibility hide
 ## color 100,50,0
 ## altColor 255,240,200
 ## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91
 ## spectrum on
 ## type chain fr2
 ## otherDb fr2
 
 ## track netFr2
 ## shortLabel $o_db Net
 ## longLabel $o_Organism ($o_date/$o_db) Alignment Net
 ## group compGeno
 ## priority 140.1
 ## visibility hide
 ## spectrum on
 ## type netAlign fr2 chainFr2
 ## otherDb fr2
 
 
 ## Then we verified that the chain covered a higher percentage of gasAcu1 than fr1 did:
 ## Look into the gasAcu1.txt file to see the commands going on there.
 
 ## Then we needed to update the links to the detail pages of chains and nets: 
 ## /cluster/home/cmclean/kent/src/hg/makeDb/trackDb/chainFr2.html and netFr2.html
 
 ###########################################################
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-24 - Hiram and Cory)
     # This will find repeats within the genome that should not be matched
     # against. Uses 11-mers.
     # Use -repMatch=128 (based on size -- for human we use 1024, and
     # fugu size is ~12% of human judging by gapless fr2 vs. hg18
     # genome sizes from featureBits.
 
     featureBits hg18 gap
     featureBits -countGaps hg18 gap
     
     ssh kolossus
     blat /cluster/data/fr2/fr2.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/data/fr2/11.ooc -repMatch=128
     # Wrote 8898 overused 11-mers to /cluster/data/fr2/11.ooc
 
     cp -p /cluster/data/fr2/11.ooc /cluster/bluearc/fugu/fr2
     cp -p /cluster/data/fr2/jkStuff/liftAll.lft /cluster/bluearc/fugu/fr2
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE - 2007-01-24 - Hiram and Cory)
     # Make a liftAll.lft that specifies 5M chunks for genbank:
     # Actually not necessary, since our chunks are small enough.  If we had
     # to, would look like this:
 #    ssh kkstore05
 #   cd /cluster/data/fr2
 #   simplePartition.pl fr2.2bit 5000000 /tmp/fr2
 #   cat /tmp/fr2/*/*.lft > jkStuff/liftAll.lft
 #   rm -r /tmp/fr2
 
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add fr2 just after fr1
     # fr2
     fr2.serverGenome = /cluster/data/fr2/fr2.2bit
     fr2.clusterGenome = /cluster/bluearc/fugu/fr2/fr2.2bit
     fr2.ooc = /cluster/bluearc/fugu/fr2/11.ooc
     fr2.align.unplacedChroms = chrUn
     fr2.lift = /cluster/bluearc/fugu/fr2/liftAll.lft
     fr2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
     fr2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
     fr2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
     fr2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
     fr2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
     fr2.genbank.mrna.xeno.loadDesc = yes
     fr2.refseq.mrna.native.load = no
     fr2.refseq.mrna.xeno.load  = no
 
     cvs ci -m "Added fr2." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     # Edit src/lib/gbGenome.c to add new species.  Not necessary here since
     # fugu already exists.
     #
     # cvs ci -m "Added Oryzias latipes (medaka)." src/lib/gbGenome.c
     # make install-server
 
 
     cd /cluster/data/genbank
     screen
 
     # This is a call to a script that will push our jobs out to the cluster
     # since it's a big job.  
     nice -n +19 bin/gbAlignStep -initial fr2 &
     # logFile: var/build/logs/2007.01.24-12:09:11.fr2.initalign.log
 
     # We had an error because machine kkr4u02 was unable to ssh to.  This
     # happened in the middle of the -run subroutine.
     para problems > problems.out 2>&1
     grep host problems.out | sort | uniq -c | sort -n
     # 62021 host: kkr4u02.kilokluster.ucsc.edu
 
     parasol list machines | grep kkr4u02
     # kkr4u02.kilokluster.ucsc.edu
 
     parasol remove machine kkr4u02.kilokluster.ucsc.edu "unable to ssh"
 
     para push -retries=5
     # updated job database on disk
     # Pushed Jobs: 14820
     # Retried jobs: 14820
 
     # We still need to finish the rest of the gbAlignStep since it failed
     # because the -run subroutine did not finish correctly.  We must manually
     # call the rest of the routine.
     nice -n 19 bin/gbAlignStep -continue=finish -initial fr2 &
     # logFile: var/build/logs/2007.01.24-15:48:19.fr2.initalign.log
 
 
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad fr2
 
     # enable daily alignment and update of hgwdev (DONE - 2007-02-05 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add fr2 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added fr2." etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 #########################################################################
 # ENSEMBL GENES (DONE - 2007-01-24 - Hiram and Cory)
 #  Good luck with the biomart interface.  It seems to be different each time
 #  it is used.
     mkdir /cluster/data/fr2/bed/ensembl
     cd /cluster/data/fr2/bed/ensembl
     # Get the ensembl gene data from 
     # http://www.biomart.org/biomart/martview/
     #	
     # Follow this sequence through the pages:
     #	The default dataset will be Human.  In the right side
     #	frame, if you scroll it up and down, you will come to a pull-down
     #	dataset menu where you can select the organism.  There appear to be
     #	two pull-down menus in this frame, one for:
     #	Database: ENSEMBL 42 GENE (SANGER)
     #	and you can select the second:
     #	Dataset: Takifugu rubripes genes (FUGU4)
 
     #	After selecting the Dataset, in the left frame, click on the
     #	"Attributes" (Structures) label, now the right frame changes to radio
     #	buttons, Features, Sequences, Structures
     #	Click the "Structures" button, the three optional buttons can be
     #	expanded to select elements of these:
     #	REGION:
     #	GENE:
     #	EXON:
     #	REGION: has Chromosome checked
     #	GENE: has Ensembl Gene ID and Biotype selected
     #	EXON: has no selections
     #	In the GENE: menu:
     #	Unselect Biotype
     #	and Select
     #	Ensembl Gene ID
     #	Ensembl Transcript ID
     #	External Gene ID
 
     #   Click on the "Filters" section on the left-side frame.
     #   Under GENE in the right-side frame, select Gene type checkbox and
     #   highlight "protein_coding".
     #   Check:  Click "Count" from top buttons, and 22,008/22,409 genes will
     #   be reported.    
 
     #	Then, in the black menu bar above these frames, click the "Results"
     #	it shows the first ten rows.  For this organism, there appear to be no
     #	External Gene ID in the HTML view.  Change the "Display maximum"
     #	"rows as" pull-down
     #	to GFF, and use the "Export all results to" pull-down to
     #	Compressed web file (notify by email), press the "Go" button to download.
     
     #   After retrieving the URL where our data is located
     #   (http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz),
     #   we get it from that place:
     wget http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz \
          -O fr2.ensembl42.gff.gz
 
     # Ensemble gives us coordinates on each scaffold relative to the beginning
     # of *that scaffold.*  We want to have them in chrUn coordinates instead.
     # We use the liftUp program on the Ensembl data, with our liftAll.lft file
     # we created a long time ago, to achieve this, and name it
     # fr2.ensembl42.protein_coding.gff since we only took the protein coding
     # genes.
     zcat fr2.ensembl42.gff.gz | liftUp -type=.gff fr2.ensembl42.protein_coding.gff \
        ../../jkStuff/liftAll.lft error stdin 
 
     # On other files, sometimes we need to massage the input names to match
     # the UCSC naming conventions.  Below is an example, though it was not
     # necessary today.
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software, and liftUp to get scaffolds onto chrUn
     #	The scaffolds and ultracontigs mentioned in this are not the scaffolds
     #	we have in our lift file for chrUn ...  can't use them.
     #    zcat oryLat1.ensembl42.gff.gz | egrep -v "scaffold|ultracontig" \
     #	| sed -e "s/^\([0-9][0-9]*\)/chr\1/" | gzip -c > ensembl42.gff.gz
     #	Verify names OK:
     #   zcat ensembl42.gff.gz | awk '{print $1}' | sort | uniq -c
     #	22938 chr1
     #	15887 chr10
     #	18645 chr11
     #	20162 chr12
     #	21474 chr13
     #	21302 chr14
     #	20148 chr15
     #	22978 chr16
     #	26164 chr17
     #	13671 chr18
     #	17109 chr19
     #	10988 chr2
     #	15705 chr20
     #	21361 chr21
     #	21030 chr22
     #	12984 chr23
     #	19009 chr24
     #	21767 chr3
     #	26204 chr4
     #	24335 chr5
     #	24300 chr6
     #	24329 chr7
     #	23323 chr8
     #	27795 chr9
 
 
     cd /cluster/data/fr2/bed/ensembl
     ldHgGene -gtf -genePredExt fr2 ensGene fr2.ensembl42.protein_coding.gff
     #   Read 22102 transcripts in 407112 lines in 1 files
     #   22102 groups 1 seqs 1 sources 4 feature types
     #   22102 gene predictions
 
     # The genome-test database will already populate this into our browser
     # since Ensembl genes are a default track.  However, the link to the
     # Ensembl website is broken because it automatically assumes we are
     # referencing the human genome.  We need to edit the
     # ~/kent/src/hg/makeDb/trackDb/fugu/trackDb.ra file in the ensGene track
     # and update the URL to point to the correct spot.  In this case, we want
     # the incorrect URL:
     # url http://www.ensembl.org/perl/transview?transcript=$$track genscan
     # to the correct URL:
     # url http://dec2006.archive.ensembl.org/Takifugu_rubripes/transview?transcript=$$
     # Also, we add the following lines at the end
     # urlName gene
     # archive dec2006
 
 
     ## Now we want to get the proteins that are derived from these genes as a link at
     #  the top of the detail browser.
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # for the Attributes, choose the "Features" box, and then In "GENE:"
     # select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Results, choose txt output and a Compressed web file (notify by email).
     # Save this as
     # ensGtp42.txt.gz
     wget http://www.biomart.org/biomart/martresults?file=martquery_0124225551_471.txt.gz \
         -O ensGtp42.txt.gz
     
     #	Strip the first lines which is merely column heading labels
     zcat ensGtp42.txt.gz | headRest 1 stdin | sed -e "s/ /\t/g" > ensGtp.txt
 
     # We want to load our genes, but unfortunately the gene names are larger
     # than the standard ensGtp.sql file that the following command handles:
     #    hgLoadSqlTab fr2 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt
     # Instead, we make our own temporary .sql file to handle the insert.
     sed -e "s/20/21/; s/18/21/" ~/kent/src/hg/lib/ensGtp.sql > ensGtp.bigcols.sql;
 
     # And then perform the insert.
     hgLoadSqlTab fr2 ensGtp ensGtp.bigcols.sql ensGtp.txt
     rm ensGtp.bigcols.sql
 
     hgsql -N -e "select count(*) from ensGtp;" fr2
     #   +-------+
     #   | 22102 |
     #   +-------+
     wc -l ensGtp.txt
     #   22102 ensGtp.txt
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # for the Attributes, choose the "Sequences" box, and then 
     # SEQUENCES: Peptide and 
     #	Header Information "Ensembl Transcript ID"
     #	Results output as FASTA
     # Save file as ensPep.fa.gz
 
     # XXX Still waiting for the proteins to be sent to us so that we can add
     # this part.
     # hgPepPred fr2 generic ensPep ensPep.fa
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2007-01-25 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("fr2", "blat3", "17786", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("fr2", "blat3", "17787", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 #########################################################################
 # BLASTZ/CHAIN/NET Hg18 (DONE - 2007-01-26 - Hiram)
     ##  Swap back to fr2
     mkdir /cluster/data/fr2/bed/blastz.hg18.swap
     cd /cluster/data/fr2/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -swap > swap.log 2>&1 &
     #	real    47m14.554s
 
     ssh hgwdev
     cd /cluster/data/fr2/bed/blastz.hg18.swap
     time nice -n +19 featureBits fr2 chainHg18Link \
 	> fb.fr2.chainHg18Link.txt 2>&1 &
     #	42875664 bases of 393312790 (10.901%) in intersection
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-26)
     ssh kkstore02
     bash # if not using bash shell already
 
     mkdir /cluster/data/fr2/blastDb
     cd /cluster/data/fr2
     zcat jgi/fugu.041029.scaffolds.fasta.gz > temp.fa
     faSplit sequence temp.fa 500 blastDb/
     rm temp.fa
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
 
     mkdir -p /san/sanvol1/scratch/fr2/blastDb
     cd /cluster/data/fr2/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/fr2/blastDb
     done
 
     mkdir -p /cluster/data/fr2/bed/tblastn.hg18KG
     cd /cluster/data/fr2/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/fr2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 495 query.lst
 
    # we want around 100000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(100000/`wc query.lst | awk "{print \\\$1}"`\)
 
 # 36727/(100000/495) = 181.798650
 
    mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa
    split -l 180 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/fr2/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit # back to bash
     
     ssh pk
     cd /cluster/data/fr2/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 101475 of 101475 jobs
 # CPU time in finished jobs:    3924977s   65416.29m  1090.27h   45.43d  0.124 y
 # IO & Wait Time:               1118884s   18648.06m   310.80h   12.95d  0.035 y
 # Average job time:                  50s       0.83m     0.01h    0.00d
 # Longest finished job:            3925s      65.42m     1.09h    0.05d
 # Submission to last job:         46557s     775.95m    12.93h    0.54d
 
     ssh kkstore04
     cd /cluster/data/fr2/bed/tblastn.hg18KG
     tcsh
     mkdir chainRun
     cd chainRun
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/fr2/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 205 of 205 jobs
 # CPU time in finished jobs:       2262s      37.70m     0.63h    0.03d  0.000 y
 # IO & Wait Time:                 40989s     683.15m    11.39h    0.47d  0.001 y
 # Average job time:                 211s       3.52m     0.06h    0.00d
 # Longest finished job:             360s       6.00m     0.10h    0.00d
 # Submission to last job:          1492s      24.87m     0.41h    0.02d
 
     ssh kkstore04
     cd /cluster/data/fr2/bed/tblastn.hg18KG/blastOut
     bash # if using another shell
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/fr2/bed/tblastn.hg18KG/unliftBlastHg18KG.psl
     cd ..
     pslCheck unliftBlastHg18KG.psl
     liftUp blastHg18KG.psl ../../jkStuff/liftAll.lft warn unliftBlastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/fr2/bed/tblastn.hg18KG
     hgLoadPsl fr2 blastHg18KG.psl
 
     # check coverage
     featureBits fr2 blastHg18KG 
 # 19761405 bases of 393312790 (5.024%) in intersection
 
     featureBits fr2 refGene:cds blastHg18KG  -enrichment
 # ensGene:cds 8.216%, blastHg18KG 5.024%, both 4.401%, cover 53.57%, enrich 10.66x
 
     ssh kkstore04
     rm -rf /cluster/data/fr2/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
 #end tblastn
 
 #########################################################################
 # BLASTZ/CHAIN/NET TetNig1 SWAP (DONE - 2007-01-29 - Hiram)
 ##  Align to fr2 scaffolds,
 ##	results lifted to fr2 chrUn coordinates
     ## Swap to fr2
     mkdir /cluster/data/fr2/bed/blastz.tetNig1.swap
     cd /cluster/data/fr2/bed/blastz.tetNig1.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -swap > swap.log 2>&1 &
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 &
     #	real    40m40.471s
 
     ssh hgwdev
     cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
     time nice -n +19 featureBits tetNig1 chainFr2Link \
 	> fb.tetNig1.chainFr2Link.txt 2>&1 
     #	246828605 bases of 342403326 (72.087%) in intersection
 
     cd /cluster/data/fr2/bed/blastz.tetNig1.swap
     time nice -n +19 featureBits fr2 chainTetNig1Link \
 	> fb.fr2.chainTetNig1.txt 2>&1
     #	247086553 bases of 393312790 (62.822%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET gasAcu1 swap (DONE - 2007-01-23 - Hiram)
 ##  no chrUn in gasAcu1, and align to fr2 scaffolds,
 ##	results lifted to fr2 chrUn coordinates
     ssh kkstore05
     mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap
     cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
     time doBlastzChainNet.pl \
 	/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=pk > swap.log 2>&1 &
     #	real    24m33.761s
 
     ssh hgwdev
     cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
     time nice -n 19 featureBits fr2 chainGasAcu1Link \
 	> fb.fr2.chainGasAcu1Link.txt 2>&1 &
     # 158383996 bases of 393312790 (40.269%) in intersection
 
 #########################################################################
 ##  BLASTZ/CHAIN/NET to gasAcu1 chrUn - the above swap does not include
 ##	gasAcu1 chrUn - thus its browser would be empty for any fr2 alignments.
 ##	This procedure will get fr2 alignments added to the gasAcu1 browser
 ##	for chrUn
     ssh kkstore02
     mkdir /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
     cd /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
 
     cat << '_EOF_' > DEF
 # Stickleback vs. Fugu, Stickleback chrUn in contigs only, to fugu contigs
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
     
 # TARGET: Fugu fr2
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf
 #       coordinates
 SEQ1_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
 SEQ1_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
 SEQ1_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
 SEQ1_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
 SEQ1_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
 SEQ1_CHUNK=20000000
 SEQ1_LIMIT=30
 SEQ1_LAP=10000
 
 # QUERY: Stickleback gasAcu1 chrUn only
 #       chrUn in contigs for this alignment run
 #       The largest is 418,000 bases and there are 5,000 of them.
 SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
 SEQ2_CHUNK=1000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
 TMPDIR=/scratch/tmp
 '_EOF_'
     #  << happy emacs
 
     time doBlastzChainNet.pl DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-blastzOutRoot /cluster/bluearc/fr2GasAcu1 \
 	-stop=net -bigClusterHub=pk > do.log 2>&1 &
     #	real    73m13.156s
 
     ##	swap back to gasAcu1
     mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap
     cd /cluster/data/gasAcu1/bed/blastz.fr2.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 &
 
     ## Now, with that chain result in hand, place it manually back in with
     ##	the full chroms chains and re-run the nets and so forth.
 
 #########################################################################
 ## swap oryLat1 results back to fr2 (DONE - 2007-01-24 - Hiram)
     mkdir /cluster/data/fr2/bed/blastz.oryLat1.swap
     cd /cluster/data/fr2/bed/blastz.oryLat1.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/oryLat1/bed/blastz.fr2.2007-01-24/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=pk > swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24
     time nice -n +19 featureBits oryLat1 chainFr2Link \
 	> fb.oryLat1.chainFr2Link.txt 2>&1
     #	177508958 bases of 700386597 (25.344%) in intersection
     cd /cluster/data/fr2/bed/blastz.oryLat1.swap]
     time nice -n +19 featureBits fr2 chainOryLat1Link \
 	> fb.fr2.chainOryLat1Link.txt 2>&1
     #	143996507 bases of 393312790 (36.611%) in intersection
 
 #########################################################################
 ## 5-Way Multiz (DONE - 2007-02-03 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/fr2/bed/multiz5way
     cd /cluster/data/fr2/bed/multiz5way
 
     cp /cluster/data/gasAcu1/bed/multiz8way/8way.nh .
 
     /cluster/bin/phast/tree_doctor \
 	--prune human_hg18,mouse_mm8,chicken_galGal3 8way.nh
 
     #	use the output of that to manually construct this tree.
     #	Arbitrarily set 0.2 distances for this added branch
     #	All other distances remain as specified in the 17way.nh
 
     cat << '_EOF_' > 5way.nh
   (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
     (stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
 	zebrafish_danRer4:0.782561);
 '_EOF_'
     # << happy emacs
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/fr2_5way.gif
 
     /cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt
     #	Use this output to create the table below
     grep -y fr2 5way.distances.txt | sort -k3,3n
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                           chainGasAcu1Link  chain linearGap
 #    distance                      on fr2    on other   minScore
 #  1  0.4393 - tetraodon tetNig1   (% 62.822) (% 72.087)       2000     loose
 #  2  0.8399 - medaka oryLat1      (% 36.611) (% 25.344)       2000     loose
 #  3  0.8399 - stickleback gasAcu1 (% 40.269) (% 37.574)       2000     loose
 #  4  1.5156 - zebrafish danRer4   (% 20.585) (%  8.543)       5000     loose
 
     cd /cluster/data/fr2/bed/multiz5way
     #	bash shell syntax here ...
     export H=/cluster/data/fr2/bed
     mkdir mafLinks
     for G in oryLat1 tetNig1 gasAcu1 danRer4
     do
 	mkdir mafLinks/$G
 	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
 	echo "missing directory blastz.${G}/mafNet"
 		exit 255
 	fi
 	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
     done
 
     #	Copy MAFs to some appropriate NFS server for kluster run
     ssh kkstore02
     mkdir /san/sanvol1/scratch/fr2/multiz5way
     cd /san/sanvol1/scratch/fr2/multiz5way
     time rsync -a --copy-links --progress \
 	/cluster/data/fr2/bed/multiz5way/mafLinks/ .
 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
     # the autoMultiz cluster run, there are only 2 jobs, kki is perfect
     ssh kki
     cd /cluster/data/fr2/bed/multiz5way/
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	5way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir run maf
     cd run
 
     #	NOTE: you need to set the db and multiz dirname properly in this script
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = fr2
 set c = $1
 set maf = $2
 set binDir = /san/sanvol1/scratch/$db/multiz5way/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz5way
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 autoMultiz $(root1) {check out line+ /cluster/data/fr2/bed/multiz5way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     awk '{print $1}' /cluster/data/fr2/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 2 jobs
     para try ... check ... push ... etc ...
 # Completed: 2 of 2 jobs
 # CPU time in finished jobs:       7246s     120.77m     2.01h    0.08d  0.000 y
 # IO & Wait Time:                   357s       5.94m     0.10h    0.00d  0.000 y
 # Average job time:                3802s      63.36m     1.06h    0.04d
 # Longest finished job:            7601s     126.68m     2.11h    0.09d
 # Submission to last job:          7601s     126.68m     2.11h    0.09d
 
     #	combine results into a single file for loading and gbdb reference
     ssh kkstore02
     cd /cluster/data/fr2/bed/multiz5way
     nice -n +19 catDir maf > multiz5way.maf
     #	makes a 1.3 Gb file:
     #	-rw-rw-r--  1 1341786986 Feb  3 19:52 multiz5way.maf
 
 ############################################################################
 # ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-02-03 - Hiram)
 ## re-done 2007-03-27 with corrected nBeds and sizes files - Hiram
     ssh kolossus
     mkdir /cluster/data/fr2/bed/multiz5way/anno
     cd /cluster/data/fr2/bed/multiz5way/anno
     mkdir maf run
     cd run
     rm -f sizes nBeds
     twoBitInfo -nBed /cluster/data/fr2/fr2.{2bit,N.bed}
 
     for DB in `cat /cluster/data/fr2/bed/multiz5way/species.lst`
     do
         ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
         ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
 	echo ${DB}.bed  >> nBeds
 	echo ${DB}.len  >> sizes
 	echo $DB
     done
 
     echo '#!/bin/csh -ef' > jobs.csh
     echo date >> jobs.csh
     # do smaller jobs first so you can see some progress immediately:
     for F in `ls -1rS ../../maf/*.maf`
     do
      echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \
       /cluster/data/fr2/fr2.2bit ../maf/`basename $F` >> jobs.csh
      echo "echo $F" >> jobs.csh
     done 
     echo date >> jobs.csh
     chmod +x jobs.csh
     time nice -n +19 ./jobs.csh > jobs.log 2>&1 &
     #	to watch progress;
     tail -f jobs.log
     #	real    165m43.716s
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/fr2/bed/multiz5way/anno/maf
     mkdir -p /gbdb/fr2/multiz5way/anno/maf
     ln -s /cluster/data/fr2/bed/multiz5way/anno/maf/*.maf \
       /gbdb/fr2/multiz5way/anno/maf
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/fr2/multiz5way/anno/maf fr2 multiz5way
     #	Loaded 1469786 mafs in 2 files from /gbdb/fr2/multiz5way/anno/maf
     #	real    0m41.123s
 
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh kolossus
     cd /cluster/data/fr2/bed/multiz5way/anno/maf
     time cat *.maf | \
     	nice -n +19 hgLoadMafSummary fr2 -minSize=30000 -mergeGap=1500 \
 	-maxSize=200000 -test multiz5waySummary stdin
     #	Created 146928 summary blocks from 3159326 components
     #	and 1469786 mafs from stdin
 
     #	real    0m58.171s
 
     ssh hgwdev
     cd /cluster/data/fr2/bed/multiz5way/anno/maf
     sed -e 's/mafSummary/multiz5waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz5waySummary.sql
     time nice -n +19 hgLoadSqlTab fr2 multiz5waySummary \
 	~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab
     #	real    0m1.941
 
 #######################################################################
 # MULTIZ5WAY MAF FRAMES (DONE - 2007-02-03 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/fr2/bed/multiz5way/frames
     cd /cluster/data/fr2/bed/multiz5way/frames
     mkdir genes
     # The following is adapted from the gasAcu1 sequence
 
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using refGene for danRer4
     for qDB in danRer4
     do
 	geneTbl=refGene
       echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
       hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$qDB.tmp.gz
       mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
       rm -f $tmpExt
     done
 
     # using genscan for tetNig1
     # using ensGene for gasAcu1, oryLat1 and fr2
     # genePreds; (must keep only the first 10 columns for knownGene)
     for qDB in gasAcu1 oryLat1 fr2 tetNig1
     do
       if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then
         geneTbl=ensGene
       elif [ $qDB = "tetNig1" ]; then
         geneTbl=genscan
       else
         exit 255
       fi
       echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
       hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$qDB.tmp.gz
       mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
       rm -f $tmpExt
     done
 
     ###
     
     ssh kkstore02
     cd /cluster/data/fr2/bed/multiz5way/frames
     time cat ../maf/*.maf | nice -n +19 genePredToMafFrames fr2 stdin stdout fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz | gzip > multiz5way.mafFrames.gz
     #	real    0m52.606
 
     ssh hgwdev
     cd /cluster/data/fr2/bed/multiz5way/frames
     time nice -n +19 hgLoadMafFrames fr2 multiz5wayFrames \
 	multiz5way.mafFrames.gz
     #	real    0m20.580s
 
 #########################################################################
 # MULTIZ5WAY DOWNLOADABLES (DONE - 2007-02-05 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/fr2/bed/multiz5way/mafDownloads
     cd /cluster/data/fr2/bed/multiz5way
 
     # upstream mafs 
     #	rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
     #	did not have fr2 as the first one
     #	There isn't any refGene table, using ensGene instead
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 fr2 \
         ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags fr2 multiz5way \
                 stdin stdout -orgs=species.lst \
         | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz
     echo "done ensGene.upstream${S}.maf.gz"
 done
 
     ssh kkstore05
     cd /cluster/data/fr2/bed/multiz5way
     ## re-done 2007-03-27 after correction to annotation step - Hiram
     time for M in anno/maf/chr*.maf
     do
 	B=`basename $M`
 	nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz
 	echo ${B}.gz done
     done
     #	real    4m39.440s
 
     cd mafDownloads
     nice -n +19 md5sum *.maf.gz > md5sum.txt
     # Make a README.txt
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/fr2/multiz5way
     cd /usr/local/apache/htdocs/goldenPath/fr2/multiz5way
     ln -s /cluster/data/fr2/bed/multiz5way/mafDownloads/{*.gz,*.txt} .
 
 ############################################################################
 # CREATE CONSERVATION WIGGLE WITH PHASTCONS
 #		(DONE - 2007-02-05 - Hiram)
 
 # Estimate phastCons parameters
     ssh kkstore05
     mkdir /cluster/data/fr2/bed/multiz5way/cons
     cd /cluster/data/fr2/bed/multiz5way/cons
 
     # Create a starting-tree.mod based on one 25,000,000 window of chrUn
     time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrUn.maf \
 	--refseq ../../../Un/chrUn.fa --in-format MAF \
 	--windows 25000000,1000 --out-format SS \
 	--between-blocks 5000 --out-root s1
     #	real    4m27.989s
 
 
     time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS \
 	s1.174992629-199991578.ss \
 	--tree "(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4)" \
 	--out-root starting-tree
     #	As an experiment, ran all of these ss files through this prediction
     #	and the resulting stats of the add up the C and G:
     ## min    Q1 median   Q3   max     mean  N   sum     stddev
     # 0.45 0.457 0.463 0.469 0.479 0.461941 17 7.853 0.00759621
     #	Using the one closest to the mean: s1.174992629-199991578.ss
 
     rm s1.*.ss
     # add up the C and G:
     grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     #	0.463
     #	This 0.463 is used in the --gc argument below
 
     ## the fa files are needed for the sequence and they are created during
     #	this loop if they haven't been done before
     # Create SS files on san filesystem
     ssh kkstore05
     mkdir -p  /san/sanvol1/scratch/fr2/cons/ss
     cd  /san/sanvol1/scratch/fr2/cons/ss
     time for C in \
 	`awk '{print $1}' /cluster/data/fr2/chrom.sizes | sed -e "s/chr//"`
     do
 	mkdir -p chr${C}
 	echo msa_split $C
 	nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
 	    /cluster/data/fr2/bed/multiz5way/maf/chr${C}.maf \
 	    --refseq /cluster/data/fr2/${C}/chr${C}.fa \
 	    --in-format MAF --windows 2500000,0 --between-blocks 5000 \
 	    --out-format SS --out-root chr${C}/chr${C}
     done &
     #	real    4m2.736s
 
     # Create a random list of 50 1 mb regions
     cd /san/sanvol1/scratch/fr2/cons/ss
     ls -1l chr*/chr*.ss \
 	| awk '$5 > 4000000 {print $9;}' \
 	| randomLines stdin 50 ../randomSs.list
 
     # Set up parasol directory to calculate trees on these 50 regions
     ssh pk
     mkdir /san/sanvol1/scratch/fr2/cons/treeRun1
     cd /san/sanvol1/scratch/fr2/cons/treeRun1
     mkdir tree log
 
     #	Tuning this loop should come back to here to recalculate 
     # Create little script that calls phastCons with right arguments
     #	--target-coverage of 0.20 is about right for mouse, will be
     #	tuned exactly below
     cat > makeTree.csh << '_EOF_'
 #!/bin/csh -fe
 set C=$1:h
 mkdir -p log/${C} tree/${C}
     /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
       /cluster/data/fr2/bed/multiz5way/cons/starting-tree.mod \
       --gc 0.463 --nrates 1,1 --no-post-probs --ignore-missing \
       --expected-length 10 --target-coverage 0.20 \
       --quiet --log log/$1 --estimate-trees tree/$1
 '_EOF_'
     #	<< happy emacs
     chmod a+x makeTree.csh
 
     # Create gensub file
     cat > template << '_EOF_'
 #LOOP
 ./makeTree.csh $(path1)
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     # Make cluster job and run it
     gensub2 ../randomSs.list single template jobList
     para create jobList
     para try/push/check/etc
 # Completed: 50 of 50 jobs
 # CPU time in finished jobs:       5204s      86.74m     1.45h    0.06d  0.000 y
 # IO & Wait Time:                   204s       3.39m     0.06h    0.00d  0.000 y
 # Average job time:                 108s       1.80m     0.03h    0.00d
 # Longest finished job:             138s       2.30m     0.04h    0.00d
 # Submission to last job:           141s       2.35m     0.04h    0.00d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh pk
     cd /san/sanvol1/scratch/fr2/cons/treeRun1
     ls -1 tree/chr*/*.cons.mod > cons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
 	--output-average ave.cons.mod > cons_summary.txt 2>&1 &
     ls -1 tree/chr*/*.noncons.mod > noncons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
 	--output-average ave.noncons.mod > noncons_summary.txt
     cp -p ave.*.mod ..
     cd ..
     cp -p ave.*.mod /cluster/data/fr2/bed/multiz5way/cons
 
     #	measuring entropy
     #	consEntopy <target coverage> <expected lengths>
     #		 ave.cons.mod ave.noncons.mod --NH 9.78
     #	never stops with the --NH argument
     time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \
 	0.20 10 ave.{cons,noncons}.mod
 ## 0.20 10
 ( Solving for new omega: 10.000000 10.467305 10.449210 10.449184 )
 
 Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
 Relative entropy: H=0.903779 bits/site
 Expected min. length: L_min=10.726004 sites
 Expected max. length: L_max=6.735382 sites
 Phylogenetic information threshold: PIT=L_min*H=9.693936 bits
 Recommended expected length: omega=10.449184 sites (for L_min*H=9.783400)
 
     ssh pk
     # Create cluster dir to do main phastCons run
     mkdir /san/sanvol1/scratch/fr2/cons/consRun1
     cd /san/sanvol1/scratch/fr2/cons/consRun1
     mkdir ppRaw bed
 
     # Create script to run phastCons with right parameters
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     cat > doPhast << '_EOF_'
 #!/bin/csh -fe
 mkdir /scratch/tmp/${2}
 cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2}
 pushd /scratch/tmp/${2} > /dev/null
 /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
    --expected-length 10 --target-coverage 0.20 --quiet \
 	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
 popd > /dev/null
 mkdir -p ppRaw/${1}
 mkdir -p bed/${1}
 mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
 mv /scratch/tmp/${2}/${2}.bed bed/${1}
 rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
 rm /scratch/tmp/${2}/${2}.ss
 rmdir /scratch/tmp/${2}
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << '_EOF_'
 #LOOP
 ./doPhast $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     # Create parasol batch and run it
     ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
 
     gensub2 in.list single template jobList
     para create jobList
     para try/check/push/etc.
     #	These jobs are very fast and very I/O intensive, even on the san
     #	they will hang it up as they work at full tilt.
 # Completed: 162 of 162 jobs
 # CPU time in finished jobs:       1501s      25.02m     0.42h    0.02d  0.000 y
 # IO & Wait Time:                   924s      15.40m     0.26h    0.01d  0.000 y
 # Average job time:                  15s       0.25m     0.00h    0.00d
 # Longest finished job:              23s       0.38m     0.01h    0.00d
 # Submission to last job:            44s       0.73m     0.01h    0.00d
 
     # combine predictions and transform scores to be in 0-1000 interval
     #	it uses a lot of memory, so on kolossus:
     ssh kolossus
     cd /san/sanvol1/scratch/fr2/cons/consRun1
     #	The sed's and the sort get the file names in chrom,start order
     #	You might like to verify it is correct by first looking at the
     #	list it produces:
     find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
     #	if that looks right, then let it run:
     find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
 	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
 	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved5Way.bed
     #	~ 16 seconds
     cp -p mostConserved5Way.bed /cluster/data/fr2/bed/multiz5way
 
     # Figure out how much is actually covered by the bed file as so:
     #	Get the non-n genome size from faSize on all chroms:
     ssh kkstore05
     cd /cluster/data/fr2
     faSize Un/chrUn.fa M/chrM.fa
     #	400525790 bases (49313545 N's 351212245 real 284435760
     #	upper 66776485 lower) in 2 sequences in 2 files
 
     cd /san/sanvol1/scratch/fr2/cons/consRun1
     #	The 351212245 comes from the non-n genome as counted above.
     awk '
 {sum+=$3-$2}
 END{printf "%% %.2f = 100.0*%d/351212245\n",100.0*sum/351212245,sum}' \
 	mostConserved5Way.bed
     #	% 15.82 = 100.0*55573943/351212245 --exp-len 10 --tar-cov 0.20
 
     #	Aiming for %70 coverage in
     #	the following featureBits measurement on CDS:
     # Beware of negative scores when too high.  The logToBedScore
     # will output an error on any negative scores.
 
     HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits fr2 \
 	-enrichment ensGene:cds mostConserved5Way.bed
     #	--expected-length 10 --target-coverage 0.20 fr2
     #	ensGene:cds 8.216%, mostConserved5Way.bed 14.130%, both 5.188%, cover
     #	63.14%, enrich 4.47x
 
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/fr2/bed/multiz5way
     # ended up using the set: --expected-length 10 --target-coverage 0.20
     time nice -n +19 hgLoadBed -strict fr2 phastConsElements5way \
 	mostConserved5Way.bed
     #	Loaded 1140341 elements of size 5
     #	real    0m28.545
 
     #	should measure the same as above
     time nice -n +19 \
 	featureBits fr2 -enrichment ensGene:cds phastConsElements5way
     #	At: --expected-length 10 --target-coverage 0.20 fr2
     #	ensGene:cds 8.216%, phastConsElements5way 14.130%, both 5.188%, cover
     #	63.14%, enrich 4.47x
 
     # Create merged posterier probability file and wiggle track data files
     ssh pk
     cd /san/sanvol1/scratch/fr2/cons/consRun1
     # the sed business gets the names sorted by chromName, chromStart
     #	so that everything goes in numerical order into wigEncode
     #	This was verified above to be correct
     time nice -n +19 find ./ppRaw -type f \
 	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
 	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
 		phastCons5.wig phastCons5.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    3m49.865s
     #	-rw-rw-r--   1 288320445 Feb  5 13:51 phastCons5.wib
     #	-rw-rw-r--   1  37122305 Feb  5 13:51 phastCons5.wig
     time nice -n +19 cp -p phastCons5.wi? /cluster/data/fr2/bed/multiz5way/
 
     #	prepare compressed copy of ascii data values for downloads
     ssh pk
     cd /san/sanvol1/scratch/fr2/cons/consRun1
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastCons5Scores
 
 for D in ppRaw/chr*
 do
     C=${D/ppRaw\/}
     out=phastCons5Scores/${C}.data.gz
     echo "========================== ${C} ${D}"
     find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
 	    gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
     #	real    5m21.400s
 
     #	copy them for downloads
     ssh kkstore05
     #	this directory is actually a symlink from store9 to store8 to
     #	avoid the data full problem on store9
     mkdir /cluster/data/fr2/bed/multiz5way/phastCons5Scores
     cd /cluster/data/fr2/bed/multiz5way/phastCons5Scores
     cp -p  /san/sanvol1/scratch/fr2/cons/consRun1/phastCons5Scores/* .
     # make a README.txt file here, and an md5sum
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores
     cd /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores
     ln -s /cluster/data/fr2/bed/multiz5way/phastCons5Scores/* .
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/fr2/bed/multiz5way
     ln -s `pwd`/phastCons5.wib /gbdb/fr2/wib/phastCons5.wib
     # ended up using the set: --expected-length 10 --target-coverage 0.20
     time nice -n +19 hgLoadWiggle fr2 phastCons5 phastCons5.wig
     #	real    0m9.256s
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/fr2/bed/multiz5way
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=fr2 phastCons5 > histogram.data 2>&1
     #	real    0m30.744
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Stickleback fr2 Histogram phastCons5 track"
 set xlabel " phastCons5 score - --expected-length 10 --target-coverage 0.20"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     #	The mostConserved can also be characterized by a histogram
     awk '{print $3-$2}' mostConserved5Way.bed > mostCons.txt
     textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \
 	> mostCons.histo.txt
 
     cat << '_EOF_' | gnuplot > mostCons.png
 set terminal png small color \
         xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Stickleback fr2 histogram: lengths of mostConserved track elements"
 set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20"
 set ylabel " # of elements at this length"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set xrange [0:200]
 set y2tics
 set boxwidth 2
 set style fill solid
 
 plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \
         "mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines
 '_EOF_'
 #	<< happy emacs
 
 ############################################################################
 ##  Set a more interesting default position, location of CABIN1 gene
 ##	DONE - 2007-02-08 - Hiram
     ssh hgwdev
     hgsql -e "update
     hgsql -e \
 'update dbDb set defaultPos="chrUn:29,305,920-29,330,760" where name = "fr2";' \
 	-h genome-testdb hgcentraltest
 
 ############################################################################
 ##  DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram)
     ssh hgwdev
     cd /cluster/data/fr2
     ~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \
 	> makeDownloads.out 2>&1
     #	Doesn't work due to missing Repeat masker outputs
     #	Create WindowMasker separate files by chrom, for downloads
     ssh kkstore05
     cd /cluster/data/fr2/bed/WindowMasker.2007-01-22
     #	This name change here is so the names created by splitFileByColumn
     #	will be reasonable
     zcat windowmasker.sdust.bed.gz > chr.fr2.WMSdust.bed
     splitFileByColumn chr.fr2.WMSdust.bed chrWM
     #	Creating chrWM/chr1.fr2.WMSdust.bed
     #	Creating chrWM/chr2.fr2.WMSdust.bed
     #	... etc ...
     cd chrWM
     tar cvzf ../chromWMSdust.bed.tar.gz *.bed
     cd ..
     #	Verify this process didn't destroy anything:
     cat chrWM/*.bed | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}'
     #	total size: 115825307
     zcat windowmasker.sdust.bed.gz \
 	| awk '{sum += $3-$2}END{printf "total size: %d\n",sum}'
     #	total size: 115825307
     #	deliver to bigZips
     cd /cluster/data/fr2/goldenPath/bigZips
     ln -s \
     /cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz .
     #	remove the chromOut.tar.gz file and re-make the md5sum.txt
     md5sum *gz > md5sum.txt
     #	go back to simpleRepeat and attempt to make a chrM.bed - turns out to
     #	be an empty result, so leave an empty file there.  Then running this
     #	makeDownloads.pl again, create two empty .out files to get through
     #	that.
     cd /cluster/data/fr2
     touch Un/chrUn.fa.out
     touch M/chrM.fa.out
     ~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \
 	> makeDownloads.out 2>&1
     cd goldenPath/bigZips
     ln -s \
 	/cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz .
     # get GenBank native mRNAs
     ssh hgwdev
     cd /cluster/data/genbank
     ./bin/i386/gbGetSeqs -db=fr2 -native \
 	GenBank mrna /cluster/data/fr2/goldenPath/bigZips/mrna.fa
     # get GenBank xeno mRNAs
     ./bin/i386/gbGetSeqs -db=fr2 -xeno \
 	GenBank mrna /cluster/data/fr2/goldenPath/bigZips/xenoMrna.fa
     cd /cluster/data/fr2/goldenPath/bigZips
     ssh kkstore05
     gzip mrna.fa xenoMrna.fa
     md5sum *.gz > md5sum.txt
     #	Edit the README.txt file to be correct
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/fr2/bigZips
     ln -s /cluster/data/fr2/goldenPath/bigZips/* .
 
 ############################################################################
 ## Fugu Photograph - obtained from Byrappa Venkatesh in email 2007-02-15
 ##		(DONE - 2007-02-16 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/fr2/photograph
     cd /cluster/data/fr2/photograph
     ## original: Byrappa.Venkatesh.Fugu.jpg
     identify By*
     #	Byrappa.Venkatesh.Fugu.jpg JPEG 700x286 DirectClass 43kb 0.000u 0:01
     convert -quality 80 -geometry 300x200 Byrappa.Venkatesh.Fugu.jpg \
 	Takifugu_rubripes.jpg
     ##  check this file into the browser/images CVS source tree and
     ##	copy to /usr/local/apache/htdocs/images
 
 ##########################################################################
 ## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/fr2/bed/RepeatMasker
     cd /cluster/data/fr2/bed/RepeatMasker
     time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \
 	-buildDir=/cluster/data/fr2/bed/RepeatMasker fr2 > do.log 2>&1
 
 ###########################################################################
 ## Chicken/Fugu chain/net swap - (DONE - 2007-03-12 - Hiram)
     mkdir /cluster/data/fr2/bed/blastz.galGal3.swap
     cd /cluster/data/fr2/bed/blastz.galGal3.swap
     time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
 	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap > swap.log  2>&1 &
     time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
 	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net -swap > swap_net.log  2>&1 &
     #	real    3m1.239s
     cat fb.fr2.chainGalGal3Link.txt
     #	36175581 bases of 393312790 (9.198%) in intersection
 
 ###########################################################################
 # Create liftover fr1 to fr2 (DONE - 2007-04-09 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/fr2/bed/blat.fr1
     cd /cluster/data/fr2/bed/blat.fr1
 
     time nice -n +19 doSameSpeciesLiftOver.pl fr1 fr2 -bigClusterHub pk \
 	-buildDir=/cluster/data/fr2/bed/blat.fr1 > do.log 2>&1
 
     cp -p fr1ToFr2.over.chain.gz ../liftOver
     cd ../liftOver
     md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/fr2/liftOver
     ln -s /cluster/data/fr2/bed/liftOver/fr1ToFr2.over.chain.gz .
 
 ##############################################################################
 # SWAP DANRER5 BLASTZ RESULT TO CREATE DANRER5 CHAINS AND NETS TRACKS, 
 # AXTNET, MAFNET AND ALIGNMENT DOWNLOADS 
 # (DONE, 2007-09-19 and 2007-09-22, hartera)
     
     ssh kkstore02
     mkdir /cluster/data/fr2/bed/blastz.swap.danRer5
     cd /cluster/data/fr2/bed/blastz.swap.danRer5
     # blastz parameters used to align fr2 as query to danRer5 as target:
     # BLASTZ_H=2500
     # BLASTZ_M=50
     # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
     # Results for fr2 blastz on danRer5 are in:
     # /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
     time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -qRepeats=windowmaskerSdust -swap \ 
         /cluster/data/danRer5/bed/blastz.fr2.2007-09-18/DEF \
          >& swap.log &
     # 0.139u 0.098s 14:09.59 0.0%     0+0k 0+0io 0pf+0w
 
     ssh hgwdev
     cat \ 
   /cluster/data/fr2/bed/blastz.danRer5.swap/fb.fr2.chainDanRer5Link.txt
     # 78259559 bases of 393312790 (19.898%) in intersection
     # look at coverage of ensGene CDS, there is no native RefSeqs 
     # track for fugu, fr2.
     featureBits fr2 ensGene:cds chainDanRer5Link -enrichment
 # ensGene:cds 8.216%, chainDanRer5Link 19.898%, both 7.130%, cover 86.78%,
 # enrich 4.36x 
     featureBits fr2 ensGene:cds chainDanRer4Link -enrichment
 # ensGene:cds 8.216%, chainDanRer4Link 20.585%, both 7.030%, cover 85.56%,
 # enrich 4.16x 
     featureBits fr2 ensGene:cds netDanRer5 -enrichment
 # ensGene:cds 8.216%, netDanRer5 66.051%, both 7.845%, cover 95.48%, 
 # enrich 1.45x
     featureBits fr2 ensGene:cds netDanRer4 -enrichment
 # ensGene:cds 8.216%, netDanRer4 65.374%, both 7.766%, cover 94.51%, 
 # enrich 1.45x
     # clean up a little (2007-09-22, hartera) 
     ssh kkstore02
     cd /cluster/data/fr2/bed
     mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap
     rm -r blastz.swap.danRer5
     ln -s blastz.danRer5.swap blastz.danRer5
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram)
     ssh kkstore04	# not too important since everything moved to the hive
     screen	# use a screen to control this job
     cd /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25
     cat fb.oryLat2.chainFr2Link.txt
     #	180945351 bases of 700386597 (25.835%) in intersection
 
     mkdir /cluster/data/fr2/bed/blastz.oryLat2.swap
     cd /cluster/data/fr2/bed/blastz.oryLat2.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/oryLat2/bed/blastz.fr2.2008-08-25/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
     #	real    24m32.826s
     cat fb.fr2.chainOryLat2Link.txt
     #	153621820 bases of 393312790 (39.058%) in intersection
 
 ############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built  Tracks are created and loaded
+by a single Makefile. This is available from:
+   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################