src/hg/makeDb/doc/bosTau4.txt 1.23

1.23 2010/04/01 17:33:10 hiram
never got susScr1 to work finished with susScr2 lastz
Index: src/hg/makeDb/doc/bosTau4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/bosTau4.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/bosTau4.txt	25 Jan 2010 19:16:14 -0000	1.22
+++ src/hg/makeDb/doc/bosTau4.txt	1 Apr 2010 17:33:10 -0000	1.23
@@ -1,1891 +1,1944 @@
 # for emacs: -*- mode: sh; -*-
 
 # Bos Taurus -- Baylor Release 4.0 (Oct 4 2007)
 #	"$Id$"
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE - 2008-03-07 - Hiram)
     ssh kkstore06
     mkdir /cluster/store3/bosTau4
     ln -s /cluster/store3/bosTau4 /cluster/data/bosTau4
     mkdir /cluster/data/bosTau4/baylor
     cd /cluster/data/bosTau4/baylor
 
     for F in README.Btau20070913.txt Contigs/*
 do
     wget --timestamping \
 "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/${F}"
 done
 
     mkdir chroms
     cd chroms
     wget --timestamping \
 "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/LinearScaffolds/*"
 
     #	Looking up chrM in Entrez nucleotide search,
     #	from bosTau3 note: NC_006853 GI:60101824
     #	search for Bos taurus mitochondrion complete genome
 
     #	Note, their Contigs files are from their 2006 release.
     #	Their chrom files are new to this release.
 
     #	fixup the Chr case names in the agp file:
     sed -e "s/^Chr/chr/" Btau20070913.All.ac.agp > bosTau4.agp
 
     #	fixup the contig names in the contigs.fa and qual files, utilize
     #	the following perl scripts:
     cat << '_EOF_' > fixContigNames.pl
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 my $argc = scalar(@ARGV);
 
 if (1 != $argc) {
     print "usage ./fixContigNames.pl Btau20060815.contigs.fa.gz \\\n",
 	" | gzip > bosTau4.contigs.fa.gz\n";
     exit 255;
 }
 
 my $fName = shift;
 
 open (FH,"zcat $fName|") or die "can not read $fName $!";
 while (my $line = <FH>) {
     if ($line =~ m/^>/) {
 	my @a = split('\|', $line);
 	$a[3] =~ s/\.[0-9]+//;
 	print ">$a[3]\n";
     } else {
 	print "$line";
     }
 }
 close (FH)
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > fixQuals.pl
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 my $argc = scalar(@ARGV);
 
 if (1 != $argc) {
     print "usage ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \\\n",
 	" | gzip > bosTau4.qual.gz\n";
     exit 255;
 }
 
 my $fName = shift;
 
 open (FH,"zcat $fName|") or die "can not read $fName $!";
 while (my $line = <FH>) {
     if ($line =~ m/^>/) {
 	$line =~ s/\.[0-9]+.*//;
     }
     print "$line";
 }
 close (FH)
 '_EOF_'
     # << happy emacs
 
     #	There are extra records in the qual and contigs fa files.
     #	Only the sequences in the agp file should be used:
     grep "^chr" bosTau4.agp | egrep -v "clone|fragment" | cut -f6 \
 	| sort > agp.names
     #	run those scripts, extract only the used records, and lift the
     #	qual scores via the AGP file to a qac file.
     chmod +x fixContigNames.pl fixQuals.pl
     ./fixContigNames.pl Btau20060815.contigs.fa.gz \
 	| faSomeRecords stdin agp.names stdout | gzip > bosTau4.contigs.fa.gz
     ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \
 	| faSomeRecords stdin agp.names stdout \
 	| qaToQac stdin stdout \
 	| qacAgpLift bosTau4.agp stdin bosTau4.qual.qac
 
 #############################################################################
 #  running makeGenomeDb.pl (DONE - 2008-03-07 - Hiram)
     ssh kkstore06
     cd /cluster/data/bosTau4
     cat << '_EOF_' > bosTau4.config.ra
 db bosTau4
 clade mammal
 scientificName Bos Taurus
 assemblyDate Oct. 2007
 assemblyLabel Baylor Release 4.0
 orderKey 236
 dbDbSpeciesDir cow
 mitoAcc 60101824
 agpFiles /cluster/data/bosTau4/baylor/bosTau4.agp
 fastaFiles /cluster/data/bosTau4/baylor/bosTau4.contigs.fa.gz
 qualFiles /cluster/data/bosTau4/baylor/bosTau4.qual.qac
 commonName Cow
 '_EOF_'
     # << happy emacs
 
     makeGenomeDb.pl bosTau4.config.ra > makeGenomeDb.log 2>&1
 
 #########################################################################
 # REPEATMASKER (DONE - 2008-03-07 - Hiram)
     ssh kkstore06
     screen # use a screen to manage this job
     mkdir /cluster/data/bosTau4/bed/repeatMasker
     cd /cluster/data/bosTau4/bed/repeatMasker
     # 
     #	This was going too slow on memk, on the order of 8 days.
     #	chilled that memk batch, then switched it to finish on pk
     doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \
 	-bigClusterHub=memk bosTau4 > do.log 2>&1 &
     #	continuing
     doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \
 	-continue=cat -bigClusterHub=pk bosTau4 > cat.log 2>&1 &
 
     time nice -n +19 featureBits bosTau4 rmsk > fb.bosTau4.rmsk.txt 2>&1 &
     #	1280927549 bases of 2731830700 (46.889%) in intersection
 
 
 
 
     # RepeatMasker and lib version from do.log:
     #	RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $
     #   Jan 11 2008 (open-3-1-9) version of RepeatMasker
     #	CC   RELEASE 20071204;
 
     # Compare coverage to previous assembly:
     featureBits bosTau3 rmsk
     # 1200525422 bases of 2731807384 (43.946%) in intersection
     featureBits bosTau2 rmsk
     # 1291561904 bases of 2812203870 (45.927%) in intersection
 
 #########################################################################
 # SIMPLE REPEATS TRF (DONE - 2008-03-07 - Hiram)
     ssh kkstore06
     screen # use a screen to manage this job
     mkdir /cluster/data/bosTau4/bed/simpleRepeat
     cd /cluster/data/bosTau4/bed/simpleRepeat
     # 
     doSimpleRepeat.pl -buildDir=/cluster/data/bosTau4/bed/simpleRepeat \
 	bosTau4 > do.log 2>&1 &
     #	after RM run is done, add this mask:
     cd /cluster/data/bosTau4
     twoBitMask bosTau4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4.2bit
 
     twoBitToFa bosTau4.2bit stdout | faSize stdin
     #	2917974530 bases (186161294 N's 2731813236 real
     #	1450855409 upper 1280957827 lower) in 11900 sequences in 1 files
     #	%43.90 masked total, %46.89 masked real
 
     twoBitToFa bosTau4.rmsk.2bit stdout | faSize stdin
     #	2917974530 bases (186161294 N's 2731813236 real
     #	1451369861 upper 1280443375 lower) in 11900 sequences in 1 files
     #	%43.88 masked total, %46.87 masked real
 
 #########################################################################
 # Create OOC file for genbank runs (DONE - 2008-03-10 - Hiram)
 # use same repMatch value as bosTau2
     ssh kkstore06
     cd /cluster/data/bosTau3
     blat bosTau4.2bit /dev/null /dev/null -tileSize=11 \
 	-makeOoc=bosTau4.11.ooc -repMatch=1005
 
     ssh kkr1u00
     mkdir /iscratch/i/bosTau4
     cd /iscratch/i/bosTau4
     cp -p /cluster/data/bosTau4/bosTau4.2bit .
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau4/
 done
 
 #########################################################################
 #  Starting Genbank
     ssh hgwdev
     cd $HOME/kent/src/hg/makeDb/genbank
     # edit etc/genbank.conf to add the following entry:
 
 # bosTau4 (B. taurus) 31 chromosomes plus 11869 scaffolds
 bosTau4.serverGenome = /cluster/data/bosTau4/bosTau4.2bit
 bosTau4.clusterGenome = /iscratch/i/bosTau4/bosTau4.2bit
 bosTau4.ooc = /cluster/data/bosTau4/bosTau4.11.ooc
 bosTau4.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
 bosTau4.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
 bosTau4.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
 bosTau4.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
 bosTau4.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
 bosTau4.lift = no
 bosTau4.downloadDir = bosTau4
 bosTau4.perChromTables = no
 bosTau4.mgc = yes
 
     cvs ci -m "Added bosTau4." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     ssh genbank
     screen  # control this business with a screen since it takes a while
     cd /cluster/data/genbank
 
     # This is a call to a script that will push our jobs out to the cluster
     # since it's a big job.  
     time nice -n +19 bin/gbAlignStep -initial bosTau4 &
     #	logFile: var/build/logs/2008.03.10-14:14:43.bosTau4.initalign.log
     #	real    567m7.431s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau4
     #	logFile: var/dbload/hgwdev/logs/2008.03.11-08:48:17.dbload.log
     #	real    34m55.565s
 
     # enable daily alignment and update of hgwdev (DONE - 2008-03-11 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add bosTau4 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added bosTau4." etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 ############################################################################
 #	DONE - 2008-03-11 - Hiram
 #	Reset default position to an area of chr3 with a number of genes
     hgsql -e \
 'update dbDb set defaultPos="chr3:21083267-21232422" where name="bosTau4";' \
 	hgcentraltest
     #	And there was a mistake in the description date entry
     hgsql -e \
 'update dbDb set description="Oct. 2007" where name="bosTau4";' \
 	hgcentraltest
 
 #########################################################################
 ## genscan run (DONE - 2008-03-11 - Hiram)
 ##	create hard masked sequence
     ssh kkstore06
     cd /cluster/data/bosTau4
     mkdir hardMasked
     for C in `cut -f1 chrom.sizes`
 do
     echo "hardMasked/${C}.hard.fa"
     twoBitToFa -seq=${C} bosTau4.2bit stdout \
 	| maskOutFa stdin hard hardMasked/${C}.hard.fa
     ls -ld "hardMasked/${C}.hard.fa"
 done
 
     #	And, make sure there aren't any sequences in this lot that have
     #	become all N's with no sequence left in them.  This drives genscan nuts
     echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt
 
     #	the lowest three are:
     egrep -v "^#|^total" faCount.hard.txt \
 	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
     #	chrUn.004.9957 0
     #	chrUn.004.9961 0
     #	chrUn.004.9996 0
     #	There are a whole bunch of these, and many with just a few bases.
     #	Actually, before removing these for genscan, run the cpgIsland
     #	business first since it can work on them all.
     #	So, remove any with less than 100 bases of sequence
     egrep -v "^#|^total" faCount.hard.txt \
 | awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' \
     | xargs rm
 
     #	now get them over to a kluster location
     mkdir /san/sanvol1/scratch/bosTau4/hardChunks
     cd /san/sanvol1/scratch/bosTau4/hardChunks
     #	creating 4,000,000 sized chunks, the chroms stay together as
     #	single pieces.  The contigs get grouped together into 4,000,000
     #	sized fasta files.  You don't want to break these things up
     #	because genscan will be doing its own internal 2.4 million
     #	window on these pieces, and the gene names are going to be
     #	constructed from the sequence name in these fasta files.
     echo /cluster/data/bosTau4/hardMasked/*.hard.fa | xargs cat \
 	| faSplit about stdin 4000000 c_
 
     ssh hgwdev
     mkdir /cluster/data/bosTau4/bed/genscan
     cd /cluster/data/bosTau4/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
 
     # Run on small cluster (more mem than big cluster).
     ssh memk
     cd /cluster/data/bosTau4/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     #	Since we split on gaps, we have no chunks like that.  You can
     #	verify with faCount on the chunks.
     ls -1Sr /san/sanvol1/scratch/bosTau4/hardChunks/c_*.fa > genome.list
 
     # Create run-time script to operate gsBig in a cluster safe manner
     cat << '_EOF_' > runGsBig
 #!/bin/csh -fe
 set runDir = `pwd`
 set srcDir = $1
 set inFile = $2
 set fileRoot = $inFile:r
 mkdir /scratch/tmp/$fileRoot
 cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
 pushd /scratch/tmp/$fileRoot
 /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
 popd
 cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
 cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
 cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
 rm -fr /scratch/tmp/$fileRoot
 '_EOF_'
     # << happy emacs
     chmod +x runGsBig
     cat << '_EOF_' > template
 #LOOP
 runGsBig /san/sanvol1/scratch/bosTau4/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} 
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 genome.list single template jobList
     para create jobList
     para try, check, push, check, ...
 XXX - running 2008-03-11 11:20
 # Completed: 97 of 97 jobs
 # CPU time in finished jobs:      93952s    1565.87m    26.10h    1.09d  0.003 y
 # IO & Wait Time:                   372s       6.19m     0.10h    0.00d  0.000 y
 # Average job time:                 972s      16.21m     0.27h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            4870s      81.17m     1.35h    0.06d
 # Submission to last job:          7902s     131.70m     2.19h    0.09d
 
     # cat and lift the results into single files
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/genscan
     sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
     sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed 
     cat pep/c_*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/genscan
     ldHgGene bosTau4 -gtf genscan genscan.gtf
     #	Read 49598 transcripts in 346887 lines in 1 files
     #	49598 groups 5059 seqs 1 sources 1 feature types
     #	49598 gene predictions
 
     hgPepPred bosTau4 generic genscanPep genscan.pep
     hgLoadBed bosTau4 genscanSubopt genscanSubopt.bed
     #	Loaded 528092 elements of size 6
 
     #	let's check the numbers
     time nice -n +19 featureBits bosTau4 genscan
     #	58224594 bases of 2731830700 (2.131%) in intersection
     time nice -n +19 featureBits bosTau3 genscan
     #	59251085 bases of 2731807384 (2.169%) in intersection
 
 #########################################################################
 # CPGISLANDS (DONE - 2008-03-11 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/bosTau4/bed/cpgIsland
     cd /cluster/data/bosTau4/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	There was a problem in here, in both cpg.c and cpg_lh.c:
     #	warning: conflicting types for built-in function 'malloc'
     #	comment out the lines where malloc is declared to get this to build
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     cd ../..
     ln -s hg3rdParty/cpgIslands/cpglh.exe .
     
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     mkdir results
     echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F
     do
 	FA=${F/*\/}
 	C=${FA/.hard.fa/}
 	echo "./cpglh.exe ${FA} > results/${C}.cpg"
 	nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg
     done > cpglh.out 2>&1 &
     #	about 5 minutes
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     #	<< happy emacs
     catDir results \
     | awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/cpgIsland
     hgLoadBed bosTau4 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     #	Reading cpgIsland.bed
     #	Loaded 37595 elements of size 10
     featureBits bosTau4 cpgIslandExt
     #	24202824 bases of 2731830700 (0.886%) in intersection
     featureBits bosTau3 cpgIslandExt
     #	24374280 bases of 2731807384 (0.892%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram)
     ssh kkstore02
     screen # use a screen to manage this multi-day job
     mkdir /cluster/data/hg18/bed/blastzBosTau4.2008-03-11
     cd /cluster/data/hg18/bed/
     ln -s blastzBosTau4.2008-03-11 blastz.bosTau4
     cd blastzBosTau4.2008-03-11
 
     cat << '_EOF_' > DEF
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau4
 SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
 SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=300
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzBosTau4.2008-03-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -syntenicNet > do.log 2>&1
     #	real    611m17.901s
     cat fb.hg18.chainBosTau4Link.txt
     #	1345348636 bases of 2881515245 (46.689%) in intersection
 
     mkdir /cluster/data/bosTau4/bed/blastz.hg18.swap
     cd /cluster/data/bosTau4/bed/blastz.hg18.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/hg18/bed/blastzBosTau4.2008-03-11/DEF \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -swap -syntenicNet > do.log 2>&1
     #	broken down during netSynteny.csh due to too many open files on
     #	a chainSplit
     #	To get them split, set up a job on memk to do the split
     #	via chainFilter: chainFilter -t=${T} bigFile.chain > ${T}.chain
     #   Where the targets T are from the list:
     #   grep "^chain " bigFile.chain | awk '{print $3}' | sort -u
     #	see full example below in Platypus Blastz
 
 ###########################################################################
 # Blastz Platypus ornAna1 (DONE - 2008-03-11,14 - Hiram)
     ssh kkstore06
     screen # use screen to control this job
     mkdir /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
     cd /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
 
     cat << '_EOF_' > DEF
 # Cow vs. platypus
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Cow bosTau4
 SEQ1_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
 SEQ1_LEN=/cluster/data/bosTau4/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ1_LIMIT=300
 SEQ1_CHUNK=20000000
 SEQ1_LAP=0
 
 # QUERY: Platypus ornAna1
 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
 SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -verbose=2 \
 	-chainLinearGap=loose -bigClusterHub=kk > do.log 2>&1 &
     #	real    1233m16.397s
     #	the pk kluster became free as this job had a long time to go.
     #	Unsure how long it had to go because the batch became confused
     #	and couldn't calculate eta, it had waiting=0 jobs ?  So, chill
     #	out the batch, allow the running jobs to complete,
     #	go to pk and get the jobs completed there.
     #	There was a difficulty with the para.results file.  It lost track
     #	of 1158 jobs, but they were completed and results exist.
     #	then continuing:
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
 	-continue=cat -chainLinearGap=loose -bigClusterHub=pk > cat.log 2>&1 &
     #	real    66m36.465s
     cat fb.bosTau4.chainOrnAna1Link.txt
     #	201799073 bases of 2731830700 (7.387%) in intersection
 
     mkdir /cluster/data/ornAna1/bed/blastz.bosTau4.swap
     cd /cluster/data/ornAna1/bed/blastz.bosTau4.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11/DEF \
 	-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
 	-swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 &
     #	real    106m17.385s
     cat fb.ornAna1.chainBosTau4Link.txt
     #	187868681 bases of 1842236818 (10.198%) in intersection
 
     #	need to split up the chain file to run the reciprocal net
     #	there are too many chroms to work with chain split
     #	do it with chainFilter as a kluster job on memk
     ssh memk
     mkdir /scratch/tmp/ornAna1
     cd /scratch/tmp/ornAna1
     for R in 0 1 2 3 4 5 6 7
 do
     echo $R
     ssh mkr0u${R} mkdir /scratch/tmp/ornAna1
     ssh mkr0u${R} cp -p /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz /scratch/tmp/ornAna1
     ssh mkr0u${R} gunzip /scratch/tmp/ornAna1/bosTau4.ornAna1.all.chain.gz
 done
 
     zcat /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz | grep "^chain "  | awk '{print $3}' | sort -u > chain.list
     cat << '_EOF_' > splitOne.csh
 #!/bin/csh -fe
 set tmpDir = /scratch/tmp/ornAna1
 set T=$1
 chainFilter -t=${T} $tmpDir/bosTau4.ornAna1.all.chain > $tmpDir/${T}.chain
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > template
 #LOOP
 ./splitOne.csh $(file1)
 '_EOF_'
     # << happy emacs
 
     gensub2 chain.list single template jobList
     para create jobList
     para try ... check ... push ... time ...
     #	fetch the results from tmp directories
     for R in 0 1 2 3 4 5 6 7
 do
     ssh mkr0u${R} "(cd /scratch/tmp/ornAna1; cp -p -u chr*.chain /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/chain)"
 done
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2008-03-12 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("bosTau4", "blat2", "17778", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("bosTau4", "blat2", "17779", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 #########################################################################
 ## 5-Way Multiz (DONE - 2008-03-18 - Hiram)
 ##
     #	the all.chain.gz files were split up via kluster jobs on memk
     #	in order to get mafSynNet files.  Example above in ornAna1 blastz
     ssh hgwdev
     mkdir /cluster/data/bosTau4/bed/multiz5way
     cd /cluster/data/bosTau4/bed/multiz5way
     #	take the 30-way tree from mm9 and eliminate genomes not in
     #	this alignment
     #	rearrange to get bosTau4 on the top of the graph
     #	paste this tree into the on-line phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to create the image for the tree diagram
 
     #	select the 9 organisms from the 30-way recently done on mouse mm9
     /cluster/bin/phast/tree_doctor \
 --prune-all-but Human_hg18,Mouse_mm9,Dog_canFam2,Platypus_ornAna1,Cow_bosTau3 \
 	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
 	| sed -e "s/bosTau3/bosTau4/g" > 5-way.fullNames.nh
 
     #	looks something like this:
 (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763,
 (Dog_canFam2:0.156637,Cow_bosTau4:0.163945):0.031326):0.332197,
 Platypus_ornAna1:0.488110);
 
     #	rearrange to get Cow at the top:
     # this leaves us with:
     cat << '_EOF_' > bosTau4.5-way.nh
 (((Cow_bosTau4:0.163945,Dog_canFam2:0.156637):0.031326,
    (Human_hg18:0.126901,Mouse_mm9:0.325818):0.019763):0.332197,
    Platypus_ornAna1:0.488110);
 '_EOF_'
     #	<< happy emacs
 
     #	create a species list from that file:
     sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' bosTau4.5-way.nh \
         | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
         | sed -e "s/.*_//; s/:.*//" | sort > species.list
     #	verify that has 5 db names in it
     # create a stripped down nh file for use in autoMZ run
     echo \
 `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' bosTau4.5-way.nh \
 	| sed -e "s/  / /g"` > tree.5.nh
     #	that looks like, as a single line:
     #	(((bosTau4 canFam2) (hg18 mm9)) ornAna1)
 
     # verify all blastz's exists
     cat << '_EOF_' > listMafs.csh
 #!/bin/csh -fe
 cd /cluster/data/bosTau4/bed/multiz5way
 foreach db (`grep -v bosTau4 species.list`)
     set bdir = /cluster/data/bosTau4/bed/blastz.$db
     if (-e $bdir/mafRBestNet/chr1.maf.gz) then
 	echo "$db mafRBestNet"
     else if (-e $bdir/mafSynNet/chr1.maf.gz) then
 	echo "$db mafSynNet"
     else if (-e $bdir/mafNet/chr1.maf.gz) then
 	echo "$db mafNet"
     else
 	echo "$db mafs not found"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./listMafs.csh
     #	see what it says:
     ./listMafs.csh
 # canFam2 mafSynNet
 # hg18 mafSynNet
 # mm9 mafSynNet
 # ornAna1 mafSynNet
 
     /cluster/bin/phast/all_dists bosTau4.5-way.nh > 5way.distances.txt
     grep -i bostau 5way.distances.txt | sort -k3,3n
 # Cow_bosTau4     Dog_canFam2     0.320582
 # Cow_bosTau4     Human_hg18      0.341935
 # Cow_bosTau4     Mouse_mm9       0.540852
 # Cow_bosTau4     Platypus_ornAna1        1.015578
 
     #	use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                       chainCalJac1Link   chain   linearGap
 #    distance                     on CalJac1    on other   minScore
 #  1  0.320582 Dog_canFam2        (% 82.846)   (% 78.351)   3000     medium
 #  2  0.341935 Human_hg18         (% 74.810)   (% 77.648)   3000     medium
 #  3  0.540852 Mouse_mm9          (% 76.925)   (% 74.694)   3000     medium
 #  4  1.015578 Platypus_ornAna1   (% 77.296)   (% 76.308)   3000     medium
 
     #	create a coherent set of all the mafs involved in this run
     mkdir mafLinks
     cd mafLinks
     ln -s ../../blastz.canFam2/mafSynNet ./canFam2
     ln -s ../../blastz.hg18/mafSynNet ./hg18
     ln -s ../../blastz.mm9/mafSynNet ./mm9
     ln -s ../../blastz.ornAna1/mafSynNet ./ornAna1
 
     #	check data size:
     du -hscL *
 # 982M    canFam2
 # 940M    hg18
 # 483M    mm9
 # 79M     ornAna1
 # 2.5G    total
 
     # copy net mafs to cluster-friendly storage
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/multiz5way/mafLinks
     mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/mafs
     rsync -a --progress --copy-links ./ \
 	/san/sanvol1/scratch/bosTau4/multiz5way/mafs/
     #	create a run-time list of contigs to operate on, not all contigs
     #	exist in all alignments, but we want all contig names used in any
     #	alignment:
     cd /san/sanvol1/scratch/bosTau4/multiz5way/mafs
     for D in *
 do
     cd "${D}"
     find . -type f
     cd ..
 done | sed -e "s#^./##; s#.maf##" | sort -u > /tmp/5-way.contig.list
     wc -l /tmp/5-way.contig.list
     #	2321 /tmp/5-way.contig.list
 
     # ready for the multiz run
     ssh pk
     mkdir /cluster/data/bosTau4/bed/multiz5way/splitRun
     cd /cluster/data/bosTau4/bed/multiz5way/splitRun
     scp -p kkstore06:/tmp/5-way.contig.list .
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $P/{autoMZ,multiz,maf_project} penn
 
     #	set the db and pairs directories here
     cat << '_EOF_' > autoMultiz.csh
 #!/bin/csh -ef
 set db = bosTau4
 set c = $1
 set result = $2
 set resultDir = $result:h
 set run = `pwd`
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz5way/mafs
 rm -fr $tmp
 mkdir -p $tmp
 mkdir -p $resultDir
 cp ../../tree.5.nh ../../species.list $tmp
 pushd $tmp
 foreach s (`grep -v $db species.list`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($run/penn $path); rehash
 $run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $result
 rm -fr $tmp
 rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/bosTau4/bed/multiz5way/splitRun/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << emacs
 
     gensub2 ../5-way.contig.list single template jobList
 
     para create jobList
     para try ... check ... push ... etc
 # Completed: 2321 of 2321 jobs
 # CPU time in finished jobs:      35680s     594.67m     9.91h    0.41d  0.001 y
 # IO & Wait Time:                  9398s     156.63m     2.61h    0.11d  0.000 y
 # Average job time:                  19s       0.32m     0.01h    0.00d
 # Longest finished job:            1982s      33.03m     0.55h    0.02d
 # Submission to last job:          3983s      66.38m     1.11h    0.05d
 
     # put the split maf results back together into a single maf file
     #	eliminate duplicate comments
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/multiz5way
     grep "^##maf version" splitRun/maf/chr1.maf \
 	| sort -u > bosTau4.5way.maf
     for F in `find ./splitRun/maf -type f -depth`
 do
     grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
 	| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
 done | sort -u >> bosTau4.5way.maf
     for F in `find ./splitRun/maf -type f -depth`
 do
     grep -v -h "^#" "${F}"
 done >> bosTau4.5way.maf
     grep "^##eof maf" splitRun/maf/chr1.maf \
 	| sort -u >> bosTau4.5way.maf
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/bosTau4/multiz5way/maf
     ln -s /cluster/data/bosTau4/bed/multiz5way/*.maf \
                 /gbdb/bosTau4/multiz5way/maf/multiz5way.maf
     # this generates an immense multiz5way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/bosTau4/multiz5way/maf bosTau4 multiz5way
     #	real    3m31.593s
     #	Loaded 5392296 mafs in 1 files from /gbdb/bosTau4/multiz5way/maf
     # load summary table
     time nice -n +19 cat /gbdb/bosTau4/multiz5way/maf/*.maf \
 	| hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz5waySummary stdin
     #	real    3m41.663s
     #	Created 779088 summary blocks from 10697864 components and 5164426 mafs from stdin
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore06
     mkdir /cluster/data/bosTau4/bed/multiz5way/anno
     cd /cluster/data/bosTau4/bed/multiz5way/anno
     mkdir maf run
 
     #	these actually already all exist from previous multiple alignments
     #	remove the echo in front of the twoBitInfo to actually make it work
     for DB in `cat ../species.list`
 do
     CDIR="/cluster/data/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `grep -v bosTau4 ../../species.list`
 do
     echo "${DB} "
     ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     ssh memk
     #	temporarily copy the bosTau4.5way.maf file onto the memk
     #	nodes /scratch/data/bosTau4/maf/ directory
     for R in 0 1 2 3 4 5 6 7
 do
     ssh mkr0u${R} rsync -a --progress \
 /cluster/data/bosTau4/bed/multiz5way/bosTau4.5way.maf \
 	/scratch/data/bosTau4/maf/
 done
     mkdir /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf
     #	need to split up the single maf file into individual
     #	per-scaffold maf files to run annotation on
     cd /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf
     #	create bed files to list approximately 1553 scaffolds in
     #	a single list, approximately 33 lists
     cat << '_EOF_' > mkBedLists.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $bedCount = 0;
 my $i = 0;
 
 my $bedFile = sprintf("file_%d.bed", $bedCount);
 
 open (BF,">$bedFile") or die "can not write to $bedFile $!";
 
 open (FH,"</cluster/data/bosTau4/chrom.sizes") or
         die "can not read /cluster/data/bosTau4/chrom.sizes $!";
 while (my $line = <FH>) {
     chomp $line;
     if ( (($i + 1) % 1553) == 0 )  {
         printf "%s\n", $line;
         close (BF);
         ++$bedCount;
         $bedFile = sprintf("file_%d.bed", $bedCount);
         open (BF,">$bedFile") or die "can not write to $bedFile $!";
     }
     ++$i;
     my ($chr, $size) = split('\s+',$line);
     printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
 }
 close (FH);
 close (BH);
 '_EOF_'
     # << happy emacs
     chmod +x mkBedLists.pl
     ./mkBedLists.pl
 
     #	now, run a mafsInRegion on each one of those lists
     cat << '_EOF_' > runOne
 #!/bin/csh -fe
 set runDir = "/cluster/data/bosTau4/bed/multiz5way/anno/splitMaf"
 set resultDir = $1
 set bedFile = $resultDir.bed
 mkdir -p $resultDir
 mkdir -p /scratch/tmp/bosTau4/$resultDir
 pushd /scratch/tmp/bosTau4/$resultDir
 mafsInRegion $runDir/$bedFile -outDir . \
         /scratch/data/bosTau4/maf/bosTau4.5way.maf
 popd
 rsync -q -a /scratch/tmp/bosTau4/$resultDir/ ./$resultDir/
 rm -fr /scratch/tmp/bosTau4/$resultDir
 rmdir --ignore-fail-on-non-empty /scratch/tmp/bosTau4
 '_EOF_'
     # << happy emacs
     chmod +x runOne
 
     cat << '_EOF_' > template
 #LOOP
 ./runOne $(root1)
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     ls file*.bed > runList
     gensub2 runList single template jobList
     para create jobList
     para try ... check ... push ... etc
 # Completed: 8 of 8 jobs
 # CPU time in finished jobs:       1137s      18.96m     0.32h    0.01d  0.000 y
 # IO & Wait Time:                   552s       9.19m     0.15h    0.01d  0.000 y
 # Average job time:                 211s       3.52m     0.06h    0.00d
 # Longest finished job:             599s       9.98m     0.17h    0.01d
 # Submission to last job:           599s       9.98m     0.17h    0.01d
 
     cd /cluster/data/bosTau4/bed/multiz5way/anno/run
 
     cat << '_EOF_' > doAnno.csh
 #!/bin/csh -ef
 set outDir = ../maf/$2
 set result = $3
 set input = $1
 mkdir -p $outDir
 cat $input | \
 nice mafAddIRows -nBeds=nBeds stdin /scratch/data/bosTau4/bosTau4.2bit $result
 '_EOF_'
     # << happy emacs
     chmod +x doAnno.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     find ../splitMaf -type f -name "*.maf" > maf.list
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 2320 of 2320 jobs
 # CPU time in finished jobs:       8405s     140.09m     2.33h    0.10d  0.000 y
 # IO & Wait Time:                  6934s     115.56m     1.93h    0.08d  0.000 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest finished job:             369s       6.15m     0.10h    0.00d
 # Submission to last job:           834s      13.90m     0.23h    0.01d
 
     #	put the results back together into a single file
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/multiz5way/anno
     grep "^##maf version" maf/file_0/chr1.maf \
 	| sort -u > bosTau4.anno.5way.maf
     find ./maf -type f -depth -name "*.maf" | while read F
 do
     grep -v -h "^#" "${F}"
 done >> bosTau4.anno.5way.maf
     echo "##eof maf" >> bosTau4.anno.5way.maf
 
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way/anno
     mkdir -p /gbdb/bosTau4/multiz5way/anno
     ln -s `pwd`/bosTau4.anno.5way.maf \
                 /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
     #	by loading this into the table multiz5way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/bosTau4/multiz5way/anno \
                 bosTau4 multiz5way
     #	Loaded 6711605 mafs in 1 files from /gbdb/bosTau4/multiz5way/anno
     #	real    4m54.025s
 
     #	normally filter this for chrom size > 1,000,000 and only load
     #	those chroms.  But this is a scaffold assembly, load everything:
     time nice -n +19 hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \
 	-maxSize=200000  multiz5waySummary \
 	    /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
     #	Created 779088 summary blocks from 10697864 components
     #	and 6412983 mafs from /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
     #	real    4m6.782s
 
     #	by loading this into the table multiz5waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	remove the multiz5way*.tab files in this /scratch/tmp directory
     rm multiz5way*.tab
     #	And, you can remove the previously loaded non-annotated maf file link:
     rm /gbdb/bosTau4/multiz5way/maf/bosTau4.5way.maf
     rmdir /gbdb/bosTau4/multiz5way/maf
 
 ###########################################################################
 ## Annotate 5-way multiple alignment with gene annotations
 ##		(DONE - 2008-03-18 - Hiram)
     # Gene frames
     ## given previous survey done for 9-way alignment on Marmoset
     ## and the appearance of new ensGene tables on everything
     #	use knownGene for hg18, mm9
     #	use ensGene for canFam2, ornAna1
     #	and refGene for bosTau4
 
     ssh hgwdev
     mkdir /cluster/data/bosTau4/bed/multiz5way/frames
     cd /cluster/data/bosTau4/bed/multiz5way/frames
     mkdir genes
     # knownGene
     for DB in hg18 mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # ensGene
     for DB in canFam2 ornAna1
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # refGene
     for DB in bosTau4
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
 
     ls -og genes
 # -rw-rw-r--  1  861210 Mar 18 15:40 bosTau4.gp.gz
 # -rw-rw-r--  1 1865308 Mar 18 15:40 canFam2.gp.gz
 # -rw-rw-r--  1 2008806 Mar 18 15:39 hg18.gp.gz
 # -rw-rw-r--  1 1965274 Mar 18 15:39 mm9.gp.gz
 # -rw-rw-r--  1 1347532 Mar 18 15:40 ornAna1.gp.gz
 
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/multiz5way/frames
     #	anything to annotate is in a pair, e.g.: bosTau4 genes/bosTau4.gp.gz
     time (cat  ../anno/bosTau4.anno.5way.maf | nice -n +19 genePredToMafFrames bosTau4 stdin stdout bosTau4 genes/bosTau4.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
     # see what it looks like in terms of number of annotations per DB:
     zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
 #  77526 bosTau4
 # 110768 ornAna1
 # 221524 mm9
 # 230010 hg18
 # 243396 canFam2
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way/frames
     time nice -n +19 hgLoadMafFrames bosTau4 multiz5wayFrames \
 	multiz5way.mafFrames.gz
     #	real    0m21.968s
 
     #	enable the trackDb entries:
 # frames multiz5wayFrames
 # irows on
 #############################################################################
 # phastCons 5-way (DONE - 2008-03-19 - Hiram)
 
     # split 5way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh memk
     mkdir /cluster/data/bosTau4/bed/multiz5way/msa.split
     cd /cluster/data/bosTau4/bed/multiz5way/msa.split
     mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /cluster/data/bosTau4/bed/multiz5way/anno/maf
 set WINDOWS = /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss
 pushd $WINDOWS
 set resultDir = $1
 set c = $2
 rm -fr $resultDir/$c
 mkdir -p $resultDir
 twoBitToFa -seq=$c /scratch/data/bosTau4/bosTau4.2bit /scratch/tmp/bosTau4.$c.fa
 # need to truncate odd-ball scaffold/chrom names that include dots
 # as phastCons utils can't handle them
 set TMP = /scratch/tmp/$c.clean.maf.$$
 perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP
 /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
     -M /scratch/tmp/bosTau4.$c.fa \
     -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
 rm -f /scratch/tmp/bosTau4.$c.fa
 rm -f $TMP
 popd
 mkdir -p $resultDir
 date > $resultDir/$c.out
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	create list of maf files:
     (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
 # Completed: 2320 of 2320 jobs
 # CPU time in finished jobs:       1710s      28.50m     0.47h    0.02d  0.000 y
 # IO & Wait Time:                  6951s     115.85m     1.93h    0.08d  0.000 y
 # Average job time:                   4s       0.06m     0.00h    0.00d
 # Longest finished job:             128s       2.13m     0.04h    0.00d
 # Submission to last job:          1048s      17.47m     0.29h    0.01d
 
     # take the cons and noncons trees from the mouse 30-way
 
     #	Estimates are not easy to make, probably more correctly,
     #	take the 30-way .mod file, and re-use it here.
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way
     cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh memk
     mkdir -p /cluster/data/bosTau4/bed/multiz5way/cons/run.cons
     cd /cluster/data/bosTau4/bed/multiz5way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all gliers placentals
     #	Well, that's what it was when used in the Mm9 30-way,
     #	in this instance, there is only the directory "all"
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin
 set subDir = $1
 set f = $2
 set c = $2:r
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 set cons = /cluster/data/bosTau4/bed/multiz5way/cons
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/bosTau4/multiz5way/cons
 if (-s $cons/$grp/$grp.non-inf) then
   cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
   cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
 else
   cp -p $cons/$grp/$grp.mod $tmp
   cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
 sleep 4
 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
 rm -f $san/$grp/pp/$subDir/$f.pp
 rm -f $san/$grp/bed/$subDir/$f.bed
 mv $tmp/$f.pp $san/$grp/pp/$subDir
 mv $tmp/$f.bed $san/$grp/bed/$subDir
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/bosTau4/multiz5way/cons
     find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
 	> /cluster/data/bosTau4/bed/multiz5way/cons/ss.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
     /cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \
 --prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \
 	| sed -e "s/bosTau3/bosTau4/" > all.mod
     cd ../run.cons/all
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/bosTau4/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 2569 of 2569 jobs
 # CPU time in finished jobs:       8636s     143.93m     2.40h    0.10d  0.000 y
 # IO & Wait Time:                 17371s     289.52m     4.83h    0.20d  0.001 y
 # Average job time:                  10s       0.17m     0.00h    0.00d
 # Longest finished job:              44s       0.73m     0.01h    0.00d
 # Submission to last job:          1008s      16.80m     0.28h    0.01d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
     find ./bed -type f -name "chr*.bed" | xargs cat \
 	| sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 3 minutes
     cp -p mostConserved.bed /cluster/data/bosTau4/bed/multiz5way/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way/cons/all
     time nice -n +19 hgLoadBed bosTau4 phastConsElements5way mostConserved.bed
     #	Loaded 1005876 elements of size 5
 
     # Try for 5% overall cov, and 70% CDS cov 
     #	We don't have any gene tracks to compare CDS coverage
     #	--rho .31 --expected-length 45 --target-coverage .3
     featureBits bosTau4 phastConsElements5way
     #	132010504 bases of 2731830700 (4.832%) in intersection
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
     mkdir -p phastCons5wayScores
 
 for D in `ls -1d pp/file* | sort -t_ -k2n`
 do
     TOP=`pwd`
     F=${D/pp\/}
     out=${TOP}/phastCons5wayScores/${F}.data.gz
     echo "${D} > ${F}.data.gz"
     cd ${D}
     find . -name "*.pp" -type f \
 	| sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \
 	| sort -t '.' -k1,1 -k3.3n \
 	| sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \
 	| gzip > ${out}
     cd "${TOP}"
 done
 
     #	copy those files to the downloads area:
 # /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way/phastConsScores
     #	for hgdownload downloads
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
     ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
 	| wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     time nice -n +19 cp -p *.wi? /cluster/data/bosTau4/bed/multiz5way/cons/all
     #	real    0m40.875s
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way/cons/all
     ln -s `pwd`/phastCons5way.wib /gbdb/bosTau4/multiz5way/phastCons5way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/bosTau4/multiz5way bosTau4 \
 	phastCons5way phastCons5way.wig
     #	real    1m5.667s
     # remove garbage
     rm wiggle.tab
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way/cons/all
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=bosTau4 phastCons5way > histogram.data 2>&1
     #	real    3m37.316s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Cow BosTau4 Histogram phastCons5way track"
 set xlabel " phastCons5way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     #	These trackDb entries turn on the wiggle phastCons data track:
     #	type wigMaf 0.0 1.0
     #	maxHeightPixels 100:40:11
     #	wiggle phastCons5way
     #	spanList 1
     #	autoScale Off
     #	windowingFunction mean
     #	pairwiseHeight 12
     #	yLineOnOff Off
 
 #############################################################################
 #  Downloads (DONE - 2008-01-11 - Hiram)
     #	Let's see if the downloads will work
     ssh hgwdev
     /cluster/data/bosTau4
     #	expecting to find repeat masker .out file here:
     ln -s bed/RepeatMasker/bosTau4.fa.out .
     time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
 	-workhorse=hgwdev bosTau4 > jkStuff/downloads.log 2>&1
     #	real    24m3.210s
     #	failed making upstream sequences:
     #	featureBits bosTau4 mgcGenes:upstream:1000 -fa=stdout
     #	setpriority: Permission denied.
     #	the 'nice' from my bash shell causes trouble inside the csh
     #	script which uses nice.  Finish off the install step manually
     #	with the mgcGenes upstreams ...
 
 #############################################################################
 #  PushQ entries (DONE - 2008-01-11 - Hiram)
     ssh hgwdev
     /cluster/data/bosTau4
     /cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql
     #	output warnings:
 # bosTau4 does not have seq
 # bosTau4 does not have gbMiscDiff
 # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
 # and genbank tables) which tracks to assign these tables to:
 #	genscanPep
 
 #############################################################################
 #  create download files (DONE - 2008-03-19 - Hiram)
     ssh hgwdev
     cd /cluster/data/bosTau4
     ln -s /cluster/data/bosTau4/bed/repeatMasker/bosTau4.fa.out .
     makeDownloads.pl bosTau4 > makeDownloads.log 2>&1
 
     #	*EDIT* the README files and ensure they are correct
 
 #############################################################################
 #  PushQ entries (DONE - 2008-03-19 - Hiram)
     ssh hgwdev
     /cluster/data/bosTau4
     /cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql
     #	output warnings:
 # hgwdev does not have /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/bosTau4ToBosTau*
 # bosTau4 does not have seq
 
 # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
 # and genbank tables) which tracks to assign these tables to:
 #	genscanPep
 
     #	looks like there should be a bosTau3 to bosTau4 liftOver run
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28)
     ssh kkstore06
     # bash  if not using bash shell already
 
     mkdir /cluster/data/bosTau4/blastDb
     cd /cluster/data/bosTau4
     grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst
     for i in `cat chr.lst`; 
     do twoBitToFa  bosTau4.unmasked.2bit -seq=$i stdout; 
     done > temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
 
     grep chrUn chrom.sizes | awk '{print $1}' > chr.lst
     for i in `cat chr.lst`; 
     do twoBitToFa  bosTau4.unmasked.2bit -seq=$i stdout; 
     done > temp.fa
     faSplit sequence temp.fa 150 blastDb/y
     rm temp.fa chr.lst
 
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 #   3440
 
     mkdir -p /san/sanvol1/scratch/bosTau4/blastDb
     cd /cluster/data/bosTau4/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/bosTau4/blastDb
     done
 
     mkdir -p /cluster/data/bosTau4/bed/tblastn.hg18KG
     cd /cluster/data/bosTau4/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/bosTau4/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 3440 query.lst
 
    # we want around 350000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)
 
 # 36727/(350000/3440) = 360.973943
 
    mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa
    split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/bosTau4/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit 
     
     ssh pk
     cd /cluster/data/bosTau4/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 Completed: 350880 of 350880 jobs
 CPU time in finished jobs:   27082816s  451380.27m  7523.00h  313.46d  0.859 y
 IO & Wait Time:               2334990s   38916.50m   648.61h   27.03d  0.074 y
 Average job time:                  84s       1.40m     0.02h    0.00d
 Longest finished job:             578s       9.63m     0.16h    0.01d
 Submission to last job:         96125s    1602.08m    26.70h    1.11d
 
 
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh pk
     cd /cluster/data/bosTau4/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 
 # Completed: 99 of 102 jobs
 # Crashed: 3 jobs
 # CPU time in finished jobs:     113248s    1887.47m    31.46h    1.31d  0.004 y
 # IO & Wait Time:                 86043s    1434.04m    23.90h    1.00d  0.003 y
 # Average job time:                2013s      33.55m     0.56h    0.02d
 # Longest finished job:            6139s     102.32m     1.71h    0.07d
 # Submission to last job:         10416s     173.60m     2.89h    0.12d
 
 # ran three crashed jobs on kolossus
 
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau4/bed/tblastn.hg18KG/blastHg18KG.psl
     cd ..
     pslCheck blastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/tblastn.hg18KG
     hgLoadPsl bosTau4 blastHg18KG.psl
 
     # check coverage
     featureBits bosTau4 blastHg18KG 
 # 40254923 bases of 2731830700 (1.474%) in intersection
 
     featureBits bosTau4 refGene:cds blastHg18KG  -enrichment
 # refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x
 
     ssh kkstore06
     rm -rf /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
 #end tblastn
 
 ##########################################################################
 #  Create 5-way downloads (DONE - 2008-03-28 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way
     cd /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way
     cp -p \
 /san/sanvol1/scratch/bosTau4/multiz5way/cons/all/phastCons5wayScores/* .
     ln -s ../../cons/all/all.mod ./5way.mod
     cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
     # edit that README.txt to be correct for this 5-way alignment
     cd ..
     mkdir multiz5way
     cd multiz5way
     cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
     # edit that README.txt to be correct for this 5-way alignment
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
     ln -s ../../bosTau4.5-way.nh 5way.nh
 
     time gzip -c ../../anno/bosTau4.anno.5way.maf > bosTau4.5way.maf.gz
     #	real    34m59.295s
 
     ssh hgwdev
     cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
     #	creating upstream files from refGene, bash script:
     cat << '_EOF_' > mkUpstream.sh
 #!/bin/bash
 DB=bosTau4
 GENE=refGene
 NWAY=multiz5way
 export DB GENE
 
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                 stdin stdout \
                 -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 '_EOF_'
     # << happy emacs
     chmod +x ./mkUpstream.sh
     time nice -n +19 ./mkUpstream.sh
 -rw-rw-r--  1    9883443 Mar 28 13:02 upstream1000.maf.gz
 -rw-rw-r--  1   17938570 Mar 28 13:06 upstream2000.maf.gz
 -rw-rw-r--  1   40384656 Mar 28 13:10 upstream5000.maf.gz
     #
     #	check the names in these upstream files to ensure sanity:
     zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
 	| sort | uniq -c | sort -rn | less
     #	should be a list of the other 4 species with a high count,
     #	then refGene names, e.g.:
     #	8806 ornAna1
     #	8806 mm9
     #	8806 hg18
     #	8806 canFam2
     #      7 NM_001077006
     #      3 NM_001113231
     #      3 NM_001105381
     #      3 NM_001102527
     #      3 NM_001102322
     #	...
 
     ssh kkstore06
     cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
     md5sum *.maf.gz > md5sum.txt
     cd ../phastCons5way
     md5sum *.data.gz *.mod > md5sum.txt
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way
     mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way
     cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
     ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way
     cd ../phastCons5way
     ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way
     #	if your ln -s `pwd`/* made extra links to files you don't want there,
     #	check the goldenPath locations and remove those extra links
 
 #############################################################################
 # N-SCAN gene predictions (nscanGene) - (2008-04-21 markd)
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     cd /cluster/data/bosTau4/bed/nscan/
     wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.gtf
     wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.prot.fa
     wget http://mblab.wustl.edu/predictions/Cow/bosTau4/readme.html
     bzip2 bosTau4.*
     chmod a-w *
 
     # load track
     gtfToGenePred -genePredExt bosTau4.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt bosTau4 nscanGene stdin
     hgPepPred bosTau4 generic nscanPep  bosTau4.prot.fa.bz2
     rm *.tab
 
     # update trackDb; need a bosTau4-specific page to describe informants
     cow/bosTau4/nscanGene.html   (copy from readme.html)
     cow/bosTau4/trackDb.ra
     # set search regex to
         termRegex chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+(\.[0-9]+)*
 
 #############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 # SELF BLASTZ (DONE - 2008-06-27 - Hiram)
     #	do not want the noise from the contigs
     ssh kkstore06
     screen # use a screen to manage this multi-day job
     mkdir /cluster/data/bosTau4/noContigs
     cd /cluster/data/bosTau4/noContigs
     cut -f1 ../chrom.sizes | grep -v chrUn | while read C
 do
     echo "twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa"
     twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa
 done
     faToTwoBit chr*.fa bosTau4.noContigs.2bit
 
     twoBitToFa bosTau4.noContigs.2bit stdout | faSize stdin
 # 2634429662 bases (167456864 N's 2466972798 real
 #	1326108891 upper 1140863907 lower) in 31 sequences in 1 files
 # %43.31 masked total, %46.25 masked real
     grep -v chrUn ../chrom.sizes | ave -col=2 stdin
 # count 31
 # total 2634429662.000000
 
     twoBitInfo bosTau4.noContigs.2bit stdout | sort -k2nr \
 	> bosTau4.noContigs.chrom.sizes
     cp -p bosTau4.noContigs.2bit /cluster/bluearc/scratch/data/bosTau4
     cp -p bosTau4.noContigs.chrom.sizes /cluster/bluearc/scratch/data/bosTau4
 
     mkdir /cluster/data/bosTau4/bed/blastzSelf.2008-08-27
     cd /cluster/data/bosTau4/bed
     ln -s blastzSelf.2008-08-27 blastzSelf
     cd blastzSelf.2008-08-27
 
     cat << '_EOF_' > DEF
 BLASTZ_M=400
 
 # TARGET: Cow bosTau4
 SEQ1_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit
 SEQ1_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau4
 SEQ2_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit
 SEQ2_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/bosTau4/bed/blastzSelf.2008-08-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     cd /cluster/data/bosTau4/bed/blastzSelf.2008-08-27
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    3537m49.719s
     #	had to fix a slight problem in the make downloads step, then:
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
 	-continue=cleanup `pwd`/DEF > cleanup.out 2>&1 &
 
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 bosTau4.upstreamGeneTbl = mgcGenes
 bosTau4.upstreamMaf = multiz5way /hive/data/genomes/bosTau4/bed/multiz5way/species.list
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 #############################################################################
 # LIFTOVER TO bosTauMd3 (DONE - 2009-12-16 - galt)
     ssh hgwdev
     cd /hive/data/genomes/bosTau4
     ln -s bosTau4.11.ooc 11.ooc
     time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
         -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
          bosTau4 bosTauMd3 >& doLiftOverToBosTauMd3.log
 
     pushd /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/
     md5sum bosTau4ToBosTauMd3.over.chain.gz >> md5sum.txt
     popd
 
 #########################################################################
-# susScr1 Pig BLASTZ/CHAIN/NET (WORKING - 2010-01-21 - Hiram)
+# susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-25 - Hiram)
     screen # use a screen to manage this multi-day job
-    mkdir /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-21
-    cd /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-21
+    mkdir /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
+    cd /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
 
     cat << '_EOF_' > DEF
 # Pig vs. Cow
 BLASTZ_M=50
 
 # TARGET: Cow BosTau4
 SEQ1_DIR=/scratch/data/bosTau4/bosTau4.2bit
 SEQ1_LEN=/scratch/data/bosTau4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Pig SusScr1
 SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
 SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
-BASE=/hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-21
+BASE=/hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
-	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
+    #	real    2377m21.473s
+    #	never did get this to work.  susScr2 as target bosTau4 as query
+    #	did finish but with immense chain pileups
+    #	failed second time too
+    #	XXX - re-running as 01-25 (was 01-21) Mon Jan 25 11:29:32 PST 2010
     #	real    3811m31.428s
     #	failed
-XXX - running Thu Jan 21 14:49:43 PST 2010
+    #	XXX - running Thu Jan 25 14:49:43 PST 2010
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	`pwd`/DEF -verbose=2 \
 	-bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
 	    -syntenicNet > do.log 2>&1
 
 #########################################################################
 # lastz swap Rat Rn4 (DONE - 2010-01-22 - Hiram)
     # original run
     cd /cluster/data/rn4/bed/blastzBosTau4.2010-01-19
     cat fb.rn4.chainBosTau4Link.txt 
     #	649931321 bases of 2571531505 (25.274%) in intersection
 
     mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap
     cd /cluster/data/bosTau4/bed/blastz.rn4.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \
 	-noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -swap -syntenicNet > swap.log 2>&1 &
     #	real    77m18.645s
     cat fb.bosTau4.chainRn4Link.txt 
     #	664253901 bases of 2731830700 (24.315%) in intersection
 
 #########################################################################
+# lastz swap Pig SusScr2 (DONE - 2010-03-31 - Hiram)
+    #	the primary lastz run on susScr2
+    cd /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27
+    cat fb.susScr2.chainBosTau4Link.txt 
+    #	1478903080 bases of 2231298548 (66.280%) in intersection
+
+    #	and the swap
+    mkdir /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap
+    cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/DEF \
+	-swap -noLoadChainSplit -syntenicNet \
+	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
+	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+    #	most interesting, this failed on the first step chainSwap
+    #	but the failure didn't make it stop, it continued and produced
+    #	zero results to the end.  Running manually:
+    cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap/axtChain
+export sizeG=188743680
+ulimit -d $sizeG
+ulimit -v $sizeG
+
+chainSwap /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/axtChain/susScr2.bosTau4.all.chain.gz stdout \
+| nice chainSort stdin stdout | nice gzip -c > bosTau4.susScr2.all.chain.gz
+
+    #	it also runs out of memory in the lift over file creation:
+    export sizeG=188743680
+    ulimit -d $sizeG
+    ulimit -v $sizeG
+
+    netChainSubset -verbose=0 noClass.net bosTau4.susScr2.all.chain.gz stdout \
+	| chainStitchId stdin stdout | gzip -c > bosTau4.susScr2.over.chain.gz
+
+    #	and netChains.csh is finished manually with this added:
+# memory limit 160 Gb
+limit datasize 163840m
+limit vmemoryuse 163840m
+
+    #	manually run the loadUp.csh with this added:
+# memory limit 160 Gb
+limit datasize 163840m
+limit vmemoryuse 163840m
+    #	real    498m5.861s
+
+    cat fb.bosTau4.chainSusScr2Link.txt
+    #	1383557633 bases of 2731830700 (50.646%) in intersection
+
+#########################################################################