src/hg/makeDb/doc/monDom4.txt 1.17

1.17 2009/09/20 17:16:45 markd
transmap build
Index: src/hg/makeDb/doc/monDom4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/monDom4.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/monDom4.txt	1 Jul 2008 16:53:02 -0000	1.16
+++ src/hg/makeDb/doc/monDom4.txt	20 Sep 2009 17:16:45 -0000	1.17
@@ -1,3415 +1,3424 @@
 # for emacs: -*- mode: sh; -*-
 
 
 #	Creating the assembly for Monodelphis domestica
 #	South American, Short-tailed Opossum
 #	http://www.genome.gov/11510687
 #	http://www.genome.gov/12512285
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+---------------------------------+
 #  | tableName   | type                            |
 #  +-------------+---------------------------------+
 #  | refGene     | genePred refPep refMrna         |
 #  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
 #  | ensGene     | genePred ensPep                 |
 #  | genscan     | genePred genscanPep             |
 #  +-------------+---------------------------------+
 
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE - 2005-08-18 - Hiram)
     ssh kkstore01
     mkdir -p /cluster/store9/monDom4/broad.mit.edu
     ln -s /cluster/store9/monDom4 /cluster/data/monDom4
     cd /cluster/data/monDom4/broad.mit.edu
     time wget --timestamping --force-directories --directory-prefix=. \
         --dont-remove-listing --recursive --level=2 --no-parent \
         --no-host-directories --cut-dirs=5 \
  ftp://ftp.broad.mit.edu/distribution/assemblies/mammals/monodelphis/monDom4/*
     #	That takes a number of hours, ending up with:
 # -rw-rw-r--  1    7492963 Jan  5 18:10 Monodelphis4.0.agp
 # -rw-rw-r--  1       1160 Jan  5 18:10 MappedAssembly.Table
 # -rw-rw-r--  1        716 Jan  5 18:10 ForDistribution.command
 # -rw-rw-r--  1       1215 Jan  5 18:10 BasicStats.out
 # -rw-rw-r--  1       1512 Jan  5 18:11 ReadUsage.Table
 # -rw-rw-r--  1  474573344 Jan  5 18:11 Monodelphis4.0.agp.chromosome.qual.gz
 # -rw-rw-r--  1 1201437533 Jan  5 18:11 Monodelphis4.0.agp.chromosome.fasta.gz
 # -rw-rw-r--  1    7647098 Jan  5 18:12 assembly.agp
 # -rw-rw-r--  1   14841256 Jan  5 18:13 assembly.markup
 # -rw-rw-r--  1    2838523 Jan  5 18:13 assembly.links
 # -rw-rw-r--  1  474281414 Jan  5 18:13 assembly.agp.qual.gz
 # -rw-rw-r--  1 1201095284 Jan  5 18:13 assembly.agp.fasta.gz
 # -rw-rw-r--  1  176159300 Jan  5 18:14 assembly.unplaced
 # -rw-rw-r--  1 1151905015 Jan  5 18:14 assembly.reads.gz
 # -rw-rw-r--  1         43 Jan  5 18:15 source
 # -rw-rw-r--  1  471180166 Jan  5 18:15 contigs.quals.gz
 # -rw-rw-r--  1 1198813602 Jan  5 18:15 contigs.bases.gz
 # -rw-rw-r--  1  757582328 Jan  5 18:16 unplaced.fasta.gz
 # -rw-rw-r--  1 2083924258 Jan  5 18:18 unplaced.qual.gz
 # -rw-rw-r--  1 1159288222 Jan  5 19:17 softmasked_monDom3.tgz
 
     zcat broad.mit.edu/Monodelphis4.0.agp.chromosome.fasta.gz \
 	| faSplit -verbose=2 byname stdin broadSplit/
 
     #	combine the Broad split files into single chroms
 for C in 1 2 3 4 5 6 7 8 X Un
 do
     mkdir -p ${C}
     rm -f ${C}/chr${C}.fa
     echo ">chr${C}" > ${C}/chr${C}.fa
     echo -n "${C}/chr${C}.fa working ... "
     ls broadSplit/${C}.*-*.fa | sort -t"." -k2,2n | while read F
     do
         grep -v "^>" ${F} >> ${C}/chr${C}.fa
     done
     echo "done"
 done
 
     #	create a 2bit file to get a blastz hg18 started
     #	this will be replaced later after RepeatMasker with masked
     #	sequence
     faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit
     #   generate chrom.sizes list in order by size, biggest first
     #   This listing is going to be used in a variety of procedures
     #   below
     twoBitInfo monDom4.2bit stdout | sort -rn +1 > chrom.sizes
     mkdir /san/sanvol1/scratch/monDom4
     cp -p monDom4.2bit /san/sanvol1/scratch/monDom4
     cp -p chrom.sizes /san/sanvol1/scratch/monDom4
     #	split into 500K chunks for RepeatMasker
     mkdir split500K
 for C in 1 2 3 4 5 6 7 8 X Un
 do
     mkdir split500K/${C}
     faSplit -verbose=2 size ${C}/chr${C}.fa 500000 split500K/${C}/chr${C}_ \
         -lift=split500K/${C}.lft
 done
     mkdir lifts
     cat split500K/*.lft > lifts/lift500K.lft
 
 ###########################################################################
 #  RepeatMasker run (DONE 2006-02-06 - Hiram)
     ssh kk
     mkdir /cluster/data/monDom4/RMRun
     mkdir /cluster/data/monDom4/RMOut
     mkdir /cluster/data/monDom4/scripts
     cd /cluster/data/monDom4/split500K
 for C in 1 2 3 4 5 6 7 8 X Un
 do
     ls -1S ${C}/*.fa
 done > ../RMRun/split.list
 
     cd /cluster/data/monDom4/RMRun
     cat << '_EOF_' > template
 #LOOP
 ../scripts/RMOpossum $(dir1) $(root1) {check out line ../RMOut/$(dir1)/$(root1).out}
 #ENDLOOP
 '_EOF_'
     #	happy emacs
 
     cat << '_EOF_' > ../scripts/RMOpossum
 #!/bin/csh -fe
 /bin/mkdir -p /scratch/tmp/monDom4/$1/$2
 /bin/mkdir -p ../RMOut/$1
 /bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa
 pushd /scratch/tmp/monDom4/$1/$2
 /cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
         -species "Monodelphis domestica" $2.fa
 popd
 /bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3
 /bin/rm -fr /scratch/tmp/monDom4/$1/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4
 '_EOF_'
     # happy emacs
     chmod +x ../scripts/RMOpossum
 
     gensub2 split.list single template jobList
     para create jobList
     para try ... check ... push ... etc ...
 # Completed: 7170 of 7170 jobs
 # CPU time in finished jobs:   26375191s  439586.52m  7326.44h  305.27d  0.836 y
 # IO & Wait Time:               1584329s   26405.48m   440.09h   18.34d  0.050 y
 # Average job time:                3900s      64.99m     1.08h    0.05d
 # Longest finished job:           12608s     210.13m     3.50h    0.15d
 # Submission to last job:        162734s    2712.23m    45.20h    1.88d
 
     #	lift results up to chrom coordinates
     ssh kkstore01
     cd /cluster/data/monDom4/RMOut
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	echo "chr${C} working ... "
 	(cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \
 	    ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \
 		| xargs tail --quiet --lines=+4) | \
 	    liftUp ../${C}/chr${C}.fa.out ../lifts/lift500K.lft error stdin
 	echo "chr${C} done"
     done
     #	the seemingly duplicate mention of _000.out and _0000.out takes
     #	the seemingly duplicate mention of _000.out and _0000.out takes
     #	care of the case where one or the other exists, but both never
     #	exist.  One will not exist and the cat will complain to stderr, hence
     #	the redirection of cat complaints to dev null.
 
 ###########################################################################
 # BLASTZ HUMAN hg18 - to quickly investigate monDom4 to see if it has the same
 # pile up problem that was in monDom2.  Got this started right after the
 # monDom4.2bit file was made from the original Broad chrom fasta
     ssh pk
     mkdir /cluster/store9/monDom4/bed/blastzHg18.2006-02-08
     cd /cluster/store9/monDom4/bed/blastzHg18.2006-02-08
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-08
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
 XXXX - running 2006-02-08 10:06
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
 
 ###########################################################################
 #	Initial database setup (DONE - 2006-02-06 - Hiram)
     ssh hgwdev
     cd /cluster/data/monDom4
     mkdir /gbdb/monDom4
     ln -s /cluster/data/monDom4/monDom4.2bit /gbdb/monDom4
     twoBitInfo monDom4.2bit stdout \
         | awk '{printf "%s\t%s\t/gbdb/monDom4/monDom4.2bit\n", $1, $2}' \
         > chromInfo.tab
 
     hgsql -e "create database monDom4;" mysql
     hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \
                 monDom4
     hgsql monDom4 < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \
         monDom4
     hgsql monDom4 -N -e 'select chrom from chromInfo' > chrom.lst
 
     # Enter monDom4 into dbDb and defaultDb so test browser knows about
     # it:
     hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
         defaultPos, active, orderKey, genome, scientificName, \
         htmlPath, hgNearOk, hgPbOk, sourceName) \
         VALUES("monDom4", "Jan 2006", "/gbdb/monDom4", "Opossum", \
         "chr4:344283621-363415496", 1, 34, "Opossum", \
         "Monodelphis domestica", \
         "/gbdb/monDom4/html/description.html", 0, 0, \
         "Broad Inst. monDom4 Jan06");' \
         -h localhost hgcentraltest
     #	do this update later when it is time to make this be the default
     hgsql -e 'update defaultDb set name="monDom4" where genome="Opossum";' \
 	hgcentraltest
     #	update default position to a HOX region:
     hgsql -e \
 'update dbDb set defaultPos="chr8:293325654-293362847" where name="monDom4";' \
 	hgcentraltest
 	chr8:293,325,654-293,362,847
 
     # Make trackDb table so browser knows what tracks to expect
     mkdir /cluster/data/monDom4/html
     ln -s /cluster/data/monDom4/html /gbdb/monDom4/html
     mkdir $HOME/kent/src/hg/makeDb/trackDb/opossum/monDom4
     cd $HOME/kent/src/hg/makeDb/trackDb/opossum
     cvs add monDom4
     cd monDom4
     cp -p /cluster/data/monDom2/html/description.html ./description.html
     #	edit that description to update
     cvs add description.html
     cvs commit
     cd $HOME/kent/src/hg/makeDb/trackDb
     #	add monDom4 to the makefile, then
     make DBS=monDom4
     #	or
     make DBS=monDom4 alpha
     #	to place it on the genome-test browser
 
 #######################################################################
 # SIMPLE REPEAT [TRF] TRACK (DONE - 2006-02-08 - Hiram)
 #   This can be done while the above repeat masker run is working
     ssh kkstore01
     mkdir /cluster/data/monDom4/bed/simpleRepeat
     cd /cluster/data/monDom4
     #	split into 50M chunks for trfBig
     mkdir split50M
 for C in 1 2 3 4 5 6 7 8 X Un
 do
     mkdir split50M/${C}
     faSplit -verbose=2 size ${C}/chr${C}.fa 50000000 split50M/${C}/chr${C}_ \
         -lift=split50M/${C}.lft
 done
     cat split50M/*.lft > lifts/lift50M.lft
     cd split50M
     ls -1S */*.fa > ../bed/simpleRepeat/split.list
 
     #	small kluster for these larger jobs
     ssh kki
     cd /cluster/data/monDom4/bed/simpleRepeat
     mkdir trf
 
     cat << '_EOF_' > template
 #LOOP
 ./runTrf.csh $(dir1) $(root1) {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
     #	happy emacs
 
     cat << '_EOF_' > runTrf.csh
 #!/bin/csh -fe
 set workDir = /scratch/tmp/monDom4TRF/$1/$2
 set inputFN = ../../split50M/$1/$2.fa
 set outputFN = $3
 mkdir -p $workDir
 cp -p $inputFN $workDir
 pushd $workDir
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $2.fa /dev/null -bedAt=$2.bed -tempDir=/scratch/tmp
 popd
 rm -f $outputFN
 cp -p $workDir/$2.bed $outputFN
 rm -fr $workDir/*
 rmdir --ignore-fail-on-non-empty $workDir
 rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF/$1
 rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF
 '_EOF_'
     #	happy emacs
     chmod +x runTrf.csh
 
     gensub2 split.list single template jobList
     para create jobList
     para try ... check ... push ... etc ...
 # Completed: 77 of 77 jobs
 # CPU time in finished jobs:      16109s     268.48m     4.47h    0.19d  0.001 y
 # IO & Wait Time:                  1199s      19.99m     0.33h    0.01d  0.000 y
 # Average job time:                 225s       3.75m     0.06h    0.00d
 # Longest finished job:            1322s      22.03m     0.37h    0.02d
 # Submission to last job:          1870s      31.17m     0.52h    0.02d
 
     #	lift to chrom coordinates
     ssh kkstore01
     cd /cluster/data/monDom4/bed/simpleRepeat
     find ./trf -type f | xargs cat | \
         liftUp simpleRepeat.bed \
 	    /cluster/data/monDom4/lifts/lift50M.lft \
 		error stdin
 
      # Load into the database:
     ssh hgwdev
     cd /cluster/data/monDom4/bed/simpleRepeat
     hgLoadBed -strict monDom4 simpleRepeat simpleRepeat.bed \
         -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     #	Loaded 710087 elements of size 16
 
     #   Compare with previous assembly
     time featureBits monDom4 simpleRepeat
     #	68337838 bases of 3501643220 (1.952%) in intersection
     #	real    0m42.137s
     time featureBits monDom2 simpleRepeat
     #	69239555 bases of 3496445653 (1.980%) in intersection
     #	real    1m5.716s
     time featureBits monDom1 simpleRepeat
     #	55000238 bases of 3492108230 (1.575%) in intersection
     #	real    1m49.240s
 
 #######################################################################
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-10 - Hiram)
 # After the simpleRepeats track has been built, make a filtered
 # version 
 # of the trf output: keep trf's with period <= 12:
 #	need these mask files on a per-chrom basis
     ssh kkstore01
     cd /cluster/data/monDom4/bed/simpleRepeat
 
     mkdir chromMask
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	echo "chr${C} working ..."
 	cat trf/chr${C}_*.bed | liftUp chromMask/chr${C}.bed \
 	    /cluster/data/monDom4/lifts/lift50M.lft \
 		error stdin
 	awk '{if ($5 <= 12) print;}' chromMask/chr${C}.bed \
 		> chromMask/chr${C}_Mask.bed
 	echo "chr${C} done"
     done
 
     #	should be same result as if we had done the whole thing as a
     #	single piece:
     cat chromMask/*_Mask.bed | wc -l
     #	485364
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed | wc -l
     #	485364
 
 #######################################################################
 # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2006-02-10 - Hiram)
 #	Additional masking was added later to use Damian Keefe's repeat
 #	libraries.
     #	the new style binRange was in development at this time, so using
     #	binaries built today with that new functionality.  Next time
     #	these will be the standard commands
     ssh kkstore01
     cd /cluster/data/monDom4
     #	verify sequence before and after masking to ensure it doesn't
     #	get broken
     time faCount ?/chr?.fa Un/chrUn.fa
 #seq    len     A       C       G       T       N       cpg
 # chr1  748055161 227831941 138649610 138661137 228270299 14642174 3225777
 # chr2  541556283 163736285 100133171 100127159 163985111 13574557 2505214
 # chr3  526135210 161221343  95632761  95680478 160815407 12785221 2208210
 # chr4  430141050 130190442  78748372  78684824 130210194 12307218 1877031
 # chr5  308900514  94866760  56126597  56061671  94769331  7076155 1311015
 # chr6  244784211  73686567  45271636  45267221  73515529  7043258 1216115
 # chr7  262624689  81219469  47052213  47076531  81079019  6197457 1127925
 # chr8  308469712  93087343  56578139  56466687  93245129  9092414 1427329
 # chrX   60718501  16286269  11251258  11264518  16304629  5611827  371051
 # chrUn 174229318 46556476  32785341  32891615  46354738 15641148 1620662
 # total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329
     #	real    1m51.555s
 
     cat << '_EOF_' > scripts/maskChroms.sh
 #!/bin/sh
 
 for C in 1 2 3 4 5 6 7 8 X Un
 do
     echo "chr${C} working ..."
     $HOME/bin/x86_64/maskOutFa -soft ${C}/chr${C}.fa \
         bed/simpleRepeat/chromMask/chr${C}_Mask.bed ${C}/chr${C}_masked.fa
     $HOME/bin/x86_64/maskOutFa -softAdd ${C}/chr${C}_masked.fa \
         ${C}/chr${C}.fa.out ${C}/chr${C}_masked.fa
     echo "done"
 done
 '_EOF_'
     #	happy emacs
     chmod +x scripts/maskChroms.sh
     time ./scripts/maskChroms.sh
 
     #	should have the same sequence still
     time faCount ?/chr?_masked.fa Un/chrUn_masked.fa
 #seq    len     A       C       G       T       N       cpg
 chr1  748055161 227831941 138649610 138661137 228270299 14642174  3225777
 chr2  541556283 163736285 100133171 100127159 163985111 13574557  2505214
 chr3  526135210 161221343  95632761  95680478 160815407 12785221  2208210
 chr4  430141050 130190442  78748372  78684824 130210194 12307218  1877031
 chr5  308900514  94866760  56126597  56061671  94769331  7076155  1311015
 chr6  244784211  73686567  45271636  45267221  73515529  7043258  1216115
 chr7  262624689  81219469  47052213  47076531  81079019  6197457  1127925
 chr8  308469712  93087343  56578139  56466687  93245129  9092414  1427329
 chrX   60718501  16286269  11251258  11264518  16304629  5611827   371051
 chrUn 174229318  46556476  32785341  32891615  46354738 15641148  1620662
 total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329
 
     #	after verifying they are the same, remove the old unmasked sequence
 for C in 1 2 3 4 5 6 7 8 X Un
 do
     rm ${C}/chr${C}.fa
     mv ${C}/chr${C}_masked.fa ${C}/chr${C}.fa
 done
 
     #	Create new 2bit file with this masked sequence
     ls -og *.2bit
     #	-rw-rw-r--  1 901986358 Feb  8 09:29 monDom4.2bit
     #	browser will not function while this 2bit file does not exist
     rm monDom4.2bit
     faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit
     #	the file becomes larger:
     ls -og *.2bit
     #	-rw-rw-r--  1 950060406 Feb 10 13:53 monDom4.2bit
 
     #	and load the rmsk tables
     ssh hgwdev
     cd /cluster/data/monDom4
     time $HOME/bin/i386/hgLoadOut -verbose=2 monDom4 \
 	?/chr?.fa.out Un/chrUn.fa.out
 #       hgLoadOut: connected to database: monDom4
 # Processing 1/chr1.fa.out
 # Strange perc. field -460.2 line 561938 of 1/chr1.fa.out
 # Strange perc. field -395.6 line 561938 of 1/chr1.fa.out
 # Loading up table chr1_rmsk
 # Processing 2/chr2.fa.out
 # Strange perc. field -160.2 line 234960 of 2/chr2.fa.out
 # Strange perc. field -238.7 line 234960 of 2/chr2.fa.out
 # bad rep range [1723, 73] line 451681 of 2/chr2.fa.out 
 # Strange perc. field -6.0 line 1104797 of 2/chr2.fa.out
 # Loading up table chr2_rmsk
 # Processing 3/chr3.fa.out
 # bad rep range [2344, 191] line 648571 of 3/chr3.fa.out 
 # Loading up table chr3_rmsk
 # Processing 4/chr4.fa.out
 # Strange perc. field -7.0 line 7733 of 4/chr4.fa.out
 # Loading up table chr4_rmsk
 # Processing 5/chr5.fa.out
 # Strange perc. field -6.9 line 34415 of 5/chr5.fa.out
 # Strange perc. field -1.1 line 44716 of 5/chr5.fa.out
 # Strange perc. field -1.1 line 44720 of 5/chr5.fa.out
 # Strange perc. field -1.1 line 44722 of 5/chr5.fa.out
 # Strange perc. field -1.1 line 44724 of 5/chr5.fa.out
 # Strange perc. field -38.3 line 146729 of 5/chr5.fa.out
 # Strange perc. field -38.3 line 146732 of 5/chr5.fa.out
 # Strange perc. field -38.3 line 146734 of 5/chr5.fa.out
 # Strange perc. field -38.3 line 146737 of 5/chr5.fa.out
 # Strange perc. field -42.5 line 239189 of 5/chr5.fa.out
 # Loading up table chr5_rmsk
 # Processing 6/chr6.fa.out
 # Strange perc. field -0.2 line 87360 of 6/chr6.fa.out
 # Strange perc. field -0.1 line 87360 of 6/chr6.fa.out
 # Strange perc. field -3960.1 line 507698 of 6/chr6.fa.out
 # Strange perc. field -1856.3 line 507698 of 6/chr6.fa.out
 # Loading up table chr6_rmsk
 # Processing 7/chr7.fa.out
 # Loading up table chr7_rmsk
 # Processing 8/chr8.fa.out
 # Loading up table chr8_rmsk
 # Processing X/chrX.fa.out
 # Loading up table chrX_rmsk
 # Processing Un/chrUn.fa.out
 # Strange perc. field -27.6 line 129404 of Un/chrUn.fa.out
 # Strange perc. field -1.8 line 129404 of Un/chrUn.fa.out
 # Strange perc. field -27.6 line 129407 of Un/chrUn.fa.out
 # Strange perc. field -1.8 line 129407 of Un/chrUn.fa.out
 # Strange perc. field -27.6 line 129409 of Un/chrUn.fa.out
 # Strange perc. field -1.8 line 129409 of Un/chrUn.fa.out
 # Strange perc. field -27.6 line 129411 of Un/chrUn.fa.out
 # Strange perc. field -1.8 line 129411 of Un/chrUn.fa.out
 # Strange perc. field -27.6 line 129413 of Un/chrUn.fa.out
 # Strange perc. field -1.8 line 129413 of Un/chrUn.fa.out
 # Loading up table chrUn_rmsk
 # note: 2 records dropped due to repStart > repEnd
 
     #	Note the interesting coincidence of the exact same % for monDom4
     #	and monDom2
     time featureBits monDom4 rmsk
     #	1814099592 bases of 3501643220 (51.807%) in intersection
     #	real    3m57.193s
     #	time featureBits -countGaps monDom4 rmsk
     #	1814099592 bases of 3605614649 (50.313%) in intersection
 
     time featureBits monDom2 rmsk
     #	1811406819 bases of 3496445653 (51.807%) in intersection
     #	real    5m31.388s
     time featureBits monDom1 rmsk
     #	1775646140 bases of 3492108230 (50.847%) in intersection
 
 #######################################################################
 # GC5BASE (DONE - 2006-02-08 - Hiram)
     ssh kkstore01
     mkdir /cluster/data/monDom4/bed/gc5Base
     cd /cluster/data/monDom4/bed/gc5Base
     time hgGcPercent -verbose=2 -wigOut -doGaps -file=stdout -win=5 monDom4 \
         /cluster/data/monDom4 | wigEncode stdin gc5Base.wig gc5Base.wib
 
     ssh hgwdev
     cd /cluster/data/monDom4/bed/gc5Base
     mkdir /gbdb/monDom4/wib
     ln -s `pwd`/gc5Base.wib /gbdb/monDom4/wib
     hgLoadWiggle -verbose=2 monDom4 gc5Base gc5Base.wig
 
 #######################################################################
 # LOAD GAP & GOLD TABLES FROM AGP (DONE - 2006-02-10 - Hiram)
     ssh hgwdev
     cd /cluster/data/monDom4
     hgGoldGapGl -noGl monDom4 broad.mit.edu/Monodelphis4.0.agp
     #   Verify sanity of indexes
     hgsql -e "show index from gold;" monDom4
     hgsql -e "show index from gap;" monDom4
     #   Look at the Cardinality column, it should have some healthy
     #   numbers for all indices
     # For some reason, the indices do not get built correctly --
     # "show index from gap/gold" shows NULL cardinalities for chrom.  
     # Rebuild indices with "analyze table".
     hgsql -e "analyze table gold; analyze table gap;" monDom4
 
 #######################################################################
 #	LINEAGE SPECIFIC REPEATS (DONE - 2006-02-10 - Hiram)
 #	Let's see if DateRepeats can do anything with this Opossum
 #	business.  This was tried, but it doesn't make anything special
 #	for any of the organisms and a -comp of opossum isn't allowed at
 #	this time so we couldn't do this on the other organisms either to 
 #	obtain any specific repeats there.
     ssh kki
     mkdir /cluster/data/monDom4/linSpecRep
     cd /cluster/data/monDom4/linSpecRep
     
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	ln -s ../${C}/chr${C}.fa.out ./chr${C}.rmsk.out
     done
 
     #	This takes a long time to run as a script, to speed it up it has
     #	been converted to a mini cluster run
     cat << '_EOF_' > template
 #LOOP
 /cluster/bluearc/RepeatMasker060120/DateRepeats $(path1) -query opossum -comp human -comp mouse -comp rat -comp dog -comp dog
 #ENDLOOP
 '_EOF_'
 
     ls *.out > out.list
 
     gensub2 out.list single template jobList
 
     ssh kki
     cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
     ./runArian.sh > jobList
     para create jobList
     para try, ... check ... push ... etc ...
     #	To check if any difference is noted in the various comparison
     #	organisms:
     awk '{if(NF>10){a=NF-4;b=NF-3;c=NF-2;d=NF-1; print $a,$b,$c,$d,$NF}}' \
 	chr2.rmsk.out_homo-sapiens_mus-musculus_rattus_canis-familiaris_canis-familiaris \
 	    | sort | uniq -c
     #	187651 - - - - -
     #	113100 0 0 0 0 0
     #	210909 ? ? ? ? ?
     #	602832 X X X X X
     #	always the same mark for each comp organism
 
 #######################################################################
 #	Distribute files to filesystems for kluster runs
     ssh kkr1u00
     mkdir /iscratch/i/monDom4
     cd /iscratch/i/monDom4
     cp -p /cluster/data/monDom4/monDom4.2bit .
     cp -p /cluster/data/monDom4/chrom.sizes .
 for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/
 done
 
     ssh pk
     mkdir /san/sanvol1/scratch/monDom4
     cd /san/sanvol1/scratch/monDom4
     cp -p /cluster/data/monDom4/monDom4.2bit .
     cp -p /cluster/data/monDom4/chrom.sizes .
 
 #######################################################################
 #  HUMAN BLASTZ - this time with masked sequence
 #	(DONE - 2006-02-10 - 2006-02-15 - Hiram)
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastzHg18.2006-02-10
     cd /cluster/data/monDom4/bed/blastzHg18.2006-02-10
 
     cat << '_EOF_' > DEF
 # opossum vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Human (hg18)
 SEQ2_DIR=/scratch/hg/hg18/nib
 SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-10
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-stop=net `pwd`/DEF > blastz.out 2>&1 &
     #	running 2006-02-10
 
 #######################################################################
 #	Additional RepeatMasking - this sequence needs to be
 #	additionally masked with Damian Keefe's libraries
 #		(DONE - 2006-02-11 - Hiram)
     ssh kkr1u00
     mkdir /cluster/data/monDom4/RMExtra
     cd /cluster/data/monDom4/RMExtra
     wget --timestamping --force-directories --directory-prefix=. \
         --dont-remove-listing --recursive --level=2 --no-parent \
         --no-host-directories --cut-dirs=6 \
 ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/repeat_libraries/monodelphis/*
     #	picks up:
     #	-rw-r--r--  1  679203 May  9  2005 supplemental.lib
     #	-rw-r--r--  1  737274 May  9  2005 ab_initio.lib
     #	-rw-r--r--  1     452 May  9  2005 README
     cp -p supplemental.lib /iscratch/i/monDom4
     cd /iscratch/i/monDom4
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/
     done
 
     ssh kk
     cd /cluster/data/monDom4/RMExtra
     cat << '_EOF_' > RMExtra.csh
 #!/bin/csh -fe
 /bin/mkdir -p /scratch/tmp/monDom4/$1/$2
 /bin/mkdir -p ../RMOutExtra/$1
 /bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa
 pushd /scratch/tmp/monDom4/$1/$2
 /cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
         -lib /iscratch/i/monDom4/supplemental.lib $2.fa
 popd
 /bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3
 /bin/rm -fr /scratch/tmp/monDom4/$1/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1
 '_EOF_'
     #	happy emacs
     chmod +x RMExtra.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./RMExtra.csh $(dir1) $(root1) {check out line ../RMOutExtra/$(dir1)/$(root1).out}
 #ENDLOOP
 '_EOF_'
     #	happy emacs
 
     mkdir ../RMOutExtra
     gensub2 ../RMRun/split.list single template jobList
     para create jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 7170 of 7170 jobs
 # CPU time in finished jobs:    8540127s  142335.45m  2372.26h   98.84d  0.271 y
 # IO & Wait Time:                103372s    1722.87m    28.71h    1.20d  0.003 y
 # Average job time:                1206s      20.09m     0.33h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2326s      38.77m     0.65h    0.03d
 # Submission to last job:         22083s     368.05m     6.13h    0.26d
     cd /cluster/data/monDom4/RMOutExtra
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	echo "chr${C} working ... "
 	(cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \
 	    ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \
 		| xargs tail --quiet --lines=+4) | \
 	    liftUp ./chr${C}.fa.out ../lifts/lift500K.lft error stdin
 	echo "chr${C} done"
     done
 
     #	Add the two results together
     cd /cluster/data/monDom4/RMOut
     cp -p /cluster/data/monDom2/scripts/uniqueRMOut.csh ../scripts
     for C in 1 2 3 4 5 6 7 8 X Un
 do
     echo "${C} working ..."
     cp -p ../${C}/chr${C}.fa.out ./chr${C}.tmp.out
     tail +4 ../RMOutExtra/chr${C}.fa.out >> ./chr${C}.tmp.out
     ../scripts/uniqueRMOut.csh ./chr${C}.tmp.out > chr${C}.fa.out
     rm -f ./chr${C}.tmp.out &
     echo "done"
 done
     #	and then add this mask to the existing masked sequence
     for C in 1 2 3 4 5 6 7 8 X Un
 do
     echo "chr${C} working ..."
     rm -f ./chr${C}.added.fa
     maskOutFa -softAdd ../${C}/chr${C}.fa ./chr${C}.fa.out ./chr${C}.added.fa
     echo "done"
 done
 
     #	verify they are not broken compared to what was previous
     #	note the additional masked sequence in the lower case counts
     ssh kkstore01
     cd /cluster/data/monDom4/RMOut
     faSize chr?.added.fa chrUn.added.fa
     # 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
     # 1949956770 lower) in 10 sequences in 10 files
     faSize ../?/chr?.fa ../Un/chrUn.fa
     # 3605614649 bases (103971429 N's 3501643220 real 1687813668 upper
     # 1813829552 lower) in 10 sequences in 10 files
 
     #	If that is OK, replace the current sequence
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	rm ../${C}/chr${C}.fa
 	mv chr${C}.added.fa ../${C}/chr${C}.fa
     done
 
     #	And rebuild the 2bit file:
     faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4RMExtra.2bit
     #	and switch it out from under the browser
     rm monDom4.2bit; mv monDom4RMExtra.2bit monDom4.2bit
     #	distribute to /iscratch/i/monDom4 ...
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("monDom4", "blat17", "17786", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("monDom4", "blat17", "17787", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 #########################################################################
 # CPGISLANDS (DONE - 2006-02-16 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/monDom4/bed/cpgIsland
     cd /cluster/data/monDom4/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     cd ../..
     ln -s hg3rdParty/cpgIslands/cpglh.exe .
     
     ssh kkstore01
     cd /cluster/data/monDom4
     mkdir tmp
     # cpglh.exe requires hard-masked (N) .fa's.  
     time for CHR in ?/chr?.fa ??/chr??.fa
     do
 	echo "maskOutFa ${CHR} hard ${CHR}.masked"
 	maskOutFa ${CHR} hard ${CHR}.masked
     done > tmp/hardMask.out 2>&1
     #	about 3 minutes 20 seconds
 
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     cd /cluster/data/monDom4/bed/cpgIsland
     for F in ../../*/chr*.fa.masked
     do
 	FA=${F/*\/}
 	C=${FA/.fa.masked/}
 	echo "./cpglh.exe ${FA} > ${C}.cpg"
 	./cpglh.exe ${F} > ${C}.cpg
     done > cpglh.out 2>&1 &
     #	about 4 minutes, there were no warnings
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     #	happy emacs
     awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/monDom4/bed/cpgIsland
     hgLoadBed -strict monDom4 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     #	Reading cpgIsland.bed
     #	Loaded 22409 elements of size 10
     featureBits monDom4 cpgIslandExt
     #	14526908 bases of 3501643220 (0.415%) in intersection
 
 #########################################################################
 # ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram)
     # See notes in makeGalGal2.doc and makeCanFam2.doc
     ssh kkstore01
     mkdir /cluster/data/monDom4/bed/cpgIslandGgfAndy
     cd /cluster/data/monDom4/bed/cpgIslandGgfAndy
 
     #	Build the preProcGgfAndy program in
     #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
 
     # Use masked sequence since this is a mammal...
     for F in ../../*/chr*.fa.masked
     do
 	FA=${F/*\/}
 	C=${FA/.fa.masked/}
 	echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
 	~/bin/$MACHTYPE/preProcGgfAndy ${F} \
 	| /cluster/home/angie/ggf-andy-cpg-island.pl \
 	| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
                    $gc=$c+$g1;  $pCpG=(100.0 * 2 * $cpg / $n);
                    $pGc=(100.0 * $gc / $n);
                    $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
                         "$pCpG\t$pGc\t$oE\n";'
     done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed
     #	about 3 minutes
 
     # load into database:
     ssh hgwdev
     cd /cluster/data/monDom4/bed/cpgIslandGgfAndy
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
     hgLoadBed -strict monDom4 cpgIslandGgfAndyMasked -tab -noBin \
       -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
     #	Loaded 67504 elements of size 10
     featureBits monDom4 cpgIslandExt
     #	14526908 bases of 3501643220 (0.415%) in intersection
     featureBits monDom4 cpgIslandGgfAndyMasked
     #	41042229 bases of 3501643220 (1.172%) in intersection
     wc -l ../cpgIsland/cpgIsland.bed *bed
     #	22409 ../cpgIsland/cpgIsland.bed
     #	67504 cpgIslandGgfAndyMasked.bed
 
 #######################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2006-02-24 - Hiram)
     ssh kkstore01
     cd /cluster/data/monDom4
     #	use repMatch of 1228 as this genome is ~ %20 larger than human
     #	1024 + (1024 * 0.2) = 1228
     time blat monDom4.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
     #	Wrote 43992 overused 11-mers to 11.ooc
     #	real    3m24.793s
 
     #	copy to bluearc filesystem for kluster use
     cp -p 11.ooc /cluster/bluearc/scratch/hg/monDom4
 
 #########################################################################
 # GENBANK auto update (DONE - 2006-02-24 - Hiram)
     # align with revised genbank process. drop xeno ESTs.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvs update -d -P etc
     #edit etc/genbank.conf to add monDom4, it is a copy of monDom2 with changes:
 
 # monDom4 (M. domestica)  8 chroms plus X and Un
 monDom4.serverGenome = /cluster/data/monDom4/monDom4.2bit
 monDom4.clusterGenome = /cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
 monDom4.ooc = /cluster/data/monDom4/11.ooc
 monDom4.lift = no
 monDom4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 monDom4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 monDom4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 monDom4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 monDom4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 monDom4.downloadDir = monDom4
 monDom4.refseq.mrna.xeno.load  = yes
 monDom4.refseq.mrna.xeno.loadDesc = yes
 monDom4.mgcTables.default = full
 monDom4.mgcTables.mgc = all
 
     #	check that into CVS, then
     # update /cluster/data/genbank/
     make etc-update
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice -n 19 bin/gbAlignStep -initial monDom4 &
     #	var/build/logs/2006.02.24-22:38:50.monDom4.initalign.log
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad  monDom4 &
     #	var/dbload/hgwdev/logs/2006.02.26-09:36:15.dbload.log
 XXXX - running 2006-02-26 09:36
 
 ##########################################################################
 #  BLASTZ SELF  (DONE - 2006-03-02 - 2006-04-19 - Hiram)
 
     ssh kk
     mkdir /cluster/data/monDom4/bed/blastzSelf.2006-03-02
     cd /cluster/data/monDom4/bed
     ln -s blastzSelf.2006-03-02 blastz.monDom4
     cd blastzSelf.2006-03-02
     
     cat << '_EOF_' > DEF
 # monDom4 vs monDom4
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 BLASTZ=blastz.v7
 
 BLASTZ_M=100
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/iscratch/i/monDom4/monDom4.2bit
 SEQ1_LEN=/iscratch/i/monDom4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: MonDom2
 SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit
 SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzSelf.2006-03-02
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	This was a difficult run, needed to do the last job on Kolossus:
 # Completed: 133955 of 133956 jobs
 # CPU time in finished jobs:   78602533s 1310042.22m 21834.04h  909.75d  2.492 y
 # IO & Wait Time:              47419875s  790331.25m 13172.19h  548.84d  1.504 y
 # Average job time:                 941s      15.68m     0.26h    0.01d
 # Longest finished job:          111202s    1853.37m    30.89h    1.29d
 # Submission to last job:        521430s    8690.50m   144.84h    6.04d
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     #	broke during chaining, the processes are too large.  Running
     #	them individually on hgwdev64 and kolossus, they take many days
     #	each, perhaps a week, still trying to get them done after two
     #	weeks of messing with them.
     #	last one took:
     #	real    7343m55.064s
     #	user    2270m9.968s
     #	sys     5072m16.807s
     #	that's 5 days and 2 hours, about 5 weeks for all:
     #	-rw-rw-r--  1  138817724 Mar 10 17:48 monDom4.2bit:chrX:.chain
     #	-rw-rw-r--  1  500083934 Mar 11 15:06 monDom4.2bit:chr7:.chain
     #	-rw-rw-r--  1  681900033 Mar 11 20:27 monDom4.2bit:chr5:.chain
     #	-rw-rw-r--  1  495478058 Mar 11 20:53 monDom4.2bit:chr6:.chain
     #	-rw-rw-r--  1  948724278 Mar 12 12:55 monDom4.2bit:chrUn:.chain
     #	-rw-rw-r--  1  668565678 Mar 12 18:03 monDom4.2bit:chr8:.chain
     #	-rw-rw-r--  1 1214431534 Mar 23 21:39 monDom4.2bit:chr2:.chain
     #	-rw-rw-r--  1 1755465687 Apr  9 19:31 monDom4.2bit:chr4:.chain
     #	-rw-rw-r--  1 1739564535 Apr 10 15:55 monDom4.2bit:chr1:.chain
     #	-rw-rw-r--  1 2041409562 Apr 16 13:58 monDom4.2bit:chr3:.chain
 
     #	loading the tables on kolossus
     ssh kolossus
     cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChai
     cat << '_EOF_' > loadUp.csh
 #!/bin/csh -fe
 cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChain/chain
 foreach c (`awk '{print $1;}' /cluster/bluearc/scratch/hg/monDom4/chrom.sizes`)
     set f = $c.chain
     if (! -e $f) then
       echo no chains for $c
       set f = /dev/null
     endif
     hgLoadChain monDom4 ${c}_chainSelf $f
 end
 '_EOF_'
     #	<< emacs
     chmod +x loadUp.csh
     time ./loadUp.csh
     #	real    211m16.852s
     #	request push of *chainSelf* tables from kolossus to hgwdev
     #	almost 32 Gb of data.
 
     ssh kolossus
     cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
 	-continue=net `pwd`/DEF > net.out 2>&1 &
 
 ############################################################################
 ##  BLASTZ swap from mm8 alignments (DONE - 2006-02-23 - Hiram)
     ssh pk
     cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits monDom4 chainMm8Link
     #   210933035 bases of 3501643220 (6.024%) in intersection
 
 ############################################################################
 ##  BLASTZ CHICKEN galGal2 (DONE - 2006-03-06 - 2006-03-08 - Hiram)
 
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06
     cd /cluster/data/monDom4/bed
     ln -s blastzGalGal2.2006-03-06 blastz.galGal2
     cd blastzGalGal2.2006-03-06
 
     cat << '_EOF_' > DEF
 # Opossum vs Chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
 SEQ2_DIR=/scratch/hg/galGal2/nib
 SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzGalGal2.2006-03-06
 TMPDIR=/scratch/tmp
 '_EOF_'
   # << happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    1399m47.238s
 
     mkdir /cluster/data/galGal2/bed/blastz.monDom4.swap
     cd /cluster/data/galGal2/bed
     ln -s blastz.monDom4.swap blastz.monDom4
     cd blastz.monDom4.swap
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06/DEF \
 	> swap.out 2>&1 &
 
 
     featureBits monDom4 chainGalGal2Link > fb.monDom4.chainGalGal2Link 2>&1
     cat fb.monDom4.chainGalGal2Link
     #	106807090 bases of 3501643220 (3.050%) in intersection
     featureBits galGal2 chainMonDom4Link > fb.galGal2.chainMonDom4Link 2>&1
     cat fb.galGal2.chainMonDom4Link
     #	95852676 bases of 1054197620 (9.092%) in intersection
 
 ############################################################################
 ##  BLASTZ ZEBRAFISH danRer3 (DONE - 2006-03-06 - 2003-03-08 - Hiram)
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06
     cd /cluster/data/monDom4/bed
     ln -s blastzDanRer3.2006-03-06 blastz.danRer3
     cd blastzDanRer3.2006-03-06
 
     #	doing only the chroms on zebrafish, leaving out the chrUn and chrNA
 
     cat << '_EOF_' > DEF
 # Opossum vs Zebrafish
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer3)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
 SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
 SEQ2_CHUNK=100000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzDanRer3.2006-03-06
 TMPDIR=/scratch/tmp
 '_EOF_'
   # << happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	common failure for unknown reason:
     #	startStep: 0, at step 5 net to stopStep 9
     #	netChains: looks like previous stage was not successful
     #		(can't find [monDom4.danRer3.]all.chain[.gz]).
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net `pwd`/DEF > net.out 2>&1 &
 
     mkdir /cluster/data/danRer3/bed/blastz.monDom4.swap
     cd /cluster/data/danRer3/bed
     ln -s blastz.monDom4.swap blastz.monDom4
     cd blastz.monDom4.swap
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06/DEF \
 	> swap.out 2>&1 &
 
     featureBits monDom4 chainDanRer3Link > fb.monDom4.chainDanRer3Link 2>&1
     cat fb.monDom4.chainDanRer3Link
     #	73187496 bases of 3501643220 (2.090%) in intersection
     featureBits danRer3 chainMonDom4Link > fb.danRer3.chainMonDom4Link 2>&1
     cat fb.danRer3.chainMonDom4Link
     #	65225617 bases of 1630323462 (4.001%) in intersection
 
 ##########################################################################
 # MAKE Human Proteins track (WORKING - 2006-03-24 - Hiram)
     #	we need the split500K sequences to do this work, set them up on
     #	the Iservers
     ssh kkstore01
     cd /cluster/data/monDom4/split500K
     rsync -a --progress `pwd`/ kkr1u00:/iscratch/i/monDom4/split500K/
     ssh kkr1u00
     cd /iscratch/i/monDom4/split500K
     #	create individual blast db's for each sequence
     for C in 1 2 3 4 5 6 7 8 Un X
     do
 	echo ${C} working ...
 	cd ${C}
 	for i in *.fa
 	do
 	    /cluster/bluearc/blast229/formatdb -p F -i $i
 	done
 	cd ..
 	echo ${C} done
     done
     find /iscratch/i/monDom4/split500K -name "*.nsq" \
 	| sed "s/\.nsq//" > target.lst        
 
     #	now sync everything to the other Iserver nodes
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress `pwd`/ kkr${R}u00:/iscratch/i/monDom4/split500K/
     done
 
     ssh kkstore01
     mkdir -p /cluster/data/monDom4/bed/tblastn.hg18KG
     cd /cluster/data/monDom4/bed/tblastn.hg18KG
 
     mkdir kgfa
     # calculate number of protein sequence to give a job count of about
     #	500,000
     echo `cat /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | wc -l` \
 	`cat /iscratch/i/monDom4/split500K/target.lst | wc -l` \
 	| awk '{printf "%d/(500000/%d) = %d\n", $1, $2, $1/(500000/$2)}'
     #	36727/(500000/7170) = 526
     #	split the proteins by that number
 
     split -l 526 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
 
     cd kgfa
     for i in kg??; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     #	Want output to go to bluearc to preserve sanity of file server
     rm -rf  /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut
     mkdir -p /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut
     ln -s  /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut .
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
 
     ssh kk
     cd /cluster/data/monDom4/bed/tblastn.hg18KG
     cat /iscratch/i/monDom4/split500K/*.lft > subChr.lft
 
     cat << '_EOF_' > template
 #LOOP
 blastSome $(path1) $(path2) {check out exists blastOut/$(root2)/q.$(root1).psl}
 #ENDLOOP
 '_EOF_'
     #	<<	happy emacs
 
     #### !NOTE: Next time run these jobs with a better separation
     #	of I/O spaces, they seem to hangup bluearc while they work.
     #	Plus, they are using /tmp instead of /scratch/tmp/
     #	And they are leaving lots of garbage, the $f.3 file.
     #	Have added $f.3 to the listing below to remove it
 
 
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
         liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
 	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/monDom4/bed/tblastn.hg18KG/subChr.lft carry $f.3
         mv $3.tmp $3
         rm -f $f.1 $f.2 $f.3
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $f.3 $3.tmp $f.8
 exit 1
 '_EOF_'
     #	<<	happy emacs
     chmod +x blastSome
 
     cp -p /iscratch/i/monDom4/split500K/target.lst .
     gensub2 target.lst kg.lst template jobList
 
     #	You want to try 10 jobs and see if the following chain
     #	pipeline will work in a reasonable time.  If it doesn't the
     #	jobs are too big.  There were some hippos in the simpleChain
     #	step.
 
     para create jobList
     para try
 
     para push
 
 # Completed: 504218 of 504218 jobs
 # CPU time in finished jobs:   28863259s  481054.31m  8018.57h  334.07d  0.915 y
 # IO & Wait Time:               4255045s   70918.42m  1181.96h   49.25d  0.135 y
 # Average job time:                  66s       1.09m     0.02h    0.00d
 # Longest finished job:             443s       7.38m     0.12h    0.01d
 # Submission to last job:        166648s    2777.47m    46.29h    1.93d
 
     #	This simpleChain step has jobs that are too large.  They could
     #	be broken up even more as was done for the hippos that remained,
     #	see below.
     ssh kk
     cd /cluster/data/monDom4/bed/tblastn.hg18KG
     mkdir chainEm
     cd /cluster/data/monDom4/bed/tblastn.hg18KG/blastOut
     find . -name "*.psl" \
 	> /cluster/data/monDom4/bed/tblastn.hg18KG/chainEm/chain.lst
     cd chainEm
     mkdir psl
 
     cat << '_EOF_' > template
 #LOOP
 ./chainSome.csh $(root1) $(file2) {check out line psl/c.$(file2).$(root1).psl}
 #ENDLOOP
 '_EOF_'
     #	<<	happy emacs
 
     #	you will have to build this simpleChain binary in
     #	hg/mouseStuff/simpleChain/
     #	it will not run compiled as an x86_64 binary, only i386
     cat << '_EOF_' > chainSome.csh
 #!/bin/csh -fe
 cat ../blastOut/${1}/q.${2}_*.psl | \
     $HOME/bin/i386/simpleChain \
         -prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl
 '_EOF_'
     #	<<	happy emacs
     chmod +x chainSome.csh
 
     ls -1dS ../blastOut/kg?? > chain.dirs
 
     gensub2 chain.dirs ../../../chrom.lst template jobList 
     para create jobList
     para push
 # Completed: 574 of 699 jobs
 # CPU time in finished jobs:   17708635s  295143.92m  4919.07h  204.96d  0.562 y
 # IO & Wait Time:               1589482s   26491.37m   441.52h   18.40d  0.050 y
 # Average job time:               33620s     560.34m     9.34h    0.39d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:          222754s    3712.57m    61.88h    2.58d
 # Submission to last job:        673020s   11217.00m   186.95h    7.79d
 
     #	Some of these ended up as hippos after a few days.
     #	Rebuild a new batch for just the hippos
     ssh kk
     cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm
     mkdir runHippos
     mkdir runHippos/psl
     #	from the running hippos, extract the commands:
     para running | grep command: > splitHippos.sh
     #	edit splitHippos.sh to make those lines read:
     #	./mkHippoList.csh kgaa chr1
     #	where mkHippoList.csh is:
     #	#!/bin/csh -fe
     #	ls ../blastOut/${1}/q.${2}_*.psl
     #	run that:
     ./splitHippos.sh > hippo.list
     #	split by chrom
     grep chr1_ hippo.list > chr1.list
     grep chr2_ hippo.list > chr2.list
     grep chr3_ hippo.list > chr3.list
     grep chr4_ hippo.list > chr4.list
     #	these were the only chroms in the list, then
 
     #	The sequence below does not produce the correct result.
     #	Instead, break up all protein results for each chrom to
     #	individual files.
     ssh hgwdev
     cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm
     #	Do these one at a time since they create huge amounts of data in
     #	/scratch/tmp  - after each one has been sent to kkr1u00, the
     #	data here on hgwdev can be deleted
     cat chr1.list | xargs cat > /scratch/tmp/chr1.psl
     cd /scratch/tmp
     cat << '_EOF_' > splitProteins.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 sub usage() {
     print "usage: sort -k10 chrN.psl | ./splitProteins.pl <chrN>\n";
     print "  where chrN is : chr1, chr2, chr3, etc...\n";
     print "  one at a time please.\n";
     exit 255;
 }
 my $argc = scalar(@ARGV);
 
 if ($argc != 1) { usage; }
 
 my $chr = shift;
 
 print "mkdir -p $chr", "\n";
 print `mkdir -p $chr`, "\n";
 
 my $curProtein="";
 my $linesRead = 0;
 
 while (my $line=<STDIN>) {
     ++$linesRead;
 my ($col1, $col2, $col3, $col4, $col5, $col6, $col7, $col8, $col9, $col10, $rest) = split('\t',$line);
     if ($curProtein ne $col10) {
         if (length($curProtein) > 0) {
         close(FH);
         }
         open(FH,">$chr/$col10.psl") or die "can not open $chr/$col10.psl";
         $curProtein = $col10;
     }
     print FH $line;
     if (0 == ($linesRead % 1000)) {print ".";}
     if (0 == ($linesRead % 70000)) {print " $linesRead \n";}
 }
 close(FH);
 '_EOF_'
     #	<< happy emacs
     chmod +x ./splitProteins.pl
 
     sort -k10 chr1.psl | ./splitProteins.pl chr1
     rsync -a --progress ./chr1/ kkr1u00:/iscratch/i/monDom4/simpleChain/chr1/
     
     #	After the four chroms are there, sync to all I servers
     ssh kkr1u00
     cd /iscratch/i/monDom4/simpleChain
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress --stats `pwd`/ \
 	kkr${R}u00:/iscratch/i/monDom4/simpleChain/
 done
     #
     #	while we are here, make up the jobList
     cat << '_EOF_' > mkJobList
 #!/bin/sh
 
 for C in 1 2 3 4
 do
     M=0
     N="00"
     find ./chr${C} -type f | while read F
     do
         BN=`basename ${F}`
         echo "./oneChained.csh chr${C} ${BN} ${N} (check line+ psl/chr${C}/${N}/${BN})"
         M=`echo $M | awk '{printf "%d", $1+1}'`
         if [ "${M}" -gt 999 ]; then
             N=`echo ${N} | awk '{printf "%02d", $1+1}'`
             M=0
         fi
     done
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x mkJobList
     ./mkJobList > jobList
     mkdir /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
     cp jobList /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
 
     ssh kk
     cd /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
     mkdir psl
     cat << '_EOF_' > oneChained.csh
 #!/bin/csh -fe
 mkdir -p psl/${1}/${3}
 cat /iscratch/i/monDom4/simpleChain/${1}/${2} | \
     $HOME/bin/i386/simpleChain \
         -prot -outPsl -maxGap=250000 stdin psl/${1}/${3}/${2}
 '_EOF_'
     #	<< happy emacs
 
     #	XXXX this sequence did not work right, it breaks up individual
     #	XXXX protein results
     #	these were the only chroms in the list, then
     mkdir chr1 chr2 chr3 chr4
     cd chr1
     split -a 3 -l 100 ../chr1.list chr1
     cd ../chr2
     split -a 3 -l 100 ../chr2.list chr2
     cd ../chr3
     split -a 3 -l 100 ../chr3.list chr3
     cd ../chr4
     split -a 3 -l 100 ../chr4.list chr4
     cd ../runHippos
     cat << '_EOF_' > mkJobList.sh
 #!/bin/sh
 
 for C in 1 2 3 4
 do
     for F in `(cd ../chr${C}; ls chr${C}*)`
     do
     echo \
 "./chainHippos.csh ${F} chr${C} {check out line psl/c.chr${C}.${F}.psl}"
     done
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x ./mkJobList.sh
     ./mkJobList.sh > jobList
 
 
     cat << '_EOF_' > chainHippos.csh
 #!/bin/csh -fe
 sed -e "s/^/..\//" ../${2}/${1} | xargs cat | \
     $HOME/bin/i386/simpleChain \
         -prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl
 '_EOF_'
     #	<< happy emacs
     chmod +x chainHippos.sh
 
     para create jobList
     para try ... check ... etc ....
 
     ######  Continuing with the concatenation of all the results
     ssh kkstore04
     #	running out of disk space on store9, create a parallel hierarchy
     #	on a different disk still on this server:
     mkdir -p /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults
     cd /cluster/data/monDom4/bed/tblastn.hg18KG
     ln -s /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults \
     	sortingResults
     #	first set of results was in blastOut
     cd blastOut
     time for i in kg??
     do 
 	find ./$i -name "q.*.psl" | xargs cat \
 		| awk '($13 - $12)/$11 > 0.6 {print}' \
     > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl
 	sort -rn \
 /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \
 	    | pslUniq stdin \
 /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.$i.psl
 	awk '(($1 / $11) ) > 0.60 {print}' \
 /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \
 	   > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.$i.psl
 	echo $i
     done
     #	real    199m44.441s
 
     #	And then, adding the hippo results to those:
     cd ../hippoRun/psl
     time for C in 1 2 3 4
     do
 	cd chr${C}
 	for D in *
 	do
 	    find ./${D} -name "*.psl" | xargs cat \
 		| awk '($13 - $12)/$11 > 0.6 {print}' \
 > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl
 	    sort -rn \
 /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \
 	    | pslUniq stdin \
 /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.kg0${C}_${D}.psl
 	awk '(($1 / $11) ) > 0.60 {print}' \
 /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \
 > /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.kg0${C}_${D}.psl
 	echo chr${C}/${D}
 	done
 	cd ..
     done
     #	real    14m49.721s
 
 
     cd ../../sortingResults
     sort -u -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* \
 	> /cluster/data/monDom4/bed/tblastn.hg18KG/blastHg18KG.psl
     cd ..
     wc -l blastHg18KG.psl
     # 63264 blastHg18KG.psl
 
     #	what is this for ?
     cat kgfa/*fa | grep ">" | wc
     # 309494  309494 4810327
 
     ssh hgwdev
     cd /cluster/data/monDom4/bed/tblastn.hg18KG
     hgLoadPsl monDom4 blastHg18KG.psl
     time nice -n +19 featureBits monDom4 blastHg18KG
     #	19424941 bases of 3501643220 (0.555%) in intersection
 
     # back to kksilo
     rm -rf blastOut
 
 # End tblastn
 
 ############################################################################
 # SWAP CHAINS/NET RN4 (DONE 3/28/06 angie)
     ssh kkstore01
     mkdir /cluster/data/monDom4/bed/blastz.rn4.swap
     cd /cluster/data/monDom4/bed/blastz.rn4.swap
     doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.monDom4/DEF \
       -workhorse kkr8u00 >& do.log & tail -f do.log
     ln -s blastz.rn4.swap /cluster/data/monDom4/bed/blastz.rn4
     nice featureBits monDom4 chainRn4Link 
     #	198094619 bases of 3501643220 (5.657%) in intersection
 
 
 #######################################################################
 #  CHIMP BLASTZ - (DONE - 2006-03-29 - 2006-04-10 - Hiram)
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastzPanTro2.2006-03-29
     cd /cluster/data/monDom4/bed
     ln -s blastzPanTro2.2006-03-29 blastz.panTro2
     cd blastzPanTro2.2006-03-29
 
     cat << '_EOF_' > DEF
 # opossum vs. chimp
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp PanTro2
 SEQ2_DIR=/scratch/hg/panTro2/nib
 SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzPanTro2.2006-03-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     # XXXX	running 2006-03-29
 
     # 2 chroms crashed out-of-mem, so run manually on kolossus
     ssh kolossus
     cd /cluster/data/monDom4/bed/blastz.panTro2/axtChain/run
     (chain.csh monDom4.2bit:chr4: chain/monDom4.2bit:chr4:.chain; \
     chain.csh monDom4.2bit:chr3: chain/monDom4.2bit:chr3:.chain) > chain.log &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -continue=chainMerge \
 	`pwd`/DEF > blastz.chainMerge.out 2>&1 &
 
     # failed during load due to mysql restart
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -continue=load \
 	`pwd`/DEF > blastz.load.out 2>&1 &
 
     # SWAP ALIGNMENTS
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \
         -workhorse=kolossus \
         `pwd`/DEF > swap.log 2>&1 &
 
     # failed on file perm change on liftOver file owned by Hiram
     rm /cluster/data/monDom4/bed/liftOver/monDom4ToPanTro2.over.chain.gz
     rm /cluster/data/panTro2/bed/blastz.monDom4.swap/axtChain/noClass.net
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \
         -workhorse=kolossus \
         -continue=net \
         `pwd`/DEF > swap.net.log 2>&1 &
 
 
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
     # XXXX	running 2006-04-04
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net -swap `pwd`/DEF > swap.net.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=load -swap `pwd`/DEF > swap.load.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cleanup -swap `pwd`/DEF > swap.cleanup.out 2>&1 &
 
     time nice -n +19 featureBits panTro2 chainMonDom4Link \
 	> fb.panTro2.chainMonDom4Link 2>&1 &
 
 
 # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/22/06 angie)
     ssh kkstore04
     mkdir /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
     cd /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
     cat << '_EOF_' > DEF
 # opossum vs. frog
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/monDom4XenTro2 \
       -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.xenTro2.2006-04-20 /cluster/data/monDom4/bed/blastz.xenTro2
 
     ssh kolossus
     time nice -n +19 featureBits monDom4 chainXenTro2Link \
 	> fb.monDom4.chainXenTro2Link 2>&1 &
     cat fb.monDom4.chainXenTro2Link
     #	8160051 bases of 3501643220 (2.232%) in intersection
 
 ###########################################################################
 # CREATE OPOSSUM AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY
 # AND NIB DIRECTORY ON THE SAN FOR BLASTZ CLUSTER RUNS 
 # (DONE, 2006-04-26, hartera)
     # If there are no lineage-specific repeats for opossum and other species
     # then use all repeats.
     ssh pk
     mkdir -p /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
     foreach f (/cluster/data/monDom4/*/chr*.fa.out)
      cp -p $f \
         /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers/$f:t:r:r.out.spec
     end
     # need nib files when running Blastz with lineage-specific repeats
     # so create these and move to the san.
     ssh kkstore04
     mkdir -p /cluster/data/monDom4/nib
     cd /cluster/data/monDom4
     foreach f (*/chr*.fa)
       echo "Processing $f ..."
       nice faToNib -softMask $f nib/$f:t:r.nib
     end
     # move nibs to the san as taking up a lot of space on /cluster/data/ 
     ssh pk
     mv /cluster/data/monDom4/nib /san/sanvol1/scratch/monDom4/
 
 ##########################################################################
 # QUALITY (DONE - 2006-04-27 - 2006-04-28 - Hiram)
     ssh kkstore04
     #	this is actually a symlink to store8 to distribute the data on
     #	the full filesystems on kkstore04
     mkdir /cluster/data/monDom4/bed/quality
     cd /cluster/data/monDom4/bed/quality
     cp -p /cluster/data/monDom2/bed/quality/qaToWigEncode.pl .
     #	using perl script from previous build
     #	Update the file name in this file:
 my $AGP_FILE="/cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp";
 
     #	convert the qual.fa to wiggle single column input and then wigEncode:
 	time nice -n +19 \
 zcat /cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp.chromosome.qual.gz \
 	| ./qaToWigEncode.pl /dev/stdin \
 	| $HOME/bin/$MACHTYPE/wigEncode -noOverlap \
 		stdin quality.wig quality.wib
 
     #	Converted stdin, upper limit 63.00, lower limit 0.00
     #	real    96m57.628s
 
     #	Resulting files:
     ls -og quality.wi?
     #	-rw-rw-r--  1 3501643220 Apr 27 18:16 quality.wib
     #	-rw-rw-r--  1  322734978 Apr 27 18:16 quality.wig
     #	Note how the byte size of the .wib file is exactly equal
     #	to the number of bases in a featureBits run without -countGaps
     #	from the rmsk featureBits above:
     #	1814099592 bases of 3501643220 (51.807%) in intersection
     #	This means we have a valid quality score for every base that is
     #	not in a gap, which is what we want.  If this doesn't work out,
     #	you can turn this wiggle track into a bed file for each
     #	contiguous run of data, and then intersect those bed files with
     #	the gap track to see where they overlap.  This happened this
     #	time when a new type of gap "clone" was found to be defined in
     #	the agp file.  Previously there was only "fragment".  The perl
     #	script had to be altered to recognize this new type of gap.
     #	For example, this could be done after the table was loaded:
     ssh kolossus
     cd /tmp
     awk '{print $1}' /cluster/data/monDom4/chrom.sizes | while read C
     do
 	hgWiggle -doBed -db=monDom4 -chr=${C} quality > ${C}.qual.bed
     done
     #	then to intersect:
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	featureBits -chrom=chr${C} monDom4 gap chr${C}.qual.bed
     done
     #	should all be zero
 
     ssh hgwdev
     cd /cluster/data/monDom4/bed/quality
     ln -s `pwd`/quality.wib /gbdb/monDom4/wib
     time nice -n +19 hgLoadWiggle monDom4 quality quality.wig
     #	real    3m29.459s
     #	verify index:
     hgsql -e "show index from quality;" monDom4
     #	check that Cardinality is a number and is not NULL
 
 
 ####################################################################
 ### swap Hg18 blastz (DONE - 2006-04-28 - Hiram)
 ###
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastz.hg18.swap
     cd /cluster/data/monDom4/bed
     ln -s blastz.hg18.swap blastz.hg18
     cd blastz.hg18.swap
     #	The original DEF file had iscratch directories for the monDom4
     #	sequence, for this one we need san directories.  This is a copy
     #	of the original one with the SEQ2_DIR and _LEN changed:
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     cd /cluster/data/monDom4/bed/blastz.hg18.swap
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap DEF > swap.out 2>&1 &
     #	real    118m44.093s
 
     ssh kolossus
     cd /cluster/data/monDom4/bed/blastz.hg18.swap
     time nice -n +19 featureBits monDom4 chainHg18Link \
 	> fb.monDom4.chainHg18Link 2>&1
     #	351100848 bases of 3501643220 (10.027%) in intersection
 
 #######################################################################
 ### 7-Way conservation (DONE - 2006-04-28 - 2006-05-02 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/monDom4/bed/multiz7way
     cd /cluster/data/monDom4/bed/multiz7way
     cp /cluster/data/mm8/bed/multiz17way/17way.nh .
 
     /cluster/bin/phast/tree_doctor --prune \
 	chimp_panTro2,macaque_rheMac2,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,tetraodon_tetNig1,fugu_fr1 \
 	17way.nh > 7way.nh
 
     #	this leaves the tree we are working with here:
     (((((human_hg18:0.054922,
 	(rat_rn4:0.081728,mouse_mm8:0.077017):0.335773):0.285925,
 	monodelphis_monDom4:0.371073):0.189124,
 	chicken_galGal2:0.454691):0.123297,
 	xenopus_xenTro2:0.782453):0.156067,
 	zebrafish_danRer3:0.938628);
 
     /cluster/bin/phast/draw_tree 7way.nh > 7way.ps
     /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
     grep -y monDom4 7way.distances.txt | sort -k3,3n
 
     #	Print out that file for reference, and use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.  Zebrafish ends up before
     #	tetraodon and fugu on the browser despite its distance.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                      chainMonDom4Link      chain   linearGap
 #     distance                      on MonDom4    on other   minScore
 #  1  0.711920 - human hg18         (% 10.027)  (% 12.385)   5000     loose
 #  2  1.014888 - chicken galGal2    (% 03.050)  (% 09.092)   5000     loose
 #  3  1.069788 - mouse mm8          (% 06.024)  (% 08.245)   5000     loose
 #  4  1.074499 - rat rn4            (% 05.657)  (% 07.784)   5000     loose
 #  5  1.465947 - frog xenTro2       (% 02.232)  (no swap )   5000     loose
 #  6  1.778189 - zebrafish danRer3  (% 02.090)  (% 04.001)   5000     loose
 
     cd /cluster/data/monDom4/bed/multiz7way
     #	bash shell syntax here ...
     export H=/cluster/data/monDom4/bed
     mkdir mafLinks
     for G in hg18 galGal2 mm8 rn4 xenTro2 danRer3
     do
 	mkdir mafLinks/$G
 	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
 	echo "missing directory blastz.${G}/mafNet"
 	fi
 	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
     done
 
     #	Usually these MAFs are copied to the san for a pk kluster run.
     #	in this case they are small enough that there is no need to copy
     #	them anywhere.  The alignment can be done right here on the
     #	kkstore04 filesystem.
     #	Copy MAFs to some appropriate NFS server for kluster run
     # ssh kkstore04
     # mkdir /san/sanvol1/scratch/monDom4/multiz7way
     # cd /san/sanvol1/scratch/monDom4/multiz7way
     # time rsync -a --copy-links --progress \
     #	    /cluster/data/monDom4/bed/multiz7way/mafLinks/ .
 
     #	We have about 734 Mb of data here, takes ~ 30 seconds to copy
 
     # the autoMultiz cluster run
     ssh kk
 
     mkdir /cluster/data/monDom4/bed/multiz7way/autoMultiz
     cd /cluster/data/monDom4/bed/multiz7way/autoMultiz
     
     mkdir penn
     #	cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     #	cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     #	cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
     cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/autoMZ penn
 
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	../7way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir -p maf run
     cd run
 
     #	NOTE: you need to set the db properly in this script
     #	And the binDir and pairs dir
 
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = monDom4
 set c = $1
 set maf = $2
 set binDir = /cluster/data/monDom4/bed/multiz7way/autoMultiz/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /cluster/data/monDom4/bed/multiz7way/mafLinks
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 autoMultiz $(root1) {check out line+ ../maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
     awk '{print $1}' /cluster/data/monDom4/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 10 jobs
     para try ... check ... push ... etc ...
 # Completed: 10 of 10 jobs
 # CPU time in finished jobs:      38783s     646.38m    10.77h    0.45d  0.001 y
 # IO & Wait Time:                  1184s      19.73m     0.33h    0.01d  0.000 y
 # Average job time:                3997s      66.61m     1.11h    0.05d
 # Longest finished job:            8706s     145.10m     2.42h    0.10d
 # Submission to last job:          8706s     145.10m     2.42h    0.10d
 
     #	combine results into a single file for loading and gbdb reference
     ssh kkstore04
     cd /cluster/data/monDom4/bed/multiz7way
     #	There used to be a mafFilter here with a minScore of 500, but it
     #	turns out that the scores in these maf files are pretty much
     #	useless.  They range from very large negatives to very large
     #	positives.
     time catDir maf > multiz7way.maf
     #	real    1m18.849s
     #	makes a 2 Gb file:
     #	-rw-rw-r--  1 2119554905 May  1 14:23 multiz7way.maf
 
     #	Create per-chrom individual maf files for downloads
     ssh kkstore04
     cd /cluster/data/monDom4/bed/multiz7way
     mkdir mafDownloads
     time for M in maf/chr*.maf
     do
 	B=`basename $M`
 	cp -p ${M} mafDownloads/${B}
 	gzip mafDownloads/${B}
 	echo ${B} done
     done
     #	real    7m21.713s
     #	deliver to downloads
     ssh hgwdev
     ln -s /cluster/data/monDom4/bed/multiz7way/mafDownloads \
 	/usr/local/apache/htdocs/goldenPath/monDom4/multiz7way
 
     # Load into database
     ssh hgwdev
     cd /cluster/data/monDom4/bed/multiz7way
     mkdir /gbdb/monDom4/multiz7way
     ln -s /cluster/data/monDom4/bed/multiz7way/multiz7way.maf \
 	/gbdb/monDom4/multiz7way
     time nice -n +19 hgLoadMaf monDom4 multiz7way
     #	Loaded 2180653 mafs in 1 files from /gbdb/monDom4/multiz7way
     #	real    3m35.280s
 
     time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 	-maxSize=50000 monDom4 multiz7waySummary multiz7way.maf
     #	Created 1423996 summary blocks from 5463968 components and
     #	2180653 mafs from multiz7way.maf
     #	real    5m40.531s
 
     # create tree image:
     cat << '_EOF_' > species.nh
 (((((human,(rat,mouse)),opossum),chicken),frog),zebrafish)
 '_EOF_'
     /cluster/bin/phast/draw_tree -b -s species.nh > species7.ps
     # photoshop to enhance, reduce the amount of whitespace to make it
     # smaller, then save as jpg
     ln -s /cluster/data/monDom4/bed/multiz7way/species7.jpg \
 	/usr/local/apache/htdocs/images/phylo/monDom4_7way.jpg
     #	the chopping of whitespace can be done with Imagemagick
     #	'display' - use the "transform -> chop" menu, select horizontal
     #	or vertical chop, highlight an area to cut
 
 ############################################################################
 # CREATE CONSERVATION WIGGLE WITH PHASTCONS
 #		(DONE - 2006-05-01 - 2006-05-02 - Hiram)
 
     # Estimate phastCons parameters
     ssh kkstore04
     mkdir /cluster/data/monDom4/bed/multiz7way/cons
     cd /cluster/data/monDom4/bed/multiz7way/cons
 
     # Create a starting-tree.mod based on chr1 (the largest one)
     time /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr1.maf \
 	--refseq ../../../1/chr1.fa --in-format MAF \
 	--windows 100000000,1000 --out-format SS \
 	--between-blocks 5000 --out-root s1
     #	real    5m50.239s
 
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
 --tree "(((((hg18,(rn4,mm8)),monDom4),galGal2),xenTro2),danRer3)" \
     --out-root starting-tree
     #	real    1m11.568s
 
     rm s1.*.ss
     # add up the C and G:
     grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     #	0.398
     #	This 0.398 is used in the --gc argument below
 
     # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
     #	Increasing their size this time from 1,000,000 to 10,000,000 to
     #	slow down the phastCons pk jobs
     ssh kkstore04
     mkdir -p  /san/sanvol1/scratch/monDom4/cons/ss
     cd  /san/sanvol1/scratch/monDom4/cons/ss
     time for C in `awk '{print $1}' /cluster/data/monDom4/chrom.sizes`
     do
       if [ -s /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf ]; then
 	mkdir ${C}
 	echo msa_split $C
 	chrN=${C/chr/}
 	chrN=${chrN/_random/}
 	/cluster/bin/phast/$MACHTYPE/msa_split \
 	    /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf \
 	    --refseq /cluster/data/monDom4/${chrN}/${C}.fa \
 	    --in-format MAF --windows 10000000,0 --between-blocks 5000 \
 	    --out-format SS --out-root ${C}/${C}
       fi
     done &
     #	real    28m44.827s
 
     # Create a random list of 50 1 mb regions  (do not use the _randoms)
     cd /san/sanvol1/scratch/monDom4/cons/ss
     ls -1l chr*/chr*.ss | grep -v random | \
 	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
 
     # Set up parasol directory to calculate trees on these 50 regions
     ssh kk
     mkdir /san/sanvol1/scratch/monDom4/cons/treeRun1
     cd /san/sanvol1/scratch/monDom4/cons/treeRun1
     mkdir tree log
 
     #	Tuning this loop should come back to here to recalculate 
     # Create little script that calls phastCons with right arguments
     #	--target-coverage of 0.20 is about right for mouse, will be
     #	tuned exactly below
     cat > makeTree.csh << '_EOF_'
 #!/bin/csh -fe
 set C=$1:h
 mkdir -p log/${C} tree/${C}
     /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
       /cluster/data/monDom4/bed/multiz7way/cons/starting-tree.mod \
       --gc 0.398 --nrates 1,1 --no-post-probs --ignore-missing \
       --expected-lengths 12 --target-coverage 0.17 \
       --quiet --log log/$1 --estimate-trees tree/$1
 '_EOF_'
 #	<< happy emacs
       chmod a+x makeTree.csh
 
 # Create gensub file
       cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1)
 #ENDLOOP
 '_EOF_'
 #	<< happy emacs
 
 # Make cluster job and run it
     gensub2 ../randomSs.list single template jobList
     para create jobList
     para try/push/check/etc
 # Completed: 50 of 50 jobs
 # CPU time in finished jobs:      38161s     636.02m    10.60h    0.44d  0.001 y
 # IO & Wait Time:                   385s       6.41m     0.11h    0.00d  0.000 y
 # Average job time:                 771s      12.85m     0.21h    0.01d
 # Longest finished job:            1672s      27.87m     0.46h    0.02d
 # Submission to last job:          1693s      28.22m     0.47h    0.02d
 
 
 # Now combine parameter estimates.  We can average the .mod files
 # using phyloBoot.  This must be done separately for the conserved
 # and nonconserved models
     ssh kkstore04
     cd /san/sanvol1/scratch/monDom4/cons/treeRun1
     ls -1 tree/chr*/*.cons.mod > cons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
 	--output-average ../ave.cons.mod > cons_summary.txt
     ls -1 tree/chr*/*.noncons.mod > noncons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
 	--output-average ../ave.noncons.mod > noncons_summary.txt
     cd ..
     cp -p ave.*.mod /cluster/data/monDom4/bed/multiz7way/cons
 
 #	measuring entropy
 #	consEntopy <target coverage> <expected lengths>
 #		 ave.cons.mod ave.noncons.mod --NH 9.78
 #	never stops with the --NH argument
 /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 --NH 9.78 \
 	ave.cons.mod ave.noncons.mod
 # ( Solving for new omega: 12.000000 9.952747 9.669713 9.663148 )
 # Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
 # Relative entropy: H=1.090778 bits/site
 # Expected min. length: L_min=9.400348 sites
 # Expected max. length: L_max=6.597224 sites
 # Phylogenetic information threshold: PIT=L_min*H=10.253692 bits
 # Recommended expected length: omega=9.663148 sites (for L_min*H=9.780000)
 XXXX - working 2006-05-01 17:00
 
 
 ### !!! ***  This one with .17 and 12 is the one that was finally used
 # consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
 # Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
 # Relative entropy: H=1.478838 bits/site
 # Required length: N=6.753382 sites
 # Total entropy: NH=9.987159 bits
 
 #	SKIP to here passing by the tuning numbers
 ssh kk
 # Create cluster dir to do main phastCons run
 mkdir /san/sanvol1/scratch/monDom4/cons/consRun1
 cd /san/sanvol1/scratch/monDom4/cons
 cp /san/sanvol1/scratch/mm8/cons/elliotsEncode.mod .
 #	This file looks like:
 ALPHABET: A C G T
 ORDER: 0
 SUBST_MOD: REV
 TRAINING_LNL: -988246.132962
 BACKGROUND: 0.295 0.205 0.205 0.295
 RATE_MAT:
 -1.165221    0.315494    0.589884    0.259843
    0.189778   -0.878194    0.208718    0.479698
    0.444622    0.261535   -0.885604    0.179447
    0.234867    0.720815    0.215191   -1.170872
 TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,monDom4:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer3:0.782561):0.156067);
 
 
     cd /san/sanvol1/scratch/monDom4/cons/consRun1
     mkdir ppRaw bed
 
     # Create script to run phastCons with right parameters
     #	These parameters:
     #	--rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
     #	were taken from Kate's 17-way in Hg17, including the
     #	-not-informative panTro2
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     cat > doPhast << '_EOF_'
 #!/bin/csh -fe
 mkdir /scratch/tmp/${2}
 cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
 pushd /scratch/tmp/${2} > /dev/null
 /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
    --expected-lengths 12 --target-coverage 0.17 --quiet \
 	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score \
 	    --require-informative 0 > ${2}.pp
 popd > /dev/null
 mkdir -p ppRaw/${1}
 mkdir -p bed/${1}
 mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
 mv /scratch/tmp/${2}/${2}.bed bed/${1}
 rm /scratch/tmp/${2}/elliotsEncode.mod
 rm /scratch/tmp/${2}/${2}.ss
 rmdir /scratch/tmp/${2}
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << '_EOF_'
 #LOOP
 doPhast $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     # Create parasol batch and run it
     ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
 
     gensub2 in.list single template jobList
     para create jobList
     para try/check/push/etc.
     #	These jobs are very fast and very I/O intensive, even on the san
     #	they will hang it up as they work at full tilt.
 # Completed: 366 of 366 jobs
 # CPU time in finished jobs:      29300s     488.33m     8.14h    0.34d  0.001 y
 # IO & Wait Time:                 19643s     327.39m     5.46h    0.23d  0.001 y
 # Average job time:                 134s       2.23m     0.04h    0.00d
 # Longest finished job:             247s       4.12m     0.07h    0.00d
 # Submission to last job:          1880s      31.33m     0.52h    0.02d
 
     # combine predictions and transform scores to be in 0-1000 interval
     #	it uses a lot of memory, so on kolossus:
     ssh kolossus
     cd /san/sanvol1/scratch/monDom4/cons/consRun1
     #	The sed's and the sort get the file names in chrom,start order
     find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
 	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
 	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/monDom4/bed/multiz7way
 
     # Figure out how much is actually covered by the bed files as so:
     #	The 2583393846 comes from the non-n genome size,
     #	from faSize on all chroms:
     ssh kkstore04
     cd /cluster/data/monDom4
     faSize ?{,?}/chr*.fa
     #	3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
     #	1949956770 lower) in 10 sequences in 10 files
 
     cd /san/sanvol1/scratch/monDom4/cons/consRun1
     awk '
 {sum+=$3-$2}
 END{printf "%% %.2f = 100.0*%d/3501643220\n",100.0*sum/3501643220,sum}' \
 	mostConserved.bed
     #	--expected-lengths 12 --target-coverage 0.17 --quiet \
     #	% 4.80 = 100.0*168149380/3501643220
 
     ssh kolossus
     cd /san/sanvol1/scratch/monDom4/cons/consRun1
     #	Aiming for %70 coverage in
     #	the following featureBits measurement on CDS:
     # Beware of negative scores when too high.  The logToBedScore
     # will output an error on any negative scores.
 
     #	I don't think this will work for opossum because we don't have
     #	enough refGene:cds coverage:
     time nice -n +19 featureBits monDom4 refGene:cds
     #	34857 bases of 3501643220 (0.001%) in intersection
 
     HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits monDom4 \
 	-enrichment refGene:cds mostConserved.bed
     #	--expected-lengths 12 --target-coverage 0.17 --quiet \
     #	refGene:cds 0.001%, mostConserved.bed 4.802%, both 0.001%, cover
     #	75.13%, enrich 15.65x
 
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/monDom4/bed/multiz7way
     time nice -n +19 hgLoadBed -strict monDom4 \
 	phastConsElements7way mostConserved.bed
     #	real    2m31.673s
     #	Loaded 1882640 elements of size 5
 
     #	should measure the same as above
     time nice -n +19 featureBits monDom4 -enrichment refGene:cds \
 	phastConsElements7way
     #	refGene:cds 0.001%, phastConsElements7way 4.802%, both 0.001%,
     #	cover 75.13%, enrich 15.65x
 
     # Create merged posterier probability file and wiggle track data files
     ssh kkstore04
     cd /san/sanvol1/scratch/monDom4/cons/consRun1
     # the sed business gets the names sorted by chromName, chromStart
     #	so that everything goes in numerical order into wigEncode
     #	You will want to test this to make sure it is sorting correctly,
     #	run this command and see if everything is in order by chrom and
     #	position:
     find ./ppRaw -type f \
 	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
 
     time nice -n +19 find ./ppRaw -type f \
 	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
 	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
 		phastCons7.wig phastCons7.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    4m4.694s
 
     #	-rw-rw-r--    1 465425363 May  2 12:42 phastCons7.wib
     #	-rw-rw-r--    1  79633105 May  2 12:42 phastCons7.wig
 
     cp -p phastCons7.wi? /cluster/data/monDom4/bed/multiz7way/
 
     #	prepare compressed copy of ascii data values for downloads
     ssh kkstore04
     cd /san/sanvol1/scratch/monDom4/cons/consRun1
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastCons7Scores
 
 for D in ppRaw/chr*
 do
     C=${D/ppRaw\/}
     out=phastCons7Scores/${C}.data.gz
     echo "========================== ${C} ${D}"
     find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
 	    gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
     #	real    8m10.959s
 
     #	copy them for downloads
     ssh kkstore04
     mkdir /cluster/data/monDom4/bed/multiz7way/phastCons7Scores
     cd /cluster/data/monDom4/bed/multiz7way/phastCons7Scores
     cp -p  /san/sanvol1/scratch/monDom4/cons/consRun1/phastCons7Scores/* .
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/monDom4
     ln -s /cluster/data/monDom4/bed/multiz7way/phastCons7Scores .
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/monDom4/bed/multiz7way
     ln -s `pwd`/phastCons7.wib /gbdb/monDom4/wib/phastCons7.wib
     time nice -n +19 hgLoadWiggle monDom4 phastCons7 phastCons7.wig
     #	real    0m52.109s
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/monDom4/bed/multiz7way
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=monDom4 phastCons7 > histogram.data 2>&1
     #	real    9m8.075s
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Opossum MonDom4 Histogram phastCons7 track"
 set xlabel " phastCons7 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #########################################################################
 # Build maf annotation for multiz7way  (DONE - 2006-07-31 - Hiram)
 
     ssh kolossus
     cd /cluster/data/monDom4/bed/multiz7way
     mkdir anno 
     cd anno
     mkdir maf run
     cd run
     rm sizes nBeds
 
     for i in `cat /cluster/data/monDom4/bed/multiz7way/species.lst`
 do
     ln -s  /cluster/data/$i/chrom.sizes $i.len
     ln -s  /cluster/data/$i/$i.N.bed $i.bed
     echo $i.bed  >> nBeds
     echo $i.len  >> sizes
 done
 
     echo "#!/bin/csh -fe" > jobs.csh
     echo date >> jobs.csh
     #	do smaller jobs first, the 'ls -rS' lists in reverse order by size
     for i in `ls -rS ../../maf/*.maf`
 do
 echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/monDom4/monDom4.2bit ../maf/`basename $i` >> jobs.csh
 echo "echo $i" >> jobs.csh
 done 
     echo date >> jobs.csh
 
     chmod +x jobs.csh
     
     #	establish a screen for this long running job
     screen
     cd /cluster/data/monDom4/bed/multiz7way
     time ./jobs.csh > jobs.log 2>&1
     #	real    195m54.071s
     #	user    57m51.373s
     #	sys     128m25.386s
     #	ctrl-a ctrl-d to release
     #	to return to the screen:
     screen -r -d
 
 
     cd /cluster/data/monDom4/bed/multiz7way/anno/maf
     mkdir -p /gbdb/monDom4/multiz7way/anno/maf
     ln -s /cluster/data/monDom4/bed/multiz7way/anno/maf/*.maf \
         /gbdb/monDom4/multiz7way/anno/maf
 
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom4/multiz7way/anno/maf \
                 monDom4 multiz7way
 # Loaded 2781002 mafs in 10 files from /gbdb/monDom4/multiz7way/anno/maf
 # real    1m49.236s
 
     time cat *.maf \
 	| nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 		-maxSize=50000 monDom4 multiz7waySummary stdin
 # Created 1423996 summary blocks from 5463968 components and 2781002 mafs
 #	from stdin
 #	real    2m30.875s
 
 #######################################################################
 # MULTIZ7WAY MAF FRAMES (DONE - 2006-08-01 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/monDom4/bed/multiz7way/frames
     cd /cluster/data/monDom4/bed/multiz7way/frames
     # The following is adapted from the sequence in mm8.txt
 
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes:
     mkdir genes
 
     for qDB in danRer3
     do
       tmpExt=`mktemp temp.XXXXXX`
       tmpMrnaCds=${qDB}.mrna-cds.${tmpExt}
       tmpMrna=${qDB}.mrna.${tmpExt}
       tmpCds=${qDB}.cds.${tmpExt}
       echo $qDB
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${qDB} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$qDB.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
       rm -f $tmpExt
     done
 
     # using knownGene for rn4 mm8 hg18
     # using refGene for galGal2
     # using mgcGenes for xenTro2
     # using ensGene for monDom4
     # genePreds; (must keep only the first 10 columns for knownGene)
     for qDB in rn4 mm8 hg18 galGal2 xenTro2 monDom4
     do
       if [ $qDB = "xenTro2" ]; then
         geneTbl=mgcGenes
       elif [ $qDB = "galGal2" ]; then
         geneTbl=refGene
       elif [ $qDB = "monDom4" ]; then
         geneTbl=ensGene
       else
         geneTbl=knownGene
       fi
       echo hgsql -N -e 'select * from '"$geneTbl ${qDB}"
       hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$qDB.tmp.gz
       mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
       rm -f $tmpExt
     done
 
     #------------------------------------------------------------------------
     # create frames
     ssh kkstore04
     cd /cluster/data/monDom4/bed/multiz7way/frames
     rm mkMafFrames.sh
     for C in `awk '{print $1;}' /cluster/data/monDom4/chrom.sizes`
 do
     echo "echo mafFrames/${C}.maf"
     echo genePredToMafFrames monDom4 ../mafDownloads/${C}.maf.gz \
 	mafFrames/${C}.maf \
 	monDom4 genes/monDom4.gp.gz \
 	hg18 genes/hg18.gp.gz \
 	mm8 genes/mm8.gp.gz \
 	rn4 genes/rn4.gp.gz \
 	galGal2 genes/galGal2.gp.gz \
 	danRer3 genes/danRer3.gp.gz \
 	xenTro2 genes/xenTro2.gp.gz
 done > mkMafFrames.sh
 
     chmod +x mkMafFrames.sh
     time nice -n +19 ./mkMafFrames.sh
     #	real    1m10.711s
     gzip mafFrames/*.maf
 
 
     #------------------------------------------------------------------------
     # load the database
     ssh hgwdev
     cd /cluster/data/monDom4/bed/multiz7way/frames
     time nice -n +19 hgLoadMafFrames monDom4 multiz7wayFrames \
 	mafFrames/*.maf.gz
     #	real    0m17.864s
 
     #	The use of this table requires an entry in the trackDb.ra for this:
     #	frames multiz7wayFrames
     #	irows on
 
 ###########################################################################
 # BLASTZ SWAP OF ZEBRAFISH (danRer4) CHAIN AND NET ALIGNMENTS OVER TO 
 # monDom4 AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS 
 # (DONE, 2006-04-29, hartera)
     # see also makeDanRer4.doc
     # Remove test tables and directories and start again:
     ssh hgwdev
     cd /cluster/data/monDom4
     foreach c (`cat chrom.lst`)
        hgsql -e "drop table chr${c}_chainDanRer4;" monDom4
        hgsql -e "drop table chr${c}_chainDanRer4Link;" monDom4
        hgsql -e "drop table chr${c}_chainDanRer4NoDyMsk;" monDom4
        hgsql -e "drop table chr${c}_chainDanRer4NoDyMskLink;" monDom4
     end
     hgsql -e "drop table netDanRer4;" monDom4
     hgsql -e "drop table netDanRer4NoDyMsk;" monDom4
     # remove downloads
     rm -r /usr/local/apache/htdocs/goldenPath/monDom4/vsDanRer4
     rm \
     /usr/local/apache/htdocs/goldenPath/monDom4/liftOver/monDom4ToDanRer4.over.chain.gz
     rm /cluster/data/monDom4/bed/liftOver/monDom4ToDanRer4.over.chain.gz
     # remove old Blastz swap
     rm -r /cluster/data/monDom4/bed/blastz.danRer4.swap
     # remove link to old blastz directory
     rm -r /cluster/data/monDom4/bed/blastz.danRer4
 
     # Used the following BLASTZ parameters:
     # BLASTZ_H=2000
     # BLASTZ_Y=3400
     # BLASTZ_L=6000
     # BLASTZ_K=2200
     # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
     # Used all repeats as lineage-specific repeats.
     # Swap directory is: /cluster/data/monDom4/bed/blastz.danRer4.swap
     ssh pk
     cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
     nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& doSwap.log &
     
     featureBits monDom4 chainDanRer4Link 
     # 62249730 bases of 3501643220 (1.778%) in intersection
     featureBits monDom4 chainDanRer3Link 
     # 73187496 bases of 3501643220 (2.090%) in intersection
     featureBits monDom4 netDanRer4
     # 745122163 bases of 3501643220 (21.279%) in intersection
     featureBits monDom4 netDanRer3
     # 672446285 bases of 3501643220 (19.204%) in intersection
     featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment
     # refGene:cds 0.001%, chainDanRer4Link 1.807%, both 0.000%, 
     # cover 43.85%, enrich 24.27x 
     featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer3Link -enrichment
     # refGene:cds 0.001%, chainDanRer3Link 2.044%, both 0.000%, 
     # cover 49.24%, enrich 24.09x
 
     # Only 36 RefSeqs in monDom4 refGene so try xenoRefGene (78707 sequences)
     # and mrna (174 sequences):
     featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment
     # xenoRefGene:cds 0.820%, chainDanRer4Link 1.807%, both 0.661%, 
     # cover 80.63%, enrich 44.63x
     featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer3Link -enrichment
     # xenoRefGene:cds 0.820%, chainDanRer3Link 2.044%, both 0.577%, 
     # cover 70.45%, enrich 34.47x
     featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment
     # mrna 0.004%, chainDanRer4Link 1.807%, both 0.002%, 
     # cover 52.67%, enrich 29.15x
     featureBits -chrom=chr1 monDom4 mrna chainDanRer3Link -enrichment
     # mrna 0.004%, chainDanRer3Link 2.044%, both 0.002%, 
     # cover 54.50%, enrich 26.66x
  
     featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer4 -enrichment
     # xenoRefGene:cds 0.820%, netDanRer4 24.539%, both 0.720%, 
     # cover 87.79%, enrich 3.58x
     featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer3 -enrichment
     # xenoRefGene:cds 0.820%, netDanRer3 22.272%, both 0.640%, 
     # cover 78.12%, enrich 3.51x
     featureBits -chrom=chr1 monDom4 mrna netDanRer4 -enrichment
     # mrna 0.004%, netDanRer4 24.539%, both 0.002%, cover 56.90%, enrich 2.32x
     featureBits -chrom=chr1 monDom4 mrna netDanRer3 -enrichment
     # mrna 0.004%, netDanRer3 22.272%, both 0.002%, cover 62.62%, enrich 2.81x
 
 ##########################################################################
 # PRODUCING GENSCAN PREDICTIONS (WORKING - 2006-05-02 - Hiram)
     #	Want to make up a chrom and contig 2bit file to get proper
     #	names on the genscan predictions.
     ssh kkstore04
     cd /cluster/data/monDom4/Un
     grep chrUn ../broad.mit.edu/Monodelphis4.0.agp > chrUn.broad.agp
     cat chrUn.broad.agp | /cluster/bin/scripts/agpToLift > chrUn_random.lft
     $HOME/kent/src/hg/utils/lft2BitToFa.pl ../monDom4.2bit chrUn_random.lft \
 	> chrUn_random.ctg.fa
 
     #	Create a 2bit file with the full chrom sequences and the
     #	random contigs, all hard masked
     cat ?/chr?.fa Un/chrUn_random.ctg.fa \
 	| maskOutFa stdin hard stdout \
 	    | faToTwoBit stdin monDom4Chroms_RandomContigs.hard.2bit
     #	make sure it still has all the unmasked sequence in it:
     twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
 	| faSize stdin
     #	3589973501 bases (2038287051 N's 1551686450 real 1551686450
     #	upper 0 lower) in 9949 sequences in 1 files
     twoBitToFa monDom4.2bit stdout | faSize stdin
     #	3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
     #	1949956770 lower) in 10 sequences in 1 files
 
     #	note the 'real' bases are the same, the lowers have become N's
     #	1089350685 + 97171400 = 1186522085 
     #	1949956770 + 103971429 = 2053928199
     #	2053928199 - 2038287051  = 15641148 == N's in gaps between contigs
 
     #	There are a lot of contigs that became completely N's with no
     #	sequence left.  genscan doesn't like that.  We need to get them
     #	out of the analysis.  Pick out the ones with over 18 real bases
     #	left:
     twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
 	| faCount stdin > chroms_randoms.faCount
     egrep -v "^#|^total" chroms_randoms.faCount \
 	| awk '{if ($2-$7 > 17) {print $1}}' > over18.lst
     #	creating 4,000,000 sized chunks
     mkdir hardChunks
     twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
 	| faSomeRecords /dev/stdin over18.lst stdout \
 	    | faSplit about stdin 4000000 hardChunks/c_
     rsync -a --progress hardChunks/ /cluster/bluearc/monDom4/hardChunks/
 
 
     ssh hgwdev
     mkdir /cluster/data/monDom4/bed/genscan
     cd /cluster/data/monDom4/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
 
     # Run on pk cluster instead of kki since these jobs are going to
     # take quite a while and we don't want to tie up the kki kluster
     # entirely.
     ssh pk
     cd /cluster/data/monDom4/bed/genscan
     # Make 3 subdirectories for genscan to place output files
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     #	Since we split on gaps, we have no chunks like that.  You can
     #	verify with faCount on the chunks.
     ls -1S /cluster/bluearc/monDom4/hardChunks/c_*.fa > genome.list
 
     # Create template file, for gensub2.  For example (3-line file):
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << emacs
     gensub2 genome.list single template jobList
     para create jobList
     para try, check, push, check, ...
 XXXX - running 2006-05-04 14:13
 # Completed: 906 of 906 jobs
 # CPU time in finished jobs:     215764s    3596.07m    59.93h    2.50d  0.007 y
 # IO & Wait Time:                  2798s      46.63m     0.78h    0.03d  0.000 y
 # Average job time:                 241s       4.02m     0.07h    0.00d
 # Longest finished job:           57335s     955.58m    15.93h    0.66d
 # Submission to last job:         63345s    1055.75m    17.60h    0.73d
 
     cat gtf/c_*.gtf | liftUp -type=.gtf t.gtf \
 	/cluster/data/monDom4/Un/chrUn_random.lft carry stdin
 
     # Concatenate and lift to chrom results:
     ssh kkstore04
     cd /cluster/data/monDom4/bed/genscan
     for C in `awk '{print $1}' ../../chrom.sizes | sed -e "s/chr//"`
     do
 	mkdir -p ${C}
 	catDir gtf | liftUp ${C}/chr${C}.gtf \
 	    /cluster/data/monDom4/hardChunks.lft error stdin
 	catDir subopt | liftUp ${C}/chr${C}.bed \
 	    /cluster/data/monDom4/hardChunks.lft error stdin
 	catDir pep > ${C}/chr${C}.pep
 	echo "done: chr${C}"
     done
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/monDom4/bed/genscan
     ldHgGene -bin -gtf monDom4 genscan 1/chr1.gtf
     #	35103 gene predictions
     hgPepPred monDom4 generic genscanPep 1/chr1.pep
     hgLoadBed monDom4 genscanSubopt 1/chr1.bed
     #	Loaded 446953 elements of size 6
 
     #	Compare with other assemblies:
     featureBits monDom4 genscan
     #	48628209 bases of 3496445653 (1.391%) in intersection
     featureBits monDom1 genscan
     #	47170795 bases of 3492108230 (1.351%) in intersection
     featureBits monDom4 genscanSubopt
     #	52714123 bases of 3496445653 (1.508%) in intersection
     featureBits monDom1 genscanSubopt
     #	51805835 bases of 3492108230 (1.484%) in intersection
     featureBits hg17 genscan
     #	55323340 bases of 2866216770 (1.930%) in intersection
     featureBits hg17 genscanSubopt
     #	55986178 bases of 2866216770 (1.953%) in intersection
 
     # Clean up
     cd /san/sanvol1/scratch/monDom4
     rm -fr hardMasked200K
     #	This kolossus clean up was actually left until much later.
     #	I found these copies here to be useful later on.
     ssh kolossus
     cd /scratch/monDom4
     rm -f *
     cd ..
     rmdir monDom4
 
 #########################################################################
 #  creating bigZips downloads (DONE - 2006-05-15 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/monDom4/downloads
     mkdir /cluster/data/monDom4/downloads/bigZips
     mkdir /cluster/data/monDom4/downloads/chromosomes
     cd /cluster/data/monDom4/downloads/chromosomes
     #	There wasn't a README for a previous assembly, these things are
     #	pretty generic, the mm8 one can be used with only slight edits.
     #	Next time use the previous Opossum assembly README.
     cp /usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt .
     #	edit that readme to provide correct references and details
     ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes
 
     ssh kkstore04
     cd /cluster/data/monDom4/downloads/chromosomes
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	echo "gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz"
 	gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz
     done
     md5sum *.fa.gz R*.txt > md5sum.txt
 
     cd ../bigZips
     gzip -c ../../broad.mit.edu/Monodelphis4.0.agp > Monodelphis4.0.agp.gz
     cd ../..
     tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa
     #	16 minutes
     tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
 	Un/chrUn.fa.masked
     #	8 minutes
     tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out
     cd bed/simpleRepeat
     mkdir trfMask
     cd trfMask
     for C in 1 2 3 4 5 6 7 8 X Un
     do
 	ln -s ../chromMask/chr${C}_Mask.bed chr${C}.bed
     done
     tar --dereference cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask
 
     ssh hgwdev
     cd /cluster/data/monDom4/downloads/bigZips
     cp /usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt .
 
     ln -s /cluster/data/monDom4/downloads/bigZips \
 	/usr/local/apache/htdocs/goldenPath/monDom4/bigZips
     cd /usr/local/apache/htdocs/goldenPath/monDom4/bigZips
     ln -s /cluster/data/monDom4/monDom4.2bit .
 
     ssh kkstore04
     cd /cluster/data/monDom4/downloads/bigZips
     md5sum *.gz *.2bit R*.txt > md5sum.txt
 
     
 #########################################################################
 # BLASTZ CHICKEN galGal3 (DONE 5/26/06 angie)
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
     cd /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
     cat << '_EOF_' > DEF
 # opossum vs chicken
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
 SEQ1_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/san/sanvol1/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmonDom4 \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=5000 -chainLinearGap=loose \
       >& do.log & tail -f do.log
     ln -s blastzGalGal3.2006-05-25 /cluster/data/monDom4/bed/blastz.galGal3
 
 
 #####################################################################
 #### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram)
    mkdir /cluster/data/monDom4/bed/ensGene
    cd /cluster/data/monDom4/bed/ensGene
 
         Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
         Choose Ensembl 39 and Monodelphis domestica, click next
 	It displays status in a window on the right, indicating how many
 	entries are here, currently: 20,907
     	The next page is the "filter" step, we do not want any filters,
 	nothing is changed on this page, click next
 	Now we are on the "output" tab, the filter in the window on the right
 	indicates that 20,907 passed the filter.  (there is no filter)
 	Now, on this output page, change the pull-down menu item from
 	its default of "features" to read "structures"
 	All the check-boxes now change.
 	Mark the check box GTF under output format
 	Under Gene Ensemble Attributes,
 	Unselect Biotype
 	Select
 	Ensembl Gene ID
 	Ensembl Transcript ID
 	External Gene ID
 
 	gzip compression
 	and give it a filename: ensGeneMonDom4
 	it will add the .gff.gz suffix
 	press "export"
 
 
     #	They have a chrMT - we don't have the chrM sequence, 
     # Add "chr" to front of each line
     #	 in the gene data gtf file to make 
     # it compatible with ldHgGene and convert remove chrMT name
 
     zcat ensGeneMonDom4.gff.gz \
 	| sed -e "s/^\([0-9XYU][0-9n]*\)/chr\1/; /^MT/d" > ensGene.gtf
     #	Verify names converted OK:
     awk '{print $1}' ensGene.gtf | sort | uniq -c
     #	153247 chr1
     #	126767 chr2
     #	 82637 chr3
     #	 90199 chr4
     #	 47661 chr5
     #	 50469 chr6
     #	 34230 chr7
     #	 55841 chr8
     #	 41868 chrUn
     #	  9965 chrX
 
     ldHgGene -gtf monDom4 ensGene ensGene.gtf
 # Read 33878 transcripts in 692884 lines in 1 files
 #   33878 groups 10 seqs 1 sources 4 feature types
 # 33878 gene predictions
 
     featureBits monDom4 ensGene
     #	32770559 bases of 3501643220 (0.936%) in intersection
 
     #	Let's see if the peptides created from the genome sequence are
     #	different than the peptides that Ensemble says they are making:
 
     #	fetch chrX peptides from the ensGene table, sort them by their
     #	name into single line instances:
     getRnaPred -peptides monDom4 ensGene chrX stdout \
 	| faToTab -type=protein stdin stdout | sort > ensGenePep.chrX.fa
     #	And chrX peptides downloaded from ensmart:
     faToTab -type=protein ensPepMonDom4chrX.fasta.gz stdout \
 	| sort > ensPepchrX.fa
     #	A diff of those two show they are different.  But this is not
     #	correct yet because they don't even have the same names in them
     #	need to try some other options here to get the same sequence
 
 
     #	It turns out it is necessary to have the ensGtp table to enable a link
     #	to Ensemble from a gene's details page
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as monDom4.ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql monDom4 < ~/kent/src/hg/lib/ensGtp.sql
     # remove header line from ensGtp.txt
     zcat monDom4.ensGtp.txt.gz \
 	| hgsql monDom4 \
 -e "load data local infile '/dev/stdin' into table ensGtp ignore 1 lines"
 
     #	Obtaining protein sequence from EnsMart
     #	Select "sequences" from the pull-down on the output page
     #	check Peptide in the "Sequences" selection area
     #	and "Ensembl Transcript ID (versioned) in the Transcript
     #	Attributes area
     #	Uncheck all items in the Gene Attributes section.
     #	Text,Fasta output, gzip, file name: monDom4.ensPep
     #	becomes monDom4.ensPep.fasta.gz
     #	The result should be just an ID and a peptide, e.g.:
 # >ENSMODT00000024417.2
 # MSEFWLCFNCCIAEQPQPKRRRRIDRSMIGEPTNFVHTAHVGSGDLFSGMNSVSSIQSQMQSKGGYGGGMPSNGQMQLVETKAG
     #
     #	 monDom4.ensPep.fa.gz
     zcat monDom4.ensPep.fa.gz | \
 	$HOME/kent/src/utils/faToTab/faToTab.pl  /dev/null /dev/stdin \
 	> ensGene.fa.tab
     #	There appear to be more peptides than there were genes before.
     #	I wonder if Ensembl has updated this list since the gene data was
     #	fetched before:
     awk '{print $1}' genePred.tab | sort -u > geneNameList
     awk '{print $1}' ensGene.fa.tab | sort -u > pepNameList
     wc geneNameList pepNameList
     #	33878   33878  711438 geneNameList
     #	33915   33914  712193 pepNameList
     #	67793   67792 1423631 total
     comm -12 geneNameList pepNameList | wc
     #	33878   33878  711438
     #	So create an exclude list to get rid of the extras:
     comm -13 geneNameList pepNameList  > excludeList
     #	and leave them out
     zcat monDom4.ensPep.fa.gz | \
 	$HOME/kent/src/utils/faToTab/faToTab.pl excludeList /dev/stdin \
 	> ensGene.fa.tab
 
     hgPepPred monDom4 tab ensPep ensGene.fa.tab
 
 ##########################################################################
 # N-SCAN gene predictions (nscanGene, nscanPep) - (2006-09-13 markd)
     cd /cluster/data/monDom4/bed/nscan/
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/possum/monDom4/chr_gtf
     wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/possum/monDom4/chr_ptx
 
     # clean up downloaded directorys and compress files.
     mv ardor.wustl.edu/predictions/opossum/monDom4/* .
     rm -rf ardor.wustl.edu
     gzip chr_*/*
     chmod a-w chr_*/*.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt monDom4 nscanGene chr_gtf/chr*.gtf.gz
 
     # load protein, add .a suffix to match transcript id
     hgPepPred -suffix=.a monDom4 generic nscanPep chr_ptx/chr*.fa.gz
     rm *.tab
 
     # update trackDb; need a monDom4-specific page to describe informants
     opossum/monDom4/nscanGene.html   (copy from rn4 and edit)
     opossum/monDom4/trackDb.ra
 
     # verify that the search spec matches the naming convention, sometimes
     # transcripts end with .a othertimes .0
     OK!
 
 
 #########################################################################
 # BLASTZ PLATYPUS ornAna1 (DONE 4/3/07 angie)
     ssh pk
     mkdir /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
     cd /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
     cat << '_EOF_' > DEF
 # opossum vs platypus
 BLASTZ=blastz.v7.x86_64
 
 # Use "mammal-fish" params even though this is mammal-mammal... 
 # pretty distant, hopefully not too many shared undermasked repeats, 
 # and if we get overwhelmed we can back off:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Platypus ornAna1
 SEQ2_DIR=/san/sanvol1/scratch/ornAna1/ornAna1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/ornAna1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/monDom4ornAna1 \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=5000 -chainLinearGap=loose \
       >& do.log & tail -f do.log
     ln -s blastz.ornAna1.2007-04-02 /cluster/data/monDom4/bed/blastz.ornAna1
 
 #########################################################################
 # BLASTZ Rhesus rheMac2 (DONE 2007-05-27 braney)
     ssh kolossus
     mkdir /cluster/data/monDom4/bed/blastz.rheMac2
     cd /cluster/data/monDom4/bed/blastz.rheMac2
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
     
 # TARGET: Opossum monDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rhesus (rheMac2)
 SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
 SEQ2_LEN=/san/sanvol1/scratch/rheMac2/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastz.rheMac2
 TMPDIR=/scratch/tmp
     # << emacs
 
     doBlastzChainNet.pl DEF \
       -bigClusterHub=pk  -chainMinScore=5000 -chainLinearGap=loose \
       >& blastz.out 2>&1 &
 
 ############################################################################
 ##  BLASTZ swap from canFam2 alignments (2007-11-10 - markd)
     # /cluster/data/canFam2/bed/blastz.monDom4/DEF1 has incorrect BASE, so
     # create DEF.fixed
     ssh hgwdev
     mkdir /cluster/data/monDom4/bed/blastz.canFam2.swap
     cd /cluster/data/monDom4/bed/blastz.canFam2.swap
     ln -s blastz.canFam2.swap ../blastz.canFam2
 
     (time /cluster/bin/scripts/doBlastzChainNet.pl \
         -swap /cluster/data/canFam2/bed/blastz.monDom4/DEF.fixed) >& swap.out&
 
     nice -n +19 featureBits monDom4 chainCanFam2Link
     # 325604035 bases of 3501643220 (9.299%) in intersection
 
 ##############################################################################
 # Blastz Orangutan ponAbe2 (DONE - 2007-11-26 - Hiram)
     ssh kkstore04
     #	managing disk space on full filesystem
     mkdir /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22
     ln -s /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22 \
 	/cluster/data/monDom4/bed
     cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
 
     screen # use screen to control this job
     mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
     cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
 
     cat << '_EOF_' > DEF
 # Opossum vs. Orangutan
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Opossum MonDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/cluster/data/monDom4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Orangutan PonAbe2
 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ1_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # XXX - can run Opossum only on pk since the chroms are so large
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> cat.log 2>&1 &
     #	real    1600m18.602s
 # Completed: 134688 of 134688 jobs
 # CPU time in finished jobs:   12106044s  201767.41m  3362.79h  140.12d  0.384 y
 # IO & Wait Time:                851946s   14199.09m   236.65h    9.86d  0.027 y
 # Average job time:                  96s       1.60m     0.03h    0.00d
 # Longest finished job:           11205s     186.75m     3.11h    0.13d
 # Submission to last job:         95155s    1585.92m    26.43h    1.10d
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-continue=chainRun -stop=download -chainLinearGap=loose \
 	-bigClusterHub=pk -verbose=2 \
 	> chainRun.log 2>&1 &
     #	real    2724m19.530s
 
     cat fb.monDom4.chainPonAbe2Link.txt
     #	395887393 bases of 3501643220 (11.306%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap
     cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
 	/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> swap.log 2>&1 &
     #	real    223m38.325s
     cat fb.ponAbe2.chainMonDom4Link.txt
     #	406409435 bases of 3093572278 (13.137%) in intersection
 
 ##############################################################################
 # Blastz Marmoset calJac1 (WORKING - 2007-11-24 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     #	managing disk space on full filesystem
     mkdir /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24
     ln -s /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24 \
 	/cluster/data/monDom4/bed
     cd /cluster/data/monDom4/bed/blastzCalJac1.2007-11-24
 
     #	this chunking makes 651,480 jobs
     cat << '_EOF_' > DEF
 # Opossum vs. Marmoset
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Opossum MonDom4
 SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ1_LEN=/cluster/data/monDom4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
 SEQ2_CHUNK=2000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # XXX - can run Opossum only on pk since the chroms are so large
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
 	-stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> cat.log 2>&1 &
     #	real 2581m49.853s
 # Completed: 651480 of 651480 jobs
 # CPU time in finished jobs:   25871528s  431192.14m  7186.54h  299.44d  0.820 y
 # IO & Wait Time:               2695828s   44930.46m   748.84h   31.20d  0.085 y
 # Average job time:                  44s       0.73m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1804s      30.07m     0.50h    0.02d
 # Submission to last job:        146641s    2444.02m    40.73h    1.70d
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
 	-continue=chainRun -stop=download \
 	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> chainRun.log 2>&1 &
     #	real    1810m38.601s
     # the 10 chaining jobs on kki take about 35 hours for the longest job
 # Completed: 10 of 10 jobs
 # CPU time in finished jobs:     369079s    6151.32m   102.52h    4.27d  0.012 y
 # IO & Wait Time:                   385s       6.41m     0.11h    0.00d  0.000 y
 # Average job time:               36946s     615.77m    10.26h    0.43d
 # Longest finished job:          103069s    1717.82m    28.63h    1.19d
 # Submission to last job:        103069s    1717.82m    28.63h    1.19d
 
     cat fb.monDom4.chainCalJac1Link.txt
     #	386915334 bases of 3501643220 (11.050%) in intersection
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
 	-continue=cleanup \
 	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> cleanup.log 2>&1 &
 
     mkdir /cluster/data/calJac1/bed/blastz.monDom4.swap
     cd /cluster/data/calJac1/bed/blastz.monDom4.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-swap -bigClusterHub=pk > swap.log 2>&1 &
     #	real    498m24.344s
     #	this failed during the load because the chainLink table became
     #	too large.  These were loaded manually with an sql statement to
     #	start the table definition, then a load data local infile using
     #	the .tab files left over from the failed load.  Note the extra
     #	definitions on the chainMonDom4Link table
 CREATE TABLE chainMonDom4 (
   bin smallint(5) unsigned NOT NULL default '0',
     score double not null,	# score of chain
     tName varchar(255) not null,	# Target sequence name
     tSize int unsigned not null,	# Target sequence size
     tStart int unsigned not null,	# Alignment start position in target
     tEnd int unsigned not null,	# Alignment end position in target
     qName varchar(255) not null,	# Query sequence name
     qSize int unsigned not null,	# Query sequence size
     qStrand char(1) not null,	# Query strand
     qStart int unsigned not null,	# Alignment start position in query
     qEnd int unsigned not null,	# Alignment end position in query
     id int unsigned not null,	# chain id
               #Indices
   KEY tName (tName(16),bin),
   KEY id (id)
 ) TYPE=MyISAM;
 
     time nice -n +19 hgsql -e \
 	"load data local infile \"chain.tab\" into table chainMonDom4;" calJac1
     #	real    8m10.273s
 
 CREATE TABLE chainMonDom4Link (
   bin smallint(5) unsigned NOT NULL default 0,
   tName varchar(255) NOT NULL default '',
   tStart int(10) unsigned NOT NULL default 0,
   tEnd int(10) unsigned NOT NULL default 0,
   qStart int(10) unsigned NOT NULL default 0,
   chainId int(10) unsigned NOT NULL default 0,
   KEY tName (tName(16),bin),
   KEY chainId (chainId)
 ) ENGINE=MyISAM max_rows=160000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;
 
     time nice -n +19 hgsql -e \
       "load data local infile \"link.tab\" into table chainMonDom4Link;" calJac1
     #	this one took a number of hours
 
     #	finish the nets and load
     time nice -n +19 netClass -verbose=0 -noAr noClass.net \
 	calJac1 monDom4 calJac1.monDom4.net
     #	real    4m18.898s
     time nice -n +19 netFilter -minGap=10 calJac1.monDom4.net \
 	| hgLoadNet -verbose=0 calJac1 netMonDom4 stdin
     #	real    0m54.559s
     cd /cluster/data/calJac1/bed/blastz.monDom4.swap
     time nice -n +19 featureBits calJac1 chainMonDom4Link \
 	> fb.calJac1.chainMonDom4Link.txt 2>&1
     #	real    25m7.373s
     cat fb.calJac1.chainMonDom4Link.txt
     #	391248654 bases of 2929139385 (13.357%) in intersection
 
     #	continuing
     cd /cluster/data/calJac1/bed/blastz.monDom4.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-continue=download -swap -bigClusterHub=pk > download.log 2>&1 &
 
 ###########################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
+
+vertebrate-wide transMap alignments were built  Tracks are created and loaded
+by a single Makefile. This is available from:
+   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
+
+see doc/builds.txt for specific details.
+############################################################################